1 /* 2 * GRUB -- GRand Unified Bootloader 3 * Copyright (C) 1999,2000,2001,2002,2003,2004 Free Software Foundation, Inc. 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License as published by 7 * the Free Software Foundation; either version 2 of the License, or 8 * (at your option) any later version. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License 16 * along with this program; if not, write to the Free Software 17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 18 */ 19 20 /* 21 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 22 * Use is subject to license terms. 23 */ 24 25 /* 26 * Copyright (c) 2012 by Delphix. All rights reserved. 27 */ 28 29 /* 30 * The zfs plug-in routines for GRUB are: 31 * 32 * zfs_mount() - locates a valid uberblock of the root pool and reads 33 * in its MOS at the memory address MOS. 34 * 35 * zfs_open() - locates a plain file object by following the MOS 36 * and places its dnode at the memory address DNODE. 37 * 38 * zfs_read() - read in the data blocks pointed by the DNODE. 39 * 40 * ZFS_SCRATCH is used as a working area. 41 * 42 * (memory addr) MOS DNODE ZFS_SCRATCH 43 * | | | 44 * +-------V---------V----------V---------------+ 45 * memory | | dnode | dnode | scratch | 46 * | | 512B | 512B | area | 47 * +--------------------------------------------+ 48 */ 49 50 #ifdef FSYS_ZFS 51 52 #include "shared.h" 53 #include "filesys.h" 54 #include "fsys_zfs.h" 55 56 /* cache for a file block of the currently zfs_open()-ed file */ 57 static void *file_buf = NULL; 58 static uint64_t file_start = 0; 59 static uint64_t file_end = 0; 60 61 /* cache for a dnode block */ 62 static dnode_phys_t *dnode_buf = NULL; 63 static dnode_phys_t *dnode_mdn = NULL; 64 static uint64_t dnode_start = 0; 65 static uint64_t dnode_end = 0; 66 67 static uint64_t pool_guid = 0; 68 static uberblock_t current_uberblock; 69 static char *stackbase; 70 71 decomp_entry_t decomp_table[ZIO_COMPRESS_FUNCTIONS] = 72 { 73 {"inherit", 0}, /* ZIO_COMPRESS_INHERIT */ 74 {"on", lzjb_decompress}, /* ZIO_COMPRESS_ON */ 75 {"off", 0}, /* ZIO_COMPRESS_OFF */ 76 {"lzjb", lzjb_decompress}, /* ZIO_COMPRESS_LZJB */ 77 {"empty", 0} /* ZIO_COMPRESS_EMPTY */ 78 }; 79 80 static int zio_read_data(blkptr_t *bp, void *buf, char *stack); 81 82 /* 83 * Our own version of bcmp(). 84 */ 85 static int 86 zfs_bcmp(const void *s1, const void *s2, size_t n) 87 { 88 const uchar_t *ps1 = s1; 89 const uchar_t *ps2 = s2; 90 91 if (s1 != s2 && n != 0) { 92 do { 93 if (*ps1++ != *ps2++) 94 return (1); 95 } while (--n != 0); 96 } 97 98 return (0); 99 } 100 101 /* 102 * Our own version of log2(). Same thing as highbit()-1. 103 */ 104 static int 105 zfs_log2(uint64_t num) 106 { 107 int i = 0; 108 109 while (num > 1) { 110 i++; 111 num = num >> 1; 112 } 113 114 return (i); 115 } 116 117 /* Checksum Functions */ 118 static void 119 zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp) 120 { 121 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); 122 } 123 124 /* Checksum Table and Values */ 125 zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { 126 {{NULL, NULL}, 0, 0, "inherit"}, 127 {{NULL, NULL}, 0, 0, "on"}, 128 {{zio_checksum_off, zio_checksum_off}, 0, 0, "off"}, 129 {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, "label"}, 130 {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, "gang_header"}, 131 {{NULL, NULL}, 0, 0, "zilog"}, 132 {{fletcher_2_native, fletcher_2_byteswap}, 0, 0, "fletcher2"}, 133 {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, "fletcher4"}, 134 {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, "SHA256"}, 135 {{NULL, NULL}, 0, 0, "zilog2"}, 136 }; 137 138 /* 139 * zio_checksum_verify: Provides support for checksum verification. 140 * 141 * Fletcher2, Fletcher4, and SHA256 are supported. 142 * 143 * Return: 144 * -1 = Failure 145 * 0 = Success 146 */ 147 static int 148 zio_checksum_verify(blkptr_t *bp, char *data, int size) 149 { 150 zio_cksum_t zc = bp->blk_cksum; 151 uint32_t checksum = BP_GET_CHECKSUM(bp); 152 int byteswap = BP_SHOULD_BYTESWAP(bp); 153 zio_eck_t *zec = (zio_eck_t *)(data + size) - 1; 154 zio_checksum_info_t *ci = &zio_checksum_table[checksum]; 155 zio_cksum_t actual_cksum, expected_cksum; 156 157 /* byteswap is not supported */ 158 if (byteswap) 159 return (-1); 160 161 if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL) 162 return (-1); 163 164 if (ci->ci_eck) { 165 expected_cksum = zec->zec_cksum; 166 zec->zec_cksum = zc; 167 ci->ci_func[0](data, size, &actual_cksum); 168 zec->zec_cksum = expected_cksum; 169 zc = expected_cksum; 170 171 } else { 172 ci->ci_func[byteswap](data, size, &actual_cksum); 173 } 174 175 if ((actual_cksum.zc_word[0] - zc.zc_word[0]) | 176 (actual_cksum.zc_word[1] - zc.zc_word[1]) | 177 (actual_cksum.zc_word[2] - zc.zc_word[2]) | 178 (actual_cksum.zc_word[3] - zc.zc_word[3])) 179 return (-1); 180 181 return (0); 182 } 183 184 /* 185 * vdev_label_start returns the physical disk offset (in bytes) of 186 * label "l". 187 */ 188 static uint64_t 189 vdev_label_start(uint64_t psize, int l) 190 { 191 return (l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ? 192 0 : psize - VDEV_LABELS * sizeof (vdev_label_t))); 193 } 194 195 /* 196 * vdev_uberblock_compare takes two uberblock structures and returns an integer 197 * indicating the more recent of the two. 198 * Return Value = 1 if ub2 is more recent 199 * Return Value = -1 if ub1 is more recent 200 * The most recent uberblock is determined using its transaction number and 201 * timestamp. The uberblock with the highest transaction number is 202 * considered "newer". If the transaction numbers of the two blocks match, the 203 * timestamps are compared to determine the "newer" of the two. 204 */ 205 static int 206 vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2) 207 { 208 if (ub1->ub_txg < ub2->ub_txg) 209 return (-1); 210 if (ub1->ub_txg > ub2->ub_txg) 211 return (1); 212 213 if (ub1->ub_timestamp < ub2->ub_timestamp) 214 return (-1); 215 if (ub1->ub_timestamp > ub2->ub_timestamp) 216 return (1); 217 218 return (0); 219 } 220 221 /* 222 * Three pieces of information are needed to verify an uberblock: the magic 223 * number, the version number, and the checksum. 224 * 225 * Return: 226 * 0 - Success 227 * -1 - Failure 228 */ 229 static int 230 uberblock_verify(uberblock_t *uber, uint64_t ub_size, uint64_t offset) 231 { 232 blkptr_t bp; 233 234 BP_ZERO(&bp); 235 BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL); 236 BP_SET_BYTEORDER(&bp, ZFS_HOST_BYTEORDER); 237 ZIO_SET_CHECKSUM(&bp.blk_cksum, offset, 0, 0, 0); 238 239 if (zio_checksum_verify(&bp, (char *)uber, ub_size) != 0) 240 return (-1); 241 242 if (uber->ub_magic == UBERBLOCK_MAGIC && 243 SPA_VERSION_IS_SUPPORTED(uber->ub_version)) 244 return (0); 245 246 return (-1); 247 } 248 249 /* 250 * Find the best uberblock. 251 * Return: 252 * Success - Pointer to the best uberblock. 253 * Failure - NULL 254 */ 255 static uberblock_t * 256 find_bestub(char *ub_array, uint64_t ashift, uint64_t sector) 257 { 258 uberblock_t *ubbest = NULL; 259 uberblock_t *ubnext; 260 uint64_t offset, ub_size; 261 int i; 262 263 ub_size = VDEV_UBERBLOCK_SIZE(ashift); 264 265 for (i = 0; i < VDEV_UBERBLOCK_COUNT(ashift); i++) { 266 ubnext = (uberblock_t *)ub_array; 267 ub_array += ub_size; 268 offset = (sector << SPA_MINBLOCKSHIFT) + 269 VDEV_UBERBLOCK_OFFSET(ashift, i); 270 271 if (uberblock_verify(ubnext, ub_size, offset) != 0) 272 continue; 273 274 if (ubbest == NULL || 275 vdev_uberblock_compare(ubnext, ubbest) > 0) 276 ubbest = ubnext; 277 } 278 279 return (ubbest); 280 } 281 282 /* 283 * Read a block of data based on the gang block address dva, 284 * and put its data in buf. 285 * 286 * Return: 287 * 0 - success 288 * 1 - failure 289 */ 290 static int 291 zio_read_gang(blkptr_t *bp, dva_t *dva, void *buf, char *stack) 292 { 293 zio_gbh_phys_t *zio_gb; 294 uint64_t offset, sector; 295 blkptr_t tmpbp; 296 int i; 297 298 zio_gb = (zio_gbh_phys_t *)stack; 299 stack += SPA_GANGBLOCKSIZE; 300 offset = DVA_GET_OFFSET(dva); 301 sector = DVA_OFFSET_TO_PHYS_SECTOR(offset); 302 303 /* read in the gang block header */ 304 if (devread(sector, 0, SPA_GANGBLOCKSIZE, (char *)zio_gb) == 0) { 305 grub_printf("failed to read in a gang block header\n"); 306 return (1); 307 } 308 309 /* self checksuming the gang block header */ 310 BP_ZERO(&tmpbp); 311 BP_SET_CHECKSUM(&tmpbp, ZIO_CHECKSUM_GANG_HEADER); 312 BP_SET_BYTEORDER(&tmpbp, ZFS_HOST_BYTEORDER); 313 ZIO_SET_CHECKSUM(&tmpbp.blk_cksum, DVA_GET_VDEV(dva), 314 DVA_GET_OFFSET(dva), bp->blk_birth, 0); 315 if (zio_checksum_verify(&tmpbp, (char *)zio_gb, SPA_GANGBLOCKSIZE)) { 316 grub_printf("failed to checksum a gang block header\n"); 317 return (1); 318 } 319 320 for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 321 if (zio_gb->zg_blkptr[i].blk_birth == 0) 322 continue; 323 324 if (zio_read_data(&zio_gb->zg_blkptr[i], buf, stack)) 325 return (1); 326 buf += BP_GET_PSIZE(&zio_gb->zg_blkptr[i]); 327 } 328 329 return (0); 330 } 331 332 /* 333 * Read in a block of raw data to buf. 334 * 335 * Return: 336 * 0 - success 337 * 1 - failure 338 */ 339 static int 340 zio_read_data(blkptr_t *bp, void *buf, char *stack) 341 { 342 int i, psize; 343 344 psize = BP_GET_PSIZE(bp); 345 346 /* pick a good dva from the block pointer */ 347 for (i = 0; i < SPA_DVAS_PER_BP; i++) { 348 uint64_t offset, sector; 349 350 if (bp->blk_dva[i].dva_word[0] == 0 && 351 bp->blk_dva[i].dva_word[1] == 0) 352 continue; 353 354 if (DVA_GET_GANG(&bp->blk_dva[i])) { 355 if (zio_read_gang(bp, &bp->blk_dva[i], buf, stack) == 0) 356 return (0); 357 } else { 358 /* read in a data block */ 359 offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 360 sector = DVA_OFFSET_TO_PHYS_SECTOR(offset); 361 if (devread(sector, 0, psize, buf) != 0) 362 return (0); 363 } 364 } 365 366 return (1); 367 } 368 369 /* 370 * Read in a block of data, verify its checksum, decompress if needed, 371 * and put the uncompressed data in buf. 372 * 373 * Return: 374 * 0 - success 375 * errnum - failure 376 */ 377 static int 378 zio_read(blkptr_t *bp, void *buf, char *stack) 379 { 380 int lsize, psize, comp; 381 char *retbuf; 382 383 comp = BP_GET_COMPRESS(bp); 384 lsize = BP_GET_LSIZE(bp); 385 psize = BP_GET_PSIZE(bp); 386 387 if ((unsigned int)comp >= ZIO_COMPRESS_FUNCTIONS || 388 (comp != ZIO_COMPRESS_OFF && 389 decomp_table[comp].decomp_func == NULL)) { 390 grub_printf("compression algorithm not supported\n"); 391 return (ERR_FSYS_CORRUPT); 392 } 393 394 if ((char *)buf < stack && ((char *)buf) + lsize > stack) { 395 grub_printf("not enough memory allocated\n"); 396 return (ERR_WONT_FIT); 397 } 398 399 retbuf = buf; 400 if (comp != ZIO_COMPRESS_OFF) { 401 buf = stack; 402 stack += psize; 403 } 404 405 if (zio_read_data(bp, buf, stack) != 0) { 406 grub_printf("zio_read_data failed\n"); 407 return (ERR_FSYS_CORRUPT); 408 } 409 410 if (zio_checksum_verify(bp, buf, psize) != 0) { 411 grub_printf("checksum verification failed\n"); 412 return (ERR_FSYS_CORRUPT); 413 } 414 415 if (comp != ZIO_COMPRESS_OFF) 416 decomp_table[comp].decomp_func(buf, retbuf, psize, lsize); 417 418 return (0); 419 } 420 421 /* 422 * Get the block from a block id. 423 * push the block onto the stack. 424 * 425 * Return: 426 * 0 - success 427 * errnum - failure 428 */ 429 static int 430 dmu_read(dnode_phys_t *dn, uint64_t blkid, void *buf, char *stack) 431 { 432 int idx, level; 433 blkptr_t *bp_array = dn->dn_blkptr; 434 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 435 blkptr_t *bp, *tmpbuf; 436 437 bp = (blkptr_t *)stack; 438 stack += sizeof (blkptr_t); 439 440 tmpbuf = (blkptr_t *)stack; 441 stack += 1<<dn->dn_indblkshift; 442 443 for (level = dn->dn_nlevels - 1; level >= 0; level--) { 444 idx = (blkid >> (epbs * level)) & ((1<<epbs)-1); 445 *bp = bp_array[idx]; 446 if (level == 0) 447 tmpbuf = buf; 448 if (BP_IS_HOLE(bp)) { 449 grub_memset(buf, 0, 450 dn->dn_datablkszsec << SPA_MINBLOCKSHIFT); 451 break; 452 } else if (errnum = zio_read(bp, tmpbuf, stack)) { 453 return (errnum); 454 } 455 456 bp_array = tmpbuf; 457 } 458 459 return (0); 460 } 461 462 /* 463 * mzap_lookup: Looks up property described by "name" and returns the value 464 * in "value". 465 * 466 * Return: 467 * 0 - success 468 * errnum - failure 469 */ 470 static int 471 mzap_lookup(mzap_phys_t *zapobj, int objsize, const char *name, 472 uint64_t *value) 473 { 474 int i, chunks; 475 mzap_ent_phys_t *mzap_ent = zapobj->mz_chunk; 476 477 chunks = objsize / MZAP_ENT_LEN - 1; 478 for (i = 0; i < chunks; i++) { 479 if (grub_strcmp(mzap_ent[i].mze_name, name) == 0) { 480 *value = mzap_ent[i].mze_value; 481 return (0); 482 } 483 } 484 485 return (ERR_FSYS_CORRUPT); 486 } 487 488 static uint64_t 489 zap_hash(uint64_t salt, const char *name) 490 { 491 static uint64_t table[256]; 492 const uint8_t *cp; 493 uint8_t c; 494 uint64_t crc = salt; 495 496 if (table[128] == 0) { 497 uint64_t *ct; 498 int i, j; 499 for (i = 0; i < 256; i++) { 500 for (ct = table + i, *ct = i, j = 8; j > 0; j--) 501 *ct = (*ct >> 1) ^ (-(*ct & 1) & 502 ZFS_CRC64_POLY); 503 } 504 } 505 506 if (crc == 0 || table[128] != ZFS_CRC64_POLY) { 507 errnum = ERR_FSYS_CORRUPT; 508 return (0); 509 } 510 511 for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++) 512 crc = (crc >> 8) ^ table[(crc ^ c) & 0xFF]; 513 514 /* 515 * Only use 28 bits, since we need 4 bits in the cookie for the 516 * collision differentiator. We MUST use the high bits, since 517 * those are the ones that we first pay attention to when 518 * choosing the bucket. 519 */ 520 crc &= ~((1ULL << (64 - 28)) - 1); 521 522 return (crc); 523 } 524 525 /* 526 * Only to be used on 8-bit arrays. 527 * array_len is actual len in bytes (not encoded le_value_length). 528 * buf is null-terminated. 529 */ 530 static int 531 zap_leaf_array_equal(zap_leaf_phys_t *l, int blksft, int chunk, 532 int array_len, const char *buf) 533 { 534 int bseen = 0; 535 536 while (bseen < array_len) { 537 struct zap_leaf_array *la = 538 &ZAP_LEAF_CHUNK(l, blksft, chunk).l_array; 539 int toread = MIN(array_len - bseen, ZAP_LEAF_ARRAY_BYTES); 540 541 if (chunk >= ZAP_LEAF_NUMCHUNKS(blksft)) 542 return (0); 543 544 if (zfs_bcmp(la->la_array, buf + bseen, toread) != 0) 545 break; 546 chunk = la->la_next; 547 bseen += toread; 548 } 549 return (bseen == array_len); 550 } 551 552 /* 553 * Given a zap_leaf_phys_t, walk thru the zap leaf chunks to get the 554 * value for the property "name". 555 * 556 * Return: 557 * 0 - success 558 * errnum - failure 559 */ 560 static int 561 zap_leaf_lookup(zap_leaf_phys_t *l, int blksft, uint64_t h, 562 const char *name, uint64_t *value) 563 { 564 uint16_t chunk; 565 struct zap_leaf_entry *le; 566 567 /* Verify if this is a valid leaf block */ 568 if (l->l_hdr.lh_block_type != ZBT_LEAF) 569 return (ERR_FSYS_CORRUPT); 570 if (l->l_hdr.lh_magic != ZAP_LEAF_MAGIC) 571 return (ERR_FSYS_CORRUPT); 572 573 for (chunk = l->l_hash[LEAF_HASH(blksft, h)]; 574 chunk != CHAIN_END; chunk = le->le_next) { 575 576 if (chunk >= ZAP_LEAF_NUMCHUNKS(blksft)) 577 return (ERR_FSYS_CORRUPT); 578 579 le = ZAP_LEAF_ENTRY(l, blksft, chunk); 580 581 /* Verify the chunk entry */ 582 if (le->le_type != ZAP_CHUNK_ENTRY) 583 return (ERR_FSYS_CORRUPT); 584 585 if (le->le_hash != h) 586 continue; 587 588 if (zap_leaf_array_equal(l, blksft, le->le_name_chunk, 589 le->le_name_length, name)) { 590 591 struct zap_leaf_array *la; 592 uint8_t *ip; 593 594 if (le->le_int_size != 8 || le->le_value_length != 1) 595 return (ERR_FSYS_CORRUPT); 596 597 /* get the uint64_t property value */ 598 la = &ZAP_LEAF_CHUNK(l, blksft, 599 le->le_value_chunk).l_array; 600 ip = la->la_array; 601 602 *value = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 | 603 (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 | 604 (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 | 605 (uint64_t)ip[6] << 8 | (uint64_t)ip[7]; 606 607 return (0); 608 } 609 } 610 611 return (ERR_FSYS_CORRUPT); 612 } 613 614 /* 615 * Fat ZAP lookup 616 * 617 * Return: 618 * 0 - success 619 * errnum - failure 620 */ 621 static int 622 fzap_lookup(dnode_phys_t *zap_dnode, zap_phys_t *zap, 623 const char *name, uint64_t *value, char *stack) 624 { 625 zap_leaf_phys_t *l; 626 uint64_t hash, idx, blkid; 627 int blksft = zfs_log2(zap_dnode->dn_datablkszsec << DNODE_SHIFT); 628 629 /* Verify if this is a fat zap header block */ 630 if (zap->zap_magic != (uint64_t)ZAP_MAGIC || 631 zap->zap_flags != 0) 632 return (ERR_FSYS_CORRUPT); 633 634 hash = zap_hash(zap->zap_salt, name); 635 if (errnum) 636 return (errnum); 637 638 /* get block id from index */ 639 if (zap->zap_ptrtbl.zt_numblks != 0) { 640 /* external pointer tables not supported */ 641 return (ERR_FSYS_CORRUPT); 642 } 643 idx = ZAP_HASH_IDX(hash, zap->zap_ptrtbl.zt_shift); 644 blkid = ((uint64_t *)zap)[idx + (1<<(blksft-3-1))]; 645 646 /* Get the leaf block */ 647 l = (zap_leaf_phys_t *)stack; 648 stack += 1<<blksft; 649 if ((1<<blksft) < sizeof (zap_leaf_phys_t)) 650 return (ERR_FSYS_CORRUPT); 651 if (errnum = dmu_read(zap_dnode, blkid, l, stack)) 652 return (errnum); 653 654 return (zap_leaf_lookup(l, blksft, hash, name, value)); 655 } 656 657 /* 658 * Read in the data of a zap object and find the value for a matching 659 * property name. 660 * 661 * Return: 662 * 0 - success 663 * errnum - failure 664 */ 665 static int 666 zap_lookup(dnode_phys_t *zap_dnode, const char *name, uint64_t *val, 667 char *stack) 668 { 669 uint64_t block_type; 670 int size; 671 void *zapbuf; 672 673 /* Read in the first block of the zap object data. */ 674 zapbuf = stack; 675 size = zap_dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 676 stack += size; 677 678 if ((errnum = dmu_read(zap_dnode, 0, zapbuf, stack)) != 0) 679 return (errnum); 680 681 block_type = *((uint64_t *)zapbuf); 682 683 if (block_type == ZBT_MICRO) { 684 return (mzap_lookup(zapbuf, size, name, val)); 685 } else if (block_type == ZBT_HEADER) { 686 /* this is a fat zap */ 687 return (fzap_lookup(zap_dnode, zapbuf, name, 688 val, stack)); 689 } 690 691 return (ERR_FSYS_CORRUPT); 692 } 693 694 typedef struct zap_attribute { 695 int za_integer_length; 696 uint64_t za_num_integers; 697 uint64_t za_first_integer; 698 char *za_name; 699 } zap_attribute_t; 700 701 typedef int (zap_cb_t)(zap_attribute_t *za, void *arg, char *stack); 702 703 static int 704 zap_iterate(dnode_phys_t *zap_dnode, zap_cb_t *cb, void *arg, char *stack) 705 { 706 uint32_t size = zap_dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 707 zap_attribute_t za; 708 int i; 709 mzap_phys_t *mzp = (mzap_phys_t *)stack; 710 stack += size; 711 712 if ((errnum = dmu_read(zap_dnode, 0, mzp, stack)) != 0) 713 return (errnum); 714 715 /* 716 * Iteration over fatzap objects has not yet been implemented. 717 * If we encounter a pool in which there are more features for 718 * read than can fit inside a microzap (i.e., more than 2048 719 * features for read), we can add support for fatzap iteration. 720 * For now, fail. 721 */ 722 if (mzp->mz_block_type != ZBT_MICRO) { 723 grub_printf("feature information stored in fatzap, pool " 724 "version not supported\n"); 725 return (1); 726 } 727 728 za.za_integer_length = 8; 729 za.za_num_integers = 1; 730 for (i = 0; i < size / MZAP_ENT_LEN - 1; i++) { 731 mzap_ent_phys_t *mzep = &mzp->mz_chunk[i]; 732 int err; 733 734 za.za_first_integer = mzep->mze_value; 735 za.za_name = mzep->mze_name; 736 err = cb(&za, arg, stack); 737 if (err != 0) 738 return (err); 739 } 740 741 return (0); 742 } 743 744 /* 745 * Get the dnode of an object number from the metadnode of an object set. 746 * 747 * Input 748 * mdn - metadnode to get the object dnode 749 * objnum - object number for the object dnode 750 * buf - data buffer that holds the returning dnode 751 * stack - scratch area 752 * 753 * Return: 754 * 0 - success 755 * errnum - failure 756 */ 757 static int 758 dnode_get(dnode_phys_t *mdn, uint64_t objnum, uint8_t type, dnode_phys_t *buf, 759 char *stack) 760 { 761 uint64_t blkid, blksz; /* the block id this object dnode is in */ 762 int epbs; /* shift of number of dnodes in a block */ 763 int idx; /* index within a block */ 764 dnode_phys_t *dnbuf; 765 766 blksz = mdn->dn_datablkszsec << SPA_MINBLOCKSHIFT; 767 epbs = zfs_log2(blksz) - DNODE_SHIFT; 768 blkid = objnum >> epbs; 769 idx = objnum & ((1<<epbs)-1); 770 771 if (dnode_buf != NULL && dnode_mdn == mdn && 772 objnum >= dnode_start && objnum < dnode_end) { 773 grub_memmove(buf, &dnode_buf[idx], DNODE_SIZE); 774 VERIFY_DN_TYPE(buf, type); 775 return (0); 776 } 777 778 if (dnode_buf && blksz == 1<<DNODE_BLOCK_SHIFT) { 779 dnbuf = dnode_buf; 780 dnode_mdn = mdn; 781 dnode_start = blkid << epbs; 782 dnode_end = (blkid + 1) << epbs; 783 } else { 784 dnbuf = (dnode_phys_t *)stack; 785 stack += blksz; 786 } 787 788 if (errnum = dmu_read(mdn, blkid, (char *)dnbuf, stack)) 789 return (errnum); 790 791 grub_memmove(buf, &dnbuf[idx], DNODE_SIZE); 792 VERIFY_DN_TYPE(buf, type); 793 794 return (0); 795 } 796 797 /* 798 * Check if this is a special file that resides at the top 799 * dataset of the pool. Currently this is the GRUB menu, 800 * boot signature and boot signature backup. 801 * str starts with '/'. 802 */ 803 static int 804 is_top_dataset_file(char *str) 805 { 806 char *tptr; 807 808 if ((tptr = grub_strstr(str, "menu.lst")) && 809 (tptr[8] == '\0' || tptr[8] == ' ') && 810 *(tptr-1) == '/') 811 return (1); 812 813 if (grub_strncmp(str, BOOTSIGN_DIR"/", 814 grub_strlen(BOOTSIGN_DIR) + 1) == 0) 815 return (1); 816 817 if (grub_strcmp(str, BOOTSIGN_BACKUP) == 0) 818 return (1); 819 820 return (0); 821 } 822 823 static int 824 check_feature(zap_attribute_t *za, void *arg, char *stack) 825 { 826 const char **names = arg; 827 int i; 828 829 if (za->za_first_integer == 0) 830 return (0); 831 832 for (i = 0; names[i] != NULL; i++) { 833 if (grub_strcmp(za->za_name, names[i]) == 0) { 834 return (0); 835 } 836 } 837 grub_printf("missing feature for read '%s'\n", za->za_name); 838 return (ERR_NEWER_VERSION); 839 } 840 841 /* 842 * Get the file dnode for a given file name where mdn is the meta dnode 843 * for this ZFS object set. When found, place the file dnode in dn. 844 * The 'path' argument will be mangled. 845 * 846 * Return: 847 * 0 - success 848 * errnum - failure 849 */ 850 static int 851 dnode_get_path(dnode_phys_t *mdn, char *path, dnode_phys_t *dn, 852 char *stack) 853 { 854 uint64_t objnum, version; 855 char *cname, ch; 856 857 if (errnum = dnode_get(mdn, MASTER_NODE_OBJ, DMU_OT_MASTER_NODE, 858 dn, stack)) 859 return (errnum); 860 861 if (errnum = zap_lookup(dn, ZPL_VERSION_STR, &version, stack)) 862 return (errnum); 863 if (version > ZPL_VERSION) 864 return (-1); 865 866 if (errnum = zap_lookup(dn, ZFS_ROOT_OBJ, &objnum, stack)) 867 return (errnum); 868 869 if (errnum = dnode_get(mdn, objnum, DMU_OT_DIRECTORY_CONTENTS, 870 dn, stack)) 871 return (errnum); 872 873 /* skip leading slashes */ 874 while (*path == '/') 875 path++; 876 877 while (*path && !grub_isspace(*path)) { 878 879 /* get the next component name */ 880 cname = path; 881 while (*path && !grub_isspace(*path) && *path != '/') 882 path++; 883 ch = *path; 884 *path = 0; /* ensure null termination */ 885 886 if (errnum = zap_lookup(dn, cname, &objnum, stack)) 887 return (errnum); 888 889 objnum = ZFS_DIRENT_OBJ(objnum); 890 if (errnum = dnode_get(mdn, objnum, 0, dn, stack)) 891 return (errnum); 892 893 *path = ch; 894 while (*path == '/') 895 path++; 896 } 897 898 /* We found the dnode for this file. Verify if it is a plain file. */ 899 VERIFY_DN_TYPE(dn, DMU_OT_PLAIN_FILE_CONTENTS); 900 901 return (0); 902 } 903 904 /* 905 * Get the default 'bootfs' property value from the rootpool. 906 * 907 * Return: 908 * 0 - success 909 * errnum -failure 910 */ 911 static int 912 get_default_bootfsobj(dnode_phys_t *mosmdn, uint64_t *obj, char *stack) 913 { 914 uint64_t objnum = 0; 915 dnode_phys_t *dn = (dnode_phys_t *)stack; 916 stack += DNODE_SIZE; 917 918 if (errnum = dnode_get(mosmdn, DMU_POOL_DIRECTORY_OBJECT, 919 DMU_OT_OBJECT_DIRECTORY, dn, stack)) 920 return (errnum); 921 922 /* 923 * find the object number for 'pool_props', and get the dnode 924 * of the 'pool_props'. 925 */ 926 if (zap_lookup(dn, DMU_POOL_PROPS, &objnum, stack)) 927 return (ERR_FILESYSTEM_NOT_FOUND); 928 929 if (errnum = dnode_get(mosmdn, objnum, DMU_OT_POOL_PROPS, dn, stack)) 930 return (errnum); 931 932 if (zap_lookup(dn, ZPOOL_PROP_BOOTFS, &objnum, stack)) 933 return (ERR_FILESYSTEM_NOT_FOUND); 934 935 if (!objnum) 936 return (ERR_FILESYSTEM_NOT_FOUND); 937 938 *obj = objnum; 939 return (0); 940 } 941 942 /* 943 * List of pool features that the grub implementation of ZFS supports for 944 * read. Note that features that are only required for write do not need 945 * to be listed here since grub opens pools in read-only mode. 946 */ 947 static const char *spa_feature_names[] = { 948 NULL 949 }; 950 951 /* 952 * Checks whether the MOS features that are active are supported by this 953 * (GRUB's) implementation of ZFS. 954 * 955 * Return: 956 * 0: Success. 957 * errnum: Failure. 958 */ 959 static int 960 check_mos_features(dnode_phys_t *mosmdn, char *stack) 961 { 962 uint64_t objnum; 963 dnode_phys_t *dn; 964 uint8_t error = 0; 965 966 dn = (dnode_phys_t *)stack; 967 stack += DNODE_SIZE; 968 969 if ((errnum = dnode_get(mosmdn, DMU_POOL_DIRECTORY_OBJECT, 970 DMU_OT_OBJECT_DIRECTORY, dn, stack)) != 0) 971 return (errnum); 972 973 /* 974 * Find the object number for 'features_for_read' and retrieve its 975 * corresponding dnode. Note that we don't check features_for_write 976 * because GRUB is not opening the pool for write. 977 */ 978 if ((errnum = zap_lookup(dn, DMU_POOL_FEATURES_FOR_READ, &objnum, 979 stack)) != 0) 980 return (errnum); 981 982 if ((errnum = dnode_get(mosmdn, objnum, DMU_OTN_ZAP_METADATA, 983 dn, stack)) != 0) 984 return (errnum); 985 986 return (zap_iterate(dn, check_feature, spa_feature_names, stack)); 987 } 988 989 /* 990 * Given a MOS metadnode, get the metadnode of a given filesystem name (fsname), 991 * e.g. pool/rootfs, or a given object number (obj), e.g. the object number 992 * of pool/rootfs. 993 * 994 * If no fsname and no obj are given, return the DSL_DIR metadnode. 995 * If fsname is given, return its metadnode and its matching object number. 996 * If only obj is given, return the metadnode for this object number. 997 * 998 * Return: 999 * 0 - success 1000 * errnum - failure 1001 */ 1002 static int 1003 get_objset_mdn(dnode_phys_t *mosmdn, char *fsname, uint64_t *obj, 1004 dnode_phys_t *mdn, char *stack) 1005 { 1006 uint64_t objnum, headobj; 1007 char *cname, ch; 1008 blkptr_t *bp; 1009 objset_phys_t *osp; 1010 int issnapshot = 0; 1011 char *snapname; 1012 1013 if (fsname == NULL && obj) { 1014 headobj = *obj; 1015 goto skip; 1016 } 1017 1018 if (errnum = dnode_get(mosmdn, DMU_POOL_DIRECTORY_OBJECT, 1019 DMU_OT_OBJECT_DIRECTORY, mdn, stack)) 1020 return (errnum); 1021 1022 if (errnum = zap_lookup(mdn, DMU_POOL_ROOT_DATASET, &objnum, 1023 stack)) 1024 return (errnum); 1025 1026 if (errnum = dnode_get(mosmdn, objnum, DMU_OT_DSL_DIR, mdn, stack)) 1027 return (errnum); 1028 1029 if (fsname == NULL) { 1030 headobj = 1031 ((dsl_dir_phys_t *)DN_BONUS(mdn))->dd_head_dataset_obj; 1032 goto skip; 1033 } 1034 1035 /* take out the pool name */ 1036 while (*fsname && !grub_isspace(*fsname) && *fsname != '/') 1037 fsname++; 1038 1039 while (*fsname && !grub_isspace(*fsname)) { 1040 uint64_t childobj; 1041 1042 while (*fsname == '/') 1043 fsname++; 1044 1045 cname = fsname; 1046 while (*fsname && !grub_isspace(*fsname) && *fsname != '/') 1047 fsname++; 1048 ch = *fsname; 1049 *fsname = 0; 1050 1051 snapname = cname; 1052 while (*snapname && !grub_isspace(*snapname) && *snapname != 1053 '@') 1054 snapname++; 1055 if (*snapname == '@') { 1056 issnapshot = 1; 1057 *snapname = 0; 1058 } 1059 childobj = 1060 ((dsl_dir_phys_t *)DN_BONUS(mdn))->dd_child_dir_zapobj; 1061 if (errnum = dnode_get(mosmdn, childobj, 1062 DMU_OT_DSL_DIR_CHILD_MAP, mdn, stack)) 1063 return (errnum); 1064 1065 if (zap_lookup(mdn, cname, &objnum, stack)) 1066 return (ERR_FILESYSTEM_NOT_FOUND); 1067 1068 if (errnum = dnode_get(mosmdn, objnum, DMU_OT_DSL_DIR, 1069 mdn, stack)) 1070 return (errnum); 1071 1072 *fsname = ch; 1073 if (issnapshot) 1074 *snapname = '@'; 1075 } 1076 headobj = ((dsl_dir_phys_t *)DN_BONUS(mdn))->dd_head_dataset_obj; 1077 if (obj) 1078 *obj = headobj; 1079 1080 skip: 1081 if (errnum = dnode_get(mosmdn, headobj, DMU_OT_DSL_DATASET, mdn, stack)) 1082 return (errnum); 1083 if (issnapshot) { 1084 uint64_t snapobj; 1085 1086 snapobj = ((dsl_dataset_phys_t *)DN_BONUS(mdn))-> 1087 ds_snapnames_zapobj; 1088 1089 if (errnum = dnode_get(mosmdn, snapobj, 1090 DMU_OT_DSL_DS_SNAP_MAP, mdn, stack)) 1091 return (errnum); 1092 if (zap_lookup(mdn, snapname + 1, &headobj, stack)) 1093 return (ERR_FILESYSTEM_NOT_FOUND); 1094 if (errnum = dnode_get(mosmdn, headobj, 1095 DMU_OT_DSL_DATASET, mdn, stack)) 1096 return (errnum); 1097 if (obj) 1098 *obj = headobj; 1099 } 1100 1101 bp = &((dsl_dataset_phys_t *)DN_BONUS(mdn))->ds_bp; 1102 osp = (objset_phys_t *)stack; 1103 stack += sizeof (objset_phys_t); 1104 if (errnum = zio_read(bp, osp, stack)) 1105 return (errnum); 1106 1107 grub_memmove((char *)mdn, (char *)&osp->os_meta_dnode, DNODE_SIZE); 1108 1109 return (0); 1110 } 1111 1112 /* 1113 * For a given XDR packed nvlist, verify the first 4 bytes and move on. 1114 * 1115 * An XDR packed nvlist is encoded as (comments from nvs_xdr_create) : 1116 * 1117 * encoding method/host endian (4 bytes) 1118 * nvl_version (4 bytes) 1119 * nvl_nvflag (4 bytes) 1120 * encoded nvpairs: 1121 * encoded size of the nvpair (4 bytes) 1122 * decoded size of the nvpair (4 bytes) 1123 * name string size (4 bytes) 1124 * name string data (sizeof(NV_ALIGN4(string)) 1125 * data type (4 bytes) 1126 * # of elements in the nvpair (4 bytes) 1127 * data 1128 * 2 zero's for the last nvpair 1129 * (end of the entire list) (8 bytes) 1130 * 1131 * Return: 1132 * 0 - success 1133 * 1 - failure 1134 */ 1135 static int 1136 nvlist_unpack(char *nvlist, char **out) 1137 { 1138 /* Verify if the 1st and 2nd byte in the nvlist are valid. */ 1139 if (nvlist[0] != NV_ENCODE_XDR || nvlist[1] != HOST_ENDIAN) 1140 return (1); 1141 1142 *out = nvlist + 4; 1143 return (0); 1144 } 1145 1146 static char * 1147 nvlist_array(char *nvlist, int index) 1148 { 1149 int i, encode_size; 1150 1151 for (i = 0; i < index; i++) { 1152 /* skip the header, nvl_version, and nvl_nvflag */ 1153 nvlist = nvlist + 4 * 2; 1154 1155 while (encode_size = BSWAP_32(*(uint32_t *)nvlist)) 1156 nvlist += encode_size; /* goto the next nvpair */ 1157 1158 nvlist = nvlist + 4 * 2; /* skip the ending 2 zeros - 8 bytes */ 1159 } 1160 1161 return (nvlist); 1162 } 1163 1164 /* 1165 * The nvlist_next_nvpair() function returns a handle to the next nvpair in the 1166 * list following nvpair. If nvpair is NULL, the first pair is returned. If 1167 * nvpair is the last pair in the nvlist, NULL is returned. 1168 */ 1169 static char * 1170 nvlist_next_nvpair(char *nvl, char *nvpair) 1171 { 1172 char *cur, *prev; 1173 int encode_size; 1174 1175 if (nvl == NULL) 1176 return (NULL); 1177 1178 if (nvpair == NULL) { 1179 /* skip over nvl_version and nvl_nvflag */ 1180 nvpair = nvl + 4 * 2; 1181 } else { 1182 /* skip to the next nvpair */ 1183 encode_size = BSWAP_32(*(uint32_t *)nvpair); 1184 nvpair += encode_size; 1185 } 1186 1187 /* 8 bytes of 0 marks the end of the list */ 1188 if (*(uint64_t *)nvpair == 0) 1189 return (NULL); 1190 1191 return (nvpair); 1192 } 1193 1194 /* 1195 * This function returns 0 on success and 1 on failure. On success, a string 1196 * containing the name of nvpair is saved in buf. 1197 */ 1198 static int 1199 nvpair_name(char *nvp, char *buf, int buflen) 1200 { 1201 int len; 1202 1203 /* skip over encode/decode size */ 1204 nvp += 4 * 2; 1205 1206 len = BSWAP_32(*(uint32_t *)nvp); 1207 if (buflen < len + 1) 1208 return (1); 1209 1210 grub_memmove(buf, nvp + 4, len); 1211 buf[len] = '\0'; 1212 1213 return (0); 1214 } 1215 1216 /* 1217 * This function retrieves the value of the nvpair in the form of enumerated 1218 * type data_type_t. This is used to determine the appropriate type to pass to 1219 * nvpair_value(). 1220 */ 1221 static int 1222 nvpair_type(char *nvp) 1223 { 1224 int name_len, type; 1225 1226 /* skip over encode/decode size */ 1227 nvp += 4 * 2; 1228 1229 /* skip over name_len */ 1230 name_len = BSWAP_32(*(uint32_t *)nvp); 1231 nvp += 4; 1232 1233 /* skip over name */ 1234 nvp = nvp + ((name_len + 3) & ~3); /* align */ 1235 1236 type = BSWAP_32(*(uint32_t *)nvp); 1237 1238 return (type); 1239 } 1240 1241 static int 1242 nvpair_value(char *nvp, void *val, int valtype, int *nelmp) 1243 { 1244 int name_len, type, slen; 1245 char *strval = val; 1246 uint64_t *intval = val; 1247 1248 /* skip over encode/decode size */ 1249 nvp += 4 * 2; 1250 1251 /* skip over name_len */ 1252 name_len = BSWAP_32(*(uint32_t *)nvp); 1253 nvp += 4; 1254 1255 /* skip over name */ 1256 nvp = nvp + ((name_len + 3) & ~3); /* align */ 1257 1258 /* skip over type */ 1259 type = BSWAP_32(*(uint32_t *)nvp); 1260 nvp += 4; 1261 1262 if (type == valtype) { 1263 int nelm; 1264 1265 nelm = BSWAP_32(*(uint32_t *)nvp); 1266 if (valtype != DATA_TYPE_BOOLEAN && nelm < 1) 1267 return (1); 1268 nvp += 4; 1269 1270 switch (valtype) { 1271 case DATA_TYPE_BOOLEAN: 1272 return (0); 1273 1274 case DATA_TYPE_STRING: 1275 slen = BSWAP_32(*(uint32_t *)nvp); 1276 nvp += 4; 1277 grub_memmove(strval, nvp, slen); 1278 strval[slen] = '\0'; 1279 return (0); 1280 1281 case DATA_TYPE_UINT64: 1282 *intval = BSWAP_64(*(uint64_t *)nvp); 1283 return (0); 1284 1285 case DATA_TYPE_NVLIST: 1286 *(void **)val = (void *)nvp; 1287 return (0); 1288 1289 case DATA_TYPE_NVLIST_ARRAY: 1290 *(void **)val = (void *)nvp; 1291 if (nelmp) 1292 *nelmp = nelm; 1293 return (0); 1294 } 1295 } 1296 1297 return (1); 1298 } 1299 1300 static int 1301 nvlist_lookup_value(char *nvlist, char *name, void *val, int valtype, 1302 int *nelmp) 1303 { 1304 char *nvpair; 1305 1306 for (nvpair = nvlist_next_nvpair(nvlist, NULL); 1307 nvpair != NULL; 1308 nvpair = nvlist_next_nvpair(nvlist, nvpair)) { 1309 int name_len = BSWAP_32(*(uint32_t *)(nvpair + 4 * 2)); 1310 char *nvp_name = nvpair + 4 * 3; 1311 1312 if ((grub_strncmp(nvp_name, name, name_len) == 0) && 1313 nvpair_type(nvpair) == valtype) { 1314 return (nvpair_value(nvpair, val, valtype, nelmp)); 1315 } 1316 } 1317 return (1); 1318 } 1319 1320 /* 1321 * Check if this vdev is online and is in a good state. 1322 */ 1323 static int 1324 vdev_validate(char *nv) 1325 { 1326 uint64_t ival; 1327 1328 if (nvlist_lookup_value(nv, ZPOOL_CONFIG_OFFLINE, &ival, 1329 DATA_TYPE_UINT64, NULL) == 0 || 1330 nvlist_lookup_value(nv, ZPOOL_CONFIG_FAULTED, &ival, 1331 DATA_TYPE_UINT64, NULL) == 0 || 1332 nvlist_lookup_value(nv, ZPOOL_CONFIG_REMOVED, &ival, 1333 DATA_TYPE_UINT64, NULL) == 0) 1334 return (ERR_DEV_VALUES); 1335 1336 return (0); 1337 } 1338 1339 /* 1340 * Get a valid vdev pathname/devid from the boot device. 1341 * The caller should already allocate MAXPATHLEN memory for bootpath and devid. 1342 */ 1343 static int 1344 vdev_get_bootpath(char *nv, uint64_t inguid, char *devid, char *bootpath, 1345 int is_spare) 1346 { 1347 char type[16]; 1348 1349 if (nvlist_lookup_value(nv, ZPOOL_CONFIG_TYPE, &type, DATA_TYPE_STRING, 1350 NULL)) 1351 return (ERR_FSYS_CORRUPT); 1352 1353 if (grub_strcmp(type, VDEV_TYPE_DISK) == 0) { 1354 uint64_t guid; 1355 1356 if (vdev_validate(nv) != 0) 1357 return (ERR_NO_BOOTPATH); 1358 1359 if (nvlist_lookup_value(nv, ZPOOL_CONFIG_GUID, 1360 &guid, DATA_TYPE_UINT64, NULL) != 0) 1361 return (ERR_NO_BOOTPATH); 1362 1363 if (guid != inguid) 1364 return (ERR_NO_BOOTPATH); 1365 1366 /* for a spare vdev, pick the disk labeled with "is_spare" */ 1367 if (is_spare) { 1368 uint64_t spare = 0; 1369 (void) nvlist_lookup_value(nv, ZPOOL_CONFIG_IS_SPARE, 1370 &spare, DATA_TYPE_UINT64, NULL); 1371 if (!spare) 1372 return (ERR_NO_BOOTPATH); 1373 } 1374 1375 if (nvlist_lookup_value(nv, ZPOOL_CONFIG_PHYS_PATH, 1376 bootpath, DATA_TYPE_STRING, NULL) != 0) 1377 bootpath[0] = '\0'; 1378 1379 if (nvlist_lookup_value(nv, ZPOOL_CONFIG_DEVID, 1380 devid, DATA_TYPE_STRING, NULL) != 0) 1381 devid[0] = '\0'; 1382 1383 if (grub_strlen(bootpath) >= MAXPATHLEN || 1384 grub_strlen(devid) >= MAXPATHLEN) 1385 return (ERR_WONT_FIT); 1386 1387 return (0); 1388 1389 } else if (grub_strcmp(type, VDEV_TYPE_MIRROR) == 0 || 1390 grub_strcmp(type, VDEV_TYPE_REPLACING) == 0 || 1391 (is_spare = (grub_strcmp(type, VDEV_TYPE_SPARE) == 0))) { 1392 int nelm, i; 1393 char *child; 1394 1395 if (nvlist_lookup_value(nv, ZPOOL_CONFIG_CHILDREN, &child, 1396 DATA_TYPE_NVLIST_ARRAY, &nelm)) 1397 return (ERR_FSYS_CORRUPT); 1398 1399 for (i = 0; i < nelm; i++) { 1400 char *child_i; 1401 1402 child_i = nvlist_array(child, i); 1403 if (vdev_get_bootpath(child_i, inguid, devid, 1404 bootpath, is_spare) == 0) 1405 return (0); 1406 } 1407 } 1408 1409 return (ERR_NO_BOOTPATH); 1410 } 1411 1412 /* 1413 * Check the disk label information and retrieve needed vdev name-value pairs. 1414 * 1415 * Return: 1416 * 0 - success 1417 * ERR_* - failure 1418 */ 1419 static int 1420 check_pool_label(uint64_t sector, char *stack, char *outdevid, 1421 char *outpath, uint64_t *outguid, uint64_t *outashift, uint64_t *outversion) 1422 { 1423 vdev_phys_t *vdev; 1424 uint64_t pool_state, txg = 0; 1425 char *nvlist, *nv, *features; 1426 uint64_t diskguid; 1427 1428 sector += (VDEV_SKIP_SIZE >> SPA_MINBLOCKSHIFT); 1429 1430 /* Read in the vdev name-value pair list (112K). */ 1431 if (devread(sector, 0, VDEV_PHYS_SIZE, stack) == 0) 1432 return (ERR_READ); 1433 1434 vdev = (vdev_phys_t *)stack; 1435 stack += sizeof (vdev_phys_t); 1436 1437 if (nvlist_unpack(vdev->vp_nvlist, &nvlist)) 1438 return (ERR_FSYS_CORRUPT); 1439 1440 if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_POOL_STATE, &pool_state, 1441 DATA_TYPE_UINT64, NULL)) 1442 return (ERR_FSYS_CORRUPT); 1443 1444 if (pool_state == POOL_STATE_DESTROYED) 1445 return (ERR_FILESYSTEM_NOT_FOUND); 1446 1447 if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_POOL_NAME, 1448 current_rootpool, DATA_TYPE_STRING, NULL)) 1449 return (ERR_FSYS_CORRUPT); 1450 1451 if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_POOL_TXG, &txg, 1452 DATA_TYPE_UINT64, NULL)) 1453 return (ERR_FSYS_CORRUPT); 1454 1455 /* not an active device */ 1456 if (txg == 0) 1457 return (ERR_NO_BOOTPATH); 1458 1459 if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_VERSION, outversion, 1460 DATA_TYPE_UINT64, NULL)) 1461 return (ERR_FSYS_CORRUPT); 1462 if (!SPA_VERSION_IS_SUPPORTED(*outversion)) 1463 return (ERR_NEWER_VERSION); 1464 if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_VDEV_TREE, &nv, 1465 DATA_TYPE_NVLIST, NULL)) 1466 return (ERR_FSYS_CORRUPT); 1467 if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_GUID, &diskguid, 1468 DATA_TYPE_UINT64, NULL)) 1469 return (ERR_FSYS_CORRUPT); 1470 if (nvlist_lookup_value(nv, ZPOOL_CONFIG_ASHIFT, outashift, 1471 DATA_TYPE_UINT64, NULL) != 0) 1472 return (ERR_FSYS_CORRUPT); 1473 if (vdev_get_bootpath(nv, diskguid, outdevid, outpath, 0)) 1474 return (ERR_NO_BOOTPATH); 1475 if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_POOL_GUID, outguid, 1476 DATA_TYPE_UINT64, NULL)) 1477 return (ERR_FSYS_CORRUPT); 1478 1479 if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_FEATURES_FOR_READ, 1480 &features, DATA_TYPE_NVLIST, NULL) == 0) { 1481 char *nvp; 1482 char *name = stack; 1483 stack += MAXNAMELEN; 1484 1485 for (nvp = nvlist_next_nvpair(features, NULL); 1486 nvp != NULL; 1487 nvp = nvlist_next_nvpair(features, nvp)) { 1488 zap_attribute_t za; 1489 1490 if (nvpair_name(nvp, name, MAXNAMELEN) != 0) 1491 return (ERR_FSYS_CORRUPT); 1492 1493 za.za_integer_length = 8; 1494 za.za_num_integers = 1; 1495 za.za_first_integer = 1; 1496 za.za_name = name; 1497 if (check_feature(&za, spa_feature_names, stack) != 0) 1498 return (ERR_NEWER_VERSION); 1499 } 1500 } 1501 1502 return (0); 1503 } 1504 1505 /* 1506 * zfs_mount() locates a valid uberblock of the root pool and read in its MOS 1507 * to the memory address MOS. 1508 * 1509 * Return: 1510 * 1 - success 1511 * 0 - failure 1512 */ 1513 int 1514 zfs_mount(void) 1515 { 1516 char *stack, *ub_array; 1517 int label = 0; 1518 uberblock_t *ubbest; 1519 objset_phys_t *osp; 1520 char tmp_bootpath[MAXNAMELEN]; 1521 char tmp_devid[MAXNAMELEN]; 1522 uint64_t tmp_guid, ashift, version; 1523 uint64_t adjpl = (uint64_t)part_length << SPA_MINBLOCKSHIFT; 1524 int err = errnum; /* preserve previous errnum state */ 1525 1526 /* if it's our first time here, zero the best uberblock out */ 1527 if (best_drive == 0 && best_part == 0 && find_best_root) { 1528 grub_memset(¤t_uberblock, 0, sizeof (uberblock_t)); 1529 pool_guid = 0; 1530 } 1531 1532 stackbase = ZFS_SCRATCH; 1533 stack = stackbase; 1534 ub_array = stack; 1535 stack += VDEV_UBERBLOCK_RING; 1536 1537 osp = (objset_phys_t *)stack; 1538 stack += sizeof (objset_phys_t); 1539 adjpl = P2ALIGN(adjpl, (uint64_t)sizeof (vdev_label_t)); 1540 1541 for (label = 0; label < VDEV_LABELS; label++) { 1542 1543 /* 1544 * some eltorito stacks don't give us a size and 1545 * we end up setting the size to MAXUINT, further 1546 * some of these devices stop working once a single 1547 * read past the end has been issued. Checking 1548 * for a maximum part_length and skipping the backup 1549 * labels at the end of the slice/partition/device 1550 * avoids breaking down on such devices. 1551 */ 1552 if (part_length == MAXUINT && label == 2) 1553 break; 1554 1555 uint64_t sector = vdev_label_start(adjpl, 1556 label) >> SPA_MINBLOCKSHIFT; 1557 1558 /* Read in the uberblock ring (128K). */ 1559 if (devread(sector + 1560 ((VDEV_SKIP_SIZE + VDEV_PHYS_SIZE) >> SPA_MINBLOCKSHIFT), 1561 0, VDEV_UBERBLOCK_RING, ub_array) == 0) 1562 continue; 1563 1564 if (check_pool_label(sector, stack, tmp_devid, 1565 tmp_bootpath, &tmp_guid, &ashift, &version)) 1566 continue; 1567 1568 if (pool_guid == 0) 1569 pool_guid = tmp_guid; 1570 1571 if ((ubbest = find_bestub(ub_array, ashift, sector)) == NULL || 1572 zio_read(&ubbest->ub_rootbp, osp, stack) != 0) 1573 continue; 1574 1575 VERIFY_OS_TYPE(osp, DMU_OST_META); 1576 1577 if (version >= SPA_VERSION_FEATURES && 1578 check_mos_features(&osp->os_meta_dnode, stack) != 0) 1579 continue; 1580 1581 if (find_best_root && ((pool_guid != tmp_guid) || 1582 vdev_uberblock_compare(ubbest, &(current_uberblock)) <= 0)) 1583 continue; 1584 1585 /* Got the MOS. Save it at the memory addr MOS. */ 1586 grub_memmove(MOS, &osp->os_meta_dnode, DNODE_SIZE); 1587 grub_memmove(¤t_uberblock, ubbest, sizeof (uberblock_t)); 1588 grub_memmove(current_bootpath, tmp_bootpath, MAXNAMELEN); 1589 grub_memmove(current_devid, tmp_devid, grub_strlen(tmp_devid)); 1590 is_zfs_mount = 1; 1591 return (1); 1592 } 1593 1594 /* 1595 * While some fs impls. (tftp) rely on setting and keeping 1596 * global errnums set, others won't reset it and will break 1597 * when issuing rawreads. The goal here is to simply not 1598 * have zfs mount attempts impact the previous state. 1599 */ 1600 errnum = err; 1601 return (0); 1602 } 1603 1604 /* 1605 * zfs_open() locates a file in the rootpool by following the 1606 * MOS and places the dnode of the file in the memory address DNODE. 1607 * 1608 * Return: 1609 * 1 - success 1610 * 0 - failure 1611 */ 1612 int 1613 zfs_open(char *filename) 1614 { 1615 char *stack; 1616 dnode_phys_t *mdn; 1617 1618 file_buf = NULL; 1619 stackbase = ZFS_SCRATCH; 1620 stack = stackbase; 1621 1622 mdn = (dnode_phys_t *)stack; 1623 stack += sizeof (dnode_phys_t); 1624 1625 dnode_mdn = NULL; 1626 dnode_buf = (dnode_phys_t *)stack; 1627 stack += 1<<DNODE_BLOCK_SHIFT; 1628 1629 /* 1630 * menu.lst is placed at the root pool filesystem level, 1631 * do not goto 'current_bootfs'. 1632 */ 1633 if (is_top_dataset_file(filename)) { 1634 if (errnum = get_objset_mdn(MOS, NULL, NULL, mdn, stack)) 1635 return (0); 1636 1637 current_bootfs_obj = 0; 1638 } else { 1639 if (current_bootfs[0] == '\0') { 1640 /* Get the default root filesystem object number */ 1641 if (errnum = get_default_bootfsobj(MOS, 1642 ¤t_bootfs_obj, stack)) 1643 return (0); 1644 1645 if (errnum = get_objset_mdn(MOS, NULL, 1646 ¤t_bootfs_obj, mdn, stack)) 1647 return (0); 1648 } else { 1649 if (errnum = get_objset_mdn(MOS, current_bootfs, 1650 ¤t_bootfs_obj, mdn, stack)) { 1651 grub_memset(current_bootfs, 0, MAXNAMELEN); 1652 return (0); 1653 } 1654 } 1655 } 1656 1657 if (dnode_get_path(mdn, filename, DNODE, stack)) { 1658 errnum = ERR_FILE_NOT_FOUND; 1659 return (0); 1660 } 1661 1662 /* get the file size and set the file position to 0 */ 1663 1664 /* 1665 * For DMU_OT_SA we will need to locate the SIZE attribute 1666 * attribute, which could be either in the bonus buffer 1667 * or the "spill" block. 1668 */ 1669 if (DNODE->dn_bonustype == DMU_OT_SA) { 1670 sa_hdr_phys_t *sahdrp; 1671 int hdrsize; 1672 1673 if (DNODE->dn_bonuslen != 0) { 1674 sahdrp = (sa_hdr_phys_t *)DN_BONUS(DNODE); 1675 } else { 1676 if (DNODE->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { 1677 blkptr_t *bp = &DNODE->dn_spill; 1678 void *buf; 1679 1680 buf = (void *)stack; 1681 stack += BP_GET_LSIZE(bp); 1682 1683 /* reset errnum to rawread() failure */ 1684 errnum = 0; 1685 if (zio_read(bp, buf, stack) != 0) { 1686 return (0); 1687 } 1688 sahdrp = buf; 1689 } else { 1690 errnum = ERR_FSYS_CORRUPT; 1691 return (0); 1692 } 1693 } 1694 hdrsize = SA_HDR_SIZE(sahdrp); 1695 filemax = *(uint64_t *)((char *)sahdrp + hdrsize + 1696 SA_SIZE_OFFSET); 1697 } else { 1698 filemax = ((znode_phys_t *)DN_BONUS(DNODE))->zp_size; 1699 } 1700 filepos = 0; 1701 1702 dnode_buf = NULL; 1703 return (1); 1704 } 1705 1706 /* 1707 * zfs_read reads in the data blocks pointed by the DNODE. 1708 * 1709 * Return: 1710 * len - the length successfully read in to the buffer 1711 * 0 - failure 1712 */ 1713 int 1714 zfs_read(char *buf, int len) 1715 { 1716 char *stack; 1717 int blksz, length, movesize; 1718 1719 if (file_buf == NULL) { 1720 file_buf = stackbase; 1721 stackbase += SPA_MAXBLOCKSIZE; 1722 file_start = file_end = 0; 1723 } 1724 stack = stackbase; 1725 1726 /* 1727 * If offset is in memory, move it into the buffer provided and return. 1728 */ 1729 if (filepos >= file_start && filepos+len <= file_end) { 1730 grub_memmove(buf, file_buf + filepos - file_start, len); 1731 filepos += len; 1732 return (len); 1733 } 1734 1735 blksz = DNODE->dn_datablkszsec << SPA_MINBLOCKSHIFT; 1736 1737 /* 1738 * Entire Dnode is too big to fit into the space available. We 1739 * will need to read it in chunks. This could be optimized to 1740 * read in as large a chunk as there is space available, but for 1741 * now, this only reads in one data block at a time. 1742 */ 1743 length = len; 1744 while (length) { 1745 /* 1746 * Find requested blkid and the offset within that block. 1747 */ 1748 uint64_t blkid = filepos / blksz; 1749 1750 if (errnum = dmu_read(DNODE, blkid, file_buf, stack)) 1751 return (0); 1752 1753 file_start = blkid * blksz; 1754 file_end = file_start + blksz; 1755 1756 movesize = MIN(length, file_end - filepos); 1757 1758 grub_memmove(buf, file_buf + filepos - file_start, 1759 movesize); 1760 buf += movesize; 1761 length -= movesize; 1762 filepos += movesize; 1763 } 1764 1765 return (len); 1766 } 1767 1768 /* 1769 * No-Op 1770 */ 1771 int 1772 zfs_embed(int *start_sector, int needed_sectors) 1773 { 1774 return (1); 1775 } 1776 1777 #endif /* FSYS_ZFS */