1 /*
   2  *  GRUB  --  GRand Unified Bootloader
   3  *  Copyright (C) 1999,2000,2001,2002,2003,2004  Free Software Foundation, Inc.
   4  *
   5  *  This program is free software; you can redistribute it and/or modify
   6  *  it under the terms of the GNU General Public License as published by
   7  *  the Free Software Foundation; either version 2 of the License, or
   8  *  (at your option) any later version.
   9  *
  10  *  This program is distributed in the hope that it will be useful,
  11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  *  GNU General Public License for more details.
  14  *
  15  *  You should have received a copy of the GNU General Public License
  16  *  along with this program; if not, write to the Free Software
  17  *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  18  */
  19 
  20 /*
  21  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  22  * Use is subject to license terms.
  23  */
  24 
  25 /*
  26  * Copyright (c) 2012 by Delphix. All rights reserved.
  27  */
  28 
  29 /*
  30  * The zfs plug-in routines for GRUB are:
  31  *
  32  * zfs_mount() - locates a valid uberblock of the root pool and reads
  33  *              in its MOS at the memory address MOS.
  34  *
  35  * zfs_open() - locates a plain file object by following the MOS
  36  *              and places its dnode at the memory address DNODE.
  37  *
  38  * zfs_read() - read in the data blocks pointed by the DNODE.
  39  *
  40  * ZFS_SCRATCH is used as a working area.
  41  *
  42  * (memory addr)   MOS      DNODE       ZFS_SCRATCH
  43  *                  |         |          |
  44  *          +-------V---------V----------V---------------+
  45  *   memory |       | dnode   | dnode    |  scratch      |
  46  *          |       | 512B    | 512B     |  area         |
  47  *          +--------------------------------------------+
  48  */
  49 
  50 #ifdef  FSYS_ZFS
  51 
  52 #include "shared.h"
  53 #include "filesys.h"
  54 #include "fsys_zfs.h"
  55 
  56 /* cache for a file block of the currently zfs_open()-ed file */
  57 static void *file_buf = NULL;
  58 static uint64_t file_start = 0;
  59 static uint64_t file_end = 0;
  60 
  61 /* cache for a dnode block */
  62 static dnode_phys_t *dnode_buf = NULL;
  63 static dnode_phys_t *dnode_mdn = NULL;
  64 static uint64_t dnode_start = 0;
  65 static uint64_t dnode_end = 0;
  66 
  67 static uint64_t pool_guid = 0;
  68 static uberblock_t current_uberblock;
  69 static char *stackbase;
  70 
  71 decomp_entry_t decomp_table[ZIO_COMPRESS_FUNCTIONS] =
  72 {
  73         {"inherit", 0},                 /* ZIO_COMPRESS_INHERIT */
  74         {"on", lzjb_decompress},        /* ZIO_COMPRESS_ON */
  75         {"off", 0},                     /* ZIO_COMPRESS_OFF */
  76         {"lzjb", lzjb_decompress},      /* ZIO_COMPRESS_LZJB */
  77         {"empty", 0}                    /* ZIO_COMPRESS_EMPTY */
  78 };
  79 
  80 static int zio_read_data(blkptr_t *bp, void *buf, char *stack);
  81 
  82 /*
  83  * Our own version of bcmp().
  84  */
  85 static int
  86 zfs_bcmp(const void *s1, const void *s2, size_t n)
  87 {
  88         const uchar_t *ps1 = s1;
  89         const uchar_t *ps2 = s2;
  90 
  91         if (s1 != s2 && n != 0) {
  92                 do {
  93                         if (*ps1++ != *ps2++)
  94                                 return (1);
  95                 } while (--n != 0);
  96         }
  97 
  98         return (0);
  99 }
 100 
 101 /*
 102  * Our own version of log2().  Same thing as highbit()-1.
 103  */
 104 static int
 105 zfs_log2(uint64_t num)
 106 {
 107         int i = 0;
 108 
 109         while (num > 1) {
 110                 i++;
 111                 num = num >> 1;
 112         }
 113 
 114         return (i);
 115 }
 116 
 117 /* Checksum Functions */
 118 static void
 119 zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
 120 {
 121         ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
 122 }
 123 
 124 /* Checksum Table and Values */
 125 zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
 126         {{NULL,                 NULL},                  0, 0,   "inherit"},
 127         {{NULL,                 NULL},                  0, 0,   "on"},
 128         {{zio_checksum_off,     zio_checksum_off},      0, 0,   "off"},
 129         {{zio_checksum_SHA256,  zio_checksum_SHA256},   1, 1,   "label"},
 130         {{zio_checksum_SHA256,  zio_checksum_SHA256},   1, 1,   "gang_header"},
 131         {{NULL,                 NULL},                  0, 0,   "zilog"},
 132         {{fletcher_2_native,    fletcher_2_byteswap},   0, 0,   "fletcher2"},
 133         {{fletcher_4_native,    fletcher_4_byteswap},   1, 0,   "fletcher4"},
 134         {{zio_checksum_SHA256,  zio_checksum_SHA256},   1, 0,   "SHA256"},
 135         {{NULL,                 NULL},                  0, 0,   "zilog2"},
 136 };
 137 
 138 /*
 139  * zio_checksum_verify: Provides support for checksum verification.
 140  *
 141  * Fletcher2, Fletcher4, and SHA256 are supported.
 142  *
 143  * Return:
 144  *      -1 = Failure
 145  *       0 = Success
 146  */
 147 static int
 148 zio_checksum_verify(blkptr_t *bp, char *data, int size)
 149 {
 150         zio_cksum_t zc = bp->blk_cksum;
 151         uint32_t checksum = BP_GET_CHECKSUM(bp);
 152         int byteswap = BP_SHOULD_BYTESWAP(bp);
 153         zio_eck_t *zec = (zio_eck_t *)(data + size) - 1;
 154         zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 155         zio_cksum_t actual_cksum, expected_cksum;
 156 
 157         /* byteswap is not supported */
 158         if (byteswap)
 159                 return (-1);
 160 
 161         if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
 162                 return (-1);
 163 
 164         if (ci->ci_eck) {
 165                 expected_cksum = zec->zec_cksum;
 166                 zec->zec_cksum = zc;
 167                 ci->ci_func[0](data, size, &actual_cksum);
 168                 zec->zec_cksum = expected_cksum;
 169                 zc = expected_cksum;
 170 
 171         } else {
 172                 ci->ci_func[byteswap](data, size, &actual_cksum);
 173         }
 174 
 175         if ((actual_cksum.zc_word[0] - zc.zc_word[0]) |
 176             (actual_cksum.zc_word[1] - zc.zc_word[1]) |
 177             (actual_cksum.zc_word[2] - zc.zc_word[2]) |
 178             (actual_cksum.zc_word[3] - zc.zc_word[3]))
 179                 return (-1);
 180 
 181         return (0);
 182 }
 183 
 184 /*
 185  * vdev_label_start returns the physical disk offset (in bytes) of
 186  * label "l".
 187  */
 188 static uint64_t
 189 vdev_label_start(uint64_t psize, int l)
 190 {
 191         return (l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
 192             0 : psize - VDEV_LABELS * sizeof (vdev_label_t)));
 193 }
 194 
 195 /*
 196  * vdev_uberblock_compare takes two uberblock structures and returns an integer
 197  * indicating the more recent of the two.
 198  *      Return Value = 1 if ub2 is more recent
 199  *      Return Value = -1 if ub1 is more recent
 200  * The most recent uberblock is determined using its transaction number and
 201  * timestamp.  The uberblock with the highest transaction number is
 202  * considered "newer".  If the transaction numbers of the two blocks match, the
 203  * timestamps are compared to determine the "newer" of the two.
 204  */
 205 static int
 206 vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2)
 207 {
 208         if (ub1->ub_txg < ub2->ub_txg)
 209                 return (-1);
 210         if (ub1->ub_txg > ub2->ub_txg)
 211                 return (1);
 212 
 213         if (ub1->ub_timestamp < ub2->ub_timestamp)
 214                 return (-1);
 215         if (ub1->ub_timestamp > ub2->ub_timestamp)
 216                 return (1);
 217 
 218         return (0);
 219 }
 220 
 221 /*
 222  * Three pieces of information are needed to verify an uberblock: the magic
 223  * number, the version number, and the checksum.
 224  *
 225  * Return:
 226  *     0 - Success
 227  *    -1 - Failure
 228  */
 229 static int
 230 uberblock_verify(uberblock_t *uber, uint64_t ub_size, uint64_t offset)
 231 {
 232         blkptr_t bp;
 233 
 234         BP_ZERO(&bp);
 235         BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
 236         BP_SET_BYTEORDER(&bp, ZFS_HOST_BYTEORDER);
 237         ZIO_SET_CHECKSUM(&bp.blk_cksum, offset, 0, 0, 0);
 238 
 239         if (zio_checksum_verify(&bp, (char *)uber, ub_size) != 0)
 240                 return (-1);
 241 
 242         if (uber->ub_magic == UBERBLOCK_MAGIC &&
 243             SPA_VERSION_IS_SUPPORTED(uber->ub_version))
 244                 return (0);
 245 
 246         return (-1);
 247 }
 248 
 249 /*
 250  * Find the best uberblock.
 251  * Return:
 252  *    Success - Pointer to the best uberblock.
 253  *    Failure - NULL
 254  */
 255 static uberblock_t *
 256 find_bestub(char *ub_array, uint64_t ashift, uint64_t sector)
 257 {
 258         uberblock_t *ubbest = NULL;
 259         uberblock_t *ubnext;
 260         uint64_t offset, ub_size;
 261         int i;
 262 
 263         ub_size = VDEV_UBERBLOCK_SIZE(ashift);
 264 
 265         for (i = 0; i < VDEV_UBERBLOCK_COUNT(ashift); i++) {
 266                 ubnext = (uberblock_t *)ub_array;
 267                 ub_array += ub_size;
 268                 offset = (sector << SPA_MINBLOCKSHIFT) +
 269                     VDEV_UBERBLOCK_OFFSET(ashift, i);
 270 
 271                 if (uberblock_verify(ubnext, ub_size, offset) != 0)
 272                         continue;
 273 
 274                 if (ubbest == NULL ||
 275                     vdev_uberblock_compare(ubnext, ubbest) > 0)
 276                         ubbest = ubnext;
 277         }
 278 
 279         return (ubbest);
 280 }
 281 
 282 /*
 283  * Read a block of data based on the gang block address dva,
 284  * and put its data in buf.
 285  *
 286  * Return:
 287  *      0 - success
 288  *      1 - failure
 289  */
 290 static int
 291 zio_read_gang(blkptr_t *bp, dva_t *dva, void *buf, char *stack)
 292 {
 293         zio_gbh_phys_t *zio_gb;
 294         uint64_t offset, sector;
 295         blkptr_t tmpbp;
 296         int i;
 297 
 298         zio_gb = (zio_gbh_phys_t *)stack;
 299         stack += SPA_GANGBLOCKSIZE;
 300         offset = DVA_GET_OFFSET(dva);
 301         sector = DVA_OFFSET_TO_PHYS_SECTOR(offset);
 302 
 303         /* read in the gang block header */
 304         if (devread(sector, 0, SPA_GANGBLOCKSIZE, (char *)zio_gb) == 0) {
 305                 grub_printf("failed to read in a gang block header\n");
 306                 return (1);
 307         }
 308 
 309         /* self checksuming the gang block header */
 310         BP_ZERO(&tmpbp);
 311         BP_SET_CHECKSUM(&tmpbp, ZIO_CHECKSUM_GANG_HEADER);
 312         BP_SET_BYTEORDER(&tmpbp, ZFS_HOST_BYTEORDER);
 313         ZIO_SET_CHECKSUM(&tmpbp.blk_cksum, DVA_GET_VDEV(dva),
 314             DVA_GET_OFFSET(dva), bp->blk_birth, 0);
 315         if (zio_checksum_verify(&tmpbp, (char *)zio_gb, SPA_GANGBLOCKSIZE)) {
 316                 grub_printf("failed to checksum a gang block header\n");
 317                 return (1);
 318         }
 319 
 320         for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
 321                 if (zio_gb->zg_blkptr[i].blk_birth == 0)
 322                         continue;
 323 
 324                 if (zio_read_data(&zio_gb->zg_blkptr[i], buf, stack))
 325                         return (1);
 326                 buf += BP_GET_PSIZE(&zio_gb->zg_blkptr[i]);
 327         }
 328 
 329         return (0);
 330 }
 331 
 332 /*
 333  * Read in a block of raw data to buf.
 334  *
 335  * Return:
 336  *      0 - success
 337  *      1 - failure
 338  */
 339 static int
 340 zio_read_data(blkptr_t *bp, void *buf, char *stack)
 341 {
 342         int i, psize;
 343 
 344         psize = BP_GET_PSIZE(bp);
 345 
 346         /* pick a good dva from the block pointer */
 347         for (i = 0; i < SPA_DVAS_PER_BP; i++) {
 348                 uint64_t offset, sector;
 349 
 350                 if (bp->blk_dva[i].dva_word[0] == 0 &&
 351                     bp->blk_dva[i].dva_word[1] == 0)
 352                         continue;
 353 
 354                 if (DVA_GET_GANG(&bp->blk_dva[i])) {
 355                         if (zio_read_gang(bp, &bp->blk_dva[i], buf, stack) == 0)
 356                                 return (0);
 357                 } else {
 358                         /* read in a data block */
 359                         offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
 360                         sector = DVA_OFFSET_TO_PHYS_SECTOR(offset);
 361                         if (devread(sector, 0, psize, buf) != 0)
 362                                 return (0);
 363                 }
 364         }
 365 
 366         return (1);
 367 }
 368 
 369 /*
 370  * Read in a block of data, verify its checksum, decompress if needed,
 371  * and put the uncompressed data in buf.
 372  *
 373  * Return:
 374  *      0 - success
 375  *      errnum - failure
 376  */
 377 static int
 378 zio_read(blkptr_t *bp, void *buf, char *stack)
 379 {
 380         int lsize, psize, comp;
 381         char *retbuf;
 382 
 383         comp = BP_GET_COMPRESS(bp);
 384         lsize = BP_GET_LSIZE(bp);
 385         psize = BP_GET_PSIZE(bp);
 386 
 387         if ((unsigned int)comp >= ZIO_COMPRESS_FUNCTIONS ||
 388             (comp != ZIO_COMPRESS_OFF &&
 389             decomp_table[comp].decomp_func == NULL)) {
 390                 grub_printf("compression algorithm not supported\n");
 391                 return (ERR_FSYS_CORRUPT);
 392         }
 393 
 394         if ((char *)buf < stack && ((char *)buf) + lsize > stack) {
 395                 grub_printf("not enough memory allocated\n");
 396                 return (ERR_WONT_FIT);
 397         }
 398 
 399         retbuf = buf;
 400         if (comp != ZIO_COMPRESS_OFF) {
 401                 buf = stack;
 402                 stack += psize;
 403         }
 404 
 405         if (zio_read_data(bp, buf, stack) != 0) {
 406                 grub_printf("zio_read_data failed\n");
 407                 return (ERR_FSYS_CORRUPT);
 408         }
 409 
 410         if (zio_checksum_verify(bp, buf, psize) != 0) {
 411                 grub_printf("checksum verification failed\n");
 412                 return (ERR_FSYS_CORRUPT);
 413         }
 414 
 415         if (comp != ZIO_COMPRESS_OFF)
 416                 decomp_table[comp].decomp_func(buf, retbuf, psize, lsize);
 417 
 418         return (0);
 419 }
 420 
 421 /*
 422  * Get the block from a block id.
 423  * push the block onto the stack.
 424  *
 425  * Return:
 426  *      0 - success
 427  *      errnum - failure
 428  */
 429 static int
 430 dmu_read(dnode_phys_t *dn, uint64_t blkid, void *buf, char *stack)
 431 {
 432         int idx, level;
 433         blkptr_t *bp_array = dn->dn_blkptr;
 434         int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 435         blkptr_t *bp, *tmpbuf;
 436 
 437         bp = (blkptr_t *)stack;
 438         stack += sizeof (blkptr_t);
 439 
 440         tmpbuf = (blkptr_t *)stack;
 441         stack += 1<<dn->dn_indblkshift;
 442 
 443         for (level = dn->dn_nlevels - 1; level >= 0; level--) {
 444                 idx = (blkid >> (epbs * level)) & ((1<<epbs)-1);
 445                 *bp = bp_array[idx];
 446                 if (level == 0)
 447                         tmpbuf = buf;
 448                 if (BP_IS_HOLE(bp)) {
 449                         grub_memset(buf, 0,
 450                             dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 451                         break;
 452                 } else if (errnum = zio_read(bp, tmpbuf, stack)) {
 453                         return (errnum);
 454                 }
 455 
 456                 bp_array = tmpbuf;
 457         }
 458 
 459         return (0);
 460 }
 461 
 462 /*
 463  * mzap_lookup: Looks up property described by "name" and returns the value
 464  * in "value".
 465  *
 466  * Return:
 467  *      0 - success
 468  *      errnum - failure
 469  */
 470 static int
 471 mzap_lookup(mzap_phys_t *zapobj, int objsize, const char *name,
 472         uint64_t *value)
 473 {
 474         int i, chunks;
 475         mzap_ent_phys_t *mzap_ent = zapobj->mz_chunk;
 476 
 477         chunks = objsize / MZAP_ENT_LEN - 1;
 478         for (i = 0; i < chunks; i++) {
 479                 if (grub_strcmp(mzap_ent[i].mze_name, name) == 0) {
 480                         *value = mzap_ent[i].mze_value;
 481                         return (0);
 482                 }
 483         }
 484 
 485         return (ERR_FSYS_CORRUPT);
 486 }
 487 
 488 static uint64_t
 489 zap_hash(uint64_t salt, const char *name)
 490 {
 491         static uint64_t table[256];
 492         const uint8_t *cp;
 493         uint8_t c;
 494         uint64_t crc = salt;
 495 
 496         if (table[128] == 0) {
 497                 uint64_t *ct;
 498                 int i, j;
 499                 for (i = 0; i < 256; i++) {
 500                         for (ct = table + i, *ct = i, j = 8; j > 0; j--)
 501                                 *ct = (*ct >> 1) ^ (-(*ct & 1) &
 502                                     ZFS_CRC64_POLY);
 503                 }
 504         }
 505 
 506         if (crc == 0 || table[128] != ZFS_CRC64_POLY) {
 507                 errnum = ERR_FSYS_CORRUPT;
 508                 return (0);
 509         }
 510 
 511         for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
 512                 crc = (crc >> 8) ^ table[(crc ^ c) & 0xFF];
 513 
 514         /*
 515          * Only use 28 bits, since we need 4 bits in the cookie for the
 516          * collision differentiator.  We MUST use the high bits, since
 517          * those are the ones that we first pay attention to when
 518          * choosing the bucket.
 519          */
 520         crc &= ~((1ULL << (64 - 28)) - 1);
 521 
 522         return (crc);
 523 }
 524 
 525 /*
 526  * Only to be used on 8-bit arrays.
 527  * array_len is actual len in bytes (not encoded le_value_length).
 528  * buf is null-terminated.
 529  */
 530 static int
 531 zap_leaf_array_equal(zap_leaf_phys_t *l, int blksft, int chunk,
 532     int array_len, const char *buf)
 533 {
 534         int bseen = 0;
 535 
 536         while (bseen < array_len) {
 537                 struct zap_leaf_array *la =
 538                     &ZAP_LEAF_CHUNK(l, blksft, chunk).l_array;
 539                 int toread = MIN(array_len - bseen, ZAP_LEAF_ARRAY_BYTES);
 540 
 541                 if (chunk >= ZAP_LEAF_NUMCHUNKS(blksft))
 542                         return (0);
 543 
 544                 if (zfs_bcmp(la->la_array, buf + bseen, toread) != 0)
 545                         break;
 546                 chunk = la->la_next;
 547                 bseen += toread;
 548         }
 549         return (bseen == array_len);
 550 }
 551 
 552 /*
 553  * Given a zap_leaf_phys_t, walk thru the zap leaf chunks to get the
 554  * value for the property "name".
 555  *
 556  * Return:
 557  *      0 - success
 558  *      errnum - failure
 559  */
 560 static int
 561 zap_leaf_lookup(zap_leaf_phys_t *l, int blksft, uint64_t h,
 562     const char *name, uint64_t *value)
 563 {
 564         uint16_t chunk;
 565         struct zap_leaf_entry *le;
 566 
 567         /* Verify if this is a valid leaf block */
 568         if (l->l_hdr.lh_block_type != ZBT_LEAF)
 569                 return (ERR_FSYS_CORRUPT);
 570         if (l->l_hdr.lh_magic != ZAP_LEAF_MAGIC)
 571                 return (ERR_FSYS_CORRUPT);
 572 
 573         for (chunk = l->l_hash[LEAF_HASH(blksft, h)];
 574             chunk != CHAIN_END; chunk = le->le_next) {
 575 
 576                 if (chunk >= ZAP_LEAF_NUMCHUNKS(blksft))
 577                         return (ERR_FSYS_CORRUPT);
 578 
 579                 le = ZAP_LEAF_ENTRY(l, blksft, chunk);
 580 
 581                 /* Verify the chunk entry */
 582                 if (le->le_type != ZAP_CHUNK_ENTRY)
 583                         return (ERR_FSYS_CORRUPT);
 584 
 585                 if (le->le_hash != h)
 586                         continue;
 587 
 588                 if (zap_leaf_array_equal(l, blksft, le->le_name_chunk,
 589                     le->le_name_length, name)) {
 590 
 591                         struct zap_leaf_array *la;
 592                         uint8_t *ip;
 593 
 594                         if (le->le_int_size != 8 || le->le_value_length != 1)
 595                                 return (ERR_FSYS_CORRUPT);
 596 
 597                         /* get the uint64_t property value */
 598                         la = &ZAP_LEAF_CHUNK(l, blksft,
 599                             le->le_value_chunk).l_array;
 600                         ip = la->la_array;
 601 
 602                         *value = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 |
 603                             (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 |
 604                             (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 |
 605                             (uint64_t)ip[6] << 8 | (uint64_t)ip[7];
 606 
 607                         return (0);
 608                 }
 609         }
 610 
 611         return (ERR_FSYS_CORRUPT);
 612 }
 613 
 614 /*
 615  * Fat ZAP lookup
 616  *
 617  * Return:
 618  *      0 - success
 619  *      errnum - failure
 620  */
 621 static int
 622 fzap_lookup(dnode_phys_t *zap_dnode, zap_phys_t *zap,
 623     const char *name, uint64_t *value, char *stack)
 624 {
 625         zap_leaf_phys_t *l;
 626         uint64_t hash, idx, blkid;
 627         int blksft = zfs_log2(zap_dnode->dn_datablkszsec << DNODE_SHIFT);
 628 
 629         /* Verify if this is a fat zap header block */
 630         if (zap->zap_magic != (uint64_t)ZAP_MAGIC ||
 631             zap->zap_flags != 0)
 632                 return (ERR_FSYS_CORRUPT);
 633 
 634         hash = zap_hash(zap->zap_salt, name);
 635         if (errnum)
 636                 return (errnum);
 637 
 638         /* get block id from index */
 639         if (zap->zap_ptrtbl.zt_numblks != 0) {
 640                 /* external pointer tables not supported */
 641                 return (ERR_FSYS_CORRUPT);
 642         }
 643         idx = ZAP_HASH_IDX(hash, zap->zap_ptrtbl.zt_shift);
 644         blkid = ((uint64_t *)zap)[idx + (1<<(blksft-3-1))];
 645 
 646         /* Get the leaf block */
 647         l = (zap_leaf_phys_t *)stack;
 648         stack += 1<<blksft;
 649         if ((1<<blksft) < sizeof (zap_leaf_phys_t))
 650                 return (ERR_FSYS_CORRUPT);
 651         if (errnum = dmu_read(zap_dnode, blkid, l, stack))
 652                 return (errnum);
 653 
 654         return (zap_leaf_lookup(l, blksft, hash, name, value));
 655 }
 656 
 657 /*
 658  * Read in the data of a zap object and find the value for a matching
 659  * property name.
 660  *
 661  * Return:
 662  *      0 - success
 663  *      errnum - failure
 664  */
 665 static int
 666 zap_lookup(dnode_phys_t *zap_dnode, const char *name, uint64_t *val,
 667     char *stack)
 668 {
 669         uint64_t block_type;
 670         int size;
 671         void *zapbuf;
 672 
 673         /* Read in the first block of the zap object data. */
 674         zapbuf = stack;
 675         size = zap_dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 676         stack += size;
 677 
 678         if ((errnum = dmu_read(zap_dnode, 0, zapbuf, stack)) != 0)
 679                 return (errnum);
 680 
 681         block_type = *((uint64_t *)zapbuf);
 682 
 683         if (block_type == ZBT_MICRO) {
 684                 return (mzap_lookup(zapbuf, size, name, val));
 685         } else if (block_type == ZBT_HEADER) {
 686                 /* this is a fat zap */
 687                 return (fzap_lookup(zap_dnode, zapbuf, name,
 688                     val, stack));
 689         }
 690 
 691         return (ERR_FSYS_CORRUPT);
 692 }
 693 
 694 typedef struct zap_attribute {
 695         int za_integer_length;
 696         uint64_t za_num_integers;
 697         uint64_t za_first_integer;
 698         char *za_name;
 699 } zap_attribute_t;
 700 
 701 typedef int (zap_cb_t)(zap_attribute_t *za, void *arg, char *stack);
 702 
 703 static int
 704 zap_iterate(dnode_phys_t *zap_dnode, zap_cb_t *cb, void *arg, char *stack)
 705 {
 706         uint32_t size = zap_dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 707         zap_attribute_t za;
 708         int i;
 709         mzap_phys_t *mzp = (mzap_phys_t *)stack;
 710         stack += size;
 711 
 712         if ((errnum = dmu_read(zap_dnode, 0, mzp, stack)) != 0)
 713                 return (errnum);
 714 
 715         /*
 716          * Iteration over fatzap objects has not yet been implemented.
 717          * If we encounter a pool in which there are more features for
 718          * read than can fit inside a microzap (i.e., more than 2048
 719          * features for read), we can add support for fatzap iteration.
 720          * For now, fail.
 721          */
 722         if (mzp->mz_block_type != ZBT_MICRO) {
 723                 grub_printf("feature information stored in fatzap, pool "
 724                     "version not supported\n");
 725                 return (1);
 726         }
 727 
 728         za.za_integer_length = 8;
 729         za.za_num_integers = 1;
 730         for (i = 0; i < size / MZAP_ENT_LEN - 1; i++) {
 731                 mzap_ent_phys_t *mzep = &mzp->mz_chunk[i];
 732                 int err;
 733 
 734                 za.za_first_integer = mzep->mze_value;
 735                 za.za_name = mzep->mze_name;
 736                 err = cb(&za, arg, stack);
 737                 if (err != 0)
 738                         return (err);
 739         }
 740 
 741         return (0);
 742 }
 743 
 744 /*
 745  * Get the dnode of an object number from the metadnode of an object set.
 746  *
 747  * Input
 748  *      mdn - metadnode to get the object dnode
 749  *      objnum - object number for the object dnode
 750  *      buf - data buffer that holds the returning dnode
 751  *      stack - scratch area
 752  *
 753  * Return:
 754  *      0 - success
 755  *      errnum - failure
 756  */
 757 static int
 758 dnode_get(dnode_phys_t *mdn, uint64_t objnum, uint8_t type, dnode_phys_t *buf,
 759         char *stack)
 760 {
 761         uint64_t blkid, blksz; /* the block id this object dnode is in */
 762         int epbs; /* shift of number of dnodes in a block */
 763         int idx; /* index within a block */
 764         dnode_phys_t *dnbuf;
 765 
 766         blksz = mdn->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 767         epbs = zfs_log2(blksz) - DNODE_SHIFT;
 768         blkid = objnum >> epbs;
 769         idx = objnum & ((1<<epbs)-1);
 770 
 771         if (dnode_buf != NULL && dnode_mdn == mdn &&
 772             objnum >= dnode_start && objnum < dnode_end) {
 773                 grub_memmove(buf, &dnode_buf[idx], DNODE_SIZE);
 774                 VERIFY_DN_TYPE(buf, type);
 775                 return (0);
 776         }
 777 
 778         if (dnode_buf && blksz == 1<<DNODE_BLOCK_SHIFT) {
 779                 dnbuf = dnode_buf;
 780                 dnode_mdn = mdn;
 781                 dnode_start = blkid << epbs;
 782                 dnode_end = (blkid + 1) << epbs;
 783         } else {
 784                 dnbuf = (dnode_phys_t *)stack;
 785                 stack += blksz;
 786         }
 787 
 788         if (errnum = dmu_read(mdn, blkid, (char *)dnbuf, stack))
 789                 return (errnum);
 790 
 791         grub_memmove(buf, &dnbuf[idx], DNODE_SIZE);
 792         VERIFY_DN_TYPE(buf, type);
 793 
 794         return (0);
 795 }
 796 
 797 /*
 798  * Check if this is a special file that resides at the top
 799  * dataset of the pool. Currently this is the GRUB menu,
 800  * boot signature and boot signature backup.
 801  * str starts with '/'.
 802  */
 803 static int
 804 is_top_dataset_file(char *str)
 805 {
 806         char *tptr;
 807 
 808         if ((tptr = grub_strstr(str, "menu.lst")) &&
 809             (tptr[8] == '\0' || tptr[8] == ' ') &&
 810             *(tptr-1) == '/')
 811                 return (1);
 812 
 813         if (grub_strncmp(str, BOOTSIGN_DIR"/",
 814             grub_strlen(BOOTSIGN_DIR) + 1) == 0)
 815                 return (1);
 816 
 817         if (grub_strcmp(str, BOOTSIGN_BACKUP) == 0)
 818                 return (1);
 819 
 820         return (0);
 821 }
 822 
 823 static int
 824 check_feature(zap_attribute_t *za, void *arg, char *stack)
 825 {
 826         const char **names = arg;
 827         int i;
 828 
 829         if (za->za_first_integer == 0)
 830                 return (0);
 831 
 832         for (i = 0; names[i] != NULL; i++) {
 833                 if (grub_strcmp(za->za_name, names[i]) == 0) {
 834                         return (0);
 835                 }
 836         }
 837         grub_printf("missing feature for read '%s'\n", za->za_name);
 838         return (ERR_NEWER_VERSION);
 839 }
 840 
 841 /*
 842  * Get the file dnode for a given file name where mdn is the meta dnode
 843  * for this ZFS object set. When found, place the file dnode in dn.
 844  * The 'path' argument will be mangled.
 845  *
 846  * Return:
 847  *      0 - success
 848  *      errnum - failure
 849  */
 850 static int
 851 dnode_get_path(dnode_phys_t *mdn, char *path, dnode_phys_t *dn,
 852     char *stack)
 853 {
 854         uint64_t objnum, version;
 855         char *cname, ch;
 856 
 857         if (errnum = dnode_get(mdn, MASTER_NODE_OBJ, DMU_OT_MASTER_NODE,
 858             dn, stack))
 859                 return (errnum);
 860 
 861         if (errnum = zap_lookup(dn, ZPL_VERSION_STR, &version, stack))
 862                 return (errnum);
 863         if (version > ZPL_VERSION)
 864                 return (-1);
 865 
 866         if (errnum = zap_lookup(dn, ZFS_ROOT_OBJ, &objnum, stack))
 867                 return (errnum);
 868 
 869         if (errnum = dnode_get(mdn, objnum, DMU_OT_DIRECTORY_CONTENTS,
 870             dn, stack))
 871                 return (errnum);
 872 
 873         /* skip leading slashes */
 874         while (*path == '/')
 875                 path++;
 876 
 877         while (*path && !grub_isspace(*path)) {
 878 
 879                 /* get the next component name */
 880                 cname = path;
 881                 while (*path && !grub_isspace(*path) && *path != '/')
 882                         path++;
 883                 ch = *path;
 884                 *path = 0;   /* ensure null termination */
 885 
 886                 if (errnum = zap_lookup(dn, cname, &objnum, stack))
 887                         return (errnum);
 888 
 889                 objnum = ZFS_DIRENT_OBJ(objnum);
 890                 if (errnum = dnode_get(mdn, objnum, 0, dn, stack))
 891                         return (errnum);
 892 
 893                 *path = ch;
 894                 while (*path == '/')
 895                         path++;
 896         }
 897 
 898         /* We found the dnode for this file. Verify if it is a plain file. */
 899         VERIFY_DN_TYPE(dn, DMU_OT_PLAIN_FILE_CONTENTS);
 900 
 901         return (0);
 902 }
 903 
 904 /*
 905  * Get the default 'bootfs' property value from the rootpool.
 906  *
 907  * Return:
 908  *      0 - success
 909  *      errnum -failure
 910  */
 911 static int
 912 get_default_bootfsobj(dnode_phys_t *mosmdn, uint64_t *obj, char *stack)
 913 {
 914         uint64_t objnum = 0;
 915         dnode_phys_t *dn = (dnode_phys_t *)stack;
 916         stack += DNODE_SIZE;
 917 
 918         if (errnum = dnode_get(mosmdn, DMU_POOL_DIRECTORY_OBJECT,
 919             DMU_OT_OBJECT_DIRECTORY, dn, stack))
 920                 return (errnum);
 921 
 922         /*
 923          * find the object number for 'pool_props', and get the dnode
 924          * of the 'pool_props'.
 925          */
 926         if (zap_lookup(dn, DMU_POOL_PROPS, &objnum, stack))
 927                 return (ERR_FILESYSTEM_NOT_FOUND);
 928 
 929         if (errnum = dnode_get(mosmdn, objnum, DMU_OT_POOL_PROPS, dn, stack))
 930                 return (errnum);
 931 
 932         if (zap_lookup(dn, ZPOOL_PROP_BOOTFS, &objnum, stack))
 933                 return (ERR_FILESYSTEM_NOT_FOUND);
 934 
 935         if (!objnum)
 936                 return (ERR_FILESYSTEM_NOT_FOUND);
 937 
 938         *obj = objnum;
 939         return (0);
 940 }
 941 
 942 /*
 943  * List of pool features that the grub implementation of ZFS supports for
 944  * read. Note that features that are only required for write do not need
 945  * to be listed here since grub opens pools in read-only mode.
 946  */
 947 static const char *spa_feature_names[] = {
 948         NULL
 949 };
 950 
 951 /*
 952  * Checks whether the MOS features that are active are supported by this
 953  * (GRUB's) implementation of ZFS.
 954  *
 955  * Return:
 956  *      0: Success.
 957  *      errnum: Failure.
 958  */
 959 static int
 960 check_mos_features(dnode_phys_t *mosmdn, char *stack)
 961 {
 962         uint64_t objnum;
 963         dnode_phys_t *dn;
 964         uint8_t error = 0;
 965 
 966         dn = (dnode_phys_t *)stack;
 967         stack += DNODE_SIZE;
 968 
 969         if ((errnum = dnode_get(mosmdn, DMU_POOL_DIRECTORY_OBJECT,
 970             DMU_OT_OBJECT_DIRECTORY, dn, stack)) != 0)
 971                 return (errnum);
 972 
 973         /*
 974          * Find the object number for 'features_for_read' and retrieve its
 975          * corresponding dnode. Note that we don't check features_for_write
 976          * because GRUB is not opening the pool for write.
 977          */
 978         if ((errnum = zap_lookup(dn, DMU_POOL_FEATURES_FOR_READ, &objnum,
 979             stack)) != 0)
 980                 return (errnum);
 981 
 982         if ((errnum = dnode_get(mosmdn, objnum, DMU_OTN_ZAP_METADATA,
 983             dn, stack)) != 0)
 984                 return (errnum);
 985 
 986         return (zap_iterate(dn, check_feature, spa_feature_names, stack));
 987 }
 988 
 989 /*
 990  * Given a MOS metadnode, get the metadnode of a given filesystem name (fsname),
 991  * e.g. pool/rootfs, or a given object number (obj), e.g. the object number
 992  * of pool/rootfs.
 993  *
 994  * If no fsname and no obj are given, return the DSL_DIR metadnode.
 995  * If fsname is given, return its metadnode and its matching object number.
 996  * If only obj is given, return the metadnode for this object number.
 997  *
 998  * Return:
 999  *      0 - success
1000  *      errnum - failure
1001  */
1002 static int
1003 get_objset_mdn(dnode_phys_t *mosmdn, char *fsname, uint64_t *obj,
1004     dnode_phys_t *mdn, char *stack)
1005 {
1006         uint64_t objnum, headobj;
1007         char *cname, ch;
1008         blkptr_t *bp;
1009         objset_phys_t *osp;
1010         int issnapshot = 0;
1011         char *snapname;
1012 
1013         if (fsname == NULL && obj) {
1014                 headobj = *obj;
1015                 goto skip;
1016         }
1017 
1018         if (errnum = dnode_get(mosmdn, DMU_POOL_DIRECTORY_OBJECT,
1019             DMU_OT_OBJECT_DIRECTORY, mdn, stack))
1020                 return (errnum);
1021 
1022         if (errnum = zap_lookup(mdn, DMU_POOL_ROOT_DATASET, &objnum,
1023             stack))
1024                 return (errnum);
1025 
1026         if (errnum = dnode_get(mosmdn, objnum, DMU_OT_DSL_DIR, mdn, stack))
1027                 return (errnum);
1028 
1029         if (fsname == NULL) {
1030                 headobj =
1031                     ((dsl_dir_phys_t *)DN_BONUS(mdn))->dd_head_dataset_obj;
1032                 goto skip;
1033         }
1034 
1035         /* take out the pool name */
1036         while (*fsname && !grub_isspace(*fsname) && *fsname != '/')
1037                 fsname++;
1038 
1039         while (*fsname && !grub_isspace(*fsname)) {
1040                 uint64_t childobj;
1041 
1042                 while (*fsname == '/')
1043                         fsname++;
1044 
1045                 cname = fsname;
1046                 while (*fsname && !grub_isspace(*fsname) && *fsname != '/')
1047                         fsname++;
1048                 ch = *fsname;
1049                 *fsname = 0;
1050 
1051                 snapname = cname;
1052                 while (*snapname && !grub_isspace(*snapname) && *snapname !=
1053                     '@')
1054                         snapname++;
1055                 if (*snapname == '@') {
1056                         issnapshot = 1;
1057                         *snapname = 0;
1058                 }
1059                 childobj =
1060                     ((dsl_dir_phys_t *)DN_BONUS(mdn))->dd_child_dir_zapobj;
1061                 if (errnum = dnode_get(mosmdn, childobj,
1062                     DMU_OT_DSL_DIR_CHILD_MAP, mdn, stack))
1063                         return (errnum);
1064 
1065                 if (zap_lookup(mdn, cname, &objnum, stack))
1066                         return (ERR_FILESYSTEM_NOT_FOUND);
1067 
1068                 if (errnum = dnode_get(mosmdn, objnum, DMU_OT_DSL_DIR,
1069                     mdn, stack))
1070                         return (errnum);
1071 
1072                 *fsname = ch;
1073                 if (issnapshot)
1074                         *snapname = '@';
1075         }
1076         headobj = ((dsl_dir_phys_t *)DN_BONUS(mdn))->dd_head_dataset_obj;
1077         if (obj)
1078                 *obj = headobj;
1079 
1080 skip:
1081         if (errnum = dnode_get(mosmdn, headobj, DMU_OT_DSL_DATASET, mdn, stack))
1082                 return (errnum);
1083         if (issnapshot) {
1084                 uint64_t snapobj;
1085 
1086                 snapobj = ((dsl_dataset_phys_t *)DN_BONUS(mdn))->
1087                     ds_snapnames_zapobj;
1088 
1089                 if (errnum = dnode_get(mosmdn, snapobj,
1090                     DMU_OT_DSL_DS_SNAP_MAP, mdn, stack))
1091                         return (errnum);
1092                 if (zap_lookup(mdn, snapname + 1, &headobj, stack))
1093                         return (ERR_FILESYSTEM_NOT_FOUND);
1094                 if (errnum = dnode_get(mosmdn, headobj,
1095                     DMU_OT_DSL_DATASET, mdn, stack))
1096                         return (errnum);
1097                 if (obj)
1098                         *obj = headobj;
1099         }
1100 
1101         bp = &((dsl_dataset_phys_t *)DN_BONUS(mdn))->ds_bp;
1102         osp = (objset_phys_t *)stack;
1103         stack += sizeof (objset_phys_t);
1104         if (errnum = zio_read(bp, osp, stack))
1105                 return (errnum);
1106 
1107         grub_memmove((char *)mdn, (char *)&osp->os_meta_dnode, DNODE_SIZE);
1108 
1109         return (0);
1110 }
1111 
1112 /*
1113  * For a given XDR packed nvlist, verify the first 4 bytes and move on.
1114  *
1115  * An XDR packed nvlist is encoded as (comments from nvs_xdr_create) :
1116  *
1117  *      encoding method/host endian     (4 bytes)
1118  *      nvl_version                     (4 bytes)
1119  *      nvl_nvflag                      (4 bytes)
1120  *      encoded nvpairs:
1121  *              encoded size of the nvpair      (4 bytes)
1122  *              decoded size of the nvpair      (4 bytes)
1123  *              name string size                (4 bytes)
1124  *              name string data                (sizeof(NV_ALIGN4(string))
1125  *              data type                       (4 bytes)
1126  *              # of elements in the nvpair     (4 bytes)
1127  *              data
1128  *      2 zero's for the last nvpair
1129  *              (end of the entire list)        (8 bytes)
1130  *
1131  * Return:
1132  *      0 - success
1133  *      1 - failure
1134  */
1135 static int
1136 nvlist_unpack(char *nvlist, char **out)
1137 {
1138         /* Verify if the 1st and 2nd byte in the nvlist are valid. */
1139         if (nvlist[0] != NV_ENCODE_XDR || nvlist[1] != HOST_ENDIAN)
1140                 return (1);
1141 
1142         *out = nvlist + 4;
1143         return (0);
1144 }
1145 
1146 static char *
1147 nvlist_array(char *nvlist, int index)
1148 {
1149         int i, encode_size;
1150 
1151         for (i = 0; i < index; i++) {
1152                 /* skip the header, nvl_version, and nvl_nvflag */
1153                 nvlist = nvlist + 4 * 2;
1154 
1155                 while (encode_size = BSWAP_32(*(uint32_t *)nvlist))
1156                         nvlist += encode_size; /* goto the next nvpair */
1157 
1158                 nvlist = nvlist + 4 * 2; /* skip the ending 2 zeros - 8 bytes */
1159         }
1160 
1161         return (nvlist);
1162 }
1163 
1164 /*
1165  * The nvlist_next_nvpair() function returns a handle to the next nvpair in the
1166  * list following nvpair. If nvpair is NULL, the first pair is returned. If
1167  * nvpair is the last pair in the nvlist, NULL is returned.
1168  */
1169 static char *
1170 nvlist_next_nvpair(char *nvl, char *nvpair)
1171 {
1172         char *cur, *prev;
1173         int encode_size;
1174 
1175         if (nvl == NULL)
1176                 return (NULL);
1177 
1178         if (nvpair == NULL) {
1179                 /* skip over nvl_version and nvl_nvflag */
1180                 nvpair = nvl + 4 * 2;
1181         } else {
1182                 /* skip to the next nvpair */
1183                 encode_size = BSWAP_32(*(uint32_t *)nvpair);
1184                 nvpair += encode_size;
1185         }
1186 
1187         /* 8 bytes of 0 marks the end of the list */
1188         if (*(uint64_t *)nvpair == 0)
1189                 return (NULL);
1190 
1191         return (nvpair);
1192 }
1193 
1194 /*
1195  * This function returns 0 on success and 1 on failure. On success, a string
1196  * containing the name of nvpair is saved in buf.
1197  */
1198 static int
1199 nvpair_name(char *nvp, char *buf, int buflen)
1200 {
1201         int len;
1202 
1203         /* skip over encode/decode size */
1204         nvp += 4 * 2;
1205 
1206         len = BSWAP_32(*(uint32_t *)nvp);
1207         if (buflen < len + 1)
1208                 return (1);
1209 
1210         grub_memmove(buf, nvp + 4, len);
1211         buf[len] = '\0';
1212 
1213         return (0);
1214 }
1215 
1216 /*
1217  * This function retrieves the value of the nvpair in the form of enumerated
1218  * type data_type_t. This is used to determine the appropriate type to pass to
1219  * nvpair_value().
1220  */
1221 static int
1222 nvpair_type(char *nvp)
1223 {
1224         int name_len, type;
1225 
1226         /* skip over encode/decode size */
1227         nvp += 4 * 2;
1228 
1229         /* skip over name_len */
1230         name_len = BSWAP_32(*(uint32_t *)nvp);
1231         nvp += 4;
1232 
1233         /* skip over name */
1234         nvp = nvp + ((name_len + 3) & ~3); /* align */
1235 
1236         type = BSWAP_32(*(uint32_t *)nvp);
1237 
1238         return (type);
1239 }
1240 
1241 static int
1242 nvpair_value(char *nvp, void *val, int valtype, int *nelmp)
1243 {
1244         int name_len, type, slen;
1245         char *strval = val;
1246         uint64_t *intval = val;
1247 
1248         /* skip over encode/decode size */
1249         nvp += 4 * 2;
1250 
1251         /* skip over name_len */
1252         name_len = BSWAP_32(*(uint32_t *)nvp);
1253         nvp += 4;
1254 
1255         /* skip over name */
1256         nvp = nvp + ((name_len + 3) & ~3); /* align */
1257 
1258         /* skip over type */
1259         type = BSWAP_32(*(uint32_t *)nvp);
1260         nvp += 4;
1261 
1262         if (type == valtype) {
1263                 int nelm;
1264 
1265                 nelm = BSWAP_32(*(uint32_t *)nvp);
1266                 if (valtype != DATA_TYPE_BOOLEAN && nelm < 1)
1267                         return (1);
1268                 nvp += 4;
1269 
1270                 switch (valtype) {
1271                 case DATA_TYPE_BOOLEAN:
1272                         return (0);
1273 
1274                 case DATA_TYPE_STRING:
1275                         slen = BSWAP_32(*(uint32_t *)nvp);
1276                         nvp += 4;
1277                         grub_memmove(strval, nvp, slen);
1278                         strval[slen] = '\0';
1279                         return (0);
1280 
1281                 case DATA_TYPE_UINT64:
1282                         *intval = BSWAP_64(*(uint64_t *)nvp);
1283                         return (0);
1284 
1285                 case DATA_TYPE_NVLIST:
1286                         *(void **)val = (void *)nvp;
1287                         return (0);
1288 
1289                 case DATA_TYPE_NVLIST_ARRAY:
1290                         *(void **)val = (void *)nvp;
1291                         if (nelmp)
1292                                 *nelmp = nelm;
1293                         return (0);
1294                 }
1295         }
1296 
1297         return (1);
1298 }
1299 
1300 static int
1301 nvlist_lookup_value(char *nvlist, char *name, void *val, int valtype,
1302     int *nelmp)
1303 {
1304         char *nvpair;
1305 
1306         for (nvpair = nvlist_next_nvpair(nvlist, NULL);
1307             nvpair != NULL;
1308             nvpair = nvlist_next_nvpair(nvlist, nvpair)) {
1309                 int name_len = BSWAP_32(*(uint32_t *)(nvpair + 4 * 2));
1310                 char *nvp_name = nvpair + 4 * 3;
1311 
1312                 if ((grub_strncmp(nvp_name, name, name_len) == 0) &&
1313                     nvpair_type(nvpair) == valtype) {
1314                         return (nvpair_value(nvpair, val, valtype, nelmp));
1315                 }
1316         }
1317         return (1);
1318 }
1319 
1320 /*
1321  * Check if this vdev is online and is in a good state.
1322  */
1323 static int
1324 vdev_validate(char *nv)
1325 {
1326         uint64_t ival;
1327 
1328         if (nvlist_lookup_value(nv, ZPOOL_CONFIG_OFFLINE, &ival,
1329             DATA_TYPE_UINT64, NULL) == 0 ||
1330             nvlist_lookup_value(nv, ZPOOL_CONFIG_FAULTED, &ival,
1331             DATA_TYPE_UINT64, NULL) == 0 ||
1332             nvlist_lookup_value(nv, ZPOOL_CONFIG_REMOVED, &ival,
1333             DATA_TYPE_UINT64, NULL) == 0)
1334                 return (ERR_DEV_VALUES);
1335 
1336         return (0);
1337 }
1338 
1339 /*
1340  * Get a valid vdev pathname/devid from the boot device.
1341  * The caller should already allocate MAXPATHLEN memory for bootpath and devid.
1342  */
1343 static int
1344 vdev_get_bootpath(char *nv, uint64_t inguid, char *devid, char *bootpath,
1345     int is_spare)
1346 {
1347         char type[16];
1348 
1349         if (nvlist_lookup_value(nv, ZPOOL_CONFIG_TYPE, &type, DATA_TYPE_STRING,
1350             NULL))
1351                 return (ERR_FSYS_CORRUPT);
1352 
1353         if (grub_strcmp(type, VDEV_TYPE_DISK) == 0) {
1354                 uint64_t guid;
1355 
1356                 if (vdev_validate(nv) != 0)
1357                         return (ERR_NO_BOOTPATH);
1358 
1359                 if (nvlist_lookup_value(nv, ZPOOL_CONFIG_GUID,
1360                     &guid, DATA_TYPE_UINT64, NULL) != 0)
1361                         return (ERR_NO_BOOTPATH);
1362 
1363                 if (guid != inguid)
1364                         return (ERR_NO_BOOTPATH);
1365 
1366                 /* for a spare vdev, pick the disk labeled with "is_spare" */
1367                 if (is_spare) {
1368                         uint64_t spare = 0;
1369                         (void) nvlist_lookup_value(nv, ZPOOL_CONFIG_IS_SPARE,
1370                             &spare, DATA_TYPE_UINT64, NULL);
1371                         if (!spare)
1372                                 return (ERR_NO_BOOTPATH);
1373                 }
1374 
1375                 if (nvlist_lookup_value(nv, ZPOOL_CONFIG_PHYS_PATH,
1376                     bootpath, DATA_TYPE_STRING, NULL) != 0)
1377                         bootpath[0] = '\0';
1378 
1379                 if (nvlist_lookup_value(nv, ZPOOL_CONFIG_DEVID,
1380                     devid, DATA_TYPE_STRING, NULL) != 0)
1381                         devid[0] = '\0';
1382 
1383                 if (grub_strlen(bootpath) >= MAXPATHLEN ||
1384                     grub_strlen(devid) >= MAXPATHLEN)
1385                         return (ERR_WONT_FIT);
1386 
1387                 return (0);
1388 
1389         } else if (grub_strcmp(type, VDEV_TYPE_MIRROR) == 0 ||
1390             grub_strcmp(type, VDEV_TYPE_REPLACING) == 0 ||
1391             (is_spare = (grub_strcmp(type, VDEV_TYPE_SPARE) == 0))) {
1392                 int nelm, i;
1393                 char *child;
1394 
1395                 if (nvlist_lookup_value(nv, ZPOOL_CONFIG_CHILDREN, &child,
1396                     DATA_TYPE_NVLIST_ARRAY, &nelm))
1397                         return (ERR_FSYS_CORRUPT);
1398 
1399                 for (i = 0; i < nelm; i++) {
1400                         char *child_i;
1401 
1402                         child_i = nvlist_array(child, i);
1403                         if (vdev_get_bootpath(child_i, inguid, devid,
1404                             bootpath, is_spare) == 0)
1405                                 return (0);
1406                 }
1407         }
1408 
1409         return (ERR_NO_BOOTPATH);
1410 }
1411 
1412 /*
1413  * Check the disk label information and retrieve needed vdev name-value pairs.
1414  *
1415  * Return:
1416  *      0 - success
1417  *      ERR_* - failure
1418  */
1419 static int
1420 check_pool_label(uint64_t sector, char *stack, char *outdevid,
1421     char *outpath, uint64_t *outguid, uint64_t *outashift, uint64_t *outversion)
1422 {
1423         vdev_phys_t *vdev;
1424         uint64_t pool_state, txg = 0;
1425         char *nvlist, *nv, *features;
1426         uint64_t diskguid;
1427 
1428         sector += (VDEV_SKIP_SIZE >> SPA_MINBLOCKSHIFT);
1429 
1430         /* Read in the vdev name-value pair list (112K). */
1431         if (devread(sector, 0, VDEV_PHYS_SIZE, stack) == 0)
1432                 return (ERR_READ);
1433 
1434         vdev = (vdev_phys_t *)stack;
1435         stack += sizeof (vdev_phys_t);
1436 
1437         if (nvlist_unpack(vdev->vp_nvlist, &nvlist))
1438                 return (ERR_FSYS_CORRUPT);
1439 
1440         if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_POOL_STATE, &pool_state,
1441             DATA_TYPE_UINT64, NULL))
1442                 return (ERR_FSYS_CORRUPT);
1443 
1444         if (pool_state == POOL_STATE_DESTROYED)
1445                 return (ERR_FILESYSTEM_NOT_FOUND);
1446 
1447         if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_POOL_NAME,
1448             current_rootpool, DATA_TYPE_STRING, NULL))
1449                 return (ERR_FSYS_CORRUPT);
1450 
1451         if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_POOL_TXG, &txg,
1452             DATA_TYPE_UINT64, NULL))
1453                 return (ERR_FSYS_CORRUPT);
1454 
1455         /* not an active device */
1456         if (txg == 0)
1457                 return (ERR_NO_BOOTPATH);
1458 
1459         if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_VERSION, outversion,
1460             DATA_TYPE_UINT64, NULL))
1461                 return (ERR_FSYS_CORRUPT);
1462         if (!SPA_VERSION_IS_SUPPORTED(*outversion))
1463                 return (ERR_NEWER_VERSION);
1464         if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_VDEV_TREE, &nv,
1465             DATA_TYPE_NVLIST, NULL))
1466                 return (ERR_FSYS_CORRUPT);
1467         if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_GUID, &diskguid,
1468             DATA_TYPE_UINT64, NULL))
1469                 return (ERR_FSYS_CORRUPT);
1470         if (nvlist_lookup_value(nv, ZPOOL_CONFIG_ASHIFT, outashift,
1471             DATA_TYPE_UINT64, NULL) != 0)
1472                 return (ERR_FSYS_CORRUPT);
1473         if (vdev_get_bootpath(nv, diskguid, outdevid, outpath, 0))
1474                 return (ERR_NO_BOOTPATH);
1475         if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_POOL_GUID, outguid,
1476             DATA_TYPE_UINT64, NULL))
1477                 return (ERR_FSYS_CORRUPT);
1478 
1479         if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_FEATURES_FOR_READ,
1480             &features, DATA_TYPE_NVLIST, NULL) == 0) {
1481                 char *nvp;
1482                 char *name = stack;
1483                 stack += MAXNAMELEN;
1484 
1485                 for (nvp = nvlist_next_nvpair(features, NULL);
1486                     nvp != NULL;
1487                     nvp = nvlist_next_nvpair(features, nvp)) {
1488                         zap_attribute_t za;
1489 
1490                         if (nvpair_name(nvp, name, MAXNAMELEN) != 0)
1491                                 return (ERR_FSYS_CORRUPT);
1492 
1493                         za.za_integer_length = 8;
1494                         za.za_num_integers = 1;
1495                         za.za_first_integer = 1;
1496                         za.za_name = name;
1497                         if (check_feature(&za, spa_feature_names, stack) != 0)
1498                                 return (ERR_NEWER_VERSION);
1499                 }
1500         }
1501 
1502         return (0);
1503 }
1504 
1505 /*
1506  * zfs_mount() locates a valid uberblock of the root pool and read in its MOS
1507  * to the memory address MOS.
1508  *
1509  * Return:
1510  *      1 - success
1511  *      0 - failure
1512  */
1513 int
1514 zfs_mount(void)
1515 {
1516         char *stack, *ub_array;
1517         int label = 0;
1518         uberblock_t *ubbest;
1519         objset_phys_t *osp;
1520         char tmp_bootpath[MAXNAMELEN];
1521         char tmp_devid[MAXNAMELEN];
1522         uint64_t tmp_guid, ashift, version;
1523         uint64_t adjpl = (uint64_t)part_length << SPA_MINBLOCKSHIFT;
1524         int err = errnum; /* preserve previous errnum state */
1525 
1526         /* if it's our first time here, zero the best uberblock out */
1527         if (best_drive == 0 && best_part == 0 && find_best_root) {
1528                 grub_memset(&current_uberblock, 0, sizeof (uberblock_t));
1529                 pool_guid = 0;
1530         }
1531 
1532         stackbase = ZFS_SCRATCH;
1533         stack = stackbase;
1534         ub_array = stack;
1535         stack += VDEV_UBERBLOCK_RING;
1536 
1537         osp = (objset_phys_t *)stack;
1538         stack += sizeof (objset_phys_t);
1539         adjpl = P2ALIGN(adjpl, (uint64_t)sizeof (vdev_label_t));
1540 
1541         for (label = 0; label < VDEV_LABELS; label++) {
1542 
1543                 /*
1544                  * some eltorito stacks don't give us a size and
1545                  * we end up setting the size to MAXUINT, further
1546                  * some of these devices stop working once a single
1547                  * read past the end has been issued. Checking
1548                  * for a maximum part_length and skipping the backup
1549                  * labels at the end of the slice/partition/device
1550                  * avoids breaking down on such devices.
1551                  */
1552                 if (part_length == MAXUINT && label == 2)
1553                         break;
1554 
1555                 uint64_t sector = vdev_label_start(adjpl,
1556                     label) >> SPA_MINBLOCKSHIFT;
1557 
1558                 /* Read in the uberblock ring (128K). */
1559                 if (devread(sector  +
1560                     ((VDEV_SKIP_SIZE + VDEV_PHYS_SIZE) >> SPA_MINBLOCKSHIFT),
1561                     0, VDEV_UBERBLOCK_RING, ub_array) == 0)
1562                         continue;
1563 
1564                 if (check_pool_label(sector, stack, tmp_devid,
1565                     tmp_bootpath, &tmp_guid, &ashift, &version))
1566                         continue;
1567 
1568                 if (pool_guid == 0)
1569                         pool_guid = tmp_guid;
1570 
1571                 if ((ubbest = find_bestub(ub_array, ashift, sector)) == NULL ||
1572                     zio_read(&ubbest->ub_rootbp, osp, stack) != 0)
1573                         continue;
1574 
1575                 VERIFY_OS_TYPE(osp, DMU_OST_META);
1576 
1577                 if (version >= SPA_VERSION_FEATURES &&
1578                     check_mos_features(&osp->os_meta_dnode, stack) != 0)
1579                         continue;
1580 
1581                 if (find_best_root && ((pool_guid != tmp_guid) ||
1582                     vdev_uberblock_compare(ubbest, &(current_uberblock)) <= 0))
1583                         continue;
1584 
1585                 /* Got the MOS. Save it at the memory addr MOS. */
1586                 grub_memmove(MOS, &osp->os_meta_dnode, DNODE_SIZE);
1587                 grub_memmove(&current_uberblock, ubbest, sizeof (uberblock_t));
1588                 grub_memmove(current_bootpath, tmp_bootpath, MAXNAMELEN);
1589                 grub_memmove(current_devid, tmp_devid, grub_strlen(tmp_devid));
1590                 is_zfs_mount = 1;
1591                 return (1);
1592         }
1593 
1594         /*
1595          * While some fs impls. (tftp) rely on setting and keeping
1596          * global errnums set, others won't reset it and will break
1597          * when issuing rawreads. The goal here is to simply not
1598          * have zfs mount attempts impact the previous state.
1599          */
1600         errnum = err;
1601         return (0);
1602 }
1603 
1604 /*
1605  * zfs_open() locates a file in the rootpool by following the
1606  * MOS and places the dnode of the file in the memory address DNODE.
1607  *
1608  * Return:
1609  *      1 - success
1610  *      0 - failure
1611  */
1612 int
1613 zfs_open(char *filename)
1614 {
1615         char *stack;
1616         dnode_phys_t *mdn;
1617 
1618         file_buf = NULL;
1619         stackbase = ZFS_SCRATCH;
1620         stack = stackbase;
1621 
1622         mdn = (dnode_phys_t *)stack;
1623         stack += sizeof (dnode_phys_t);
1624 
1625         dnode_mdn = NULL;
1626         dnode_buf = (dnode_phys_t *)stack;
1627         stack += 1<<DNODE_BLOCK_SHIFT;
1628 
1629         /*
1630          * menu.lst is placed at the root pool filesystem level,
1631          * do not goto 'current_bootfs'.
1632          */
1633         if (is_top_dataset_file(filename)) {
1634                 if (errnum = get_objset_mdn(MOS, NULL, NULL, mdn, stack))
1635                         return (0);
1636 
1637                 current_bootfs_obj = 0;
1638         } else {
1639                 if (current_bootfs[0] == '\0') {
1640                         /* Get the default root filesystem object number */
1641                         if (errnum = get_default_bootfsobj(MOS,
1642                             &current_bootfs_obj, stack))
1643                                 return (0);
1644 
1645                         if (errnum = get_objset_mdn(MOS, NULL,
1646                             &current_bootfs_obj, mdn, stack))
1647                                 return (0);
1648                 } else {
1649                         if (errnum = get_objset_mdn(MOS, current_bootfs,
1650                             &current_bootfs_obj, mdn, stack)) {
1651                                 grub_memset(current_bootfs, 0, MAXNAMELEN);
1652                                 return (0);
1653                         }
1654                 }
1655         }
1656 
1657         if (dnode_get_path(mdn, filename, DNODE, stack)) {
1658                 errnum = ERR_FILE_NOT_FOUND;
1659                 return (0);
1660         }
1661 
1662         /* get the file size and set the file position to 0 */
1663 
1664         /*
1665          * For DMU_OT_SA we will need to locate the SIZE attribute
1666          * attribute, which could be either in the bonus buffer
1667          * or the "spill" block.
1668          */
1669         if (DNODE->dn_bonustype == DMU_OT_SA) {
1670                 sa_hdr_phys_t *sahdrp;
1671                 int hdrsize;
1672 
1673                 if (DNODE->dn_bonuslen != 0) {
1674                         sahdrp = (sa_hdr_phys_t *)DN_BONUS(DNODE);
1675                 } else {
1676                         if (DNODE->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
1677                                 blkptr_t *bp = &DNODE->dn_spill;
1678                                 void *buf;
1679 
1680                                 buf = (void *)stack;
1681                                 stack += BP_GET_LSIZE(bp);
1682 
1683                                 /* reset errnum to rawread() failure */
1684                                 errnum = 0;
1685                                 if (zio_read(bp, buf, stack) != 0) {
1686                                         return (0);
1687                                 }
1688                                 sahdrp = buf;
1689                         } else {
1690                                 errnum = ERR_FSYS_CORRUPT;
1691                                 return (0);
1692                         }
1693                 }
1694                 hdrsize = SA_HDR_SIZE(sahdrp);
1695                 filemax = *(uint64_t *)((char *)sahdrp + hdrsize +
1696                     SA_SIZE_OFFSET);
1697         } else {
1698                 filemax = ((znode_phys_t *)DN_BONUS(DNODE))->zp_size;
1699         }
1700         filepos = 0;
1701 
1702         dnode_buf = NULL;
1703         return (1);
1704 }
1705 
1706 /*
1707  * zfs_read reads in the data blocks pointed by the DNODE.
1708  *
1709  * Return:
1710  *      len - the length successfully read in to the buffer
1711  *      0   - failure
1712  */
1713 int
1714 zfs_read(char *buf, int len)
1715 {
1716         char *stack;
1717         int blksz, length, movesize;
1718 
1719         if (file_buf == NULL) {
1720                 file_buf = stackbase;
1721                 stackbase += SPA_MAXBLOCKSIZE;
1722                 file_start = file_end = 0;
1723         }
1724         stack = stackbase;
1725 
1726         /*
1727          * If offset is in memory, move it into the buffer provided and return.
1728          */
1729         if (filepos >= file_start && filepos+len <= file_end) {
1730                 grub_memmove(buf, file_buf + filepos - file_start, len);
1731                 filepos += len;
1732                 return (len);
1733         }
1734 
1735         blksz = DNODE->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1736 
1737         /*
1738          * Entire Dnode is too big to fit into the space available.  We
1739          * will need to read it in chunks.  This could be optimized to
1740          * read in as large a chunk as there is space available, but for
1741          * now, this only reads in one data block at a time.
1742          */
1743         length = len;
1744         while (length) {
1745                 /*
1746                  * Find requested blkid and the offset within that block.
1747                  */
1748                 uint64_t blkid = filepos / blksz;
1749 
1750                 if (errnum = dmu_read(DNODE, blkid, file_buf, stack))
1751                         return (0);
1752 
1753                 file_start = blkid * blksz;
1754                 file_end = file_start + blksz;
1755 
1756                 movesize = MIN(length, file_end - filepos);
1757 
1758                 grub_memmove(buf, file_buf + filepos - file_start,
1759                     movesize);
1760                 buf += movesize;
1761                 length -= movesize;
1762                 filepos += movesize;
1763         }
1764 
1765         return (len);
1766 }
1767 
1768 /*
1769  * No-Op
1770  */
1771 int
1772 zfs_embed(int *start_sector, int needed_sectors)
1773 {
1774         return (1);
1775 }
1776 
1777 #endif /* FSYS_ZFS */