illumos-gate New usr/src/uts/i86pc/dboot/dboot

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright 2012 Joyent, Inc.  All rights reserved.
  27  */
  28 
  29 
  30 #include <sys/types.h>
  31 #include <sys/machparam.h>
  32 #include <sys/x86_archext.h>
  33 #include <sys/systm.h>
  34 #include <sys/mach_mmu.h>
  35 #include <sys/multiboot.h>
  36 #include <sys/sha1.h>
  37 
  38 #if defined(__xpv)
  39 
  40 #include <sys/hypervisor.h>
  41 uintptr_t xen_virt_start;
  42 pfn_t *mfn_to_pfn_mapping;
  43 
  44 #else /* !__xpv */
  45 
  46 extern multiboot_header_t mb_header;
  47 extern int have_cpuid(void);
  48 
  49 #endif /* !__xpv */
  50 
  51 #include <sys/inttypes.h>
  52 #include <sys/bootinfo.h>
  53 #include <sys/mach_mmu.h>
  54 #include <sys/boot_console.h>
  55 
  56 #include "dboot_asm.h"
  57 #include "dboot_printf.h"
  58 #include "dboot_xboot.h"
  59 #include "dboot_elfload.h"
  60 
  61 #define SHA1_ASCII_LENGTH       (SHA1_DIGEST_LENGTH * 2)
  62 
  63 /*
  64  * This file contains code that runs to transition us from either a multiboot
  65  * compliant loader (32 bit non-paging) or a XPV domain loader to
  66  * regular kernel execution. Its task is to setup the kernel memory image
  67  * and page tables.
  68  *
  69  * The code executes as:
  70  *      - 32 bits under GRUB (for 32 or 64 bit Solaris)
  71  *      - a 32 bit program for the 32-bit PV hypervisor
  72  *      - a 64 bit program for the 64-bit PV hypervisor (at least for now)
  73  *
  74  * Under the PV hypervisor, we must create mappings for any memory beyond the
  75  * initial start of day allocation (such as the kernel itself).
  76  *
  77  * When on the metal, the mapping between maddr_t and paddr_t is 1:1.
  78  * Since we are running in real mode, so all such memory is accessible.
  79  */
  80 
  81 /*
  82  * Standard bits used in PTE (page level) and PTP (internal levels)
  83  */
  84 x86pte_t ptp_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_USER;
  85 x86pte_t pte_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_MOD | PT_NOCONSIST;
  86 
  87 /*
  88  * This is the target addresses (physical) where the kernel text and data
  89  * nucleus pages will be unpacked. On the hypervisor this is actually a
  90  * virtual address.
  91  */
  92 paddr_t ktext_phys;
  93 uint32_t ksize = 2 * FOUR_MEG;  /* kernel nucleus is 8Meg */
  94 
  95 static uint64_t target_kernel_text;     /* value to use for KERNEL_TEXT */
  96 
  97 /*
  98  * The stack is setup in assembler before entering startup_kernel()
  99  */
 100 char stack_space[STACK_SIZE];
 101 
 102 /*
 103  * Used to track physical memory allocation
 104  */
 105 static paddr_t next_avail_addr = 0;
 106 
 107 #if defined(__xpv)
 108 /*
 109  * Additional information needed for hypervisor memory allocation.
 110  * Only memory up to scratch_end is mapped by page tables.
 111  * mfn_base is the start of the hypervisor virtual image. It's ONE_GIG, so
 112  * to derive a pfn from a pointer, you subtract mfn_base.
 113  */
 114 
 115 static paddr_t scratch_end = 0; /* we can't write all of mem here */
 116 static paddr_t mfn_base;                /* addr corresponding to mfn_list[0] */
 117 start_info_t *xen_info;
 118 
 119 #else   /* __xpv */
 120 
 121 /*
 122  * If on the metal, then we have a multiboot loader.
 123  */
 124 multiboot_info_t *mb_info;
 125 
 126 #endif  /* __xpv */
 127 
 128 /*
 129  * This contains information passed to the kernel
 130  */
 131 struct xboot_info boot_info[2]; /* extra space to fix alignement for amd64 */
 132 struct xboot_info *bi;
 133 
 134 /*
 135  * Page table and memory stuff.
 136  */
 137 static paddr_t max_mem;                 /* maximum memory address */
 138 
 139 /*
 140  * Information about processor MMU
 141  */
 142 int amd64_support = 0;
 143 int largepage_support = 0;
 144 int pae_support = 0;
 145 int pge_support = 0;
 146 int NX_support = 0;
 147 
 148 /*
 149  * Low 32 bits of kernel entry address passed back to assembler.
 150  * When running a 64 bit kernel, the high 32 bits are 0xffffffff.
 151  */
 152 uint32_t entry_addr_low;
 153 
 154 /*
 155  * Memlists for the kernel. We shouldn't need a lot of these.
 156  */
 157 #define MAX_MEMLIST (50)
 158 struct boot_memlist memlists[MAX_MEMLIST];
 159 uint_t memlists_used = 0;
 160 struct boot_memlist pcimemlists[MAX_MEMLIST];
 161 uint_t pcimemlists_used = 0;
 162 struct boot_memlist rsvdmemlists[MAX_MEMLIST];
 163 uint_t rsvdmemlists_used = 0;
 164 
 165 #define MAX_MODULES (10)
 166 struct boot_modules modules[MAX_MODULES];
 167 uint_t modules_used = 0;
 168 
 169 /*
 170  * Debugging macros
 171  */
 172 uint_t prom_debug = 0;
 173 uint_t map_debug = 0;
 174 
 175 /*
 176  * Either hypervisor-specific or grub-specific code builds the initial
 177  * memlists. This code does the sort/merge/link for final use.
 178  */
 179 static void
 180 sort_physinstall(void)
 181 {
 182         int i;
 183 #if !defined(__xpv)
 184         int j;
 185         struct boot_memlist tmp;
 186 
 187         /*
 188          * Now sort the memlists, in case they weren't in order.
 189          * Yeah, this is a bubble sort; small, simple and easy to get right.
 190          */
 191         DBG_MSG("Sorting phys-installed list\n");
 192         for (j = memlists_used - 1; j > 0; --j) {
 193                 for (i = 0; i < j; ++i) {
 194                         if (memlists[i].addr < memlists[i + 1].addr)
 195                                 continue;
 196                         tmp = memlists[i];
 197                         memlists[i] = memlists[i + 1];
 198                         memlists[i + 1] = tmp;
 199                 }
 200         }
 201 
 202         /*
 203          * Merge any memlists that don't have holes between them.
 204          */
 205         for (i = 0; i <= memlists_used - 1; ++i) {
 206                 if (memlists[i].addr + memlists[i].size != memlists[i + 1].addr)
 207                         continue;
 208 
 209                 if (prom_debug)
 210                         dboot_printf(
 211                             "merging mem segs %" PRIx64 "...%" PRIx64
 212                             " w/ %" PRIx64 "...%" PRIx64 "\n",
 213                             memlists[i].addr,
 214                             memlists[i].addr + memlists[i].size,
 215                             memlists[i + 1].addr,
 216                             memlists[i + 1].addr + memlists[i + 1].size);
 217 
 218                 memlists[i].size += memlists[i + 1].size;
 219                 for (j = i + 1; j < memlists_used - 1; ++j)
 220                         memlists[j] = memlists[j + 1];
 221                 --memlists_used;
 222                 DBG(memlists_used);
 223                 --i;    /* after merging we need to reexamine, so do this */
 224         }
 225 #endif  /* __xpv */
 226 
 227         if (prom_debug) {
 228                 dboot_printf("\nFinal memlists:\n");
 229                 for (i = 0; i < memlists_used; ++i) {
 230                         dboot_printf("\t%d: addr=%" PRIx64 " size=%"
 231                             PRIx64 "\n", i, memlists[i].addr, memlists[i].size);
 232                 }
 233         }
 234 
 235         /*
 236          * link together the memlists with native size pointers
 237          */
 238         memlists[0].next = 0;
 239         memlists[0].prev = 0;
 240         for (i = 1; i < memlists_used; ++i) {
 241                 memlists[i].prev = (native_ptr_t)(uintptr_t)(memlists + i - 1);
 242                 memlists[i].next = 0;
 243                 memlists[i - 1].next = (native_ptr_t)(uintptr_t)(memlists + i);
 244         }
 245         bi->bi_phys_install = (native_ptr_t)(uintptr_t)memlists;
 246         DBG(bi->bi_phys_install);
 247 }
 248 
 249 /*
 250  * build bios reserved memlists
 251  */
 252 static void
 253 build_rsvdmemlists(void)
 254 {
 255         int i;
 256 
 257         rsvdmemlists[0].next = 0;
 258         rsvdmemlists[0].prev = 0;
 259         for (i = 1; i < rsvdmemlists_used; ++i) {
 260                 rsvdmemlists[i].prev =
 261                     (native_ptr_t)(uintptr_t)(rsvdmemlists + i - 1);
 262                 rsvdmemlists[i].next = 0;
 263                 rsvdmemlists[i - 1].next =
 264                     (native_ptr_t)(uintptr_t)(rsvdmemlists + i);
 265         }
 266         bi->bi_rsvdmem = (native_ptr_t)(uintptr_t)rsvdmemlists;
 267         DBG(bi->bi_rsvdmem);
 268 }
 269 
 270 #if defined(__xpv)
 271 
 272 /*
 273  * halt on the hypervisor after a delay to drain console output
 274  */
 275 void
 276 dboot_halt(void)
 277 {
 278         uint_t i = 10000;
 279 
 280         while (--i)
 281                 (void) HYPERVISOR_yield();
 282         (void) HYPERVISOR_shutdown(SHUTDOWN_poweroff);
 283 }
 284 
 285 /*
 286  * From a machine address, find the corresponding pseudo-physical address.
 287  * Pseudo-physical address are contiguous and run from mfn_base in each VM.
 288  * Machine addresses are the real underlying hardware addresses.
 289  * These are needed for page table entries. Note that this routine is
 290  * poorly protected. A bad value of "ma" will cause a page fault.
 291  */
 292 paddr_t
 293 ma_to_pa(maddr_t ma)
 294 {
 295         ulong_t pgoff = ma & MMU_PAGEOFFSET;
 296         ulong_t pfn = mfn_to_pfn_mapping[mmu_btop(ma)];
 297         paddr_t pa;
 298 
 299         if (pfn >= xen_info->nr_pages)
 300                 return (-(paddr_t)1);
 301         pa = mfn_base + mmu_ptob((paddr_t)pfn) + pgoff;
 302 #ifdef DEBUG
 303         if (ma != pa_to_ma(pa))
 304                 dboot_printf("ma_to_pa(%" PRIx64 ") got %" PRIx64 ", "
 305                     "pa_to_ma() says %" PRIx64 "\n", ma, pa, pa_to_ma(pa));
 306 #endif
 307         return (pa);
 308 }
 309 
 310 /*
 311  * From a pseudo-physical address, find the corresponding machine address.
 312  */
 313 maddr_t
 314 pa_to_ma(paddr_t pa)
 315 {
 316         pfn_t pfn;
 317         ulong_t mfn;
 318 
 319         pfn = mmu_btop(pa - mfn_base);
 320         if (pa < mfn_base || pfn >= xen_info->nr_pages)
 321                 dboot_panic("pa_to_ma(): illegal address 0x%lx", (ulong_t)pa);
 322         mfn = ((ulong_t *)xen_info->mfn_list)[pfn];
 323 #ifdef DEBUG
 324         if (mfn_to_pfn_mapping[mfn] != pfn)
 325                 dboot_printf("pa_to_ma(pfn=%lx) got %lx ma_to_pa() says %lx\n",
 326                     pfn, mfn, mfn_to_pfn_mapping[mfn]);
 327 #endif
 328         return (mfn_to_ma(mfn) | (pa & MMU_PAGEOFFSET));
 329 }
 330 
 331 #endif  /* __xpv */
 332 
 333 x86pte_t
 334 get_pteval(paddr_t table, uint_t index)
 335 {
 336         if (pae_support)
 337                 return (((x86pte_t *)(uintptr_t)table)[index]);
 338         return (((x86pte32_t *)(uintptr_t)table)[index]);
 339 }
 340 
 341 /*ARGSUSED*/
 342 void
 343 set_pteval(paddr_t table, uint_t index, uint_t level, x86pte_t pteval)
 344 {
 345 #ifdef __xpv
 346         mmu_update_t t;
 347         maddr_t mtable = pa_to_ma(table);
 348         int retcnt;
 349 
 350         t.ptr = (mtable + index * pte_size) | MMU_NORMAL_PT_UPDATE;
 351         t.val = pteval;
 352         if (HYPERVISOR_mmu_update(&t, 1, &retcnt, DOMID_SELF) || retcnt != 1)
 353                 dboot_panic("HYPERVISOR_mmu_update() failed");
 354 #else /* __xpv */
 355         uintptr_t tab_addr = (uintptr_t)table;
 356 
 357         if (pae_support)
 358                 ((x86pte_t *)tab_addr)[index] = pteval;
 359         else
 360                 ((x86pte32_t *)tab_addr)[index] = (x86pte32_t)pteval;
 361         if (level == top_level && level == 2)
 362                 reload_cr3();
 363 #endif /* __xpv */
 364 }
 365 
 366 paddr_t
 367 make_ptable(x86pte_t *pteval, uint_t level)
 368 {
 369         paddr_t new_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
 370 
 371         if (level == top_level && level == 2)
 372                 *pteval = pa_to_ma((uintptr_t)new_table) | PT_VALID;
 373         else
 374                 *pteval = pa_to_ma((uintptr_t)new_table) | ptp_bits;
 375 
 376 #ifdef __xpv
 377         /* Remove write permission to the new page table. */
 378         if (HYPERVISOR_update_va_mapping(new_table,
 379             *pteval & ~(x86pte_t)PT_WRITABLE, UVMF_INVLPG | UVMF_LOCAL))
 380                 dboot_panic("HYP_update_va_mapping error");
 381 #endif
 382 
 383         if (map_debug)
 384                 dboot_printf("new page table lvl=%d paddr=0x%lx ptp=0x%"
 385                     PRIx64 "\n", level, (ulong_t)new_table, *pteval);
 386         return (new_table);
 387 }
 388 
 389 x86pte_t *
 390 map_pte(paddr_t table, uint_t index)
 391 {
 392         return ((x86pte_t *)(uintptr_t)(table + index * pte_size));
 393 }
 394 
 395 /*
 396  * dump out the contents of page tables...
 397  */
 398 static void
 399 dump_tables(void)
 400 {
 401         uint_t save_index[4];   /* for recursion */
 402         char *save_table[4];    /* for recursion */
 403         uint_t  l;
 404         uint64_t va;
 405         uint64_t pgsize;
 406         int index;
 407         int i;
 408         x86pte_t pteval;
 409         char *table;
 410         static char *tablist = "\t\t\t";
 411         char *tabs = tablist + 3 - top_level;
 412         uint_t pa, pa1;
 413 #if !defined(__xpv)
 414 #define maddr_t paddr_t
 415 #endif /* !__xpv */
 416 
 417         dboot_printf("Finished pagetables:\n");
 418         table = (char *)(uintptr_t)top_page_table;
 419         l = top_level;
 420         va = 0;
 421         for (index = 0; index < ptes_per_table; ++index) {
 422                 pgsize = 1ull << shift_amt[l];
 423                 if (pae_support)
 424                         pteval = ((x86pte_t *)table)[index];
 425                 else
 426                         pteval = ((x86pte32_t *)table)[index];
 427                 if (pteval == 0)
 428                         goto next_entry;
 429 
 430                 dboot_printf("%s %p[0x%x] = %" PRIx64 ", va=%" PRIx64,
 431                     tabs + l, (void *)table, index, (uint64_t)pteval, va);
 432                 pa = ma_to_pa(pteval & MMU_PAGEMASK);
 433                 dboot_printf(" physaddr=%x\n", pa);
 434 
 435                 /*
 436                  * Don't try to walk hypervisor private pagetables
 437                  */
 438                 if ((l > 1 || (l == 1 && (pteval & PT_PAGESIZE) == 0))) {
 439                         save_table[l] = table;
 440                         save_index[l] = index;
 441                         --l;
 442                         index = -1;
 443                         table = (char *)(uintptr_t)
 444                             ma_to_pa(pteval & MMU_PAGEMASK);
 445                         goto recursion;
 446                 }
 447 
 448                 /*
 449                  * shorten dump for consecutive mappings
 450                  */
 451                 for (i = 1; index + i < ptes_per_table; ++i) {
 452                         if (pae_support)
 453                                 pteval = ((x86pte_t *)table)[index + i];
 454                         else
 455                                 pteval = ((x86pte32_t *)table)[index + i];
 456                         if (pteval == 0)
 457                                 break;
 458                         pa1 = ma_to_pa(pteval & MMU_PAGEMASK);
 459                         if (pa1 != pa + i * pgsize)
 460                                 break;
 461                 }
 462                 if (i > 2) {
 463                         dboot_printf("%s...\n", tabs + l);
 464                         va += pgsize * (i - 2);
 465                         index += i - 2;
 466                 }
 467 next_entry:
 468                 va += pgsize;
 469                 if (l == 3 && index == 256)     /* VA hole */
 470                         va = 0xffff800000000000ull;
 471 recursion:
 472                 ;
 473         }
 474         if (l < top_level) {
 475                 ++l;
 476                 index = save_index[l];
 477                 table = save_table[l];
 478                 goto recursion;
 479         }
 480 }
 481 
 482 /*
 483  * Add a mapping for the machine page at the given virtual address.
 484  */
 485 static void
 486 map_ma_at_va(maddr_t ma, native_ptr_t va, uint_t level)
 487 {
 488         x86pte_t *ptep;
 489         x86pte_t pteval;
 490 
 491         pteval = ma | pte_bits;
 492         if (level > 0)
 493                 pteval |= PT_PAGESIZE;
 494         if (va >= target_kernel_text && pge_support)
 495                 pteval |= PT_GLOBAL;
 496 
 497         if (map_debug && ma != va)
 498                 dboot_printf("mapping ma=0x%" PRIx64 " va=0x%" PRIx64
 499                     " pte=0x%" PRIx64 " l=%d\n",
 500                     (uint64_t)ma, (uint64_t)va, pteval, level);
 501 
 502 #if defined(__xpv)
 503         /*
 504          * see if we can avoid find_pte() on the hypervisor
 505          */
 506         if (HYPERVISOR_update_va_mapping(va, pteval,
 507             UVMF_INVLPG | UVMF_LOCAL) == 0)
 508                 return;
 509 #endif
 510 
 511         /*
 512          * Find the pte that will map this address. This creates any
 513          * missing intermediate level page tables
 514          */
 515         ptep = find_pte(va, NULL, level, 0);
 516 
 517         /*
 518          * When paravirtualized, we must use hypervisor calls to modify the
 519          * PTE, since paging is active. On real hardware we just write to
 520          * the pagetables which aren't in use yet.
 521          */
 522 #if defined(__xpv)
 523         ptep = ptep;    /* shut lint up */
 524         if (HYPERVISOR_update_va_mapping(va, pteval, UVMF_INVLPG | UVMF_LOCAL))
 525                 dboot_panic("mmu_update failed-map_pa_at_va va=0x%" PRIx64
 526                     " l=%d ma=0x%" PRIx64 ", pte=0x%" PRIx64 "",
 527                     (uint64_t)va, level, (uint64_t)ma, pteval);
 528 #else
 529         if (va < 1024 * 1024)
 530                 pteval |= PT_NOCACHE;           /* for video RAM */
 531         if (pae_support)
 532                 *ptep = pteval;
 533         else
 534                 *((x86pte32_t *)ptep) = (x86pte32_t)pteval;
 535 #endif
 536 }
 537 
 538 /*
 539  * Add a mapping for the physical page at the given virtual address.
 540  */
 541 static void
 542 map_pa_at_va(paddr_t pa, native_ptr_t va, uint_t level)
 543 {
 544         map_ma_at_va(pa_to_ma(pa), va, level);
 545 }
 546 
 547 /*
 548  * This is called to remove start..end from the
 549  * possible range of PCI addresses.
 550  */
 551 const uint64_t pci_lo_limit = 0x00100000ul;
 552 const uint64_t pci_hi_limit = 0xfff00000ul;
 553 static void
 554 exclude_from_pci(uint64_t start, uint64_t end)
 555 {
 556         int i;
 557         int j;
 558         struct boot_memlist *ml;
 559 
 560         for (i = 0; i < pcimemlists_used; ++i) {
 561                 ml = &pcimemlists[i];
 562 
 563                 /* delete the entire range? */
 564                 if (start <= ml->addr && ml->addr + ml->size <= end) {
 565                         --pcimemlists_used;
 566                         for (j = i; j < pcimemlists_used; ++j)
 567                                 pcimemlists[j] = pcimemlists[j + 1];
 568                         --i;    /* to revisit the new one at this index */
 569                 }
 570 
 571                 /* split a range? */
 572                 else if (ml->addr < start && end < ml->addr + ml->size) {
 573 
 574                         ++pcimemlists_used;
 575                         if (pcimemlists_used > MAX_MEMLIST)
 576                                 dboot_panic("too many pcimemlists");
 577 
 578                         for (j = pcimemlists_used - 1; j > i; --j)
 579                                 pcimemlists[j] = pcimemlists[j - 1];
 580                         ml->size = start - ml->addr;
 581 
 582                         ++ml;
 583                         ml->size = (ml->addr + ml->size) - end;
 584                         ml->addr = end;
 585                         ++i;    /* skip on to next one */
 586                 }
 587 
 588                 /* cut memory off the start? */
 589                 else if (ml->addr < end && end < ml->addr + ml->size) {
 590                         ml->size -= end - ml->addr;
 591                         ml->addr = end;
 592                 }
 593 
 594                 /* cut memory off the end? */
 595                 else if (ml->addr <= start && start < ml->addr + ml->size) {
 596                         ml->size = start - ml->addr;
 597                 }
 598         }
 599 }
 600 
 601 /*
 602  * Xen strips the size field out of the mb_memory_map_t, see struct e820entry
 603  * definition in Xen source.
 604  */
 605 #ifdef __xpv
 606 typedef struct {
 607         uint32_t        base_addr_low;
 608         uint32_t        base_addr_high;
 609         uint32_t        length_low;
 610         uint32_t        length_high;
 611         uint32_t        type;
 612 } mmap_t;
 613 #else
 614 typedef mb_memory_map_t mmap_t;
 615 #endif
 616 
 617 static void
 618 build_pcimemlists(mmap_t *mem, int num)
 619 {
 620         mmap_t *mmap;
 621         uint64_t page_offset = MMU_PAGEOFFSET;  /* needs to be 64 bits */
 622         uint64_t start;
 623         uint64_t end;
 624         int i;
 625 
 626         /*
 627          * initialize
 628          */
 629         pcimemlists[0].addr = pci_lo_limit;
 630         pcimemlists[0].size = pci_hi_limit - pci_lo_limit;
 631         pcimemlists_used = 1;
 632 
 633         /*
 634          * Fill in PCI memlists.
 635          */
 636         for (mmap = mem, i = 0; i < num; ++i, ++mmap) {
 637                 start = ((uint64_t)mmap->base_addr_high << 32) +
 638                     mmap->base_addr_low;
 639                 end = start + ((uint64_t)mmap->length_high << 32) +
 640                     mmap->length_low;
 641 
 642                 if (prom_debug)
 643                         dboot_printf("\ttype: %d %" PRIx64 "..%"
 644                             PRIx64 "\n", mmap->type, start, end);
 645 
 646                 /*
 647                  * page align start and end
 648                  */
 649                 start = (start + page_offset) & ~page_offset;
 650                 end &= ~page_offset;
 651                 if (end <= start)
 652                         continue;
 653 
 654                 exclude_from_pci(start, end);
 655         }
 656 
 657         /*
 658          * Finish off the pcimemlist
 659          */
 660         if (prom_debug) {
 661                 for (i = 0; i < pcimemlists_used; ++i) {
 662                         dboot_printf("pcimemlist entry 0x%" PRIx64 "..0x%"
 663                             PRIx64 "\n", pcimemlists[i].addr,
 664                             pcimemlists[i].addr + pcimemlists[i].size);
 665                 }
 666         }
 667         pcimemlists[0].next = 0;
 668         pcimemlists[0].prev = 0;
 669         for (i = 1; i < pcimemlists_used; ++i) {
 670                 pcimemlists[i].prev =
 671                     (native_ptr_t)(uintptr_t)(pcimemlists + i - 1);
 672                 pcimemlists[i].next = 0;
 673                 pcimemlists[i - 1].next =
 674                     (native_ptr_t)(uintptr_t)(pcimemlists + i);
 675         }
 676         bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists;
 677         DBG(bi->bi_pcimem);
 678 }
 679 
 680 #if defined(__xpv)
 681 /*
 682  * Initialize memory allocator stuff from hypervisor-supplied start info.
 683  *
 684  * There is 512KB of scratch area after the boot stack page.
 685  * We'll use that for everything except the kernel nucleus pages which are too
 686  * big to fit there and are allocated last anyway.
 687  */
 688 #define MAXMAPS 100
 689 static mmap_t map_buffer[MAXMAPS];
 690 static void
 691 init_mem_alloc(void)
 692 {
 693         int     local;  /* variables needed to find start region */
 694         paddr_t scratch_start;
 695         xen_memory_map_t map;
 696 
 697         DBG_MSG("Entered init_mem_alloc()\n");
 698 
 699         /*
 700          * Free memory follows the stack. There's at least 512KB of scratch
 701          * space, rounded up to at least 2Mb alignment.  That should be enough
 702          * for the page tables we'll need to build.  The nucleus memory is
 703          * allocated last and will be outside the addressible range.  We'll
 704          * switch to new page tables before we unpack the kernel
 705          */
 706         scratch_start = RNDUP((paddr_t)(uintptr_t)&local, MMU_PAGESIZE);
 707         DBG(scratch_start);
 708         scratch_end = RNDUP((paddr_t)scratch_start + 512 * 1024, TWO_MEG);
 709         DBG(scratch_end);
 710 
 711         /*
 712          * For paranoia, leave some space between hypervisor data and ours.
 713          * Use 500 instead of 512.
 714          */
 715         next_avail_addr = scratch_end - 500 * 1024;
 716         DBG(next_avail_addr);
 717 
 718         /*
 719          * The domain builder gives us at most 1 module
 720          */
 721         DBG(xen_info->mod_len);
 722         if (xen_info->mod_len > 0) {
 723                 DBG(xen_info->mod_start);
 724                 modules[0].bm_addr = xen_info->mod_start;
 725                 modules[0].bm_size = xen_info->mod_len;
 726                 bi->bi_module_cnt = 1;
 727                 bi->bi_modules = (native_ptr_t)modules;
 728         } else {
 729                 bi->bi_module_cnt = 0;
 730                 bi->bi_modules = NULL;
 731         }
 732         DBG(bi->bi_module_cnt);
 733         DBG(bi->bi_modules);
 734 
 735         DBG(xen_info->mfn_list);
 736         DBG(xen_info->nr_pages);
 737         max_mem = (paddr_t)xen_info->nr_pages << MMU_PAGESHIFT;
 738         DBG(max_mem);
 739 
 740         /*
 741          * Using pseudo-physical addresses, so only 1 memlist element
 742          */
 743         memlists[0].addr = 0;
 744         DBG(memlists[0].addr);
 745         memlists[0].size = max_mem;
 746         DBG(memlists[0].size);
 747         memlists_used = 1;
 748         DBG(memlists_used);
 749 
 750         /*
 751          * finish building physinstall list
 752          */
 753         sort_physinstall();
 754 
 755         /*
 756          * build bios reserved memlists
 757          */
 758         build_rsvdmemlists();
 759 
 760         if (DOMAIN_IS_INITDOMAIN(xen_info)) {
 761                 /*
 762                  * build PCI Memory list
 763                  */
 764                 map.nr_entries = MAXMAPS;
 765                 /*LINTED: constant in conditional context*/
 766                 set_xen_guest_handle(map.buffer, map_buffer);
 767                 if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &map) != 0)
 768                         dboot_panic("getting XENMEM_machine_memory_map failed");
 769                 build_pcimemlists(map_buffer, map.nr_entries);
 770         }
 771 }
 772 
 773 #else   /* !__xpv */
 774 
 775 static uint8_t
 776 dboot_a2h(char v)
 777 {
 778         if (v >= 'a')
 779                 return (v - 'a' + 0xa);
 780         else if (v >= 'A')
 781                 return (v - 'A' + 0xa);
 782         else if (v >= '0')
 783                 return (v - '0');
 784         else
 785                 dboot_panic("bad ASCII hex character %c\n", v);
 786 
 787         return (0);
 788 }
 789 
 790 static void
 791 digest_a2h(const char *ascii, uint8_t *digest)
 792 {
 793         unsigned int i;
 794 
 795         for (i = 0; i < SHA1_DIGEST_LENGTH; i++) {
 796                 digest[i] = dboot_a2h(ascii[i * 2]) << 4;
 797                 digest[i] |= dboot_a2h(ascii[i * 2 + 1]);
 798         }
 799 }
 800 
 801 /*
 802  * Generate a SHA-1 hash of the first len bytes of image, and compare it with
 803  * the ASCII-format hash found in the 40-byte buffer at ascii.  If they
 804  * match, return 0, otherwise -1.  This works only for images smaller than
 805  * 4 GB, which should not be a problem.
 806  */
 807 static int
 808 check_image_hash(const char *ascii, const void *image, size_t len)
 809 {
 810         SHA1_CTX ctx;
 811         uint8_t digest[SHA1_DIGEST_LENGTH];
 812         uint8_t baseline[SHA1_DIGEST_LENGTH];
 813         unsigned int i;
 814 
 815         digest_a2h(ascii, baseline);
 816 
 817         SHA1Init(&ctx);
 818         SHA1Update(&ctx, image, len);
 819         SHA1Final(digest, &ctx);
 820 
 821         for (i = 0; i < SHA1_DIGEST_LENGTH; i++) {
 822                 if (digest[i] != baseline[i])
 823                         return (-1);
 824         }
 825 
 826         return (0);
 827 }
 828 
 829 static void
 830 check_images(void)
 831 {
 832         int i;
 833         char *hashes;
 834         mb_module_t *mod, *hashmod;
 835         char *hash;
 836         char displayhash[SHA1_ASCII_LENGTH + 1];
 837         size_t hashlen;
 838         size_t len;
 839 
 840         /*
 841          * A brief note on lengths and sizes: GRUB, for reasons unknown, passes
 842          * the address of the last valid byte in a module plus 1 as mod_end.
 843          * This is of course a bug; the multiboot specification simply states
 844          * that mod_start and mod_end "contain the start and end addresses of
 845          * the boot module itself" which is pretty obviously not what GRUB is
 846          * doing.  However, fixing it requires that not only this code be
 847          * changed but also that other code consuming this value and values
 848          * derived from it be fixed, and that the kernel and GRUB must either
 849          * both have the bug or neither.  While there are a lot of combinations
 850          * that will work, there are also some that won't, so for simplicity
 851          * we'll just cope with the bug.  That means we won't actually hash the
 852          * byte at mod_end, and we will expect that mod_end for the hash file
 853          * itself is one greater than some multiple of 41 (40 bytes of ASCII
 854          * hash plus a newline for each module).
 855          */
 856 
 857         if (mb_info->mods_count > 1) {
 858                 mod = (mb_module_t *)mb_info->mods_addr;
 859                 hashmod = mod + (mb_info->mods_count - 1);
 860                 hashes = (char *)hashmod->mod_start;
 861                 hashlen = (size_t)(hashmod->mod_end - hashmod->mod_start);
 862                 hash = hashes;
 863                 if (prom_debug) {
 864                         dboot_printf("Hash module found at %lx size %lx\n",
 865                             (ulong_t)hashes, (ulong_t)hashlen);
 866                 }
 867         } else {
 868                 DBG_MSG("Skipping hash check; no hash module found.\n");
 869                 return;
 870         }
 871 
 872         for (mod = (mb_module_t *)(mb_info->mods_addr), i = 0;
 873             i < mb_info->mods_count - 1; ++mod, ++i) {
 874                 if ((hash - hashes) + SHA1_ASCII_LENGTH + 1 > hashlen) {
 875                         dboot_printf("Short hash module of length 0x%lx bytes; "
 876                             "skipping hash checks\n", (ulong_t)hashlen);
 877                         break;
 878                 }
 879 
 880                 (void) memcpy(displayhash, hash, SHA1_ASCII_LENGTH);
 881                 displayhash[SHA1_ASCII_LENGTH] = '\0';
 882                 if (prom_debug) {
 883                         dboot_printf("Checking hash for module %d [%s]: ",
 884                             i, displayhash);
 885                 }
 886 
 887                 len = mod->mod_end - mod->mod_start;      /* see above */
 888                 if (check_image_hash(hash, (void *)mod->mod_start, len) != 0) {
 889                         dboot_panic("SHA-1 hash mismatch on %s; expected %s\n",
 890                             (char *)mod->mod_name, displayhash);
 891                 } else {
 892                         DBG_MSG("OK\n");
 893                 }
 894                 hash += SHA1_ASCII_LENGTH + 1;
 895         }
 896 }
 897 
 898 /*
 899  * During memory allocation, find the highest address not used yet.
 900  */
 901 static void
 902 check_higher(paddr_t a)
 903 {
 904         if (a < next_avail_addr)
 905                 return;
 906         next_avail_addr = RNDUP(a + 1, MMU_PAGESIZE);
 907         DBG(next_avail_addr);
 908 }
 909 
 910 /*
 911  * Walk through the module information finding the last used address.
 912  * The first available address will become the top level page table.
 913  *
 914  * We then build the phys_install memlist from the multiboot information.
 915  */
 916 static void
 917 init_mem_alloc(void)
 918 {
 919         mb_memory_map_t *mmap;
 920         mb_module_t *mod;
 921         uint64_t start;
 922         uint64_t end;
 923         uint64_t page_offset = MMU_PAGEOFFSET;  /* needs to be 64 bits */
 924         extern char _end[];
 925         int i;
 926 
 927         DBG_MSG("Entered init_mem_alloc()\n");
 928         DBG((uintptr_t)mb_info);
 929 
 930         if (mb_info->mods_count > MAX_MODULES) {
 931                 dboot_panic("Too many modules (%d) -- the maximum is %d.",
 932                     mb_info->mods_count, MAX_MODULES);
 933         }
 934         /*
 935          * search the modules to find the last used address
 936          * we'll build the module list while we're walking through here
 937          */
 938         DBG_MSG("\nFinding Modules\n");
 939         check_higher((paddr_t)(uintptr_t)&_end);
 940         for (mod = (mb_module_t *)(mb_info->mods_addr), i = 0;
 941             i < mb_info->mods_count;
 942             ++mod, ++i) {
 943                 if (prom_debug) {
 944                         dboot_printf("\tmodule #%d: %s at: 0x%lx, end 0x%lx\n",
 945                             i, (char *)(mod->mod_name),
 946                             (ulong_t)mod->mod_start, (ulong_t)mod->mod_end);
 947                 }
 948                 modules[i].bm_addr = mod->mod_start;
 949                 if (mod->mod_start > mod->mod_end) {
 950                         dboot_panic("module[%d]: Invalid module start address "
 951                             "(0x%llx)", i, (uint64_t)mod->mod_start);
 952                 }
 953                 modules[i].bm_size = mod->mod_end - mod->mod_start;
 954 
 955                 check_higher(mod->mod_end);
 956         }
 957         bi->bi_modules = (native_ptr_t)(uintptr_t)modules;
 958         DBG(bi->bi_modules);
 959         bi->bi_module_cnt = mb_info->mods_count;
 960         DBG(bi->bi_module_cnt);
 961 
 962         check_images();
 963 
 964         /*
 965          * Walk through the memory map from multiboot and build our memlist
 966          * structures. Note these will have native format pointers.
 967          */
 968         DBG_MSG("\nFinding Memory Map\n");
 969         DBG(mb_info->flags);
 970         max_mem = 0;
 971         if (mb_info->flags & 0x40) {
 972                 int cnt = 0;
 973 
 974                 DBG(mb_info->mmap_addr);
 975                 DBG(mb_info->mmap_length);
 976                 check_higher(mb_info->mmap_addr + mb_info->mmap_length);
 977 
 978                 for (mmap = (mb_memory_map_t *)mb_info->mmap_addr;
 979                     (uint32_t)mmap < mb_info->mmap_addr + mb_info->mmap_length;
 980                     mmap = (mb_memory_map_t *)((uint32_t)mmap + mmap->size
 981                     + sizeof (mmap->size))) {
 982                         ++cnt;
 983                         start = ((uint64_t)mmap->base_addr_high << 32) +
 984                             mmap->base_addr_low;
 985                         end = start + ((uint64_t)mmap->length_high << 32) +
 986                             mmap->length_low;
 987 
 988                         if (prom_debug)
 989                                 dboot_printf("\ttype: %d %" PRIx64 "..%"
 990                                     PRIx64 "\n", mmap->type, start, end);
 991 
 992                         /*
 993                          * page align start and end
 994                          */
 995                         start = (start + page_offset) & ~page_offset;
 996                         end &= ~page_offset;
 997                         if (end <= start)
 998                                 continue;
 999 
1000                         /*
1001                          * only type 1 is usable RAM
1002                          */
1003                         switch (mmap->type) {
1004                         case 1:
1005                                 if (end > max_mem)
1006                                         max_mem = end;
1007                                 memlists[memlists_used].addr = start;
1008                                 memlists[memlists_used].size = end - start;
1009                                 ++memlists_used;
1010                                 if (memlists_used > MAX_MEMLIST)
1011                                         dboot_panic("too many memlists");
1012                                 break;
1013                         case 2:
1014                                 rsvdmemlists[rsvdmemlists_used].addr = start;
1015                                 rsvdmemlists[rsvdmemlists_used].size =
1016                                     end - start;
1017                                 ++rsvdmemlists_used;
1018                                 if (rsvdmemlists_used > MAX_MEMLIST)
1019                                         dboot_panic("too many rsvdmemlists");
1020                                 break;
1021                         default:
1022                                 continue;
1023                         }
1024                 }
1025                 build_pcimemlists((mb_memory_map_t *)mb_info->mmap_addr, cnt);
1026         } else if (mb_info->flags & 0x01) {
1027                 DBG(mb_info->mem_lower);
1028                 memlists[memlists_used].addr = 0;
1029                 memlists[memlists_used].size = mb_info->mem_lower * 1024;
1030                 ++memlists_used;
1031                 DBG(mb_info->mem_upper);
1032                 memlists[memlists_used].addr = 1024 * 1024;
1033                 memlists[memlists_used].size = mb_info->mem_upper * 1024;
1034                 ++memlists_used;
1035 
1036                 /*
1037                  * Old platform - assume I/O space at the end of memory.
1038                  */
1039                 pcimemlists[0].addr =
1040                     (mb_info->mem_upper * 1024) + (1024 * 1024);
1041                 pcimemlists[0].size = pci_hi_limit - pcimemlists[0].addr;
1042                 pcimemlists[0].next = 0;
1043                 pcimemlists[0].prev = 0;
1044                 bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists;
1045                 DBG(bi->bi_pcimem);
1046         } else {
1047                 dboot_panic("No memory info from boot loader!!!");
1048         }
1049 
1050         check_higher(bi->bi_cmdline);
1051 
1052         /*
1053          * finish processing the physinstall list
1054          */
1055         sort_physinstall();
1056 
1057         /*
1058          * build bios reserved mem lists
1059          */
1060         build_rsvdmemlists();
1061 }
1062 #endif /* !__xpv */
1063 
1064 /*
1065  * Simple memory allocator, allocates aligned physical memory.
1066  * Note that startup_kernel() only allocates memory, never frees.
1067  * Memory usage just grows in an upward direction.
1068  */
1069 static void *
1070 do_mem_alloc(uint32_t size, uint32_t align)
1071 {
1072         uint_t i;
1073         uint64_t best;
1074         uint64_t start;
1075         uint64_t end;
1076 
1077         /*
1078          * make sure size is a multiple of pagesize
1079          */
1080         size = RNDUP(size, MMU_PAGESIZE);
1081         next_avail_addr = RNDUP(next_avail_addr, align);
1082 
1083         /*
1084          * XXPV fixme joe
1085          *
1086          * a really large bootarchive that causes you to run out of memory
1087          * may cause this to blow up
1088          */
1089         /* LINTED E_UNEXPECTED_UINT_PROMOTION */
1090         best = (uint64_t)-size;
1091         for (i = 0; i < memlists_used; ++i) {
1092                 start = memlists[i].addr;
1093 #if defined(__xpv)
1094                 start += mfn_base;
1095 #endif
1096                 end = start + memlists[i].size;
1097 
1098                 /*
1099                  * did we find the desired address?
1100                  */
1101                 if (start <= next_avail_addr && next_avail_addr + size <= end) {
1102                         best = next_avail_addr;
1103                         goto done;
1104                 }
1105 
1106                 /*
1107                  * if not is this address the best so far?
1108                  */
1109                 if (start > next_avail_addr && start < best &&
1110                     RNDUP(start, align) + size <= end)
1111                         best = RNDUP(start, align);
1112         }
1113 
1114         /*
1115          * We didn't find exactly the address we wanted, due to going off the
1116          * end of a memory region. Return the best found memory address.
1117          */
1118 done:
1119         next_avail_addr = best + size;
1120 #if defined(__xpv)
1121         if (next_avail_addr > scratch_end)
1122                 dboot_panic("Out of mem next_avail: 0x%lx, scratch_end: "
1123                     "0x%lx", (ulong_t)next_avail_addr,
1124                     (ulong_t)scratch_end);
1125 #endif
1126         (void) memset((void *)(uintptr_t)best, 0, size);
1127         return ((void *)(uintptr_t)best);
1128 }
1129 
1130 void *
1131 mem_alloc(uint32_t size)
1132 {
1133         return (do_mem_alloc(size, MMU_PAGESIZE));
1134 }
1135 
1136 
1137 /*
1138  * Build page tables to map all of memory used so far as well as the kernel.
1139  */
1140 static void
1141 build_page_tables(void)
1142 {
1143         uint32_t psize;
1144         uint32_t level;
1145         uint32_t off;
1146         uint64_t start;
1147 #if !defined(__xpv)
1148         uint32_t i;
1149         uint64_t end;
1150 #endif  /* __xpv */
1151 
1152         /*
1153          * If we're on metal, we need to create the top level pagetable.
1154          */
1155 #if defined(__xpv)
1156         top_page_table = (paddr_t)(uintptr_t)xen_info->pt_base;
1157 #else /* __xpv */
1158         top_page_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
1159 #endif /* __xpv */
1160         DBG((uintptr_t)top_page_table);
1161 
1162         /*
1163          * Determine if we'll use large mappings for kernel, then map it.
1164          */
1165         if (largepage_support) {
1166                 psize = lpagesize;
1167                 level = 1;
1168         } else {
1169                 psize = MMU_PAGESIZE;
1170                 level = 0;
1171         }
1172 
1173         DBG_MSG("Mapping kernel\n");
1174         DBG(ktext_phys);
1175         DBG(target_kernel_text);
1176         DBG(ksize);
1177         DBG(psize);
1178         for (off = 0; off < ksize; off += psize)
1179                 map_pa_at_va(ktext_phys + off, target_kernel_text + off, level);
1180 
1181         /*
1182          * The kernel will need a 1 page window to work with page tables
1183          */
1184         bi->bi_pt_window = (uintptr_t)mem_alloc(MMU_PAGESIZE);
1185         DBG(bi->bi_pt_window);
1186         bi->bi_pte_to_pt_window =
1187             (uintptr_t)find_pte(bi->bi_pt_window, NULL, 0, 0);
1188         DBG(bi->bi_pte_to_pt_window);
1189 
1190 #if defined(__xpv)
1191         if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
1192                 /* If this is a domU we're done. */
1193                 DBG_MSG("\nPage tables constructed\n");
1194                 return;
1195         }
1196 #endif /* __xpv */
1197 
1198         /*
1199          * We need 1:1 mappings for the lower 1M of memory to access
1200          * BIOS tables used by a couple of drivers during boot.
1201          *
1202          * The following code works because our simple memory allocator
1203          * only grows usage in an upwards direction.
1204          *
1205          * Note that by this point in boot some mappings for low memory
1206          * may already exist because we've already accessed device in low
1207          * memory.  (Specifically the video frame buffer and keyboard
1208          * status ports.)  If we're booting on raw hardware then GRUB
1209          * created these mappings for us.  If we're booting under a
1210          * hypervisor then we went ahead and remapped these devices into
1211          * memory allocated within dboot itself.
1212          */
1213         if (map_debug)
1214                 dboot_printf("1:1 map pa=0..1Meg\n");
1215         for (start = 0; start < 1024 * 1024; start += MMU_PAGESIZE) {
1216 #if defined(__xpv)
1217                 map_ma_at_va(start, start, 0);
1218 #else /* __xpv */
1219                 map_pa_at_va(start, start, 0);
1220 #endif /* __xpv */
1221         }
1222 
1223 #if !defined(__xpv)
1224         for (i = 0; i < memlists_used; ++i) {
1225                 start = memlists[i].addr;
1226 
1227                 end = start + memlists[i].size;
1228 
1229                 if (map_debug)
1230                         dboot_printf("1:1 map pa=%" PRIx64 "..%" PRIx64 "\n",
1231                             start, end);
1232                 while (start < end && start < next_avail_addr) {
1233                         map_pa_at_va(start, start, 0);
1234                         start += MMU_PAGESIZE;
1235                 }
1236         }
1237 #endif /* !__xpv */
1238 
1239         DBG_MSG("\nPage tables constructed\n");
1240 }
1241 
1242 #define NO_MULTIBOOT    \
1243 "multiboot is no longer used to boot the Solaris Operating System.\n\
1244 The grub entry should be changed to:\n\
1245 kernel$ /platform/i86pc/kernel/$ISADIR/unix\n\
1246 module$ /platform/i86pc/$ISADIR/boot_archive\n\
1247 See http://illumos.org/msg/SUNOS-8000-AK for details.\n"
1248 
1249 /*
1250  * startup_kernel has a pretty simple job. It builds pagetables which reflect
1251  * 1:1 mappings for all memory in use. It then also adds mappings for
1252  * the kernel nucleus at virtual address of target_kernel_text using large page
1253  * mappings. The page table pages are also accessible at 1:1 mapped
1254  * virtual addresses.
1255  */
1256 /*ARGSUSED*/
1257 void
1258 startup_kernel(void)
1259 {
1260         char *cmdline;
1261         uintptr_t addr;
1262 #if defined(__xpv)
1263         physdev_set_iopl_t set_iopl;
1264 #endif /* __xpv */
1265 
1266         /*
1267          * At this point we are executing in a 32 bit real mode.
1268          */
1269 #if defined(__xpv)
1270         cmdline = (char *)xen_info->cmd_line;
1271 #else /* __xpv */
1272         cmdline = (char *)mb_info->cmdline;
1273 #endif /* __xpv */
1274 
1275         prom_debug = (strstr(cmdline, "prom_debug") != NULL);
1276         map_debug = (strstr(cmdline, "map_debug") != NULL);
1277 
1278 #if defined(__xpv)
1279         /*
1280          * For dom0, before we initialize the console subsystem we'll
1281          * need to enable io operations, so set I/O priveldge level to 1.
1282          */
1283         if (DOMAIN_IS_INITDOMAIN(xen_info)) {
1284                 set_iopl.iopl = 1;
1285                 (void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
1286         }
1287 #endif /* __xpv */
1288 
1289         bcons_init(cmdline);
1290         DBG_MSG("\n\nSolaris prekernel set: ");
1291         DBG_MSG(cmdline);
1292         DBG_MSG("\n");
1293 
1294         if (strstr(cmdline, "multiboot") != NULL) {
1295                 dboot_panic(NO_MULTIBOOT);
1296         }
1297 
1298         /*
1299          * boot info must be 16 byte aligned for 64 bit kernel ABI
1300          */
1301         addr = (uintptr_t)boot_info;
1302         addr = (addr + 0xf) & ~0xf;
1303         bi = (struct xboot_info *)addr;
1304         DBG((uintptr_t)bi);
1305         bi->bi_cmdline = (native_ptr_t)(uintptr_t)cmdline;
1306 
1307         /*
1308          * Need correct target_kernel_text value
1309          */
1310 #if defined(_BOOT_TARGET_amd64)
1311         target_kernel_text = KERNEL_TEXT_amd64;
1312 #elif defined(__xpv)
1313         target_kernel_text = KERNEL_TEXT_i386_xpv;
1314 #else
1315         target_kernel_text = KERNEL_TEXT_i386;
1316 #endif
1317         DBG(target_kernel_text);
1318 
1319 #if defined(__xpv)
1320 
1321         /*
1322          * XXPV Derive this stuff from CPUID / what the hypervisor has enabled
1323          */
1324 
1325 #if defined(_BOOT_TARGET_amd64)
1326         /*
1327          * 64-bit hypervisor.
1328          */
1329         amd64_support = 1;
1330         pae_support = 1;
1331 
1332 #else   /* _BOOT_TARGET_amd64 */
1333 
1334         /*
1335          * See if we are running on a PAE Hypervisor
1336          */
1337         {
1338                 xen_capabilities_info_t caps;
1339 
1340                 if (HYPERVISOR_xen_version(XENVER_capabilities, &caps) != 0)
1341                         dboot_panic("HYPERVISOR_xen_version(caps) failed");
1342                 caps[sizeof (caps) - 1] = 0;
1343                 if (prom_debug)
1344                         dboot_printf("xen capabilities %s\n", caps);
1345                 if (strstr(caps, "x86_32p") != NULL)
1346                         pae_support = 1;
1347         }
1348 
1349 #endif  /* _BOOT_TARGET_amd64 */
1350         {
1351                 xen_platform_parameters_t p;
1352 
1353                 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &p) != 0)
1354                         dboot_panic("HYPERVISOR_xen_version(parms) failed");
1355                 DBG(p.virt_start);
1356                 mfn_to_pfn_mapping = (pfn_t *)(xen_virt_start = p.virt_start);
1357         }
1358 
1359         /*
1360          * The hypervisor loads stuff starting at 1Gig
1361          */
1362         mfn_base = ONE_GIG;
1363         DBG(mfn_base);
1364 
1365         /*
1366          * enable writable page table mode for the hypervisor
1367          */
1368         if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
1369             VMASST_TYPE_writable_pagetables) < 0)
1370                 dboot_panic("HYPERVISOR_vm_assist(writable_pagetables) failed");
1371 
1372         /*
1373          * check for NX support
1374          */
1375         if (pae_support) {
1376                 uint32_t eax = 0x80000000;
1377                 uint32_t edx = get_cpuid_edx(&eax);
1378 
1379                 if (eax >= 0x80000001) {
1380                         eax = 0x80000001;
1381                         edx = get_cpuid_edx(&eax);
1382                         if (edx & CPUID_AMD_EDX_NX)
1383                                 NX_support = 1;
1384                 }
1385         }
1386 
1387 #if !defined(_BOOT_TARGET_amd64)
1388 
1389         /*
1390          * The 32-bit hypervisor uses segmentation to protect itself from
1391          * guests. This means when a guest attempts to install a flat 4GB
1392          * code or data descriptor the 32-bit hypervisor will protect itself
1393          * by silently shrinking the segment such that if the guest attempts
1394          * any access where the hypervisor lives a #gp fault is generated.
1395          * The problem is that some applications expect a full 4GB flat
1396          * segment for their current thread pointer and will use negative
1397          * offset segment wrap around to access data. TLS support in linux
1398          * brand is one example of this.
1399          *
1400          * The 32-bit hypervisor can catch the #gp fault in these cases
1401          * and emulate the access without passing the #gp fault to the guest
1402          * but only if VMASST_TYPE_4gb_segments is explicitly turned on.
1403          * Seems like this should have been the default.
1404          * Either way, we want the hypervisor -- and not Solaris -- to deal
1405          * to deal with emulating these accesses.
1406          */
1407         if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
1408             VMASST_TYPE_4gb_segments) < 0)
1409                 dboot_panic("HYPERVISOR_vm_assist(4gb_segments) failed");
1410 #endif  /* !_BOOT_TARGET_amd64 */
1411 
1412 #else   /* __xpv */
1413 
1414         /*
1415          * use cpuid to enable MMU features
1416          */
1417         if (have_cpuid()) {
1418                 uint32_t eax, edx;
1419 
1420                 eax = 1;
1421                 edx = get_cpuid_edx(&eax);
1422                 if (edx & CPUID_INTC_EDX_PSE)
1423                         largepage_support = 1;
1424                 if (edx & CPUID_INTC_EDX_PGE)
1425                         pge_support = 1;
1426                 if (edx & CPUID_INTC_EDX_PAE)
1427                         pae_support = 1;
1428 
1429                 eax = 0x80000000;
1430                 edx = get_cpuid_edx(&eax);
1431                 if (eax >= 0x80000001) {
1432                         eax = 0x80000001;
1433                         edx = get_cpuid_edx(&eax);
1434                         if (edx & CPUID_AMD_EDX_LM)
1435                                 amd64_support = 1;
1436                         if (edx & CPUID_AMD_EDX_NX)
1437                                 NX_support = 1;
1438                 }
1439         } else {
1440                 dboot_printf("cpuid not supported\n");
1441         }
1442 #endif /* __xpv */
1443 
1444 
1445 #if defined(_BOOT_TARGET_amd64)
1446         if (amd64_support == 0)
1447                 dboot_panic("long mode not supported, rebooting");
1448         else if (pae_support == 0)
1449                 dboot_panic("long mode, but no PAE; rebooting");
1450 #else
1451         /*
1452          * Allow the command line to over-ride use of PAE for 32 bit.
1453          */
1454         if (strstr(cmdline, "disablePAE=true") != NULL) {
1455                 pae_support = 0;
1456                 NX_support = 0;
1457                 amd64_support = 0;
1458         }
1459 #endif
1460 
1461         /*
1462          * initialize the simple memory allocator
1463          */
1464         init_mem_alloc();
1465 
1466 #if !defined(__xpv) && !defined(_BOOT_TARGET_amd64)
1467         /*
1468          * disable PAE on 32 bit h/w w/o NX and < 4Gig of memory
1469          */
1470         if (max_mem < FOUR_GIG && NX_support == 0)
1471                 pae_support = 0;
1472 #endif
1473 
1474         /*
1475          * configure mmu information
1476          */
1477         if (pae_support) {
1478                 shift_amt = shift_amt_pae;
1479                 ptes_per_table = 512;
1480                 pte_size = 8;
1481                 lpagesize = TWO_MEG;
1482 #if defined(_BOOT_TARGET_amd64)
1483                 top_level = 3;
1484 #else
1485                 top_level = 2;
1486 #endif
1487         } else {
1488                 pae_support = 0;
1489                 NX_support = 0;
1490                 shift_amt = shift_amt_nopae;
1491                 ptes_per_table = 1024;
1492                 pte_size = 4;
1493                 lpagesize = FOUR_MEG;
1494                 top_level = 1;
1495         }
1496 
1497         DBG(pge_support);
1498         DBG(NX_support);
1499         DBG(largepage_support);
1500         DBG(amd64_support);
1501         DBG(top_level);
1502         DBG(pte_size);
1503         DBG(ptes_per_table);
1504         DBG(lpagesize);
1505 
1506 #if defined(__xpv)
1507         ktext_phys = ONE_GIG;           /* from UNIX Mapfile */
1508 #else
1509         ktext_phys = FOUR_MEG;          /* from UNIX Mapfile */
1510 #endif
1511 
1512 #if !defined(__xpv) && defined(_BOOT_TARGET_amd64)
1513         /*
1514          * For grub, copy kernel bits from the ELF64 file to final place.
1515          */
1516         DBG_MSG("\nAllocating nucleus pages.\n");
1517         ktext_phys = (uintptr_t)do_mem_alloc(ksize, FOUR_MEG);
1518         if (ktext_phys == 0)
1519                 dboot_panic("failed to allocate aligned kernel memory");
1520         if (dboot_elfload64(mb_header.load_addr) != 0)
1521                 dboot_panic("failed to parse kernel ELF image, rebooting");
1522 #endif
1523 
1524         DBG(ktext_phys);
1525 
1526         /*
1527          * Allocate page tables.
1528          */
1529         build_page_tables();
1530 
1531         /*
1532          * return to assembly code to switch to running kernel
1533          */
1534         entry_addr_low = (uint32_t)target_kernel_text;
1535         DBG(entry_addr_low);
1536         bi->bi_use_largepage = largepage_support;
1537         bi->bi_use_pae = pae_support;
1538         bi->bi_use_pge = pge_support;
1539         bi->bi_use_nx = NX_support;
1540 
1541 #if defined(__xpv)
1542 
1543         bi->bi_next_paddr = next_avail_addr - mfn_base;
1544         DBG(bi->bi_next_paddr);
1545         bi->bi_next_vaddr = (native_ptr_t)next_avail_addr;
1546         DBG(bi->bi_next_vaddr);
1547 
1548         /*
1549          * unmap unused pages in start area to make them available for DMA
1550          */
1551         while (next_avail_addr < scratch_end) {
1552                 (void) HYPERVISOR_update_va_mapping(next_avail_addr,
1553                     0, UVMF_INVLPG | UVMF_LOCAL);
1554                 next_avail_addr += MMU_PAGESIZE;
1555         }
1556 
1557         bi->bi_xen_start_info = (uintptr_t)xen_info;
1558         DBG((uintptr_t)HYPERVISOR_shared_info);
1559         bi->bi_shared_info = (native_ptr_t)HYPERVISOR_shared_info;
1560         bi->bi_top_page_table = (uintptr_t)top_page_table - mfn_base;
1561 
1562 #else /* __xpv */
1563 
1564         bi->bi_next_paddr = next_avail_addr;
1565         DBG(bi->bi_next_paddr);
1566         bi->bi_next_vaddr = (uintptr_t)next_avail_addr;
1567         DBG(bi->bi_next_vaddr);
1568         bi->bi_mb_info = (uintptr_t)mb_info;
1569         bi->bi_top_page_table = (uintptr_t)top_page_table;
1570 
1571 #endif /* __xpv */
1572 
1573         bi->bi_kseg_size = FOUR_MEG;
1574         DBG(bi->bi_kseg_size);
1575 
1576 #ifndef __xpv
1577         if (map_debug)
1578                 dump_tables();
1579 #endif
1580 
1581         DBG_MSG("\n\n*** DBOOT DONE -- back to asm to jump to kernel\n\n");
1582 }