illumos-gate Old usr/src/uts/i86pc/dboot/dboot

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 
  28 #include <sys/types.h>
  29 #include <sys/machparam.h>
  30 #include <sys/x86_archext.h>
  31 #include <sys/systm.h>
  32 #include <sys/mach_mmu.h>
  33 #include <sys/multiboot.h>
  34 
  35 #if defined(__xpv)
  36 
  37 #include <sys/hypervisor.h>
  38 uintptr_t xen_virt_start;
  39 pfn_t *mfn_to_pfn_mapping;
  40 
  41 #else /* !__xpv */
  42 
  43 extern multiboot_header_t mb_header;
  44 extern int have_cpuid(void);
  45 
  46 #endif /* !__xpv */
  47 
  48 #include <sys/inttypes.h>
  49 #include <sys/bootinfo.h>
  50 #include <sys/mach_mmu.h>
  51 #include <sys/boot_console.h>
  52 
  53 #include "dboot_asm.h"
  54 #include "dboot_printf.h"
  55 #include "dboot_xboot.h"
  56 #include "dboot_elfload.h"
  57 
  58 /*
  59  * This file contains code that runs to transition us from either a multiboot
  60  * compliant loader (32 bit non-paging) or a XPV domain loader to
  61  * regular kernel execution. Its task is to setup the kernel memory image
  62  * and page tables.
  63  *
  64  * The code executes as:
  65  *      - 32 bits under GRUB (for 32 or 64 bit Solaris)
  66  *      - a 32 bit program for the 32-bit PV hypervisor
  67  *      - a 64 bit program for the 64-bit PV hypervisor (at least for now)
  68  *
  69  * Under the PV hypervisor, we must create mappings for any memory beyond the
  70  * initial start of day allocation (such as the kernel itself).
  71  *
  72  * When on the metal, the mapping between maddr_t and paddr_t is 1:1.
  73  * Since we are running in real mode, so all such memory is accessible.
  74  */
  75 
  76 /*
  77  * Standard bits used in PTE (page level) and PTP (internal levels)
  78  */
  79 x86pte_t ptp_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_USER;
  80 x86pte_t pte_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_MOD | PT_NOCONSIST;
  81 
  82 /*
  83  * This is the target addresses (physical) where the kernel text and data
  84  * nucleus pages will be unpacked. On the hypervisor this is actually a
  85  * virtual address.
  86  */
  87 paddr_t ktext_phys;
  88 uint32_t ksize = 2 * FOUR_MEG;  /* kernel nucleus is 8Meg */
  89 
  90 static uint64_t target_kernel_text;     /* value to use for KERNEL_TEXT */
  91 
  92 /*
  93  * The stack is setup in assembler before entering startup_kernel()
  94  */
  95 char stack_space[STACK_SIZE];
  96 
  97 /*
  98  * Used to track physical memory allocation
  99  */
 100 static paddr_t next_avail_addr = 0;
 101 
 102 #if defined(__xpv)
 103 /*
 104  * Additional information needed for hypervisor memory allocation.
 105  * Only memory up to scratch_end is mapped by page tables.
 106  * mfn_base is the start of the hypervisor virtual image. It's ONE_GIG, so
 107  * to derive a pfn from a pointer, you subtract mfn_base.
 108  */
 109 
 110 static paddr_t scratch_end = 0; /* we can't write all of mem here */
 111 static paddr_t mfn_base;                /* addr corresponding to mfn_list[0] */
 112 start_info_t *xen_info;
 113 
 114 #else   /* __xpv */
 115 
 116 /*
 117  * If on the metal, then we have a multiboot loader.
 118  */
 119 multiboot_info_t *mb_info;
 120 
 121 #endif  /* __xpv */
 122 
 123 /*
 124  * This contains information passed to the kernel
 125  */
 126 struct xboot_info boot_info[2]; /* extra space to fix alignement for amd64 */
 127 struct xboot_info *bi;
 128 
 129 /*
 130  * Page table and memory stuff.
 131  */
 132 static paddr_t max_mem;                 /* maximum memory address */
 133 
 134 /*
 135  * Information about processor MMU
 136  */
 137 int amd64_support = 0;
 138 int largepage_support = 0;
 139 int pae_support = 0;
 140 int pge_support = 0;
 141 int NX_support = 0;
 142 
 143 /*
 144  * Low 32 bits of kernel entry address passed back to assembler.
 145  * When running a 64 bit kernel, the high 32 bits are 0xffffffff.
 146  */
 147 uint32_t entry_addr_low;
 148 
 149 /*
 150  * Memlists for the kernel. We shouldn't need a lot of these.
 151  */
 152 #define MAX_MEMLIST (50)
 153 struct boot_memlist memlists[MAX_MEMLIST];
 154 uint_t memlists_used = 0;
 155 struct boot_memlist pcimemlists[MAX_MEMLIST];
 156 uint_t pcimemlists_used = 0;
 157 struct boot_memlist rsvdmemlists[MAX_MEMLIST];
 158 uint_t rsvdmemlists_used = 0;
 159 
 160 #define MAX_MODULES (10)
 161 struct boot_modules modules[MAX_MODULES];
 162 uint_t modules_used = 0;
 163 
 164 /*
 165  * Debugging macros
 166  */
 167 uint_t prom_debug = 0;
 168 uint_t map_debug = 0;
 169 
 170 /*
 171  * Either hypervisor-specific or grub-specific code builds the initial
 172  * memlists. This code does the sort/merge/link for final use.
 173  */
 174 static void
 175 sort_physinstall(void)
 176 {
 177         int i;
 178 #if !defined(__xpv)
 179         int j;
 180         struct boot_memlist tmp;
 181 
 182         /*
 183          * Now sort the memlists, in case they weren't in order.
 184          * Yeah, this is a bubble sort; small, simple and easy to get right.
 185          */
 186         DBG_MSG("Sorting phys-installed list\n");
 187         for (j = memlists_used - 1; j > 0; --j) {
 188                 for (i = 0; i < j; ++i) {
 189                         if (memlists[i].addr < memlists[i + 1].addr)
 190                                 continue;
 191                         tmp = memlists[i];
 192                         memlists[i] = memlists[i + 1];
 193                         memlists[i + 1] = tmp;
 194                 }
 195         }
 196 
 197         /*
 198          * Merge any memlists that don't have holes between them.
 199          */
 200         for (i = 0; i <= memlists_used - 1; ++i) {
 201                 if (memlists[i].addr + memlists[i].size != memlists[i + 1].addr)
 202                         continue;
 203 
 204                 if (prom_debug)
 205                         dboot_printf(
 206                             "merging mem segs %" PRIx64 "...%" PRIx64
 207                             " w/ %" PRIx64 "...%" PRIx64 "\n",
 208                             memlists[i].addr,
 209                             memlists[i].addr + memlists[i].size,
 210                             memlists[i + 1].addr,
 211                             memlists[i + 1].addr + memlists[i + 1].size);
 212 
 213                 memlists[i].size += memlists[i + 1].size;
 214                 for (j = i + 1; j < memlists_used - 1; ++j)
 215                         memlists[j] = memlists[j + 1];
 216                 --memlists_used;
 217                 DBG(memlists_used);
 218                 --i;    /* after merging we need to reexamine, so do this */
 219         }
 220 #endif  /* __xpv */
 221 
 222         if (prom_debug) {
 223                 dboot_printf("\nFinal memlists:\n");
 224                 for (i = 0; i < memlists_used; ++i) {
 225                         dboot_printf("\t%d: addr=%" PRIx64 " size=%"
 226                             PRIx64 "\n", i, memlists[i].addr, memlists[i].size);
 227                 }
 228         }
 229 
 230         /*
 231          * link together the memlists with native size pointers
 232          */
 233         memlists[0].next = 0;
 234         memlists[0].prev = 0;
 235         for (i = 1; i < memlists_used; ++i) {
 236                 memlists[i].prev = (native_ptr_t)(uintptr_t)(memlists + i - 1);
 237                 memlists[i].next = 0;
 238                 memlists[i - 1].next = (native_ptr_t)(uintptr_t)(memlists + i);
 239         }
 240         bi->bi_phys_install = (native_ptr_t)(uintptr_t)memlists;
 241         DBG(bi->bi_phys_install);
 242 }
 243 
 244 /*
 245  * build bios reserved memlists
 246  */
 247 static void
 248 build_rsvdmemlists(void)
 249 {
 250         int i;
 251 
 252         rsvdmemlists[0].next = 0;
 253         rsvdmemlists[0].prev = 0;
 254         for (i = 1; i < rsvdmemlists_used; ++i) {
 255                 rsvdmemlists[i].prev =
 256                     (native_ptr_t)(uintptr_t)(rsvdmemlists + i - 1);
 257                 rsvdmemlists[i].next = 0;
 258                 rsvdmemlists[i - 1].next =
 259                     (native_ptr_t)(uintptr_t)(rsvdmemlists + i);
 260         }
 261         bi->bi_rsvdmem = (native_ptr_t)(uintptr_t)rsvdmemlists;
 262         DBG(bi->bi_rsvdmem);
 263 }
 264 
 265 #if defined(__xpv)
 266 
 267 /*
 268  * halt on the hypervisor after a delay to drain console output
 269  */
 270 void
 271 dboot_halt(void)
 272 {
 273         uint_t i = 10000;
 274 
 275         while (--i)
 276                 (void) HYPERVISOR_yield();
 277         (void) HYPERVISOR_shutdown(SHUTDOWN_poweroff);
 278 }
 279 
 280 /*
 281  * From a machine address, find the corresponding pseudo-physical address.
 282  * Pseudo-physical address are contiguous and run from mfn_base in each VM.
 283  * Machine addresses are the real underlying hardware addresses.
 284  * These are needed for page table entries. Note that this routine is
 285  * poorly protected. A bad value of "ma" will cause a page fault.
 286  */
 287 paddr_t
 288 ma_to_pa(maddr_t ma)
 289 {
 290         ulong_t pgoff = ma & MMU_PAGEOFFSET;
 291         ulong_t pfn = mfn_to_pfn_mapping[mmu_btop(ma)];
 292         paddr_t pa;
 293 
 294         if (pfn >= xen_info->nr_pages)
 295                 return (-(paddr_t)1);
 296         pa = mfn_base + mmu_ptob((paddr_t)pfn) + pgoff;
 297 #ifdef DEBUG
 298         if (ma != pa_to_ma(pa))
 299                 dboot_printf("ma_to_pa(%" PRIx64 ") got %" PRIx64 ", "
 300                     "pa_to_ma() says %" PRIx64 "\n", ma, pa, pa_to_ma(pa));
 301 #endif
 302         return (pa);
 303 }
 304 
 305 /*
 306  * From a pseudo-physical address, find the corresponding machine address.
 307  */
 308 maddr_t
 309 pa_to_ma(paddr_t pa)
 310 {
 311         pfn_t pfn;
 312         ulong_t mfn;
 313 
 314         pfn = mmu_btop(pa - mfn_base);
 315         if (pa < mfn_base || pfn >= xen_info->nr_pages)
 316                 dboot_panic("pa_to_ma(): illegal address 0x%lx", (ulong_t)pa);
 317         mfn = ((ulong_t *)xen_info->mfn_list)[pfn];
 318 #ifdef DEBUG
 319         if (mfn_to_pfn_mapping[mfn] != pfn)
 320                 dboot_printf("pa_to_ma(pfn=%lx) got %lx ma_to_pa() says %lx\n",
 321                     pfn, mfn, mfn_to_pfn_mapping[mfn]);
 322 #endif
 323         return (mfn_to_ma(mfn) | (pa & MMU_PAGEOFFSET));
 324 }
 325 
 326 #endif  /* __xpv */
 327 
 328 x86pte_t
 329 get_pteval(paddr_t table, uint_t index)
 330 {
 331         if (pae_support)
 332                 return (((x86pte_t *)(uintptr_t)table)[index]);
 333         return (((x86pte32_t *)(uintptr_t)table)[index]);
 334 }
 335 
 336 /*ARGSUSED*/
 337 void
 338 set_pteval(paddr_t table, uint_t index, uint_t level, x86pte_t pteval)
 339 {
 340 #ifdef __xpv
 341         mmu_update_t t;
 342         maddr_t mtable = pa_to_ma(table);
 343         int retcnt;
 344 
 345         t.ptr = (mtable + index * pte_size) | MMU_NORMAL_PT_UPDATE;
 346         t.val = pteval;
 347         if (HYPERVISOR_mmu_update(&t, 1, &retcnt, DOMID_SELF) || retcnt != 1)
 348                 dboot_panic("HYPERVISOR_mmu_update() failed");
 349 #else /* __xpv */
 350         uintptr_t tab_addr = (uintptr_t)table;
 351 
 352         if (pae_support)
 353                 ((x86pte_t *)tab_addr)[index] = pteval;
 354         else
 355                 ((x86pte32_t *)tab_addr)[index] = (x86pte32_t)pteval;
 356         if (level == top_level && level == 2)
 357                 reload_cr3();
 358 #endif /* __xpv */
 359 }
 360 
 361 paddr_t
 362 make_ptable(x86pte_t *pteval, uint_t level)
 363 {
 364         paddr_t new_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
 365 
 366         if (level == top_level && level == 2)
 367                 *pteval = pa_to_ma((uintptr_t)new_table) | PT_VALID;
 368         else
 369                 *pteval = pa_to_ma((uintptr_t)new_table) | ptp_bits;
 370 
 371 #ifdef __xpv
 372         /* Remove write permission to the new page table. */
 373         if (HYPERVISOR_update_va_mapping(new_table,
 374             *pteval & ~(x86pte_t)PT_WRITABLE, UVMF_INVLPG | UVMF_LOCAL))
 375                 dboot_panic("HYP_update_va_mapping error");
 376 #endif
 377 
 378         if (map_debug)
 379                 dboot_printf("new page table lvl=%d paddr=0x%lx ptp=0x%"
 380                     PRIx64 "\n", level, (ulong_t)new_table, *pteval);
 381         return (new_table);
 382 }
 383 
 384 x86pte_t *
 385 map_pte(paddr_t table, uint_t index)
 386 {
 387         return ((x86pte_t *)(uintptr_t)(table + index * pte_size));
 388 }
 389 
 390 /*
 391  * dump out the contents of page tables...
 392  */
 393 static void
 394 dump_tables(void)
 395 {
 396         uint_t save_index[4];   /* for recursion */
 397         char *save_table[4];    /* for recursion */
 398         uint_t  l;
 399         uint64_t va;
 400         uint64_t pgsize;
 401         int index;
 402         int i;
 403         x86pte_t pteval;
 404         char *table;
 405         static char *tablist = "\t\t\t";
 406         char *tabs = tablist + 3 - top_level;
 407         uint_t pa, pa1;
 408 #if !defined(__xpv)
 409 #define maddr_t paddr_t
 410 #endif /* !__xpv */
 411 
 412         dboot_printf("Finished pagetables:\n");
 413         table = (char *)(uintptr_t)top_page_table;
 414         l = top_level;
 415         va = 0;
 416         for (index = 0; index < ptes_per_table; ++index) {
 417                 pgsize = 1ull << shift_amt[l];
 418                 if (pae_support)
 419                         pteval = ((x86pte_t *)table)[index];
 420                 else
 421                         pteval = ((x86pte32_t *)table)[index];
 422                 if (pteval == 0)
 423                         goto next_entry;
 424 
 425                 dboot_printf("%s %p[0x%x] = %" PRIx64 ", va=%" PRIx64,
 426                     tabs + l, (void *)table, index, (uint64_t)pteval, va);
 427                 pa = ma_to_pa(pteval & MMU_PAGEMASK);
 428                 dboot_printf(" physaddr=%x\n", pa);
 429 
 430                 /*
 431                  * Don't try to walk hypervisor private pagetables
 432                  */
 433                 if ((l > 1 || (l == 1 && (pteval & PT_PAGESIZE) == 0))) {
 434                         save_table[l] = table;
 435                         save_index[l] = index;
 436                         --l;
 437                         index = -1;
 438                         table = (char *)(uintptr_t)
 439                             ma_to_pa(pteval & MMU_PAGEMASK);
 440                         goto recursion;
 441                 }
 442 
 443                 /*
 444                  * shorten dump for consecutive mappings
 445                  */
 446                 for (i = 1; index + i < ptes_per_table; ++i) {
 447                         if (pae_support)
 448                                 pteval = ((x86pte_t *)table)[index + i];
 449                         else
 450                                 pteval = ((x86pte32_t *)table)[index + i];
 451                         if (pteval == 0)
 452                                 break;
 453                         pa1 = ma_to_pa(pteval & MMU_PAGEMASK);
 454                         if (pa1 != pa + i * pgsize)
 455                                 break;
 456                 }
 457                 if (i > 2) {
 458                         dboot_printf("%s...\n", tabs + l);
 459                         va += pgsize * (i - 2);
 460                         index += i - 2;
 461                 }
 462 next_entry:
 463                 va += pgsize;
 464                 if (l == 3 && index == 256)     /* VA hole */
 465                         va = 0xffff800000000000ull;
 466 recursion:
 467                 ;
 468         }
 469         if (l < top_level) {
 470                 ++l;
 471                 index = save_index[l];
 472                 table = save_table[l];
 473                 goto recursion;
 474         }
 475 }
 476 
 477 /*
 478  * Add a mapping for the machine page at the given virtual address.
 479  */
 480 static void
 481 map_ma_at_va(maddr_t ma, native_ptr_t va, uint_t level)
 482 {
 483         x86pte_t *ptep;
 484         x86pte_t pteval;
 485 
 486         pteval = ma | pte_bits;
 487         if (level > 0)
 488                 pteval |= PT_PAGESIZE;
 489         if (va >= target_kernel_text && pge_support)
 490                 pteval |= PT_GLOBAL;
 491 
 492         if (map_debug && ma != va)
 493                 dboot_printf("mapping ma=0x%" PRIx64 " va=0x%" PRIx64
 494                     " pte=0x%" PRIx64 " l=%d\n",
 495                     (uint64_t)ma, (uint64_t)va, pteval, level);
 496 
 497 #if defined(__xpv)
 498         /*
 499          * see if we can avoid find_pte() on the hypervisor
 500          */
 501         if (HYPERVISOR_update_va_mapping(va, pteval,
 502             UVMF_INVLPG | UVMF_LOCAL) == 0)
 503                 return;
 504 #endif
 505 
 506         /*
 507          * Find the pte that will map this address. This creates any
 508          * missing intermediate level page tables
 509          */
 510         ptep = find_pte(va, NULL, level, 0);
 511 
 512         /*
 513          * When paravirtualized, we must use hypervisor calls to modify the
 514          * PTE, since paging is active. On real hardware we just write to
 515          * the pagetables which aren't in use yet.
 516          */
 517 #if defined(__xpv)
 518         ptep = ptep;    /* shut lint up */
 519         if (HYPERVISOR_update_va_mapping(va, pteval, UVMF_INVLPG | UVMF_LOCAL))
 520                 dboot_panic("mmu_update failed-map_pa_at_va va=0x%" PRIx64
 521                     " l=%d ma=0x%" PRIx64 ", pte=0x%" PRIx64 "",
 522                     (uint64_t)va, level, (uint64_t)ma, pteval);
 523 #else
 524         if (va < 1024 * 1024)
 525                 pteval |= PT_NOCACHE;           /* for video RAM */
 526         if (pae_support)
 527                 *ptep = pteval;
 528         else
 529                 *((x86pte32_t *)ptep) = (x86pte32_t)pteval;
 530 #endif
 531 }
 532 
 533 /*
 534  * Add a mapping for the physical page at the given virtual address.
 535  */
 536 static void
 537 map_pa_at_va(paddr_t pa, native_ptr_t va, uint_t level)
 538 {
 539         map_ma_at_va(pa_to_ma(pa), va, level);
 540 }
 541 
 542 /*
 543  * This is called to remove start..end from the
 544  * possible range of PCI addresses.
 545  */
 546 const uint64_t pci_lo_limit = 0x00100000ul;
 547 const uint64_t pci_hi_limit = 0xfff00000ul;
 548 static void
 549 exclude_from_pci(uint64_t start, uint64_t end)
 550 {
 551         int i;
 552         int j;
 553         struct boot_memlist *ml;
 554 
 555         for (i = 0; i < pcimemlists_used; ++i) {
 556                 ml = &pcimemlists[i];
 557 
 558                 /* delete the entire range? */
 559                 if (start <= ml->addr && ml->addr + ml->size <= end) {
 560                         --pcimemlists_used;
 561                         for (j = i; j < pcimemlists_used; ++j)
 562                                 pcimemlists[j] = pcimemlists[j + 1];
 563                         --i;    /* to revisit the new one at this index */
 564                 }
 565 
 566                 /* split a range? */
 567                 else if (ml->addr < start && end < ml->addr + ml->size) {
 568 
 569                         ++pcimemlists_used;
 570                         if (pcimemlists_used > MAX_MEMLIST)
 571                                 dboot_panic("too many pcimemlists");
 572 
 573                         for (j = pcimemlists_used - 1; j > i; --j)
 574                                 pcimemlists[j] = pcimemlists[j - 1];
 575                         ml->size = start - ml->addr;
 576 
 577                         ++ml;
 578                         ml->size = (ml->addr + ml->size) - end;
 579                         ml->addr = end;
 580                         ++i;    /* skip on to next one */
 581                 }
 582 
 583                 /* cut memory off the start? */
 584                 else if (ml->addr < end && end < ml->addr + ml->size) {
 585                         ml->size -= end - ml->addr;
 586                         ml->addr = end;
 587                 }
 588 
 589                 /* cut memory off the end? */
 590                 else if (ml->addr <= start && start < ml->addr + ml->size) {
 591                         ml->size = start - ml->addr;
 592                 }
 593         }
 594 }
 595 
 596 /*
 597  * Xen strips the size field out of the mb_memory_map_t, see struct e820entry
 598  * definition in Xen source.
 599  */
 600 #ifdef __xpv
 601 typedef struct {
 602         uint32_t        base_addr_low;
 603         uint32_t        base_addr_high;
 604         uint32_t        length_low;
 605         uint32_t        length_high;
 606         uint32_t        type;
 607 } mmap_t;
 608 #else
 609 typedef mb_memory_map_t mmap_t;
 610 #endif
 611 
 612 static void
 613 build_pcimemlists(mmap_t *mem, int num)
 614 {
 615         mmap_t *mmap;
 616         uint64_t page_offset = MMU_PAGEOFFSET;  /* needs to be 64 bits */
 617         uint64_t start;
 618         uint64_t end;
 619         int i;
 620 
 621         /*
 622          * initialize
 623          */
 624         pcimemlists[0].addr = pci_lo_limit;
 625         pcimemlists[0].size = pci_hi_limit - pci_lo_limit;
 626         pcimemlists_used = 1;
 627 
 628         /*
 629          * Fill in PCI memlists.
 630          */
 631         for (mmap = mem, i = 0; i < num; ++i, ++mmap) {
 632                 start = ((uint64_t)mmap->base_addr_high << 32) +
 633                     mmap->base_addr_low;
 634                 end = start + ((uint64_t)mmap->length_high << 32) +
 635                     mmap->length_low;
 636 
 637                 if (prom_debug)
 638                         dboot_printf("\ttype: %d %" PRIx64 "..%"
 639                             PRIx64 "\n", mmap->type, start, end);
 640 
 641                 /*
 642                  * page align start and end
 643                  */
 644                 start = (start + page_offset) & ~page_offset;
 645                 end &= ~page_offset;
 646                 if (end <= start)
 647                         continue;
 648 
 649                 exclude_from_pci(start, end);
 650         }
 651 
 652         /*
 653          * Finish off the pcimemlist
 654          */
 655         if (prom_debug) {
 656                 for (i = 0; i < pcimemlists_used; ++i) {
 657                         dboot_printf("pcimemlist entry 0x%" PRIx64 "..0x%"
 658                             PRIx64 "\n", pcimemlists[i].addr,
 659                             pcimemlists[i].addr + pcimemlists[i].size);
 660                 }
 661         }
 662         pcimemlists[0].next = 0;
 663         pcimemlists[0].prev = 0;
 664         for (i = 1; i < pcimemlists_used; ++i) {
 665                 pcimemlists[i].prev =
 666                     (native_ptr_t)(uintptr_t)(pcimemlists + i - 1);
 667                 pcimemlists[i].next = 0;
 668                 pcimemlists[i - 1].next =
 669                     (native_ptr_t)(uintptr_t)(pcimemlists + i);
 670         }
 671         bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists;
 672         DBG(bi->bi_pcimem);
 673 }
 674 
 675 #if defined(__xpv)
 676 /*
 677  * Initialize memory allocator stuff from hypervisor-supplied start info.
 678  *
 679  * There is 512KB of scratch area after the boot stack page.
 680  * We'll use that for everything except the kernel nucleus pages which are too
 681  * big to fit there and are allocated last anyway.
 682  */
 683 #define MAXMAPS 100
 684 static mmap_t map_buffer[MAXMAPS];
 685 static void
 686 init_mem_alloc(void)
 687 {
 688         int     local;  /* variables needed to find start region */
 689         paddr_t scratch_start;
 690         xen_memory_map_t map;
 691 
 692         DBG_MSG("Entered init_mem_alloc()\n");
 693 
 694         /*
 695          * Free memory follows the stack. There's at least 512KB of scratch
 696          * space, rounded up to at least 2Mb alignment.  That should be enough
 697          * for the page tables we'll need to build.  The nucleus memory is
 698          * allocated last and will be outside the addressible range.  We'll
 699          * switch to new page tables before we unpack the kernel
 700          */
 701         scratch_start = RNDUP((paddr_t)(uintptr_t)&local, MMU_PAGESIZE);
 702         DBG(scratch_start);
 703         scratch_end = RNDUP((paddr_t)scratch_start + 512 * 1024, TWO_MEG);
 704         DBG(scratch_end);
 705 
 706         /*
 707          * For paranoia, leave some space between hypervisor data and ours.
 708          * Use 500 instead of 512.
 709          */
 710         next_avail_addr = scratch_end - 500 * 1024;
 711         DBG(next_avail_addr);
 712 
 713         /*
 714          * The domain builder gives us at most 1 module
 715          */
 716         DBG(xen_info->mod_len);
 717         if (xen_info->mod_len > 0) {
 718                 DBG(xen_info->mod_start);
 719                 modules[0].bm_addr = xen_info->mod_start;
 720                 modules[0].bm_size = xen_info->mod_len;
 721                 bi->bi_module_cnt = 1;
 722                 bi->bi_modules = (native_ptr_t)modules;
 723         } else {
 724                 bi->bi_module_cnt = 0;
 725                 bi->bi_modules = NULL;
 726         }
 727         DBG(bi->bi_module_cnt);
 728         DBG(bi->bi_modules);
 729 
 730         DBG(xen_info->mfn_list);
 731         DBG(xen_info->nr_pages);
 732         max_mem = (paddr_t)xen_info->nr_pages << MMU_PAGESHIFT;
 733         DBG(max_mem);
 734 
 735         /*
 736          * Using pseudo-physical addresses, so only 1 memlist element
 737          */
 738         memlists[0].addr = 0;
 739         DBG(memlists[0].addr);
 740         memlists[0].size = max_mem;
 741         DBG(memlists[0].size);
 742         memlists_used = 1;
 743         DBG(memlists_used);
 744 
 745         /*
 746          * finish building physinstall list
 747          */
 748         sort_physinstall();
 749 
 750         /*
 751          * build bios reserved memlists
 752          */
 753         build_rsvdmemlists();
 754 
 755         if (DOMAIN_IS_INITDOMAIN(xen_info)) {
 756                 /*
 757                  * build PCI Memory list
 758                  */
 759                 map.nr_entries = MAXMAPS;
 760                 /*LINTED: constant in conditional context*/
 761                 set_xen_guest_handle(map.buffer, map_buffer);
 762                 if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &map) != 0)
 763                         dboot_panic("getting XENMEM_machine_memory_map failed");
 764                 build_pcimemlists(map_buffer, map.nr_entries);
 765         }
 766 }
 767 
 768 #else   /* !__xpv */
 769 
 770 /*
 771  * During memory allocation, find the highest address not used yet.
 772  */
 773 static void
 774 check_higher(paddr_t a)
 775 {
 776         if (a < next_avail_addr)
 777                 return;
 778         next_avail_addr = RNDUP(a + 1, MMU_PAGESIZE);
 779         DBG(next_avail_addr);
 780 }
 781 
 782 /*
 783  * Walk through the module information finding the last used address.
 784  * The first available address will become the top level page table.
 785  *
 786  * We then build the phys_install memlist from the multiboot information.
 787  */
 788 static void
 789 init_mem_alloc(void)
 790 {
 791         mb_memory_map_t *mmap;
 792         mb_module_t *mod;
 793         uint64_t start;
 794         uint64_t end;
 795         uint64_t page_offset = MMU_PAGEOFFSET;  /* needs to be 64 bits */
 796         extern char _end[];
 797         int i;
 798 
 799         DBG_MSG("Entered init_mem_alloc()\n");
 800         DBG((uintptr_t)mb_info);
 801 
 802         if (mb_info->mods_count > MAX_MODULES) {
 803                 dboot_panic("Too many modules (%d) -- the maximum is %d.",
 804                     mb_info->mods_count, MAX_MODULES);
 805         }
 806         /*
 807          * search the modules to find the last used address
 808          * we'll build the module list while we're walking through here
 809          */
 810         DBG_MSG("\nFinding Modules\n");
 811         check_higher((paddr_t)(uintptr_t)&_end);
 812         for (mod = (mb_module_t *)(mb_info->mods_addr), i = 0;
 813             i < mb_info->mods_count;
 814             ++mod, ++i) {
 815                 if (prom_debug) {
 816                         dboot_printf("\tmodule #%d: %s at: 0x%lx, len 0x%lx\n",
 817                             i, (char *)(mod->mod_name),
 818                             (ulong_t)mod->mod_start, (ulong_t)mod->mod_end);
 819                 }
 820                 modules[i].bm_addr = mod->mod_start;
 821                 if (mod->mod_start > mod->mod_end) {
 822                         dboot_panic("module[%d]: Invalid module start address "
 823                             "(0x%llx)", i, (uint64_t)mod->mod_start);
 824                 }
 825                 modules[i].bm_size = mod->mod_end - mod->mod_start;
 826 
 827                 check_higher(mod->mod_end);
 828         }
 829         bi->bi_modules = (native_ptr_t)(uintptr_t)modules;
 830         DBG(bi->bi_modules);
 831         bi->bi_module_cnt = mb_info->mods_count;
 832         DBG(bi->bi_module_cnt);
 833 
 834         /*
 835          * Walk through the memory map from multiboot and build our memlist
 836          * structures. Note these will have native format pointers.
 837          */
 838         DBG_MSG("\nFinding Memory Map\n");
 839         DBG(mb_info->flags);
 840         max_mem = 0;
 841         if (mb_info->flags & 0x40) {
 842                 int cnt = 0;
 843 
 844                 DBG(mb_info->mmap_addr);
 845                 DBG(mb_info->mmap_length);
 846                 check_higher(mb_info->mmap_addr + mb_info->mmap_length);
 847 
 848                 for (mmap = (mb_memory_map_t *)mb_info->mmap_addr;
 849                     (uint32_t)mmap < mb_info->mmap_addr + mb_info->mmap_length;
 850                     mmap = (mb_memory_map_t *)((uint32_t)mmap + mmap->size
 851                     + sizeof (mmap->size))) {
 852                         ++cnt;
 853                         start = ((uint64_t)mmap->base_addr_high << 32) +
 854                             mmap->base_addr_low;
 855                         end = start + ((uint64_t)mmap->length_high << 32) +
 856                             mmap->length_low;
 857 
 858                         if (prom_debug)
 859                                 dboot_printf("\ttype: %d %" PRIx64 "..%"
 860                                     PRIx64 "\n", mmap->type, start, end);
 861 
 862                         /*
 863                          * page align start and end
 864                          */
 865                         start = (start + page_offset) & ~page_offset;
 866                         end &= ~page_offset;
 867                         if (end <= start)
 868                                 continue;
 869 
 870                         /*
 871                          * only type 1 is usable RAM
 872                          */
 873                         switch (mmap->type) {
 874                         case 1:
 875                                 if (end > max_mem)
 876                                         max_mem = end;
 877                                 memlists[memlists_used].addr = start;
 878                                 memlists[memlists_used].size = end - start;
 879                                 ++memlists_used;
 880                                 if (memlists_used > MAX_MEMLIST)
 881                                         dboot_panic("too many memlists");
 882                                 break;
 883                         case 2:
 884                                 rsvdmemlists[rsvdmemlists_used].addr = start;
 885                                 rsvdmemlists[rsvdmemlists_used].size =
 886                                     end - start;
 887                                 ++rsvdmemlists_used;
 888                                 if (rsvdmemlists_used > MAX_MEMLIST)
 889                                         dboot_panic("too many rsvdmemlists");
 890                                 break;
 891                         default:
 892                                 continue;
 893                         }
 894                 }
 895                 build_pcimemlists((mb_memory_map_t *)mb_info->mmap_addr, cnt);
 896         } else if (mb_info->flags & 0x01) {
 897                 DBG(mb_info->mem_lower);
 898                 memlists[memlists_used].addr = 0;
 899                 memlists[memlists_used].size = mb_info->mem_lower * 1024;
 900                 ++memlists_used;
 901                 DBG(mb_info->mem_upper);
 902                 memlists[memlists_used].addr = 1024 * 1024;
 903                 memlists[memlists_used].size = mb_info->mem_upper * 1024;
 904                 ++memlists_used;
 905 
 906                 /*
 907                  * Old platform - assume I/O space at the end of memory.
 908                  */
 909                 pcimemlists[0].addr =
 910                     (mb_info->mem_upper * 1024) + (1024 * 1024);
 911                 pcimemlists[0].size = pci_hi_limit - pcimemlists[0].addr;
 912                 pcimemlists[0].next = 0;
 913                 pcimemlists[0].prev = 0;
 914                 bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists;
 915                 DBG(bi->bi_pcimem);
 916         } else {
 917                 dboot_panic("No memory info from boot loader!!!");
 918         }
 919 
 920         check_higher(bi->bi_cmdline);
 921 
 922         /*
 923          * finish processing the physinstall list
 924          */
 925         sort_physinstall();
 926 
 927         /*
 928          * build bios reserved mem lists
 929          */
 930         build_rsvdmemlists();
 931 }
 932 #endif /* !__xpv */
 933 
 934 /*
 935  * Simple memory allocator, allocates aligned physical memory.
 936  * Note that startup_kernel() only allocates memory, never frees.
 937  * Memory usage just grows in an upward direction.
 938  */
 939 static void *
 940 do_mem_alloc(uint32_t size, uint32_t align)
 941 {
 942         uint_t i;
 943         uint64_t best;
 944         uint64_t start;
 945         uint64_t end;
 946 
 947         /*
 948          * make sure size is a multiple of pagesize
 949          */
 950         size = RNDUP(size, MMU_PAGESIZE);
 951         next_avail_addr = RNDUP(next_avail_addr, align);
 952 
 953         /*
 954          * XXPV fixme joe
 955          *
 956          * a really large bootarchive that causes you to run out of memory
 957          * may cause this to blow up
 958          */
 959         /* LINTED E_UNEXPECTED_UINT_PROMOTION */
 960         best = (uint64_t)-size;
 961         for (i = 0; i < memlists_used; ++i) {
 962                 start = memlists[i].addr;
 963 #if defined(__xpv)
 964                 start += mfn_base;
 965 #endif
 966                 end = start + memlists[i].size;
 967 
 968                 /*
 969                  * did we find the desired address?
 970                  */
 971                 if (start <= next_avail_addr && next_avail_addr + size <= end) {
 972                         best = next_avail_addr;
 973                         goto done;
 974                 }
 975 
 976                 /*
 977                  * if not is this address the best so far?
 978                  */
 979                 if (start > next_avail_addr && start < best &&
 980                     RNDUP(start, align) + size <= end)
 981                         best = RNDUP(start, align);
 982         }
 983 
 984         /*
 985          * We didn't find exactly the address we wanted, due to going off the
 986          * end of a memory region. Return the best found memory address.
 987          */
 988 done:
 989         next_avail_addr = best + size;
 990 #if defined(__xpv)
 991         if (next_avail_addr > scratch_end)
 992                 dboot_panic("Out of mem next_avail: 0x%lx, scratch_end: "
 993                     "0x%lx", (ulong_t)next_avail_addr,
 994                     (ulong_t)scratch_end);
 995 #endif
 996         (void) memset((void *)(uintptr_t)best, 0, size);
 997         return ((void *)(uintptr_t)best);
 998 }
 999 
1000 void *
1001 mem_alloc(uint32_t size)
1002 {
1003         return (do_mem_alloc(size, MMU_PAGESIZE));
1004 }
1005 
1006 
1007 /*
1008  * Build page tables to map all of memory used so far as well as the kernel.
1009  */
1010 static void
1011 build_page_tables(void)
1012 {
1013         uint32_t psize;
1014         uint32_t level;
1015         uint32_t off;
1016         uint64_t start;
1017 #if !defined(__xpv)
1018         uint32_t i;
1019         uint64_t end;
1020 #endif  /* __xpv */
1021 
1022         /*
1023          * If we're on metal, we need to create the top level pagetable.
1024          */
1025 #if defined(__xpv)
1026         top_page_table = (paddr_t)(uintptr_t)xen_info->pt_base;
1027 #else /* __xpv */
1028         top_page_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
1029 #endif /* __xpv */
1030         DBG((uintptr_t)top_page_table);
1031 
1032         /*
1033          * Determine if we'll use large mappings for kernel, then map it.
1034          */
1035         if (largepage_support) {
1036                 psize = lpagesize;
1037                 level = 1;
1038         } else {
1039                 psize = MMU_PAGESIZE;
1040                 level = 0;
1041         }
1042 
1043         DBG_MSG("Mapping kernel\n");
1044         DBG(ktext_phys);
1045         DBG(target_kernel_text);
1046         DBG(ksize);
1047         DBG(psize);
1048         for (off = 0; off < ksize; off += psize)
1049                 map_pa_at_va(ktext_phys + off, target_kernel_text + off, level);
1050 
1051         /*
1052          * The kernel will need a 1 page window to work with page tables
1053          */
1054         bi->bi_pt_window = (uintptr_t)mem_alloc(MMU_PAGESIZE);
1055         DBG(bi->bi_pt_window);
1056         bi->bi_pte_to_pt_window =
1057             (uintptr_t)find_pte(bi->bi_pt_window, NULL, 0, 0);
1058         DBG(bi->bi_pte_to_pt_window);
1059 
1060 #if defined(__xpv)
1061         if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
1062                 /* If this is a domU we're done. */
1063                 DBG_MSG("\nPage tables constructed\n");
1064                 return;
1065         }
1066 #endif /* __xpv */
1067 
1068         /*
1069          * We need 1:1 mappings for the lower 1M of memory to access
1070          * BIOS tables used by a couple of drivers during boot.
1071          *
1072          * The following code works because our simple memory allocator
1073          * only grows usage in an upwards direction.
1074          *
1075          * Note that by this point in boot some mappings for low memory
1076          * may already exist because we've already accessed device in low
1077          * memory.  (Specifically the video frame buffer and keyboard
1078          * status ports.)  If we're booting on raw hardware then GRUB
1079          * created these mappings for us.  If we're booting under a
1080          * hypervisor then we went ahead and remapped these devices into
1081          * memory allocated within dboot itself.
1082          */
1083         if (map_debug)
1084                 dboot_printf("1:1 map pa=0..1Meg\n");
1085         for (start = 0; start < 1024 * 1024; start += MMU_PAGESIZE) {
1086 #if defined(__xpv)
1087                 map_ma_at_va(start, start, 0);
1088 #else /* __xpv */
1089                 map_pa_at_va(start, start, 0);
1090 #endif /* __xpv */
1091         }
1092 
1093 #if !defined(__xpv)
1094         for (i = 0; i < memlists_used; ++i) {
1095                 start = memlists[i].addr;
1096 
1097                 end = start + memlists[i].size;
1098 
1099                 if (map_debug)
1100                         dboot_printf("1:1 map pa=%" PRIx64 "..%" PRIx64 "\n",
1101                             start, end);
1102                 while (start < end && start < next_avail_addr) {
1103                         map_pa_at_va(start, start, 0);
1104                         start += MMU_PAGESIZE;
1105                 }
1106         }
1107 #endif /* !__xpv */
1108 
1109         DBG_MSG("\nPage tables constructed\n");
1110 }
1111 
1112 #define NO_MULTIBOOT    \
1113 "multiboot is no longer used to boot the Solaris Operating System.\n\
1114 The grub entry should be changed to:\n\
1115 kernel$ /platform/i86pc/kernel/$ISADIR/unix\n\
1116 module$ /platform/i86pc/$ISADIR/boot_archive\n\
1117 See http://illumos.org/msg/SUNOS-8000-AK for details.\n"
1118 
1119 /*
1120  * startup_kernel has a pretty simple job. It builds pagetables which reflect
1121  * 1:1 mappings for all memory in use. It then also adds mappings for
1122  * the kernel nucleus at virtual address of target_kernel_text using large page
1123  * mappings. The page table pages are also accessible at 1:1 mapped
1124  * virtual addresses.
1125  */
1126 /*ARGSUSED*/
1127 void
1128 startup_kernel(void)
1129 {
1130         char *cmdline;
1131         uintptr_t addr;
1132 #if defined(__xpv)
1133         physdev_set_iopl_t set_iopl;
1134 #endif /* __xpv */
1135 
1136         /*
1137          * At this point we are executing in a 32 bit real mode.
1138          */
1139 #if defined(__xpv)
1140         cmdline = (char *)xen_info->cmd_line;
1141 #else /* __xpv */
1142         cmdline = (char *)mb_info->cmdline;
1143 #endif /* __xpv */
1144 
1145         prom_debug = (strstr(cmdline, "prom_debug") != NULL);
1146         map_debug = (strstr(cmdline, "map_debug") != NULL);
1147 
1148 #if defined(__xpv)
1149         /*
1150          * For dom0, before we initialize the console subsystem we'll
1151          * need to enable io operations, so set I/O priveldge level to 1.
1152          */
1153         if (DOMAIN_IS_INITDOMAIN(xen_info)) {
1154                 set_iopl.iopl = 1;
1155                 (void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
1156         }
1157 #endif /* __xpv */
1158 
1159         bcons_init(cmdline);
1160         DBG_MSG("\n\nSolaris prekernel set: ");
1161         DBG_MSG(cmdline);
1162         DBG_MSG("\n");
1163 
1164         if (strstr(cmdline, "multiboot") != NULL) {
1165                 dboot_panic(NO_MULTIBOOT);
1166         }
1167 
1168         /*
1169          * boot info must be 16 byte aligned for 64 bit kernel ABI
1170          */
1171         addr = (uintptr_t)boot_info;
1172         addr = (addr + 0xf) & ~0xf;
1173         bi = (struct xboot_info *)addr;
1174         DBG((uintptr_t)bi);
1175         bi->bi_cmdline = (native_ptr_t)(uintptr_t)cmdline;
1176 
1177         /*
1178          * Need correct target_kernel_text value
1179          */
1180 #if defined(_BOOT_TARGET_amd64)
1181         target_kernel_text = KERNEL_TEXT_amd64;
1182 #elif defined(__xpv)
1183         target_kernel_text = KERNEL_TEXT_i386_xpv;
1184 #else
1185         target_kernel_text = KERNEL_TEXT_i386;
1186 #endif
1187         DBG(target_kernel_text);
1188 
1189 #if defined(__xpv)
1190 
1191         /*
1192          * XXPV Derive this stuff from CPUID / what the hypervisor has enabled
1193          */
1194 
1195 #if defined(_BOOT_TARGET_amd64)
1196         /*
1197          * 64-bit hypervisor.
1198          */
1199         amd64_support = 1;
1200         pae_support = 1;
1201 
1202 #else   /* _BOOT_TARGET_amd64 */
1203 
1204         /*
1205          * See if we are running on a PAE Hypervisor
1206          */
1207         {
1208                 xen_capabilities_info_t caps;
1209 
1210                 if (HYPERVISOR_xen_version(XENVER_capabilities, &caps) != 0)
1211                         dboot_panic("HYPERVISOR_xen_version(caps) failed");
1212                 caps[sizeof (caps) - 1] = 0;
1213                 if (prom_debug)
1214                         dboot_printf("xen capabilities %s\n", caps);
1215                 if (strstr(caps, "x86_32p") != NULL)
1216                         pae_support = 1;
1217         }
1218 
1219 #endif  /* _BOOT_TARGET_amd64 */
1220         {
1221                 xen_platform_parameters_t p;
1222 
1223                 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &p) != 0)
1224                         dboot_panic("HYPERVISOR_xen_version(parms) failed");
1225                 DBG(p.virt_start);
1226                 mfn_to_pfn_mapping = (pfn_t *)(xen_virt_start = p.virt_start);
1227         }
1228 
1229         /*
1230          * The hypervisor loads stuff starting at 1Gig
1231          */
1232         mfn_base = ONE_GIG;
1233         DBG(mfn_base);
1234 
1235         /*
1236          * enable writable page table mode for the hypervisor
1237          */
1238         if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
1239             VMASST_TYPE_writable_pagetables) < 0)
1240                 dboot_panic("HYPERVISOR_vm_assist(writable_pagetables) failed");
1241 
1242         /*
1243          * check for NX support
1244          */
1245         if (pae_support) {
1246                 uint32_t eax = 0x80000000;
1247                 uint32_t edx = get_cpuid_edx(&eax);
1248 
1249                 if (eax >= 0x80000001) {
1250                         eax = 0x80000001;
1251                         edx = get_cpuid_edx(&eax);
1252                         if (edx & CPUID_AMD_EDX_NX)
1253                                 NX_support = 1;
1254                 }
1255         }
1256 
1257 #if !defined(_BOOT_TARGET_amd64)
1258 
1259         /*
1260          * The 32-bit hypervisor uses segmentation to protect itself from
1261          * guests. This means when a guest attempts to install a flat 4GB
1262          * code or data descriptor the 32-bit hypervisor will protect itself
1263          * by silently shrinking the segment such that if the guest attempts
1264          * any access where the hypervisor lives a #gp fault is generated.
1265          * The problem is that some applications expect a full 4GB flat
1266          * segment for their current thread pointer and will use negative
1267          * offset segment wrap around to access data. TLS support in linux
1268          * brand is one example of this.
1269          *
1270          * The 32-bit hypervisor can catch the #gp fault in these cases
1271          * and emulate the access without passing the #gp fault to the guest
1272          * but only if VMASST_TYPE_4gb_segments is explicitly turned on.
1273          * Seems like this should have been the default.
1274          * Either way, we want the hypervisor -- and not Solaris -- to deal
1275          * to deal with emulating these accesses.
1276          */
1277         if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
1278             VMASST_TYPE_4gb_segments) < 0)
1279                 dboot_panic("HYPERVISOR_vm_assist(4gb_segments) failed");
1280 #endif  /* !_BOOT_TARGET_amd64 */
1281 
1282 #else   /* __xpv */
1283 
1284         /*
1285          * use cpuid to enable MMU features
1286          */
1287         if (have_cpuid()) {
1288                 uint32_t eax, edx;
1289 
1290                 eax = 1;
1291                 edx = get_cpuid_edx(&eax);
1292                 if (edx & CPUID_INTC_EDX_PSE)
1293                         largepage_support = 1;
1294                 if (edx & CPUID_INTC_EDX_PGE)
1295                         pge_support = 1;
1296                 if (edx & CPUID_INTC_EDX_PAE)
1297                         pae_support = 1;
1298 
1299                 eax = 0x80000000;
1300                 edx = get_cpuid_edx(&eax);
1301                 if (eax >= 0x80000001) {
1302                         eax = 0x80000001;
1303                         edx = get_cpuid_edx(&eax);
1304                         if (edx & CPUID_AMD_EDX_LM)
1305                                 amd64_support = 1;
1306                         if (edx & CPUID_AMD_EDX_NX)
1307                                 NX_support = 1;
1308                 }
1309         } else {
1310                 dboot_printf("cpuid not supported\n");
1311         }
1312 #endif /* __xpv */
1313 
1314 
1315 #if defined(_BOOT_TARGET_amd64)
1316         if (amd64_support == 0)
1317                 dboot_panic("long mode not supported, rebooting");
1318         else if (pae_support == 0)
1319                 dboot_panic("long mode, but no PAE; rebooting");
1320 #else
1321         /*
1322          * Allow the command line to over-ride use of PAE for 32 bit.
1323          */
1324         if (strstr(cmdline, "disablePAE=true") != NULL) {
1325                 pae_support = 0;
1326                 NX_support = 0;
1327                 amd64_support = 0;
1328         }
1329 #endif
1330 
1331         /*
1332          * initialize the simple memory allocator
1333          */
1334         init_mem_alloc();
1335 
1336 #if !defined(__xpv) && !defined(_BOOT_TARGET_amd64)
1337         /*
1338          * disable PAE on 32 bit h/w w/o NX and < 4Gig of memory
1339          */
1340         if (max_mem < FOUR_GIG && NX_support == 0)
1341                 pae_support = 0;
1342 #endif
1343 
1344         /*
1345          * configure mmu information
1346          */
1347         if (pae_support) {
1348                 shift_amt = shift_amt_pae;
1349                 ptes_per_table = 512;
1350                 pte_size = 8;
1351                 lpagesize = TWO_MEG;
1352 #if defined(_BOOT_TARGET_amd64)
1353                 top_level = 3;
1354 #else
1355                 top_level = 2;
1356 #endif
1357         } else {
1358                 pae_support = 0;
1359                 NX_support = 0;
1360                 shift_amt = shift_amt_nopae;
1361                 ptes_per_table = 1024;
1362                 pte_size = 4;
1363                 lpagesize = FOUR_MEG;
1364                 top_level = 1;
1365         }
1366 
1367         DBG(pge_support);
1368         DBG(NX_support);
1369         DBG(largepage_support);
1370         DBG(amd64_support);
1371         DBG(top_level);
1372         DBG(pte_size);
1373         DBG(ptes_per_table);
1374         DBG(lpagesize);
1375 
1376 #if defined(__xpv)
1377         ktext_phys = ONE_GIG;           /* from UNIX Mapfile */
1378 #else
1379         ktext_phys = FOUR_MEG;          /* from UNIX Mapfile */
1380 #endif
1381 
1382 #if !defined(__xpv) && defined(_BOOT_TARGET_amd64)
1383         /*
1384          * For grub, copy kernel bits from the ELF64 file to final place.
1385          */
1386         DBG_MSG("\nAllocating nucleus pages.\n");
1387         ktext_phys = (uintptr_t)do_mem_alloc(ksize, FOUR_MEG);
1388         if (ktext_phys == 0)
1389                 dboot_panic("failed to allocate aligned kernel memory");
1390         if (dboot_elfload64(mb_header.load_addr) != 0)
1391                 dboot_panic("failed to parse kernel ELF image, rebooting");
1392 #endif
1393 
1394         DBG(ktext_phys);
1395 
1396         /*
1397          * Allocate page tables.
1398          */
1399         build_page_tables();
1400 
1401         /*
1402          * return to assembly code to switch to running kernel
1403          */
1404         entry_addr_low = (uint32_t)target_kernel_text;
1405         DBG(entry_addr_low);
1406         bi->bi_use_largepage = largepage_support;
1407         bi->bi_use_pae = pae_support;
1408         bi->bi_use_pge = pge_support;
1409         bi->bi_use_nx = NX_support;
1410 
1411 #if defined(__xpv)
1412 
1413         bi->bi_next_paddr = next_avail_addr - mfn_base;
1414         DBG(bi->bi_next_paddr);
1415         bi->bi_next_vaddr = (native_ptr_t)next_avail_addr;
1416         DBG(bi->bi_next_vaddr);
1417 
1418         /*
1419          * unmap unused pages in start area to make them available for DMA
1420          */
1421         while (next_avail_addr < scratch_end) {
1422                 (void) HYPERVISOR_update_va_mapping(next_avail_addr,
1423                     0, UVMF_INVLPG | UVMF_LOCAL);
1424                 next_avail_addr += MMU_PAGESIZE;
1425         }
1426 
1427         bi->bi_xen_start_info = (uintptr_t)xen_info;
1428         DBG((uintptr_t)HYPERVISOR_shared_info);
1429         bi->bi_shared_info = (native_ptr_t)HYPERVISOR_shared_info;
1430         bi->bi_top_page_table = (uintptr_t)top_page_table - mfn_base;
1431 
1432 #else /* __xpv */
1433 
1434         bi->bi_next_paddr = next_avail_addr;
1435         DBG(bi->bi_next_paddr);
1436         bi->bi_next_vaddr = (uintptr_t)next_avail_addr;
1437         DBG(bi->bi_next_vaddr);
1438         bi->bi_mb_info = (uintptr_t)mb_info;
1439         bi->bi_top_page_table = (uintptr_t)top_page_table;
1440 
1441 #endif /* __xpv */
1442 
1443         bi->bi_kseg_size = FOUR_MEG;
1444         DBG(bi->bi_kseg_size);
1445 
1446 #ifndef __xpv
1447         if (map_debug)
1448                 dump_tables();
1449 #endif
1450 
1451         DBG_MSG("\n\n*** DBOOT DONE -- back to asm to jump to kernel\n\n");
1452 }