1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 * 26 * Copyright 2012 Joyent, Inc. All rights reserved. 27 */ 28 29 30 #include <sys/types.h> 31 #include <sys/machparam.h> 32 #include <sys/x86_archext.h> 33 #include <sys/systm.h> 34 #include <sys/mach_mmu.h> 35 #include <sys/multiboot.h> 36 #include <sys/sha1.h> 37 38 #if defined(__xpv) 39 40 #include <sys/hypervisor.h> 41 uintptr_t xen_virt_start; 42 pfn_t *mfn_to_pfn_mapping; 43 44 #else /* !__xpv */ 45 46 extern multiboot_header_t mb_header; 47 extern int have_cpuid(void); 48 49 #endif /* !__xpv */ 50 51 #include <sys/inttypes.h> 52 #include <sys/bootinfo.h> 53 #include <sys/mach_mmu.h> 54 #include <sys/boot_console.h> 55 56 #include "dboot_asm.h" 57 #include "dboot_printf.h" 58 #include "dboot_xboot.h" 59 #include "dboot_elfload.h" 60 61 #define SHA1_ASCII_LENGTH (SHA1_DIGEST_LENGTH * 2) 62 63 /* 64 * This file contains code that runs to transition us from either a multiboot 65 * compliant loader (32 bit non-paging) or a XPV domain loader to 66 * regular kernel execution. Its task is to setup the kernel memory image 67 * and page tables. 68 * 69 * The code executes as: 70 * - 32 bits under GRUB (for 32 or 64 bit Solaris) 71 * - a 32 bit program for the 32-bit PV hypervisor 72 * - a 64 bit program for the 64-bit PV hypervisor (at least for now) 73 * 74 * Under the PV hypervisor, we must create mappings for any memory beyond the 75 * initial start of day allocation (such as the kernel itself). 76 * 77 * When on the metal, the mapping between maddr_t and paddr_t is 1:1. 78 * Since we are running in real mode, so all such memory is accessible. 79 */ 80 81 /* 82 * Standard bits used in PTE (page level) and PTP (internal levels) 83 */ 84 x86pte_t ptp_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_USER; 85 x86pte_t pte_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_MOD | PT_NOCONSIST; 86 87 /* 88 * This is the target addresses (physical) where the kernel text and data 89 * nucleus pages will be unpacked. On the hypervisor this is actually a 90 * virtual address. 91 */ 92 paddr_t ktext_phys; 93 uint32_t ksize = 2 * FOUR_MEG; /* kernel nucleus is 8Meg */ 94 95 static uint64_t target_kernel_text; /* value to use for KERNEL_TEXT */ 96 97 /* 98 * The stack is setup in assembler before entering startup_kernel() 99 */ 100 char stack_space[STACK_SIZE]; 101 102 /* 103 * Used to track physical memory allocation 104 */ 105 static paddr_t next_avail_addr = 0; 106 107 #if defined(__xpv) 108 /* 109 * Additional information needed for hypervisor memory allocation. 110 * Only memory up to scratch_end is mapped by page tables. 111 * mfn_base is the start of the hypervisor virtual image. It's ONE_GIG, so 112 * to derive a pfn from a pointer, you subtract mfn_base. 113 */ 114 115 static paddr_t scratch_end = 0; /* we can't write all of mem here */ 116 static paddr_t mfn_base; /* addr corresponding to mfn_list[0] */ 117 start_info_t *xen_info; 118 119 #else /* __xpv */ 120 121 /* 122 * If on the metal, then we have a multiboot loader. 123 */ 124 multiboot_info_t *mb_info; 125 126 #endif /* __xpv */ 127 128 /* 129 * This contains information passed to the kernel 130 */ 131 struct xboot_info boot_info[2]; /* extra space to fix alignement for amd64 */ 132 struct xboot_info *bi; 133 134 /* 135 * Page table and memory stuff. 136 */ 137 static paddr_t max_mem; /* maximum memory address */ 138 139 /* 140 * Information about processor MMU 141 */ 142 int amd64_support = 0; 143 int largepage_support = 0; 144 int pae_support = 0; 145 int pge_support = 0; 146 int NX_support = 0; 147 148 /* 149 * Low 32 bits of kernel entry address passed back to assembler. 150 * When running a 64 bit kernel, the high 32 bits are 0xffffffff. 151 */ 152 uint32_t entry_addr_low; 153 154 /* 155 * Memlists for the kernel. We shouldn't need a lot of these. 156 */ 157 #define MAX_MEMLIST (50) 158 struct boot_memlist memlists[MAX_MEMLIST]; 159 uint_t memlists_used = 0; 160 struct boot_memlist pcimemlists[MAX_MEMLIST]; 161 uint_t pcimemlists_used = 0; 162 struct boot_memlist rsvdmemlists[MAX_MEMLIST]; 163 uint_t rsvdmemlists_used = 0; 164 165 #define MAX_MODULES (10) 166 struct boot_modules modules[MAX_MODULES]; 167 uint_t modules_used = 0; 168 169 /* 170 * Debugging macros 171 */ 172 uint_t prom_debug = 0; 173 uint_t map_debug = 0; 174 175 /* 176 * Either hypervisor-specific or grub-specific code builds the initial 177 * memlists. This code does the sort/merge/link for final use. 178 */ 179 static void 180 sort_physinstall(void) 181 { 182 int i; 183 #if !defined(__xpv) 184 int j; 185 struct boot_memlist tmp; 186 187 /* 188 * Now sort the memlists, in case they weren't in order. 189 * Yeah, this is a bubble sort; small, simple and easy to get right. 190 */ 191 DBG_MSG("Sorting phys-installed list\n"); 192 for (j = memlists_used - 1; j > 0; --j) { 193 for (i = 0; i < j; ++i) { 194 if (memlists[i].addr < memlists[i + 1].addr) 195 continue; 196 tmp = memlists[i]; 197 memlists[i] = memlists[i + 1]; 198 memlists[i + 1] = tmp; 199 } 200 } 201 202 /* 203 * Merge any memlists that don't have holes between them. 204 */ 205 for (i = 0; i <= memlists_used - 1; ++i) { 206 if (memlists[i].addr + memlists[i].size != memlists[i + 1].addr) 207 continue; 208 209 if (prom_debug) 210 dboot_printf( 211 "merging mem segs %" PRIx64 "...%" PRIx64 212 " w/ %" PRIx64 "...%" PRIx64 "\n", 213 memlists[i].addr, 214 memlists[i].addr + memlists[i].size, 215 memlists[i + 1].addr, 216 memlists[i + 1].addr + memlists[i + 1].size); 217 218 memlists[i].size += memlists[i + 1].size; 219 for (j = i + 1; j < memlists_used - 1; ++j) 220 memlists[j] = memlists[j + 1]; 221 --memlists_used; 222 DBG(memlists_used); 223 --i; /* after merging we need to reexamine, so do this */ 224 } 225 #endif /* __xpv */ 226 227 if (prom_debug) { 228 dboot_printf("\nFinal memlists:\n"); 229 for (i = 0; i < memlists_used; ++i) { 230 dboot_printf("\t%d: addr=%" PRIx64 " size=%" 231 PRIx64 "\n", i, memlists[i].addr, memlists[i].size); 232 } 233 } 234 235 /* 236 * link together the memlists with native size pointers 237 */ 238 memlists[0].next = 0; 239 memlists[0].prev = 0; 240 for (i = 1; i < memlists_used; ++i) { 241 memlists[i].prev = (native_ptr_t)(uintptr_t)(memlists + i - 1); 242 memlists[i].next = 0; 243 memlists[i - 1].next = (native_ptr_t)(uintptr_t)(memlists + i); 244 } 245 bi->bi_phys_install = (native_ptr_t)(uintptr_t)memlists; 246 DBG(bi->bi_phys_install); 247 } 248 249 /* 250 * build bios reserved memlists 251 */ 252 static void 253 build_rsvdmemlists(void) 254 { 255 int i; 256 257 rsvdmemlists[0].next = 0; 258 rsvdmemlists[0].prev = 0; 259 for (i = 1; i < rsvdmemlists_used; ++i) { 260 rsvdmemlists[i].prev = 261 (native_ptr_t)(uintptr_t)(rsvdmemlists + i - 1); 262 rsvdmemlists[i].next = 0; 263 rsvdmemlists[i - 1].next = 264 (native_ptr_t)(uintptr_t)(rsvdmemlists + i); 265 } 266 bi->bi_rsvdmem = (native_ptr_t)(uintptr_t)rsvdmemlists; 267 DBG(bi->bi_rsvdmem); 268 } 269 270 #if defined(__xpv) 271 272 /* 273 * halt on the hypervisor after a delay to drain console output 274 */ 275 void 276 dboot_halt(void) 277 { 278 uint_t i = 10000; 279 280 while (--i) 281 (void) HYPERVISOR_yield(); 282 (void) HYPERVISOR_shutdown(SHUTDOWN_poweroff); 283 } 284 285 /* 286 * From a machine address, find the corresponding pseudo-physical address. 287 * Pseudo-physical address are contiguous and run from mfn_base in each VM. 288 * Machine addresses are the real underlying hardware addresses. 289 * These are needed for page table entries. Note that this routine is 290 * poorly protected. A bad value of "ma" will cause a page fault. 291 */ 292 paddr_t 293 ma_to_pa(maddr_t ma) 294 { 295 ulong_t pgoff = ma & MMU_PAGEOFFSET; 296 ulong_t pfn = mfn_to_pfn_mapping[mmu_btop(ma)]; 297 paddr_t pa; 298 299 if (pfn >= xen_info->nr_pages) 300 return (-(paddr_t)1); 301 pa = mfn_base + mmu_ptob((paddr_t)pfn) + pgoff; 302 #ifdef DEBUG 303 if (ma != pa_to_ma(pa)) 304 dboot_printf("ma_to_pa(%" PRIx64 ") got %" PRIx64 ", " 305 "pa_to_ma() says %" PRIx64 "\n", ma, pa, pa_to_ma(pa)); 306 #endif 307 return (pa); 308 } 309 310 /* 311 * From a pseudo-physical address, find the corresponding machine address. 312 */ 313 maddr_t 314 pa_to_ma(paddr_t pa) 315 { 316 pfn_t pfn; 317 ulong_t mfn; 318 319 pfn = mmu_btop(pa - mfn_base); 320 if (pa < mfn_base || pfn >= xen_info->nr_pages) 321 dboot_panic("pa_to_ma(): illegal address 0x%lx", (ulong_t)pa); 322 mfn = ((ulong_t *)xen_info->mfn_list)[pfn]; 323 #ifdef DEBUG 324 if (mfn_to_pfn_mapping[mfn] != pfn) 325 dboot_printf("pa_to_ma(pfn=%lx) got %lx ma_to_pa() says %lx\n", 326 pfn, mfn, mfn_to_pfn_mapping[mfn]); 327 #endif 328 return (mfn_to_ma(mfn) | (pa & MMU_PAGEOFFSET)); 329 } 330 331 #endif /* __xpv */ 332 333 x86pte_t 334 get_pteval(paddr_t table, uint_t index) 335 { 336 if (pae_support) 337 return (((x86pte_t *)(uintptr_t)table)[index]); 338 return (((x86pte32_t *)(uintptr_t)table)[index]); 339 } 340 341 /*ARGSUSED*/ 342 void 343 set_pteval(paddr_t table, uint_t index, uint_t level, x86pte_t pteval) 344 { 345 #ifdef __xpv 346 mmu_update_t t; 347 maddr_t mtable = pa_to_ma(table); 348 int retcnt; 349 350 t.ptr = (mtable + index * pte_size) | MMU_NORMAL_PT_UPDATE; 351 t.val = pteval; 352 if (HYPERVISOR_mmu_update(&t, 1, &retcnt, DOMID_SELF) || retcnt != 1) 353 dboot_panic("HYPERVISOR_mmu_update() failed"); 354 #else /* __xpv */ 355 uintptr_t tab_addr = (uintptr_t)table; 356 357 if (pae_support) 358 ((x86pte_t *)tab_addr)[index] = pteval; 359 else 360 ((x86pte32_t *)tab_addr)[index] = (x86pte32_t)pteval; 361 if (level == top_level && level == 2) 362 reload_cr3(); 363 #endif /* __xpv */ 364 } 365 366 paddr_t 367 make_ptable(x86pte_t *pteval, uint_t level) 368 { 369 paddr_t new_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE); 370 371 if (level == top_level && level == 2) 372 *pteval = pa_to_ma((uintptr_t)new_table) | PT_VALID; 373 else 374 *pteval = pa_to_ma((uintptr_t)new_table) | ptp_bits; 375 376 #ifdef __xpv 377 /* Remove write permission to the new page table. */ 378 if (HYPERVISOR_update_va_mapping(new_table, 379 *pteval & ~(x86pte_t)PT_WRITABLE, UVMF_INVLPG | UVMF_LOCAL)) 380 dboot_panic("HYP_update_va_mapping error"); 381 #endif 382 383 if (map_debug) 384 dboot_printf("new page table lvl=%d paddr=0x%lx ptp=0x%" 385 PRIx64 "\n", level, (ulong_t)new_table, *pteval); 386 return (new_table); 387 } 388 389 x86pte_t * 390 map_pte(paddr_t table, uint_t index) 391 { 392 return ((x86pte_t *)(uintptr_t)(table + index * pte_size)); 393 } 394 395 /* 396 * dump out the contents of page tables... 397 */ 398 static void 399 dump_tables(void) 400 { 401 uint_t save_index[4]; /* for recursion */ 402 char *save_table[4]; /* for recursion */ 403 uint_t l; 404 uint64_t va; 405 uint64_t pgsize; 406 int index; 407 int i; 408 x86pte_t pteval; 409 char *table; 410 static char *tablist = "\t\t\t"; 411 char *tabs = tablist + 3 - top_level; 412 uint_t pa, pa1; 413 #if !defined(__xpv) 414 #define maddr_t paddr_t 415 #endif /* !__xpv */ 416 417 dboot_printf("Finished pagetables:\n"); 418 table = (char *)(uintptr_t)top_page_table; 419 l = top_level; 420 va = 0; 421 for (index = 0; index < ptes_per_table; ++index) { 422 pgsize = 1ull << shift_amt[l]; 423 if (pae_support) 424 pteval = ((x86pte_t *)table)[index]; 425 else 426 pteval = ((x86pte32_t *)table)[index]; 427 if (pteval == 0) 428 goto next_entry; 429 430 dboot_printf("%s %p[0x%x] = %" PRIx64 ", va=%" PRIx64, 431 tabs + l, (void *)table, index, (uint64_t)pteval, va); 432 pa = ma_to_pa(pteval & MMU_PAGEMASK); 433 dboot_printf(" physaddr=%x\n", pa); 434 435 /* 436 * Don't try to walk hypervisor private pagetables 437 */ 438 if ((l > 1 || (l == 1 && (pteval & PT_PAGESIZE) == 0))) { 439 save_table[l] = table; 440 save_index[l] = index; 441 --l; 442 index = -1; 443 table = (char *)(uintptr_t) 444 ma_to_pa(pteval & MMU_PAGEMASK); 445 goto recursion; 446 } 447 448 /* 449 * shorten dump for consecutive mappings 450 */ 451 for (i = 1; index + i < ptes_per_table; ++i) { 452 if (pae_support) 453 pteval = ((x86pte_t *)table)[index + i]; 454 else 455 pteval = ((x86pte32_t *)table)[index + i]; 456 if (pteval == 0) 457 break; 458 pa1 = ma_to_pa(pteval & MMU_PAGEMASK); 459 if (pa1 != pa + i * pgsize) 460 break; 461 } 462 if (i > 2) { 463 dboot_printf("%s...\n", tabs + l); 464 va += pgsize * (i - 2); 465 index += i - 2; 466 } 467 next_entry: 468 va += pgsize; 469 if (l == 3 && index == 256) /* VA hole */ 470 va = 0xffff800000000000ull; 471 recursion: 472 ; 473 } 474 if (l < top_level) { 475 ++l; 476 index = save_index[l]; 477 table = save_table[l]; 478 goto recursion; 479 } 480 } 481 482 /* 483 * Add a mapping for the machine page at the given virtual address. 484 */ 485 static void 486 map_ma_at_va(maddr_t ma, native_ptr_t va, uint_t level) 487 { 488 x86pte_t *ptep; 489 x86pte_t pteval; 490 491 pteval = ma | pte_bits; 492 if (level > 0) 493 pteval |= PT_PAGESIZE; 494 if (va >= target_kernel_text && pge_support) 495 pteval |= PT_GLOBAL; 496 497 if (map_debug && ma != va) 498 dboot_printf("mapping ma=0x%" PRIx64 " va=0x%" PRIx64 499 " pte=0x%" PRIx64 " l=%d\n", 500 (uint64_t)ma, (uint64_t)va, pteval, level); 501 502 #if defined(__xpv) 503 /* 504 * see if we can avoid find_pte() on the hypervisor 505 */ 506 if (HYPERVISOR_update_va_mapping(va, pteval, 507 UVMF_INVLPG | UVMF_LOCAL) == 0) 508 return; 509 #endif 510 511 /* 512 * Find the pte that will map this address. This creates any 513 * missing intermediate level page tables 514 */ 515 ptep = find_pte(va, NULL, level, 0); 516 517 /* 518 * When paravirtualized, we must use hypervisor calls to modify the 519 * PTE, since paging is active. On real hardware we just write to 520 * the pagetables which aren't in use yet. 521 */ 522 #if defined(__xpv) 523 ptep = ptep; /* shut lint up */ 524 if (HYPERVISOR_update_va_mapping(va, pteval, UVMF_INVLPG | UVMF_LOCAL)) 525 dboot_panic("mmu_update failed-map_pa_at_va va=0x%" PRIx64 526 " l=%d ma=0x%" PRIx64 ", pte=0x%" PRIx64 "", 527 (uint64_t)va, level, (uint64_t)ma, pteval); 528 #else 529 if (va < 1024 * 1024) 530 pteval |= PT_NOCACHE; /* for video RAM */ 531 if (pae_support) 532 *ptep = pteval; 533 else 534 *((x86pte32_t *)ptep) = (x86pte32_t)pteval; 535 #endif 536 } 537 538 /* 539 * Add a mapping for the physical page at the given virtual address. 540 */ 541 static void 542 map_pa_at_va(paddr_t pa, native_ptr_t va, uint_t level) 543 { 544 map_ma_at_va(pa_to_ma(pa), va, level); 545 } 546 547 /* 548 * This is called to remove start..end from the 549 * possible range of PCI addresses. 550 */ 551 const uint64_t pci_lo_limit = 0x00100000ul; 552 const uint64_t pci_hi_limit = 0xfff00000ul; 553 static void 554 exclude_from_pci(uint64_t start, uint64_t end) 555 { 556 int i; 557 int j; 558 struct boot_memlist *ml; 559 560 for (i = 0; i < pcimemlists_used; ++i) { 561 ml = &pcimemlists[i]; 562 563 /* delete the entire range? */ 564 if (start <= ml->addr && ml->addr + ml->size <= end) { 565 --pcimemlists_used; 566 for (j = i; j < pcimemlists_used; ++j) 567 pcimemlists[j] = pcimemlists[j + 1]; 568 --i; /* to revisit the new one at this index */ 569 } 570 571 /* split a range? */ 572 else if (ml->addr < start && end < ml->addr + ml->size) { 573 574 ++pcimemlists_used; 575 if (pcimemlists_used > MAX_MEMLIST) 576 dboot_panic("too many pcimemlists"); 577 578 for (j = pcimemlists_used - 1; j > i; --j) 579 pcimemlists[j] = pcimemlists[j - 1]; 580 ml->size = start - ml->addr; 581 582 ++ml; 583 ml->size = (ml->addr + ml->size) - end; 584 ml->addr = end; 585 ++i; /* skip on to next one */ 586 } 587 588 /* cut memory off the start? */ 589 else if (ml->addr < end && end < ml->addr + ml->size) { 590 ml->size -= end - ml->addr; 591 ml->addr = end; 592 } 593 594 /* cut memory off the end? */ 595 else if (ml->addr <= start && start < ml->addr + ml->size) { 596 ml->size = start - ml->addr; 597 } 598 } 599 } 600 601 /* 602 * Xen strips the size field out of the mb_memory_map_t, see struct e820entry 603 * definition in Xen source. 604 */ 605 #ifdef __xpv 606 typedef struct { 607 uint32_t base_addr_low; 608 uint32_t base_addr_high; 609 uint32_t length_low; 610 uint32_t length_high; 611 uint32_t type; 612 } mmap_t; 613 #else 614 typedef mb_memory_map_t mmap_t; 615 #endif 616 617 static void 618 build_pcimemlists(mmap_t *mem, int num) 619 { 620 mmap_t *mmap; 621 uint64_t page_offset = MMU_PAGEOFFSET; /* needs to be 64 bits */ 622 uint64_t start; 623 uint64_t end; 624 int i; 625 626 /* 627 * initialize 628 */ 629 pcimemlists[0].addr = pci_lo_limit; 630 pcimemlists[0].size = pci_hi_limit - pci_lo_limit; 631 pcimemlists_used = 1; 632 633 /* 634 * Fill in PCI memlists. 635 */ 636 for (mmap = mem, i = 0; i < num; ++i, ++mmap) { 637 start = ((uint64_t)mmap->base_addr_high << 32) + 638 mmap->base_addr_low; 639 end = start + ((uint64_t)mmap->length_high << 32) + 640 mmap->length_low; 641 642 if (prom_debug) 643 dboot_printf("\ttype: %d %" PRIx64 "..%" 644 PRIx64 "\n", mmap->type, start, end); 645 646 /* 647 * page align start and end 648 */ 649 start = (start + page_offset) & ~page_offset; 650 end &= ~page_offset; 651 if (end <= start) 652 continue; 653 654 exclude_from_pci(start, end); 655 } 656 657 /* 658 * Finish off the pcimemlist 659 */ 660 if (prom_debug) { 661 for (i = 0; i < pcimemlists_used; ++i) { 662 dboot_printf("pcimemlist entry 0x%" PRIx64 "..0x%" 663 PRIx64 "\n", pcimemlists[i].addr, 664 pcimemlists[i].addr + pcimemlists[i].size); 665 } 666 } 667 pcimemlists[0].next = 0; 668 pcimemlists[0].prev = 0; 669 for (i = 1; i < pcimemlists_used; ++i) { 670 pcimemlists[i].prev = 671 (native_ptr_t)(uintptr_t)(pcimemlists + i - 1); 672 pcimemlists[i].next = 0; 673 pcimemlists[i - 1].next = 674 (native_ptr_t)(uintptr_t)(pcimemlists + i); 675 } 676 bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists; 677 DBG(bi->bi_pcimem); 678 } 679 680 #if defined(__xpv) 681 /* 682 * Initialize memory allocator stuff from hypervisor-supplied start info. 683 * 684 * There is 512KB of scratch area after the boot stack page. 685 * We'll use that for everything except the kernel nucleus pages which are too 686 * big to fit there and are allocated last anyway. 687 */ 688 #define MAXMAPS 100 689 static mmap_t map_buffer[MAXMAPS]; 690 static void 691 init_mem_alloc(void) 692 { 693 int local; /* variables needed to find start region */ 694 paddr_t scratch_start; 695 xen_memory_map_t map; 696 697 DBG_MSG("Entered init_mem_alloc()\n"); 698 699 /* 700 * Free memory follows the stack. There's at least 512KB of scratch 701 * space, rounded up to at least 2Mb alignment. That should be enough 702 * for the page tables we'll need to build. The nucleus memory is 703 * allocated last and will be outside the addressible range. We'll 704 * switch to new page tables before we unpack the kernel 705 */ 706 scratch_start = RNDUP((paddr_t)(uintptr_t)&local, MMU_PAGESIZE); 707 DBG(scratch_start); 708 scratch_end = RNDUP((paddr_t)scratch_start + 512 * 1024, TWO_MEG); 709 DBG(scratch_end); 710 711 /* 712 * For paranoia, leave some space between hypervisor data and ours. 713 * Use 500 instead of 512. 714 */ 715 next_avail_addr = scratch_end - 500 * 1024; 716 DBG(next_avail_addr); 717 718 /* 719 * The domain builder gives us at most 1 module 720 */ 721 DBG(xen_info->mod_len); 722 if (xen_info->mod_len > 0) { 723 DBG(xen_info->mod_start); 724 modules[0].bm_addr = xen_info->mod_start; 725 modules[0].bm_size = xen_info->mod_len; 726 bi->bi_module_cnt = 1; 727 bi->bi_modules = (native_ptr_t)modules; 728 } else { 729 bi->bi_module_cnt = 0; 730 bi->bi_modules = NULL; 731 } 732 DBG(bi->bi_module_cnt); 733 DBG(bi->bi_modules); 734 735 DBG(xen_info->mfn_list); 736 DBG(xen_info->nr_pages); 737 max_mem = (paddr_t)xen_info->nr_pages << MMU_PAGESHIFT; 738 DBG(max_mem); 739 740 /* 741 * Using pseudo-physical addresses, so only 1 memlist element 742 */ 743 memlists[0].addr = 0; 744 DBG(memlists[0].addr); 745 memlists[0].size = max_mem; 746 DBG(memlists[0].size); 747 memlists_used = 1; 748 DBG(memlists_used); 749 750 /* 751 * finish building physinstall list 752 */ 753 sort_physinstall(); 754 755 /* 756 * build bios reserved memlists 757 */ 758 build_rsvdmemlists(); 759 760 if (DOMAIN_IS_INITDOMAIN(xen_info)) { 761 /* 762 * build PCI Memory list 763 */ 764 map.nr_entries = MAXMAPS; 765 /*LINTED: constant in conditional context*/ 766 set_xen_guest_handle(map.buffer, map_buffer); 767 if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &map) != 0) 768 dboot_panic("getting XENMEM_machine_memory_map failed"); 769 build_pcimemlists(map_buffer, map.nr_entries); 770 } 771 } 772 773 #else /* !__xpv */ 774 775 static uint8_t 776 dboot_a2h(char v) 777 { 778 if (v >= 'a') 779 return (v - 'a' + 0xa); 780 else if (v >= 'A') 781 return (v - 'A' + 0xa); 782 else if (v >= '0') 783 return (v - '0'); 784 else 785 dboot_panic("bad ASCII hex character %c\n", v); 786 787 return (0); 788 } 789 790 static void 791 digest_a2h(const char *ascii, uint8_t *digest) 792 { 793 unsigned int i; 794 795 for (i = 0; i < SHA1_DIGEST_LENGTH; i++) { 796 digest[i] = dboot_a2h(ascii[i * 2]) << 4; 797 digest[i] |= dboot_a2h(ascii[i * 2 + 1]); 798 } 799 } 800 801 /* 802 * Generate a SHA-1 hash of the first len bytes of image, and compare it with 803 * the ASCII-format hash found in the 40-byte buffer at ascii. If they 804 * match, return 0, otherwise -1. This works only for images smaller than 805 * 4 GB, which should not be a problem. 806 */ 807 static int 808 check_image_hash(const char *ascii, const void *image, size_t len) 809 { 810 SHA1_CTX ctx; 811 uint8_t digest[SHA1_DIGEST_LENGTH]; 812 uint8_t baseline[SHA1_DIGEST_LENGTH]; 813 unsigned int i; 814 815 digest_a2h(ascii, baseline); 816 817 SHA1Init(&ctx); 818 SHA1Update(&ctx, image, len); 819 SHA1Final(digest, &ctx); 820 821 for (i = 0; i < SHA1_DIGEST_LENGTH; i++) { 822 if (digest[i] != baseline[i]) 823 return (-1); 824 } 825 826 return (0); 827 } 828 829 static void 830 check_images(void) 831 { 832 int i; 833 char *hashes; 834 mb_module_t *mod, *hashmod; 835 char *hash; 836 char displayhash[SHA1_ASCII_LENGTH + 1]; 837 size_t hashlen; 838 size_t len; 839 840 /* 841 * A brief note on lengths and sizes: GRUB, for reasons unknown, passes 842 * the address of the last valid byte in a module plus 1 as mod_end. 843 * This is of course a bug; the multiboot specification simply states 844 * that mod_start and mod_end "contain the start and end addresses of 845 * the boot module itself" which is pretty obviously not what GRUB is 846 * doing. However, fixing it requires that not only this code be 847 * changed but also that other code consuming this value and values 848 * derived from it be fixed, and that the kernel and GRUB must either 849 * both have the bug or neither. While there are a lot of combinations 850 * that will work, there are also some that won't, so for simplicity 851 * we'll just cope with the bug. That means we won't actually hash the 852 * byte at mod_end, and we will expect that mod_end for the hash file 853 * itself is one greater than some multiple of 41 (40 bytes of ASCII 854 * hash plus a newline for each module). 855 */ 856 857 if (mb_info->mods_count > 1) { 858 mod = (mb_module_t *)mb_info->mods_addr; 859 hashmod = mod + (mb_info->mods_count - 1); 860 hashes = (char *)hashmod->mod_start; 861 hashlen = (size_t)(hashmod->mod_end - hashmod->mod_start); 862 hash = hashes; 863 if (prom_debug) { 864 dboot_printf("Hash module found at %lx size %lx\n", 865 (ulong_t)hashes, (ulong_t)hashlen); 866 } 867 } else { 868 DBG_MSG("Skipping hash check; no hash module found.\n"); 869 return; 870 } 871 872 for (mod = (mb_module_t *)(mb_info->mods_addr), i = 0; 873 i < mb_info->mods_count - 1; ++mod, ++i) { 874 if ((hash - hashes) + SHA1_ASCII_LENGTH + 1 > hashlen) { 875 dboot_printf("Short hash module of length 0x%lx bytes; " 876 "skipping hash checks\n", (ulong_t)hashlen); 877 break; 878 } 879 880 (void) memcpy(displayhash, hash, SHA1_ASCII_LENGTH); 881 displayhash[SHA1_ASCII_LENGTH] = '\0'; 882 if (prom_debug) { 883 dboot_printf("Checking hash for module %d [%s]: ", 884 i, displayhash); 885 } 886 887 len = mod->mod_end - mod->mod_start; /* see above */ 888 if (check_image_hash(hash, (void *)mod->mod_start, len) != 0) { 889 dboot_panic("SHA-1 hash mismatch on %s; expected %s\n", 890 (char *)mod->mod_name, displayhash); 891 } else { 892 DBG_MSG("OK\n"); 893 } 894 hash += SHA1_ASCII_LENGTH + 1; 895 } 896 } 897 898 /* 899 * During memory allocation, find the highest address not used yet. 900 */ 901 static void 902 check_higher(paddr_t a) 903 { 904 if (a < next_avail_addr) 905 return; 906 next_avail_addr = RNDUP(a + 1, MMU_PAGESIZE); 907 DBG(next_avail_addr); 908 } 909 910 /* 911 * Walk through the module information finding the last used address. 912 * The first available address will become the top level page table. 913 * 914 * We then build the phys_install memlist from the multiboot information. 915 */ 916 static void 917 init_mem_alloc(void) 918 { 919 mb_memory_map_t *mmap; 920 mb_module_t *mod; 921 uint64_t start; 922 uint64_t end; 923 uint64_t page_offset = MMU_PAGEOFFSET; /* needs to be 64 bits */ 924 extern char _end[]; 925 int i; 926 927 DBG_MSG("Entered init_mem_alloc()\n"); 928 DBG((uintptr_t)mb_info); 929 930 if (mb_info->mods_count > MAX_MODULES) { 931 dboot_panic("Too many modules (%d) -- the maximum is %d.", 932 mb_info->mods_count, MAX_MODULES); 933 } 934 /* 935 * search the modules to find the last used address 936 * we'll build the module list while we're walking through here 937 */ 938 DBG_MSG("\nFinding Modules\n"); 939 check_higher((paddr_t)(uintptr_t)&_end); 940 for (mod = (mb_module_t *)(mb_info->mods_addr), i = 0; 941 i < mb_info->mods_count; 942 ++mod, ++i) { 943 if (prom_debug) { 944 dboot_printf("\tmodule #%d: %s at: 0x%lx, end 0x%lx\n", 945 i, (char *)(mod->mod_name), 946 (ulong_t)mod->mod_start, (ulong_t)mod->mod_end); 947 } 948 modules[i].bm_addr = mod->mod_start; 949 if (mod->mod_start > mod->mod_end) { 950 dboot_panic("module[%d]: Invalid module start address " 951 "(0x%llx)", i, (uint64_t)mod->mod_start); 952 } 953 modules[i].bm_size = mod->mod_end - mod->mod_start; 954 955 check_higher(mod->mod_end); 956 } 957 bi->bi_modules = (native_ptr_t)(uintptr_t)modules; 958 DBG(bi->bi_modules); 959 bi->bi_module_cnt = mb_info->mods_count; 960 DBG(bi->bi_module_cnt); 961 962 check_images(); 963 964 /* 965 * Walk through the memory map from multiboot and build our memlist 966 * structures. Note these will have native format pointers. 967 */ 968 DBG_MSG("\nFinding Memory Map\n"); 969 DBG(mb_info->flags); 970 max_mem = 0; 971 if (mb_info->flags & 0x40) { 972 int cnt = 0; 973 974 DBG(mb_info->mmap_addr); 975 DBG(mb_info->mmap_length); 976 check_higher(mb_info->mmap_addr + mb_info->mmap_length); 977 978 for (mmap = (mb_memory_map_t *)mb_info->mmap_addr; 979 (uint32_t)mmap < mb_info->mmap_addr + mb_info->mmap_length; 980 mmap = (mb_memory_map_t *)((uint32_t)mmap + mmap->size 981 + sizeof (mmap->size))) { 982 ++cnt; 983 start = ((uint64_t)mmap->base_addr_high << 32) + 984 mmap->base_addr_low; 985 end = start + ((uint64_t)mmap->length_high << 32) + 986 mmap->length_low; 987 988 if (prom_debug) 989 dboot_printf("\ttype: %d %" PRIx64 "..%" 990 PRIx64 "\n", mmap->type, start, end); 991 992 /* 993 * page align start and end 994 */ 995 start = (start + page_offset) & ~page_offset; 996 end &= ~page_offset; 997 if (end <= start) 998 continue; 999 1000 /* 1001 * only type 1 is usable RAM 1002 */ 1003 switch (mmap->type) { 1004 case 1: 1005 if (end > max_mem) 1006 max_mem = end; 1007 memlists[memlists_used].addr = start; 1008 memlists[memlists_used].size = end - start; 1009 ++memlists_used; 1010 if (memlists_used > MAX_MEMLIST) 1011 dboot_panic("too many memlists"); 1012 break; 1013 case 2: 1014 rsvdmemlists[rsvdmemlists_used].addr = start; 1015 rsvdmemlists[rsvdmemlists_used].size = 1016 end - start; 1017 ++rsvdmemlists_used; 1018 if (rsvdmemlists_used > MAX_MEMLIST) 1019 dboot_panic("too many rsvdmemlists"); 1020 break; 1021 default: 1022 continue; 1023 } 1024 } 1025 build_pcimemlists((mb_memory_map_t *)mb_info->mmap_addr, cnt); 1026 } else if (mb_info->flags & 0x01) { 1027 DBG(mb_info->mem_lower); 1028 memlists[memlists_used].addr = 0; 1029 memlists[memlists_used].size = mb_info->mem_lower * 1024; 1030 ++memlists_used; 1031 DBG(mb_info->mem_upper); 1032 memlists[memlists_used].addr = 1024 * 1024; 1033 memlists[memlists_used].size = mb_info->mem_upper * 1024; 1034 ++memlists_used; 1035 1036 /* 1037 * Old platform - assume I/O space at the end of memory. 1038 */ 1039 pcimemlists[0].addr = 1040 (mb_info->mem_upper * 1024) + (1024 * 1024); 1041 pcimemlists[0].size = pci_hi_limit - pcimemlists[0].addr; 1042 pcimemlists[0].next = 0; 1043 pcimemlists[0].prev = 0; 1044 bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists; 1045 DBG(bi->bi_pcimem); 1046 } else { 1047 dboot_panic("No memory info from boot loader!!!"); 1048 } 1049 1050 check_higher(bi->bi_cmdline); 1051 1052 /* 1053 * finish processing the physinstall list 1054 */ 1055 sort_physinstall(); 1056 1057 /* 1058 * build bios reserved mem lists 1059 */ 1060 build_rsvdmemlists(); 1061 } 1062 #endif /* !__xpv */ 1063 1064 /* 1065 * Simple memory allocator, allocates aligned physical memory. 1066 * Note that startup_kernel() only allocates memory, never frees. 1067 * Memory usage just grows in an upward direction. 1068 */ 1069 static void * 1070 do_mem_alloc(uint32_t size, uint32_t align) 1071 { 1072 uint_t i; 1073 uint64_t best; 1074 uint64_t start; 1075 uint64_t end; 1076 1077 /* 1078 * make sure size is a multiple of pagesize 1079 */ 1080 size = RNDUP(size, MMU_PAGESIZE); 1081 next_avail_addr = RNDUP(next_avail_addr, align); 1082 1083 /* 1084 * XXPV fixme joe 1085 * 1086 * a really large bootarchive that causes you to run out of memory 1087 * may cause this to blow up 1088 */ 1089 /* LINTED E_UNEXPECTED_UINT_PROMOTION */ 1090 best = (uint64_t)-size; 1091 for (i = 0; i < memlists_used; ++i) { 1092 start = memlists[i].addr; 1093 #if defined(__xpv) 1094 start += mfn_base; 1095 #endif 1096 end = start + memlists[i].size; 1097 1098 /* 1099 * did we find the desired address? 1100 */ 1101 if (start <= next_avail_addr && next_avail_addr + size <= end) { 1102 best = next_avail_addr; 1103 goto done; 1104 } 1105 1106 /* 1107 * if not is this address the best so far? 1108 */ 1109 if (start > next_avail_addr && start < best && 1110 RNDUP(start, align) + size <= end) 1111 best = RNDUP(start, align); 1112 } 1113 1114 /* 1115 * We didn't find exactly the address we wanted, due to going off the 1116 * end of a memory region. Return the best found memory address. 1117 */ 1118 done: 1119 next_avail_addr = best + size; 1120 #if defined(__xpv) 1121 if (next_avail_addr > scratch_end) 1122 dboot_panic("Out of mem next_avail: 0x%lx, scratch_end: " 1123 "0x%lx", (ulong_t)next_avail_addr, 1124 (ulong_t)scratch_end); 1125 #endif 1126 (void) memset((void *)(uintptr_t)best, 0, size); 1127 return ((void *)(uintptr_t)best); 1128 } 1129 1130 void * 1131 mem_alloc(uint32_t size) 1132 { 1133 return (do_mem_alloc(size, MMU_PAGESIZE)); 1134 } 1135 1136 1137 /* 1138 * Build page tables to map all of memory used so far as well as the kernel. 1139 */ 1140 static void 1141 build_page_tables(void) 1142 { 1143 uint32_t psize; 1144 uint32_t level; 1145 uint32_t off; 1146 uint64_t start; 1147 #if !defined(__xpv) 1148 uint32_t i; 1149 uint64_t end; 1150 #endif /* __xpv */ 1151 1152 /* 1153 * If we're on metal, we need to create the top level pagetable. 1154 */ 1155 #if defined(__xpv) 1156 top_page_table = (paddr_t)(uintptr_t)xen_info->pt_base; 1157 #else /* __xpv */ 1158 top_page_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE); 1159 #endif /* __xpv */ 1160 DBG((uintptr_t)top_page_table); 1161 1162 /* 1163 * Determine if we'll use large mappings for kernel, then map it. 1164 */ 1165 if (largepage_support) { 1166 psize = lpagesize; 1167 level = 1; 1168 } else { 1169 psize = MMU_PAGESIZE; 1170 level = 0; 1171 } 1172 1173 DBG_MSG("Mapping kernel\n"); 1174 DBG(ktext_phys); 1175 DBG(target_kernel_text); 1176 DBG(ksize); 1177 DBG(psize); 1178 for (off = 0; off < ksize; off += psize) 1179 map_pa_at_va(ktext_phys + off, target_kernel_text + off, level); 1180 1181 /* 1182 * The kernel will need a 1 page window to work with page tables 1183 */ 1184 bi->bi_pt_window = (uintptr_t)mem_alloc(MMU_PAGESIZE); 1185 DBG(bi->bi_pt_window); 1186 bi->bi_pte_to_pt_window = 1187 (uintptr_t)find_pte(bi->bi_pt_window, NULL, 0, 0); 1188 DBG(bi->bi_pte_to_pt_window); 1189 1190 #if defined(__xpv) 1191 if (!DOMAIN_IS_INITDOMAIN(xen_info)) { 1192 /* If this is a domU we're done. */ 1193 DBG_MSG("\nPage tables constructed\n"); 1194 return; 1195 } 1196 #endif /* __xpv */ 1197 1198 /* 1199 * We need 1:1 mappings for the lower 1M of memory to access 1200 * BIOS tables used by a couple of drivers during boot. 1201 * 1202 * The following code works because our simple memory allocator 1203 * only grows usage in an upwards direction. 1204 * 1205 * Note that by this point in boot some mappings for low memory 1206 * may already exist because we've already accessed device in low 1207 * memory. (Specifically the video frame buffer and keyboard 1208 * status ports.) If we're booting on raw hardware then GRUB 1209 * created these mappings for us. If we're booting under a 1210 * hypervisor then we went ahead and remapped these devices into 1211 * memory allocated within dboot itself. 1212 */ 1213 if (map_debug) 1214 dboot_printf("1:1 map pa=0..1Meg\n"); 1215 for (start = 0; start < 1024 * 1024; start += MMU_PAGESIZE) { 1216 #if defined(__xpv) 1217 map_ma_at_va(start, start, 0); 1218 #else /* __xpv */ 1219 map_pa_at_va(start, start, 0); 1220 #endif /* __xpv */ 1221 } 1222 1223 #if !defined(__xpv) 1224 for (i = 0; i < memlists_used; ++i) { 1225 start = memlists[i].addr; 1226 1227 end = start + memlists[i].size; 1228 1229 if (map_debug) 1230 dboot_printf("1:1 map pa=%" PRIx64 "..%" PRIx64 "\n", 1231 start, end); 1232 while (start < end && start < next_avail_addr) { 1233 map_pa_at_va(start, start, 0); 1234 start += MMU_PAGESIZE; 1235 } 1236 } 1237 #endif /* !__xpv */ 1238 1239 DBG_MSG("\nPage tables constructed\n"); 1240 } 1241 1242 #define NO_MULTIBOOT \ 1243 "multiboot is no longer used to boot the Solaris Operating System.\n\ 1244 The grub entry should be changed to:\n\ 1245 kernel$ /platform/i86pc/kernel/$ISADIR/unix\n\ 1246 module$ /platform/i86pc/$ISADIR/boot_archive\n\ 1247 See http://illumos.org/msg/SUNOS-8000-AK for details.\n" 1248 1249 /* 1250 * startup_kernel has a pretty simple job. It builds pagetables which reflect 1251 * 1:1 mappings for all memory in use. It then also adds mappings for 1252 * the kernel nucleus at virtual address of target_kernel_text using large page 1253 * mappings. The page table pages are also accessible at 1:1 mapped 1254 * virtual addresses. 1255 */ 1256 /*ARGSUSED*/ 1257 void 1258 startup_kernel(void) 1259 { 1260 char *cmdline; 1261 uintptr_t addr; 1262 #if defined(__xpv) 1263 physdev_set_iopl_t set_iopl; 1264 #endif /* __xpv */ 1265 1266 /* 1267 * At this point we are executing in a 32 bit real mode. 1268 */ 1269 #if defined(__xpv) 1270 cmdline = (char *)xen_info->cmd_line; 1271 #else /* __xpv */ 1272 cmdline = (char *)mb_info->cmdline; 1273 #endif /* __xpv */ 1274 1275 prom_debug = (strstr(cmdline, "prom_debug") != NULL); 1276 map_debug = (strstr(cmdline, "map_debug") != NULL); 1277 1278 #if defined(__xpv) 1279 /* 1280 * For dom0, before we initialize the console subsystem we'll 1281 * need to enable io operations, so set I/O priveldge level to 1. 1282 */ 1283 if (DOMAIN_IS_INITDOMAIN(xen_info)) { 1284 set_iopl.iopl = 1; 1285 (void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 1286 } 1287 #endif /* __xpv */ 1288 1289 bcons_init(cmdline); 1290 DBG_MSG("\n\nSolaris prekernel set: "); 1291 DBG_MSG(cmdline); 1292 DBG_MSG("\n"); 1293 1294 if (strstr(cmdline, "multiboot") != NULL) { 1295 dboot_panic(NO_MULTIBOOT); 1296 } 1297 1298 /* 1299 * boot info must be 16 byte aligned for 64 bit kernel ABI 1300 */ 1301 addr = (uintptr_t)boot_info; 1302 addr = (addr + 0xf) & ~0xf; 1303 bi = (struct xboot_info *)addr; 1304 DBG((uintptr_t)bi); 1305 bi->bi_cmdline = (native_ptr_t)(uintptr_t)cmdline; 1306 1307 /* 1308 * Need correct target_kernel_text value 1309 */ 1310 #if defined(_BOOT_TARGET_amd64) 1311 target_kernel_text = KERNEL_TEXT_amd64; 1312 #elif defined(__xpv) 1313 target_kernel_text = KERNEL_TEXT_i386_xpv; 1314 #else 1315 target_kernel_text = KERNEL_TEXT_i386; 1316 #endif 1317 DBG(target_kernel_text); 1318 1319 #if defined(__xpv) 1320 1321 /* 1322 * XXPV Derive this stuff from CPUID / what the hypervisor has enabled 1323 */ 1324 1325 #if defined(_BOOT_TARGET_amd64) 1326 /* 1327 * 64-bit hypervisor. 1328 */ 1329 amd64_support = 1; 1330 pae_support = 1; 1331 1332 #else /* _BOOT_TARGET_amd64 */ 1333 1334 /* 1335 * See if we are running on a PAE Hypervisor 1336 */ 1337 { 1338 xen_capabilities_info_t caps; 1339 1340 if (HYPERVISOR_xen_version(XENVER_capabilities, &caps) != 0) 1341 dboot_panic("HYPERVISOR_xen_version(caps) failed"); 1342 caps[sizeof (caps) - 1] = 0; 1343 if (prom_debug) 1344 dboot_printf("xen capabilities %s\n", caps); 1345 if (strstr(caps, "x86_32p") != NULL) 1346 pae_support = 1; 1347 } 1348 1349 #endif /* _BOOT_TARGET_amd64 */ 1350 { 1351 xen_platform_parameters_t p; 1352 1353 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &p) != 0) 1354 dboot_panic("HYPERVISOR_xen_version(parms) failed"); 1355 DBG(p.virt_start); 1356 mfn_to_pfn_mapping = (pfn_t *)(xen_virt_start = p.virt_start); 1357 } 1358 1359 /* 1360 * The hypervisor loads stuff starting at 1Gig 1361 */ 1362 mfn_base = ONE_GIG; 1363 DBG(mfn_base); 1364 1365 /* 1366 * enable writable page table mode for the hypervisor 1367 */ 1368 if (HYPERVISOR_vm_assist(VMASST_CMD_enable, 1369 VMASST_TYPE_writable_pagetables) < 0) 1370 dboot_panic("HYPERVISOR_vm_assist(writable_pagetables) failed"); 1371 1372 /* 1373 * check for NX support 1374 */ 1375 if (pae_support) { 1376 uint32_t eax = 0x80000000; 1377 uint32_t edx = get_cpuid_edx(&eax); 1378 1379 if (eax >= 0x80000001) { 1380 eax = 0x80000001; 1381 edx = get_cpuid_edx(&eax); 1382 if (edx & CPUID_AMD_EDX_NX) 1383 NX_support = 1; 1384 } 1385 } 1386 1387 #if !defined(_BOOT_TARGET_amd64) 1388 1389 /* 1390 * The 32-bit hypervisor uses segmentation to protect itself from 1391 * guests. This means when a guest attempts to install a flat 4GB 1392 * code or data descriptor the 32-bit hypervisor will protect itself 1393 * by silently shrinking the segment such that if the guest attempts 1394 * any access where the hypervisor lives a #gp fault is generated. 1395 * The problem is that some applications expect a full 4GB flat 1396 * segment for their current thread pointer and will use negative 1397 * offset segment wrap around to access data. TLS support in linux 1398 * brand is one example of this. 1399 * 1400 * The 32-bit hypervisor can catch the #gp fault in these cases 1401 * and emulate the access without passing the #gp fault to the guest 1402 * but only if VMASST_TYPE_4gb_segments is explicitly turned on. 1403 * Seems like this should have been the default. 1404 * Either way, we want the hypervisor -- and not Solaris -- to deal 1405 * to deal with emulating these accesses. 1406 */ 1407 if (HYPERVISOR_vm_assist(VMASST_CMD_enable, 1408 VMASST_TYPE_4gb_segments) < 0) 1409 dboot_panic("HYPERVISOR_vm_assist(4gb_segments) failed"); 1410 #endif /* !_BOOT_TARGET_amd64 */ 1411 1412 #else /* __xpv */ 1413 1414 /* 1415 * use cpuid to enable MMU features 1416 */ 1417 if (have_cpuid()) { 1418 uint32_t eax, edx; 1419 1420 eax = 1; 1421 edx = get_cpuid_edx(&eax); 1422 if (edx & CPUID_INTC_EDX_PSE) 1423 largepage_support = 1; 1424 if (edx & CPUID_INTC_EDX_PGE) 1425 pge_support = 1; 1426 if (edx & CPUID_INTC_EDX_PAE) 1427 pae_support = 1; 1428 1429 eax = 0x80000000; 1430 edx = get_cpuid_edx(&eax); 1431 if (eax >= 0x80000001) { 1432 eax = 0x80000001; 1433 edx = get_cpuid_edx(&eax); 1434 if (edx & CPUID_AMD_EDX_LM) 1435 amd64_support = 1; 1436 if (edx & CPUID_AMD_EDX_NX) 1437 NX_support = 1; 1438 } 1439 } else { 1440 dboot_printf("cpuid not supported\n"); 1441 } 1442 #endif /* __xpv */ 1443 1444 1445 #if defined(_BOOT_TARGET_amd64) 1446 if (amd64_support == 0) 1447 dboot_panic("long mode not supported, rebooting"); 1448 else if (pae_support == 0) 1449 dboot_panic("long mode, but no PAE; rebooting"); 1450 #else 1451 /* 1452 * Allow the command line to over-ride use of PAE for 32 bit. 1453 */ 1454 if (strstr(cmdline, "disablePAE=true") != NULL) { 1455 pae_support = 0; 1456 NX_support = 0; 1457 amd64_support = 0; 1458 } 1459 #endif 1460 1461 /* 1462 * initialize the simple memory allocator 1463 */ 1464 init_mem_alloc(); 1465 1466 #if !defined(__xpv) && !defined(_BOOT_TARGET_amd64) 1467 /* 1468 * disable PAE on 32 bit h/w w/o NX and < 4Gig of memory 1469 */ 1470 if (max_mem < FOUR_GIG && NX_support == 0) 1471 pae_support = 0; 1472 #endif 1473 1474 /* 1475 * configure mmu information 1476 */ 1477 if (pae_support) { 1478 shift_amt = shift_amt_pae; 1479 ptes_per_table = 512; 1480 pte_size = 8; 1481 lpagesize = TWO_MEG; 1482 #if defined(_BOOT_TARGET_amd64) 1483 top_level = 3; 1484 #else 1485 top_level = 2; 1486 #endif 1487 } else { 1488 pae_support = 0; 1489 NX_support = 0; 1490 shift_amt = shift_amt_nopae; 1491 ptes_per_table = 1024; 1492 pte_size = 4; 1493 lpagesize = FOUR_MEG; 1494 top_level = 1; 1495 } 1496 1497 DBG(pge_support); 1498 DBG(NX_support); 1499 DBG(largepage_support); 1500 DBG(amd64_support); 1501 DBG(top_level); 1502 DBG(pte_size); 1503 DBG(ptes_per_table); 1504 DBG(lpagesize); 1505 1506 #if defined(__xpv) 1507 ktext_phys = ONE_GIG; /* from UNIX Mapfile */ 1508 #else 1509 ktext_phys = FOUR_MEG; /* from UNIX Mapfile */ 1510 #endif 1511 1512 #if !defined(__xpv) && defined(_BOOT_TARGET_amd64) 1513 /* 1514 * For grub, copy kernel bits from the ELF64 file to final place. 1515 */ 1516 DBG_MSG("\nAllocating nucleus pages.\n"); 1517 ktext_phys = (uintptr_t)do_mem_alloc(ksize, FOUR_MEG); 1518 if (ktext_phys == 0) 1519 dboot_panic("failed to allocate aligned kernel memory"); 1520 if (dboot_elfload64(mb_header.load_addr) != 0) 1521 dboot_panic("failed to parse kernel ELF image, rebooting"); 1522 #endif 1523 1524 DBG(ktext_phys); 1525 1526 /* 1527 * Allocate page tables. 1528 */ 1529 build_page_tables(); 1530 1531 /* 1532 * return to assembly code to switch to running kernel 1533 */ 1534 entry_addr_low = (uint32_t)target_kernel_text; 1535 DBG(entry_addr_low); 1536 bi->bi_use_largepage = largepage_support; 1537 bi->bi_use_pae = pae_support; 1538 bi->bi_use_pge = pge_support; 1539 bi->bi_use_nx = NX_support; 1540 1541 #if defined(__xpv) 1542 1543 bi->bi_next_paddr = next_avail_addr - mfn_base; 1544 DBG(bi->bi_next_paddr); 1545 bi->bi_next_vaddr = (native_ptr_t)next_avail_addr; 1546 DBG(bi->bi_next_vaddr); 1547 1548 /* 1549 * unmap unused pages in start area to make them available for DMA 1550 */ 1551 while (next_avail_addr < scratch_end) { 1552 (void) HYPERVISOR_update_va_mapping(next_avail_addr, 1553 0, UVMF_INVLPG | UVMF_LOCAL); 1554 next_avail_addr += MMU_PAGESIZE; 1555 } 1556 1557 bi->bi_xen_start_info = (uintptr_t)xen_info; 1558 DBG((uintptr_t)HYPERVISOR_shared_info); 1559 bi->bi_shared_info = (native_ptr_t)HYPERVISOR_shared_info; 1560 bi->bi_top_page_table = (uintptr_t)top_page_table - mfn_base; 1561 1562 #else /* __xpv */ 1563 1564 bi->bi_next_paddr = next_avail_addr; 1565 DBG(bi->bi_next_paddr); 1566 bi->bi_next_vaddr = (uintptr_t)next_avail_addr; 1567 DBG(bi->bi_next_vaddr); 1568 bi->bi_mb_info = (uintptr_t)mb_info; 1569 bi->bi_top_page_table = (uintptr_t)top_page_table; 1570 1571 #endif /* __xpv */ 1572 1573 bi->bi_kseg_size = FOUR_MEG; 1574 DBG(bi->bi_kseg_size); 1575 1576 #ifndef __xpv 1577 if (map_debug) 1578 dump_tables(); 1579 #endif 1580 1581 DBG_MSG("\n\n*** DBOOT DONE -- back to asm to jump to kernel\n\n"); 1582 }