Print this page
3364 dboot should check boot archive integrity
Reviewed by: Hans Rosenfeld <hans.rosenfeld@nexenta.com>
Reviewed by: Dan McDonald <danmcd@nexenta.com>
Reviewed by: Richard Lowe <richlowe@richlowe.net>
Reviewed by: Garrett D'Amore <garrett@damore.org>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/i86pc/dboot/dboot_startkern.c
+++ new/usr/src/uts/i86pc/dboot/dboot_startkern.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
↓ open down ↓ |
14 lines elided |
↑ open up ↑ |
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 24 * Use is subject to license terms.
25 + *
26 + * Copyright 2012 Joyent, Inc. All rights reserved.
25 27 */
26 28
27 29
28 30 #include <sys/types.h>
29 31 #include <sys/machparam.h>
30 32 #include <sys/x86_archext.h>
31 33 #include <sys/systm.h>
32 34 #include <sys/mach_mmu.h>
33 35 #include <sys/multiboot.h>
36 +#include <sys/sha1.h>
34 37
35 38 #if defined(__xpv)
36 39
37 40 #include <sys/hypervisor.h>
38 41 uintptr_t xen_virt_start;
39 42 pfn_t *mfn_to_pfn_mapping;
40 43
41 44 #else /* !__xpv */
42 45
43 46 extern multiboot_header_t mb_header;
44 47 extern int have_cpuid(void);
45 48
46 49 #endif /* !__xpv */
47 50
↓ open down ↓ |
4 lines elided |
↑ open up ↑ |
48 51 #include <sys/inttypes.h>
49 52 #include <sys/bootinfo.h>
50 53 #include <sys/mach_mmu.h>
51 54 #include <sys/boot_console.h>
52 55
53 56 #include "dboot_asm.h"
54 57 #include "dboot_printf.h"
55 58 #include "dboot_xboot.h"
56 59 #include "dboot_elfload.h"
57 60
61 +#define SHA1_ASCII_LENGTH (SHA1_DIGEST_LENGTH * 2)
62 +
58 63 /*
59 64 * This file contains code that runs to transition us from either a multiboot
60 65 * compliant loader (32 bit non-paging) or a XPV domain loader to
61 66 * regular kernel execution. Its task is to setup the kernel memory image
62 67 * and page tables.
63 68 *
64 69 * The code executes as:
65 70 * - 32 bits under GRUB (for 32 or 64 bit Solaris)
66 71 * - a 32 bit program for the 32-bit PV hypervisor
67 72 * - a 64 bit program for the 64-bit PV hypervisor (at least for now)
68 73 *
69 74 * Under the PV hypervisor, we must create mappings for any memory beyond the
70 75 * initial start of day allocation (such as the kernel itself).
71 76 *
72 77 * When on the metal, the mapping between maddr_t and paddr_t is 1:1.
73 78 * Since we are running in real mode, so all such memory is accessible.
74 79 */
75 80
76 81 /*
77 82 * Standard bits used in PTE (page level) and PTP (internal levels)
78 83 */
79 84 x86pte_t ptp_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_USER;
80 85 x86pte_t pte_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_MOD | PT_NOCONSIST;
81 86
82 87 /*
83 88 * This is the target addresses (physical) where the kernel text and data
84 89 * nucleus pages will be unpacked. On the hypervisor this is actually a
85 90 * virtual address.
86 91 */
87 92 paddr_t ktext_phys;
88 93 uint32_t ksize = 2 * FOUR_MEG; /* kernel nucleus is 8Meg */
89 94
90 95 static uint64_t target_kernel_text; /* value to use for KERNEL_TEXT */
91 96
92 97 /*
93 98 * The stack is setup in assembler before entering startup_kernel()
94 99 */
95 100 char stack_space[STACK_SIZE];
96 101
97 102 /*
98 103 * Used to track physical memory allocation
99 104 */
100 105 static paddr_t next_avail_addr = 0;
101 106
102 107 #if defined(__xpv)
103 108 /*
104 109 * Additional information needed for hypervisor memory allocation.
105 110 * Only memory up to scratch_end is mapped by page tables.
106 111 * mfn_base is the start of the hypervisor virtual image. It's ONE_GIG, so
107 112 * to derive a pfn from a pointer, you subtract mfn_base.
108 113 */
109 114
110 115 static paddr_t scratch_end = 0; /* we can't write all of mem here */
111 116 static paddr_t mfn_base; /* addr corresponding to mfn_list[0] */
112 117 start_info_t *xen_info;
113 118
114 119 #else /* __xpv */
115 120
116 121 /*
117 122 * If on the metal, then we have a multiboot loader.
118 123 */
119 124 multiboot_info_t *mb_info;
120 125
121 126 #endif /* __xpv */
122 127
123 128 /*
124 129 * This contains information passed to the kernel
125 130 */
126 131 struct xboot_info boot_info[2]; /* extra space to fix alignement for amd64 */
127 132 struct xboot_info *bi;
128 133
129 134 /*
130 135 * Page table and memory stuff.
131 136 */
132 137 static paddr_t max_mem; /* maximum memory address */
133 138
134 139 /*
135 140 * Information about processor MMU
136 141 */
137 142 int amd64_support = 0;
138 143 int largepage_support = 0;
139 144 int pae_support = 0;
140 145 int pge_support = 0;
141 146 int NX_support = 0;
142 147
143 148 /*
144 149 * Low 32 bits of kernel entry address passed back to assembler.
145 150 * When running a 64 bit kernel, the high 32 bits are 0xffffffff.
146 151 */
147 152 uint32_t entry_addr_low;
148 153
149 154 /*
150 155 * Memlists for the kernel. We shouldn't need a lot of these.
151 156 */
152 157 #define MAX_MEMLIST (50)
153 158 struct boot_memlist memlists[MAX_MEMLIST];
154 159 uint_t memlists_used = 0;
155 160 struct boot_memlist pcimemlists[MAX_MEMLIST];
156 161 uint_t pcimemlists_used = 0;
157 162 struct boot_memlist rsvdmemlists[MAX_MEMLIST];
158 163 uint_t rsvdmemlists_used = 0;
159 164
160 165 #define MAX_MODULES (10)
161 166 struct boot_modules modules[MAX_MODULES];
162 167 uint_t modules_used = 0;
163 168
164 169 /*
165 170 * Debugging macros
166 171 */
167 172 uint_t prom_debug = 0;
168 173 uint_t map_debug = 0;
169 174
170 175 /*
171 176 * Either hypervisor-specific or grub-specific code builds the initial
172 177 * memlists. This code does the sort/merge/link for final use.
173 178 */
174 179 static void
175 180 sort_physinstall(void)
176 181 {
177 182 int i;
178 183 #if !defined(__xpv)
179 184 int j;
180 185 struct boot_memlist tmp;
181 186
182 187 /*
183 188 * Now sort the memlists, in case they weren't in order.
184 189 * Yeah, this is a bubble sort; small, simple and easy to get right.
185 190 */
186 191 DBG_MSG("Sorting phys-installed list\n");
187 192 for (j = memlists_used - 1; j > 0; --j) {
188 193 for (i = 0; i < j; ++i) {
189 194 if (memlists[i].addr < memlists[i + 1].addr)
190 195 continue;
191 196 tmp = memlists[i];
192 197 memlists[i] = memlists[i + 1];
193 198 memlists[i + 1] = tmp;
194 199 }
195 200 }
196 201
197 202 /*
198 203 * Merge any memlists that don't have holes between them.
199 204 */
200 205 for (i = 0; i <= memlists_used - 1; ++i) {
201 206 if (memlists[i].addr + memlists[i].size != memlists[i + 1].addr)
202 207 continue;
203 208
204 209 if (prom_debug)
205 210 dboot_printf(
206 211 "merging mem segs %" PRIx64 "...%" PRIx64
207 212 " w/ %" PRIx64 "...%" PRIx64 "\n",
208 213 memlists[i].addr,
209 214 memlists[i].addr + memlists[i].size,
210 215 memlists[i + 1].addr,
211 216 memlists[i + 1].addr + memlists[i + 1].size);
212 217
213 218 memlists[i].size += memlists[i + 1].size;
214 219 for (j = i + 1; j < memlists_used - 1; ++j)
215 220 memlists[j] = memlists[j + 1];
216 221 --memlists_used;
217 222 DBG(memlists_used);
218 223 --i; /* after merging we need to reexamine, so do this */
219 224 }
220 225 #endif /* __xpv */
221 226
222 227 if (prom_debug) {
223 228 dboot_printf("\nFinal memlists:\n");
224 229 for (i = 0; i < memlists_used; ++i) {
225 230 dboot_printf("\t%d: addr=%" PRIx64 " size=%"
226 231 PRIx64 "\n", i, memlists[i].addr, memlists[i].size);
227 232 }
228 233 }
229 234
230 235 /*
231 236 * link together the memlists with native size pointers
232 237 */
233 238 memlists[0].next = 0;
234 239 memlists[0].prev = 0;
235 240 for (i = 1; i < memlists_used; ++i) {
236 241 memlists[i].prev = (native_ptr_t)(uintptr_t)(memlists + i - 1);
237 242 memlists[i].next = 0;
238 243 memlists[i - 1].next = (native_ptr_t)(uintptr_t)(memlists + i);
239 244 }
240 245 bi->bi_phys_install = (native_ptr_t)(uintptr_t)memlists;
241 246 DBG(bi->bi_phys_install);
242 247 }
243 248
244 249 /*
245 250 * build bios reserved memlists
246 251 */
247 252 static void
248 253 build_rsvdmemlists(void)
249 254 {
250 255 int i;
251 256
252 257 rsvdmemlists[0].next = 0;
253 258 rsvdmemlists[0].prev = 0;
254 259 for (i = 1; i < rsvdmemlists_used; ++i) {
255 260 rsvdmemlists[i].prev =
256 261 (native_ptr_t)(uintptr_t)(rsvdmemlists + i - 1);
257 262 rsvdmemlists[i].next = 0;
258 263 rsvdmemlists[i - 1].next =
259 264 (native_ptr_t)(uintptr_t)(rsvdmemlists + i);
260 265 }
261 266 bi->bi_rsvdmem = (native_ptr_t)(uintptr_t)rsvdmemlists;
262 267 DBG(bi->bi_rsvdmem);
263 268 }
264 269
265 270 #if defined(__xpv)
266 271
267 272 /*
268 273 * halt on the hypervisor after a delay to drain console output
269 274 */
270 275 void
271 276 dboot_halt(void)
272 277 {
273 278 uint_t i = 10000;
274 279
275 280 while (--i)
276 281 (void) HYPERVISOR_yield();
277 282 (void) HYPERVISOR_shutdown(SHUTDOWN_poweroff);
278 283 }
279 284
280 285 /*
281 286 * From a machine address, find the corresponding pseudo-physical address.
282 287 * Pseudo-physical address are contiguous and run from mfn_base in each VM.
283 288 * Machine addresses are the real underlying hardware addresses.
284 289 * These are needed for page table entries. Note that this routine is
285 290 * poorly protected. A bad value of "ma" will cause a page fault.
286 291 */
287 292 paddr_t
288 293 ma_to_pa(maddr_t ma)
289 294 {
290 295 ulong_t pgoff = ma & MMU_PAGEOFFSET;
291 296 ulong_t pfn = mfn_to_pfn_mapping[mmu_btop(ma)];
292 297 paddr_t pa;
293 298
294 299 if (pfn >= xen_info->nr_pages)
295 300 return (-(paddr_t)1);
296 301 pa = mfn_base + mmu_ptob((paddr_t)pfn) + pgoff;
297 302 #ifdef DEBUG
298 303 if (ma != pa_to_ma(pa))
299 304 dboot_printf("ma_to_pa(%" PRIx64 ") got %" PRIx64 ", "
300 305 "pa_to_ma() says %" PRIx64 "\n", ma, pa, pa_to_ma(pa));
301 306 #endif
302 307 return (pa);
303 308 }
304 309
305 310 /*
306 311 * From a pseudo-physical address, find the corresponding machine address.
307 312 */
308 313 maddr_t
309 314 pa_to_ma(paddr_t pa)
310 315 {
311 316 pfn_t pfn;
312 317 ulong_t mfn;
313 318
314 319 pfn = mmu_btop(pa - mfn_base);
315 320 if (pa < mfn_base || pfn >= xen_info->nr_pages)
316 321 dboot_panic("pa_to_ma(): illegal address 0x%lx", (ulong_t)pa);
317 322 mfn = ((ulong_t *)xen_info->mfn_list)[pfn];
318 323 #ifdef DEBUG
319 324 if (mfn_to_pfn_mapping[mfn] != pfn)
320 325 dboot_printf("pa_to_ma(pfn=%lx) got %lx ma_to_pa() says %lx\n",
321 326 pfn, mfn, mfn_to_pfn_mapping[mfn]);
322 327 #endif
323 328 return (mfn_to_ma(mfn) | (pa & MMU_PAGEOFFSET));
324 329 }
325 330
326 331 #endif /* __xpv */
327 332
328 333 x86pte_t
329 334 get_pteval(paddr_t table, uint_t index)
330 335 {
331 336 if (pae_support)
332 337 return (((x86pte_t *)(uintptr_t)table)[index]);
333 338 return (((x86pte32_t *)(uintptr_t)table)[index]);
334 339 }
335 340
336 341 /*ARGSUSED*/
337 342 void
338 343 set_pteval(paddr_t table, uint_t index, uint_t level, x86pte_t pteval)
339 344 {
340 345 #ifdef __xpv
341 346 mmu_update_t t;
342 347 maddr_t mtable = pa_to_ma(table);
343 348 int retcnt;
344 349
345 350 t.ptr = (mtable + index * pte_size) | MMU_NORMAL_PT_UPDATE;
346 351 t.val = pteval;
347 352 if (HYPERVISOR_mmu_update(&t, 1, &retcnt, DOMID_SELF) || retcnt != 1)
348 353 dboot_panic("HYPERVISOR_mmu_update() failed");
349 354 #else /* __xpv */
350 355 uintptr_t tab_addr = (uintptr_t)table;
351 356
352 357 if (pae_support)
353 358 ((x86pte_t *)tab_addr)[index] = pteval;
354 359 else
355 360 ((x86pte32_t *)tab_addr)[index] = (x86pte32_t)pteval;
356 361 if (level == top_level && level == 2)
357 362 reload_cr3();
358 363 #endif /* __xpv */
359 364 }
360 365
361 366 paddr_t
362 367 make_ptable(x86pte_t *pteval, uint_t level)
363 368 {
364 369 paddr_t new_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
365 370
366 371 if (level == top_level && level == 2)
367 372 *pteval = pa_to_ma((uintptr_t)new_table) | PT_VALID;
368 373 else
369 374 *pteval = pa_to_ma((uintptr_t)new_table) | ptp_bits;
370 375
371 376 #ifdef __xpv
372 377 /* Remove write permission to the new page table. */
373 378 if (HYPERVISOR_update_va_mapping(new_table,
374 379 *pteval & ~(x86pte_t)PT_WRITABLE, UVMF_INVLPG | UVMF_LOCAL))
375 380 dboot_panic("HYP_update_va_mapping error");
376 381 #endif
377 382
378 383 if (map_debug)
379 384 dboot_printf("new page table lvl=%d paddr=0x%lx ptp=0x%"
380 385 PRIx64 "\n", level, (ulong_t)new_table, *pteval);
381 386 return (new_table);
382 387 }
383 388
384 389 x86pte_t *
385 390 map_pte(paddr_t table, uint_t index)
386 391 {
387 392 return ((x86pte_t *)(uintptr_t)(table + index * pte_size));
388 393 }
389 394
390 395 /*
391 396 * dump out the contents of page tables...
392 397 */
393 398 static void
394 399 dump_tables(void)
395 400 {
396 401 uint_t save_index[4]; /* for recursion */
397 402 char *save_table[4]; /* for recursion */
398 403 uint_t l;
399 404 uint64_t va;
400 405 uint64_t pgsize;
401 406 int index;
402 407 int i;
403 408 x86pte_t pteval;
404 409 char *table;
405 410 static char *tablist = "\t\t\t";
406 411 char *tabs = tablist + 3 - top_level;
407 412 uint_t pa, pa1;
408 413 #if !defined(__xpv)
409 414 #define maddr_t paddr_t
410 415 #endif /* !__xpv */
411 416
412 417 dboot_printf("Finished pagetables:\n");
413 418 table = (char *)(uintptr_t)top_page_table;
414 419 l = top_level;
415 420 va = 0;
416 421 for (index = 0; index < ptes_per_table; ++index) {
417 422 pgsize = 1ull << shift_amt[l];
418 423 if (pae_support)
419 424 pteval = ((x86pte_t *)table)[index];
420 425 else
421 426 pteval = ((x86pte32_t *)table)[index];
422 427 if (pteval == 0)
423 428 goto next_entry;
424 429
425 430 dboot_printf("%s %p[0x%x] = %" PRIx64 ", va=%" PRIx64,
426 431 tabs + l, (void *)table, index, (uint64_t)pteval, va);
427 432 pa = ma_to_pa(pteval & MMU_PAGEMASK);
428 433 dboot_printf(" physaddr=%x\n", pa);
429 434
430 435 /*
431 436 * Don't try to walk hypervisor private pagetables
432 437 */
433 438 if ((l > 1 || (l == 1 && (pteval & PT_PAGESIZE) == 0))) {
434 439 save_table[l] = table;
435 440 save_index[l] = index;
436 441 --l;
437 442 index = -1;
438 443 table = (char *)(uintptr_t)
439 444 ma_to_pa(pteval & MMU_PAGEMASK);
440 445 goto recursion;
441 446 }
442 447
443 448 /*
444 449 * shorten dump for consecutive mappings
445 450 */
446 451 for (i = 1; index + i < ptes_per_table; ++i) {
447 452 if (pae_support)
448 453 pteval = ((x86pte_t *)table)[index + i];
449 454 else
450 455 pteval = ((x86pte32_t *)table)[index + i];
451 456 if (pteval == 0)
452 457 break;
453 458 pa1 = ma_to_pa(pteval & MMU_PAGEMASK);
454 459 if (pa1 != pa + i * pgsize)
455 460 break;
456 461 }
457 462 if (i > 2) {
458 463 dboot_printf("%s...\n", tabs + l);
459 464 va += pgsize * (i - 2);
460 465 index += i - 2;
461 466 }
462 467 next_entry:
463 468 va += pgsize;
464 469 if (l == 3 && index == 256) /* VA hole */
465 470 va = 0xffff800000000000ull;
466 471 recursion:
467 472 ;
468 473 }
469 474 if (l < top_level) {
470 475 ++l;
471 476 index = save_index[l];
472 477 table = save_table[l];
473 478 goto recursion;
474 479 }
475 480 }
476 481
477 482 /*
478 483 * Add a mapping for the machine page at the given virtual address.
479 484 */
480 485 static void
481 486 map_ma_at_va(maddr_t ma, native_ptr_t va, uint_t level)
482 487 {
483 488 x86pte_t *ptep;
484 489 x86pte_t pteval;
485 490
486 491 pteval = ma | pte_bits;
487 492 if (level > 0)
488 493 pteval |= PT_PAGESIZE;
489 494 if (va >= target_kernel_text && pge_support)
490 495 pteval |= PT_GLOBAL;
491 496
492 497 if (map_debug && ma != va)
493 498 dboot_printf("mapping ma=0x%" PRIx64 " va=0x%" PRIx64
494 499 " pte=0x%" PRIx64 " l=%d\n",
495 500 (uint64_t)ma, (uint64_t)va, pteval, level);
496 501
497 502 #if defined(__xpv)
498 503 /*
499 504 * see if we can avoid find_pte() on the hypervisor
500 505 */
501 506 if (HYPERVISOR_update_va_mapping(va, pteval,
502 507 UVMF_INVLPG | UVMF_LOCAL) == 0)
503 508 return;
504 509 #endif
505 510
506 511 /*
507 512 * Find the pte that will map this address. This creates any
508 513 * missing intermediate level page tables
509 514 */
510 515 ptep = find_pte(va, NULL, level, 0);
511 516
512 517 /*
513 518 * When paravirtualized, we must use hypervisor calls to modify the
514 519 * PTE, since paging is active. On real hardware we just write to
515 520 * the pagetables which aren't in use yet.
516 521 */
517 522 #if defined(__xpv)
518 523 ptep = ptep; /* shut lint up */
519 524 if (HYPERVISOR_update_va_mapping(va, pteval, UVMF_INVLPG | UVMF_LOCAL))
520 525 dboot_panic("mmu_update failed-map_pa_at_va va=0x%" PRIx64
521 526 " l=%d ma=0x%" PRIx64 ", pte=0x%" PRIx64 "",
522 527 (uint64_t)va, level, (uint64_t)ma, pteval);
523 528 #else
524 529 if (va < 1024 * 1024)
525 530 pteval |= PT_NOCACHE; /* for video RAM */
526 531 if (pae_support)
527 532 *ptep = pteval;
528 533 else
529 534 *((x86pte32_t *)ptep) = (x86pte32_t)pteval;
530 535 #endif
531 536 }
532 537
533 538 /*
534 539 * Add a mapping for the physical page at the given virtual address.
535 540 */
536 541 static void
537 542 map_pa_at_va(paddr_t pa, native_ptr_t va, uint_t level)
538 543 {
539 544 map_ma_at_va(pa_to_ma(pa), va, level);
540 545 }
541 546
542 547 /*
543 548 * This is called to remove start..end from the
544 549 * possible range of PCI addresses.
545 550 */
546 551 const uint64_t pci_lo_limit = 0x00100000ul;
547 552 const uint64_t pci_hi_limit = 0xfff00000ul;
548 553 static void
549 554 exclude_from_pci(uint64_t start, uint64_t end)
550 555 {
551 556 int i;
552 557 int j;
553 558 struct boot_memlist *ml;
554 559
555 560 for (i = 0; i < pcimemlists_used; ++i) {
556 561 ml = &pcimemlists[i];
557 562
558 563 /* delete the entire range? */
559 564 if (start <= ml->addr && ml->addr + ml->size <= end) {
560 565 --pcimemlists_used;
561 566 for (j = i; j < pcimemlists_used; ++j)
562 567 pcimemlists[j] = pcimemlists[j + 1];
563 568 --i; /* to revisit the new one at this index */
564 569 }
565 570
566 571 /* split a range? */
567 572 else if (ml->addr < start && end < ml->addr + ml->size) {
568 573
569 574 ++pcimemlists_used;
570 575 if (pcimemlists_used > MAX_MEMLIST)
571 576 dboot_panic("too many pcimemlists");
572 577
573 578 for (j = pcimemlists_used - 1; j > i; --j)
574 579 pcimemlists[j] = pcimemlists[j - 1];
575 580 ml->size = start - ml->addr;
576 581
577 582 ++ml;
578 583 ml->size = (ml->addr + ml->size) - end;
579 584 ml->addr = end;
580 585 ++i; /* skip on to next one */
581 586 }
582 587
583 588 /* cut memory off the start? */
584 589 else if (ml->addr < end && end < ml->addr + ml->size) {
585 590 ml->size -= end - ml->addr;
586 591 ml->addr = end;
587 592 }
588 593
589 594 /* cut memory off the end? */
590 595 else if (ml->addr <= start && start < ml->addr + ml->size) {
591 596 ml->size = start - ml->addr;
592 597 }
593 598 }
594 599 }
595 600
596 601 /*
597 602 * Xen strips the size field out of the mb_memory_map_t, see struct e820entry
598 603 * definition in Xen source.
599 604 */
600 605 #ifdef __xpv
601 606 typedef struct {
602 607 uint32_t base_addr_low;
603 608 uint32_t base_addr_high;
604 609 uint32_t length_low;
605 610 uint32_t length_high;
606 611 uint32_t type;
607 612 } mmap_t;
608 613 #else
609 614 typedef mb_memory_map_t mmap_t;
610 615 #endif
611 616
612 617 static void
613 618 build_pcimemlists(mmap_t *mem, int num)
614 619 {
615 620 mmap_t *mmap;
616 621 uint64_t page_offset = MMU_PAGEOFFSET; /* needs to be 64 bits */
617 622 uint64_t start;
618 623 uint64_t end;
619 624 int i;
620 625
621 626 /*
622 627 * initialize
623 628 */
624 629 pcimemlists[0].addr = pci_lo_limit;
625 630 pcimemlists[0].size = pci_hi_limit - pci_lo_limit;
626 631 pcimemlists_used = 1;
627 632
628 633 /*
629 634 * Fill in PCI memlists.
630 635 */
631 636 for (mmap = mem, i = 0; i < num; ++i, ++mmap) {
632 637 start = ((uint64_t)mmap->base_addr_high << 32) +
633 638 mmap->base_addr_low;
634 639 end = start + ((uint64_t)mmap->length_high << 32) +
635 640 mmap->length_low;
636 641
637 642 if (prom_debug)
638 643 dboot_printf("\ttype: %d %" PRIx64 "..%"
639 644 PRIx64 "\n", mmap->type, start, end);
640 645
641 646 /*
642 647 * page align start and end
643 648 */
644 649 start = (start + page_offset) & ~page_offset;
645 650 end &= ~page_offset;
646 651 if (end <= start)
647 652 continue;
648 653
649 654 exclude_from_pci(start, end);
650 655 }
651 656
652 657 /*
653 658 * Finish off the pcimemlist
654 659 */
655 660 if (prom_debug) {
656 661 for (i = 0; i < pcimemlists_used; ++i) {
657 662 dboot_printf("pcimemlist entry 0x%" PRIx64 "..0x%"
658 663 PRIx64 "\n", pcimemlists[i].addr,
659 664 pcimemlists[i].addr + pcimemlists[i].size);
660 665 }
661 666 }
662 667 pcimemlists[0].next = 0;
663 668 pcimemlists[0].prev = 0;
664 669 for (i = 1; i < pcimemlists_used; ++i) {
665 670 pcimemlists[i].prev =
666 671 (native_ptr_t)(uintptr_t)(pcimemlists + i - 1);
667 672 pcimemlists[i].next = 0;
668 673 pcimemlists[i - 1].next =
669 674 (native_ptr_t)(uintptr_t)(pcimemlists + i);
670 675 }
671 676 bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists;
672 677 DBG(bi->bi_pcimem);
673 678 }
674 679
675 680 #if defined(__xpv)
676 681 /*
677 682 * Initialize memory allocator stuff from hypervisor-supplied start info.
678 683 *
679 684 * There is 512KB of scratch area after the boot stack page.
680 685 * We'll use that for everything except the kernel nucleus pages which are too
681 686 * big to fit there and are allocated last anyway.
682 687 */
683 688 #define MAXMAPS 100
684 689 static mmap_t map_buffer[MAXMAPS];
685 690 static void
686 691 init_mem_alloc(void)
687 692 {
688 693 int local; /* variables needed to find start region */
689 694 paddr_t scratch_start;
690 695 xen_memory_map_t map;
691 696
692 697 DBG_MSG("Entered init_mem_alloc()\n");
693 698
694 699 /*
695 700 * Free memory follows the stack. There's at least 512KB of scratch
696 701 * space, rounded up to at least 2Mb alignment. That should be enough
697 702 * for the page tables we'll need to build. The nucleus memory is
698 703 * allocated last and will be outside the addressible range. We'll
699 704 * switch to new page tables before we unpack the kernel
700 705 */
701 706 scratch_start = RNDUP((paddr_t)(uintptr_t)&local, MMU_PAGESIZE);
702 707 DBG(scratch_start);
703 708 scratch_end = RNDUP((paddr_t)scratch_start + 512 * 1024, TWO_MEG);
704 709 DBG(scratch_end);
705 710
706 711 /*
707 712 * For paranoia, leave some space between hypervisor data and ours.
708 713 * Use 500 instead of 512.
709 714 */
710 715 next_avail_addr = scratch_end - 500 * 1024;
711 716 DBG(next_avail_addr);
712 717
713 718 /*
714 719 * The domain builder gives us at most 1 module
715 720 */
716 721 DBG(xen_info->mod_len);
717 722 if (xen_info->mod_len > 0) {
718 723 DBG(xen_info->mod_start);
719 724 modules[0].bm_addr = xen_info->mod_start;
720 725 modules[0].bm_size = xen_info->mod_len;
721 726 bi->bi_module_cnt = 1;
722 727 bi->bi_modules = (native_ptr_t)modules;
723 728 } else {
724 729 bi->bi_module_cnt = 0;
725 730 bi->bi_modules = NULL;
726 731 }
727 732 DBG(bi->bi_module_cnt);
728 733 DBG(bi->bi_modules);
729 734
730 735 DBG(xen_info->mfn_list);
731 736 DBG(xen_info->nr_pages);
732 737 max_mem = (paddr_t)xen_info->nr_pages << MMU_PAGESHIFT;
733 738 DBG(max_mem);
734 739
735 740 /*
736 741 * Using pseudo-physical addresses, so only 1 memlist element
737 742 */
738 743 memlists[0].addr = 0;
739 744 DBG(memlists[0].addr);
740 745 memlists[0].size = max_mem;
741 746 DBG(memlists[0].size);
742 747 memlists_used = 1;
743 748 DBG(memlists_used);
744 749
745 750 /*
746 751 * finish building physinstall list
747 752 */
748 753 sort_physinstall();
749 754
750 755 /*
751 756 * build bios reserved memlists
752 757 */
753 758 build_rsvdmemlists();
754 759
755 760 if (DOMAIN_IS_INITDOMAIN(xen_info)) {
756 761 /*
757 762 * build PCI Memory list
758 763 */
759 764 map.nr_entries = MAXMAPS;
↓ open down ↓ |
692 lines elided |
↑ open up ↑ |
760 765 /*LINTED: constant in conditional context*/
761 766 set_xen_guest_handle(map.buffer, map_buffer);
762 767 if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &map) != 0)
763 768 dboot_panic("getting XENMEM_machine_memory_map failed");
764 769 build_pcimemlists(map_buffer, map.nr_entries);
765 770 }
766 771 }
767 772
768 773 #else /* !__xpv */
769 774
775 +static uint8_t
776 +dboot_a2h(char v)
777 +{
778 + if (v >= 'a')
779 + return (v - 'a' + 0xa);
780 + else if (v >= 'A')
781 + return (v - 'A' + 0xa);
782 + else if (v >= '0')
783 + return (v - '0');
784 + else
785 + dboot_panic("bad ASCII hex character %c\n", v);
786 +
787 + return (0);
788 +}
789 +
790 +static void
791 +digest_a2h(const char *ascii, uint8_t *digest)
792 +{
793 + unsigned int i;
794 +
795 + for (i = 0; i < SHA1_DIGEST_LENGTH; i++) {
796 + digest[i] = dboot_a2h(ascii[i * 2]) << 4;
797 + digest[i] |= dboot_a2h(ascii[i * 2 + 1]);
798 + }
799 +}
800 +
770 801 /*
802 + * Generate a SHA-1 hash of the first len bytes of image, and compare it with
803 + * the ASCII-format hash found in the 40-byte buffer at ascii. If they
804 + * match, return 0, otherwise -1. This works only for images smaller than
805 + * 4 GB, which should not be a problem.
806 + */
807 +static int
808 +check_image_hash(const char *ascii, const void *image, size_t len)
809 +{
810 + SHA1_CTX ctx;
811 + uint8_t digest[SHA1_DIGEST_LENGTH];
812 + uint8_t baseline[SHA1_DIGEST_LENGTH];
813 + unsigned int i;
814 +
815 + digest_a2h(ascii, baseline);
816 +
817 + SHA1Init(&ctx);
818 + SHA1Update(&ctx, image, len);
819 + SHA1Final(digest, &ctx);
820 +
821 + for (i = 0; i < SHA1_DIGEST_LENGTH; i++) {
822 + if (digest[i] != baseline[i])
823 + return (-1);
824 + }
825 +
826 + return (0);
827 +}
828 +
829 +static void
830 +check_images(void)
831 +{
832 + int i;
833 + char *hashes;
834 + mb_module_t *mod, *hashmod;
835 + char *hash;
836 + char displayhash[SHA1_ASCII_LENGTH + 1];
837 + size_t hashlen;
838 + size_t len;
839 +
840 + /*
841 + * A brief note on lengths and sizes: GRUB, for reasons unknown, passes
842 + * the address of the last valid byte in a module plus 1 as mod_end.
843 + * This is of course a bug; the multiboot specification simply states
844 + * that mod_start and mod_end "contain the start and end addresses of
845 + * the boot module itself" which is pretty obviously not what GRUB is
846 + * doing. However, fixing it requires that not only this code be
847 + * changed but also that other code consuming this value and values
848 + * derived from it be fixed, and that the kernel and GRUB must either
849 + * both have the bug or neither. While there are a lot of combinations
850 + * that will work, there are also some that won't, so for simplicity
851 + * we'll just cope with the bug. That means we won't actually hash the
852 + * byte at mod_end, and we will expect that mod_end for the hash file
853 + * itself is one greater than some multiple of 41 (40 bytes of ASCII
854 + * hash plus a newline for each module).
855 + */
856 +
857 + if (mb_info->mods_count > 1) {
858 + mod = (mb_module_t *)mb_info->mods_addr;
859 + hashmod = mod + (mb_info->mods_count - 1);
860 + hashes = (char *)hashmod->mod_start;
861 + hashlen = (size_t)(hashmod->mod_end - hashmod->mod_start);
862 + hash = hashes;
863 + if (prom_debug) {
864 + dboot_printf("Hash module found at %lx size %lx\n",
865 + (ulong_t)hashes, (ulong_t)hashlen);
866 + }
867 + } else {
868 + DBG_MSG("Skipping hash check; no hash module found.\n");
869 + return;
870 + }
871 +
872 + for (mod = (mb_module_t *)(mb_info->mods_addr), i = 0;
873 + i < mb_info->mods_count - 1; ++mod, ++i) {
874 + if ((hash - hashes) + SHA1_ASCII_LENGTH + 1 > hashlen) {
875 + dboot_printf("Short hash module of length 0x%lx bytes; "
876 + "skipping hash checks\n", (ulong_t)hashlen);
877 + break;
878 + }
879 +
880 + (void) memcpy(displayhash, hash, SHA1_ASCII_LENGTH);
881 + displayhash[SHA1_ASCII_LENGTH] = '\0';
882 + if (prom_debug) {
883 + dboot_printf("Checking hash for module %d [%s]: ",
884 + i, displayhash);
885 + }
886 +
887 + len = mod->mod_end - mod->mod_start; /* see above */
888 + if (check_image_hash(hash, (void *)mod->mod_start, len) != 0) {
889 + dboot_panic("SHA-1 hash mismatch on %s; expected %s\n",
890 + (char *)mod->mod_name, displayhash);
891 + } else {
892 + DBG_MSG("OK\n");
893 + }
894 + hash += SHA1_ASCII_LENGTH + 1;
895 + }
896 +}
897 +
898 +/*
771 899 * During memory allocation, find the highest address not used yet.
772 900 */
773 901 static void
774 902 check_higher(paddr_t a)
775 903 {
776 904 if (a < next_avail_addr)
777 905 return;
778 906 next_avail_addr = RNDUP(a + 1, MMU_PAGESIZE);
779 907 DBG(next_avail_addr);
780 908 }
781 909
782 910 /*
783 911 * Walk through the module information finding the last used address.
784 912 * The first available address will become the top level page table.
785 913 *
786 914 * We then build the phys_install memlist from the multiboot information.
787 915 */
788 916 static void
789 917 init_mem_alloc(void)
790 918 {
791 919 mb_memory_map_t *mmap;
792 920 mb_module_t *mod;
793 921 uint64_t start;
794 922 uint64_t end;
795 923 uint64_t page_offset = MMU_PAGEOFFSET; /* needs to be 64 bits */
796 924 extern char _end[];
797 925 int i;
798 926
799 927 DBG_MSG("Entered init_mem_alloc()\n");
800 928 DBG((uintptr_t)mb_info);
801 929
802 930 if (mb_info->mods_count > MAX_MODULES) {
803 931 dboot_panic("Too many modules (%d) -- the maximum is %d.",
804 932 mb_info->mods_count, MAX_MODULES);
805 933 }
↓ open down ↓ |
25 lines elided |
↑ open up ↑ |
806 934 /*
807 935 * search the modules to find the last used address
808 936 * we'll build the module list while we're walking through here
809 937 */
810 938 DBG_MSG("\nFinding Modules\n");
811 939 check_higher((paddr_t)(uintptr_t)&_end);
812 940 for (mod = (mb_module_t *)(mb_info->mods_addr), i = 0;
813 941 i < mb_info->mods_count;
814 942 ++mod, ++i) {
815 943 if (prom_debug) {
816 - dboot_printf("\tmodule #%d: %s at: 0x%lx, len 0x%lx\n",
944 + dboot_printf("\tmodule #%d: %s at: 0x%lx, end 0x%lx\n",
817 945 i, (char *)(mod->mod_name),
818 946 (ulong_t)mod->mod_start, (ulong_t)mod->mod_end);
819 947 }
820 948 modules[i].bm_addr = mod->mod_start;
821 949 if (mod->mod_start > mod->mod_end) {
822 950 dboot_panic("module[%d]: Invalid module start address "
823 951 "(0x%llx)", i, (uint64_t)mod->mod_start);
824 952 }
825 953 modules[i].bm_size = mod->mod_end - mod->mod_start;
826 954
827 955 check_higher(mod->mod_end);
828 956 }
829 957 bi->bi_modules = (native_ptr_t)(uintptr_t)modules;
830 958 DBG(bi->bi_modules);
831 959 bi->bi_module_cnt = mb_info->mods_count;
832 960 DBG(bi->bi_module_cnt);
833 961
962 + check_images();
963 +
834 964 /*
835 965 * Walk through the memory map from multiboot and build our memlist
836 966 * structures. Note these will have native format pointers.
837 967 */
838 968 DBG_MSG("\nFinding Memory Map\n");
839 969 DBG(mb_info->flags);
840 970 max_mem = 0;
841 971 if (mb_info->flags & 0x40) {
842 972 int cnt = 0;
843 973
844 974 DBG(mb_info->mmap_addr);
845 975 DBG(mb_info->mmap_length);
846 976 check_higher(mb_info->mmap_addr + mb_info->mmap_length);
847 977
848 978 for (mmap = (mb_memory_map_t *)mb_info->mmap_addr;
849 979 (uint32_t)mmap < mb_info->mmap_addr + mb_info->mmap_length;
850 980 mmap = (mb_memory_map_t *)((uint32_t)mmap + mmap->size
851 981 + sizeof (mmap->size))) {
852 982 ++cnt;
853 983 start = ((uint64_t)mmap->base_addr_high << 32) +
854 984 mmap->base_addr_low;
855 985 end = start + ((uint64_t)mmap->length_high << 32) +
856 986 mmap->length_low;
857 987
858 988 if (prom_debug)
859 989 dboot_printf("\ttype: %d %" PRIx64 "..%"
860 990 PRIx64 "\n", mmap->type, start, end);
861 991
862 992 /*
863 993 * page align start and end
864 994 */
865 995 start = (start + page_offset) & ~page_offset;
866 996 end &= ~page_offset;
867 997 if (end <= start)
868 998 continue;
869 999
870 1000 /*
871 1001 * only type 1 is usable RAM
872 1002 */
873 1003 switch (mmap->type) {
874 1004 case 1:
875 1005 if (end > max_mem)
876 1006 max_mem = end;
877 1007 memlists[memlists_used].addr = start;
878 1008 memlists[memlists_used].size = end - start;
879 1009 ++memlists_used;
880 1010 if (memlists_used > MAX_MEMLIST)
881 1011 dboot_panic("too many memlists");
882 1012 break;
883 1013 case 2:
884 1014 rsvdmemlists[rsvdmemlists_used].addr = start;
885 1015 rsvdmemlists[rsvdmemlists_used].size =
886 1016 end - start;
887 1017 ++rsvdmemlists_used;
888 1018 if (rsvdmemlists_used > MAX_MEMLIST)
889 1019 dboot_panic("too many rsvdmemlists");
890 1020 break;
891 1021 default:
892 1022 continue;
893 1023 }
894 1024 }
895 1025 build_pcimemlists((mb_memory_map_t *)mb_info->mmap_addr, cnt);
896 1026 } else if (mb_info->flags & 0x01) {
897 1027 DBG(mb_info->mem_lower);
898 1028 memlists[memlists_used].addr = 0;
899 1029 memlists[memlists_used].size = mb_info->mem_lower * 1024;
900 1030 ++memlists_used;
901 1031 DBG(mb_info->mem_upper);
902 1032 memlists[memlists_used].addr = 1024 * 1024;
903 1033 memlists[memlists_used].size = mb_info->mem_upper * 1024;
904 1034 ++memlists_used;
905 1035
906 1036 /*
907 1037 * Old platform - assume I/O space at the end of memory.
908 1038 */
909 1039 pcimemlists[0].addr =
910 1040 (mb_info->mem_upper * 1024) + (1024 * 1024);
911 1041 pcimemlists[0].size = pci_hi_limit - pcimemlists[0].addr;
912 1042 pcimemlists[0].next = 0;
913 1043 pcimemlists[0].prev = 0;
914 1044 bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists;
915 1045 DBG(bi->bi_pcimem);
916 1046 } else {
917 1047 dboot_panic("No memory info from boot loader!!!");
918 1048 }
919 1049
920 1050 check_higher(bi->bi_cmdline);
921 1051
922 1052 /*
923 1053 * finish processing the physinstall list
924 1054 */
925 1055 sort_physinstall();
926 1056
927 1057 /*
928 1058 * build bios reserved mem lists
929 1059 */
930 1060 build_rsvdmemlists();
931 1061 }
932 1062 #endif /* !__xpv */
933 1063
934 1064 /*
935 1065 * Simple memory allocator, allocates aligned physical memory.
936 1066 * Note that startup_kernel() only allocates memory, never frees.
937 1067 * Memory usage just grows in an upward direction.
938 1068 */
939 1069 static void *
940 1070 do_mem_alloc(uint32_t size, uint32_t align)
941 1071 {
942 1072 uint_t i;
943 1073 uint64_t best;
944 1074 uint64_t start;
945 1075 uint64_t end;
946 1076
947 1077 /*
948 1078 * make sure size is a multiple of pagesize
949 1079 */
950 1080 size = RNDUP(size, MMU_PAGESIZE);
951 1081 next_avail_addr = RNDUP(next_avail_addr, align);
952 1082
953 1083 /*
954 1084 * XXPV fixme joe
955 1085 *
956 1086 * a really large bootarchive that causes you to run out of memory
957 1087 * may cause this to blow up
958 1088 */
959 1089 /* LINTED E_UNEXPECTED_UINT_PROMOTION */
960 1090 best = (uint64_t)-size;
961 1091 for (i = 0; i < memlists_used; ++i) {
962 1092 start = memlists[i].addr;
963 1093 #if defined(__xpv)
964 1094 start += mfn_base;
965 1095 #endif
966 1096 end = start + memlists[i].size;
967 1097
968 1098 /*
969 1099 * did we find the desired address?
970 1100 */
971 1101 if (start <= next_avail_addr && next_avail_addr + size <= end) {
972 1102 best = next_avail_addr;
973 1103 goto done;
974 1104 }
975 1105
976 1106 /*
977 1107 * if not is this address the best so far?
978 1108 */
979 1109 if (start > next_avail_addr && start < best &&
980 1110 RNDUP(start, align) + size <= end)
981 1111 best = RNDUP(start, align);
982 1112 }
983 1113
984 1114 /*
985 1115 * We didn't find exactly the address we wanted, due to going off the
986 1116 * end of a memory region. Return the best found memory address.
987 1117 */
988 1118 done:
989 1119 next_avail_addr = best + size;
990 1120 #if defined(__xpv)
991 1121 if (next_avail_addr > scratch_end)
992 1122 dboot_panic("Out of mem next_avail: 0x%lx, scratch_end: "
993 1123 "0x%lx", (ulong_t)next_avail_addr,
994 1124 (ulong_t)scratch_end);
995 1125 #endif
996 1126 (void) memset((void *)(uintptr_t)best, 0, size);
997 1127 return ((void *)(uintptr_t)best);
998 1128 }
999 1129
1000 1130 void *
1001 1131 mem_alloc(uint32_t size)
1002 1132 {
1003 1133 return (do_mem_alloc(size, MMU_PAGESIZE));
1004 1134 }
1005 1135
1006 1136
1007 1137 /*
1008 1138 * Build page tables to map all of memory used so far as well as the kernel.
1009 1139 */
1010 1140 static void
1011 1141 build_page_tables(void)
1012 1142 {
1013 1143 uint32_t psize;
1014 1144 uint32_t level;
1015 1145 uint32_t off;
1016 1146 uint64_t start;
1017 1147 #if !defined(__xpv)
1018 1148 uint32_t i;
1019 1149 uint64_t end;
1020 1150 #endif /* __xpv */
1021 1151
1022 1152 /*
1023 1153 * If we're on metal, we need to create the top level pagetable.
1024 1154 */
1025 1155 #if defined(__xpv)
1026 1156 top_page_table = (paddr_t)(uintptr_t)xen_info->pt_base;
1027 1157 #else /* __xpv */
1028 1158 top_page_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
1029 1159 #endif /* __xpv */
1030 1160 DBG((uintptr_t)top_page_table);
1031 1161
1032 1162 /*
1033 1163 * Determine if we'll use large mappings for kernel, then map it.
1034 1164 */
1035 1165 if (largepage_support) {
1036 1166 psize = lpagesize;
1037 1167 level = 1;
1038 1168 } else {
1039 1169 psize = MMU_PAGESIZE;
1040 1170 level = 0;
1041 1171 }
1042 1172
1043 1173 DBG_MSG("Mapping kernel\n");
1044 1174 DBG(ktext_phys);
1045 1175 DBG(target_kernel_text);
1046 1176 DBG(ksize);
1047 1177 DBG(psize);
1048 1178 for (off = 0; off < ksize; off += psize)
1049 1179 map_pa_at_va(ktext_phys + off, target_kernel_text + off, level);
1050 1180
1051 1181 /*
1052 1182 * The kernel will need a 1 page window to work with page tables
1053 1183 */
1054 1184 bi->bi_pt_window = (uintptr_t)mem_alloc(MMU_PAGESIZE);
1055 1185 DBG(bi->bi_pt_window);
1056 1186 bi->bi_pte_to_pt_window =
1057 1187 (uintptr_t)find_pte(bi->bi_pt_window, NULL, 0, 0);
1058 1188 DBG(bi->bi_pte_to_pt_window);
1059 1189
1060 1190 #if defined(__xpv)
1061 1191 if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
1062 1192 /* If this is a domU we're done. */
1063 1193 DBG_MSG("\nPage tables constructed\n");
1064 1194 return;
1065 1195 }
1066 1196 #endif /* __xpv */
1067 1197
1068 1198 /*
1069 1199 * We need 1:1 mappings for the lower 1M of memory to access
1070 1200 * BIOS tables used by a couple of drivers during boot.
1071 1201 *
1072 1202 * The following code works because our simple memory allocator
1073 1203 * only grows usage in an upwards direction.
1074 1204 *
1075 1205 * Note that by this point in boot some mappings for low memory
1076 1206 * may already exist because we've already accessed device in low
1077 1207 * memory. (Specifically the video frame buffer and keyboard
1078 1208 * status ports.) If we're booting on raw hardware then GRUB
1079 1209 * created these mappings for us. If we're booting under a
1080 1210 * hypervisor then we went ahead and remapped these devices into
1081 1211 * memory allocated within dboot itself.
1082 1212 */
1083 1213 if (map_debug)
1084 1214 dboot_printf("1:1 map pa=0..1Meg\n");
1085 1215 for (start = 0; start < 1024 * 1024; start += MMU_PAGESIZE) {
1086 1216 #if defined(__xpv)
1087 1217 map_ma_at_va(start, start, 0);
1088 1218 #else /* __xpv */
1089 1219 map_pa_at_va(start, start, 0);
1090 1220 #endif /* __xpv */
1091 1221 }
1092 1222
1093 1223 #if !defined(__xpv)
1094 1224 for (i = 0; i < memlists_used; ++i) {
1095 1225 start = memlists[i].addr;
1096 1226
1097 1227 end = start + memlists[i].size;
1098 1228
1099 1229 if (map_debug)
1100 1230 dboot_printf("1:1 map pa=%" PRIx64 "..%" PRIx64 "\n",
1101 1231 start, end);
1102 1232 while (start < end && start < next_avail_addr) {
1103 1233 map_pa_at_va(start, start, 0);
1104 1234 start += MMU_PAGESIZE;
1105 1235 }
1106 1236 }
1107 1237 #endif /* !__xpv */
1108 1238
1109 1239 DBG_MSG("\nPage tables constructed\n");
1110 1240 }
1111 1241
1112 1242 #define NO_MULTIBOOT \
1113 1243 "multiboot is no longer used to boot the Solaris Operating System.\n\
1114 1244 The grub entry should be changed to:\n\
1115 1245 kernel$ /platform/i86pc/kernel/$ISADIR/unix\n\
1116 1246 module$ /platform/i86pc/$ISADIR/boot_archive\n\
1117 1247 See http://illumos.org/msg/SUNOS-8000-AK for details.\n"
1118 1248
1119 1249 /*
1120 1250 * startup_kernel has a pretty simple job. It builds pagetables which reflect
1121 1251 * 1:1 mappings for all memory in use. It then also adds mappings for
1122 1252 * the kernel nucleus at virtual address of target_kernel_text using large page
1123 1253 * mappings. The page table pages are also accessible at 1:1 mapped
1124 1254 * virtual addresses.
1125 1255 */
1126 1256 /*ARGSUSED*/
1127 1257 void
1128 1258 startup_kernel(void)
1129 1259 {
1130 1260 char *cmdline;
1131 1261 uintptr_t addr;
1132 1262 #if defined(__xpv)
1133 1263 physdev_set_iopl_t set_iopl;
1134 1264 #endif /* __xpv */
1135 1265
1136 1266 /*
1137 1267 * At this point we are executing in a 32 bit real mode.
1138 1268 */
1139 1269 #if defined(__xpv)
1140 1270 cmdline = (char *)xen_info->cmd_line;
1141 1271 #else /* __xpv */
1142 1272 cmdline = (char *)mb_info->cmdline;
1143 1273 #endif /* __xpv */
1144 1274
1145 1275 prom_debug = (strstr(cmdline, "prom_debug") != NULL);
1146 1276 map_debug = (strstr(cmdline, "map_debug") != NULL);
1147 1277
1148 1278 #if defined(__xpv)
1149 1279 /*
1150 1280 * For dom0, before we initialize the console subsystem we'll
1151 1281 * need to enable io operations, so set I/O priveldge level to 1.
1152 1282 */
1153 1283 if (DOMAIN_IS_INITDOMAIN(xen_info)) {
1154 1284 set_iopl.iopl = 1;
1155 1285 (void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
1156 1286 }
1157 1287 #endif /* __xpv */
1158 1288
1159 1289 bcons_init(cmdline);
1160 1290 DBG_MSG("\n\nSolaris prekernel set: ");
1161 1291 DBG_MSG(cmdline);
1162 1292 DBG_MSG("\n");
1163 1293
1164 1294 if (strstr(cmdline, "multiboot") != NULL) {
1165 1295 dboot_panic(NO_MULTIBOOT);
1166 1296 }
1167 1297
1168 1298 /*
1169 1299 * boot info must be 16 byte aligned for 64 bit kernel ABI
1170 1300 */
1171 1301 addr = (uintptr_t)boot_info;
1172 1302 addr = (addr + 0xf) & ~0xf;
1173 1303 bi = (struct xboot_info *)addr;
1174 1304 DBG((uintptr_t)bi);
1175 1305 bi->bi_cmdline = (native_ptr_t)(uintptr_t)cmdline;
1176 1306
1177 1307 /*
1178 1308 * Need correct target_kernel_text value
1179 1309 */
1180 1310 #if defined(_BOOT_TARGET_amd64)
1181 1311 target_kernel_text = KERNEL_TEXT_amd64;
1182 1312 #elif defined(__xpv)
1183 1313 target_kernel_text = KERNEL_TEXT_i386_xpv;
1184 1314 #else
1185 1315 target_kernel_text = KERNEL_TEXT_i386;
1186 1316 #endif
1187 1317 DBG(target_kernel_text);
1188 1318
1189 1319 #if defined(__xpv)
1190 1320
1191 1321 /*
1192 1322 * XXPV Derive this stuff from CPUID / what the hypervisor has enabled
1193 1323 */
1194 1324
1195 1325 #if defined(_BOOT_TARGET_amd64)
1196 1326 /*
1197 1327 * 64-bit hypervisor.
1198 1328 */
1199 1329 amd64_support = 1;
1200 1330 pae_support = 1;
1201 1331
1202 1332 #else /* _BOOT_TARGET_amd64 */
1203 1333
1204 1334 /*
1205 1335 * See if we are running on a PAE Hypervisor
1206 1336 */
1207 1337 {
1208 1338 xen_capabilities_info_t caps;
1209 1339
1210 1340 if (HYPERVISOR_xen_version(XENVER_capabilities, &caps) != 0)
1211 1341 dboot_panic("HYPERVISOR_xen_version(caps) failed");
1212 1342 caps[sizeof (caps) - 1] = 0;
1213 1343 if (prom_debug)
1214 1344 dboot_printf("xen capabilities %s\n", caps);
1215 1345 if (strstr(caps, "x86_32p") != NULL)
1216 1346 pae_support = 1;
1217 1347 }
1218 1348
1219 1349 #endif /* _BOOT_TARGET_amd64 */
1220 1350 {
1221 1351 xen_platform_parameters_t p;
1222 1352
1223 1353 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &p) != 0)
1224 1354 dboot_panic("HYPERVISOR_xen_version(parms) failed");
1225 1355 DBG(p.virt_start);
1226 1356 mfn_to_pfn_mapping = (pfn_t *)(xen_virt_start = p.virt_start);
1227 1357 }
1228 1358
1229 1359 /*
1230 1360 * The hypervisor loads stuff starting at 1Gig
1231 1361 */
1232 1362 mfn_base = ONE_GIG;
1233 1363 DBG(mfn_base);
1234 1364
1235 1365 /*
1236 1366 * enable writable page table mode for the hypervisor
1237 1367 */
1238 1368 if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
1239 1369 VMASST_TYPE_writable_pagetables) < 0)
1240 1370 dboot_panic("HYPERVISOR_vm_assist(writable_pagetables) failed");
1241 1371
1242 1372 /*
1243 1373 * check for NX support
1244 1374 */
1245 1375 if (pae_support) {
1246 1376 uint32_t eax = 0x80000000;
1247 1377 uint32_t edx = get_cpuid_edx(&eax);
1248 1378
1249 1379 if (eax >= 0x80000001) {
1250 1380 eax = 0x80000001;
1251 1381 edx = get_cpuid_edx(&eax);
1252 1382 if (edx & CPUID_AMD_EDX_NX)
1253 1383 NX_support = 1;
1254 1384 }
1255 1385 }
1256 1386
1257 1387 #if !defined(_BOOT_TARGET_amd64)
1258 1388
1259 1389 /*
1260 1390 * The 32-bit hypervisor uses segmentation to protect itself from
1261 1391 * guests. This means when a guest attempts to install a flat 4GB
1262 1392 * code or data descriptor the 32-bit hypervisor will protect itself
1263 1393 * by silently shrinking the segment such that if the guest attempts
1264 1394 * any access where the hypervisor lives a #gp fault is generated.
1265 1395 * The problem is that some applications expect a full 4GB flat
1266 1396 * segment for their current thread pointer and will use negative
1267 1397 * offset segment wrap around to access data. TLS support in linux
1268 1398 * brand is one example of this.
1269 1399 *
1270 1400 * The 32-bit hypervisor can catch the #gp fault in these cases
1271 1401 * and emulate the access without passing the #gp fault to the guest
1272 1402 * but only if VMASST_TYPE_4gb_segments is explicitly turned on.
1273 1403 * Seems like this should have been the default.
1274 1404 * Either way, we want the hypervisor -- and not Solaris -- to deal
1275 1405 * to deal with emulating these accesses.
1276 1406 */
1277 1407 if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
1278 1408 VMASST_TYPE_4gb_segments) < 0)
1279 1409 dboot_panic("HYPERVISOR_vm_assist(4gb_segments) failed");
1280 1410 #endif /* !_BOOT_TARGET_amd64 */
1281 1411
1282 1412 #else /* __xpv */
1283 1413
1284 1414 /*
1285 1415 * use cpuid to enable MMU features
1286 1416 */
1287 1417 if (have_cpuid()) {
1288 1418 uint32_t eax, edx;
1289 1419
1290 1420 eax = 1;
1291 1421 edx = get_cpuid_edx(&eax);
1292 1422 if (edx & CPUID_INTC_EDX_PSE)
1293 1423 largepage_support = 1;
1294 1424 if (edx & CPUID_INTC_EDX_PGE)
1295 1425 pge_support = 1;
1296 1426 if (edx & CPUID_INTC_EDX_PAE)
1297 1427 pae_support = 1;
1298 1428
1299 1429 eax = 0x80000000;
1300 1430 edx = get_cpuid_edx(&eax);
1301 1431 if (eax >= 0x80000001) {
1302 1432 eax = 0x80000001;
1303 1433 edx = get_cpuid_edx(&eax);
1304 1434 if (edx & CPUID_AMD_EDX_LM)
1305 1435 amd64_support = 1;
1306 1436 if (edx & CPUID_AMD_EDX_NX)
1307 1437 NX_support = 1;
1308 1438 }
1309 1439 } else {
1310 1440 dboot_printf("cpuid not supported\n");
1311 1441 }
1312 1442 #endif /* __xpv */
1313 1443
1314 1444
1315 1445 #if defined(_BOOT_TARGET_amd64)
1316 1446 if (amd64_support == 0)
1317 1447 dboot_panic("long mode not supported, rebooting");
1318 1448 else if (pae_support == 0)
1319 1449 dboot_panic("long mode, but no PAE; rebooting");
1320 1450 #else
1321 1451 /*
1322 1452 * Allow the command line to over-ride use of PAE for 32 bit.
1323 1453 */
1324 1454 if (strstr(cmdline, "disablePAE=true") != NULL) {
1325 1455 pae_support = 0;
1326 1456 NX_support = 0;
1327 1457 amd64_support = 0;
1328 1458 }
1329 1459 #endif
1330 1460
1331 1461 /*
1332 1462 * initialize the simple memory allocator
1333 1463 */
1334 1464 init_mem_alloc();
1335 1465
1336 1466 #if !defined(__xpv) && !defined(_BOOT_TARGET_amd64)
1337 1467 /*
1338 1468 * disable PAE on 32 bit h/w w/o NX and < 4Gig of memory
1339 1469 */
1340 1470 if (max_mem < FOUR_GIG && NX_support == 0)
1341 1471 pae_support = 0;
1342 1472 #endif
1343 1473
1344 1474 /*
1345 1475 * configure mmu information
1346 1476 */
1347 1477 if (pae_support) {
1348 1478 shift_amt = shift_amt_pae;
1349 1479 ptes_per_table = 512;
1350 1480 pte_size = 8;
1351 1481 lpagesize = TWO_MEG;
1352 1482 #if defined(_BOOT_TARGET_amd64)
1353 1483 top_level = 3;
1354 1484 #else
1355 1485 top_level = 2;
1356 1486 #endif
1357 1487 } else {
1358 1488 pae_support = 0;
1359 1489 NX_support = 0;
1360 1490 shift_amt = shift_amt_nopae;
1361 1491 ptes_per_table = 1024;
1362 1492 pte_size = 4;
1363 1493 lpagesize = FOUR_MEG;
1364 1494 top_level = 1;
1365 1495 }
1366 1496
1367 1497 DBG(pge_support);
1368 1498 DBG(NX_support);
1369 1499 DBG(largepage_support);
1370 1500 DBG(amd64_support);
1371 1501 DBG(top_level);
1372 1502 DBG(pte_size);
1373 1503 DBG(ptes_per_table);
1374 1504 DBG(lpagesize);
1375 1505
1376 1506 #if defined(__xpv)
1377 1507 ktext_phys = ONE_GIG; /* from UNIX Mapfile */
1378 1508 #else
1379 1509 ktext_phys = FOUR_MEG; /* from UNIX Mapfile */
1380 1510 #endif
1381 1511
1382 1512 #if !defined(__xpv) && defined(_BOOT_TARGET_amd64)
1383 1513 /*
1384 1514 * For grub, copy kernel bits from the ELF64 file to final place.
1385 1515 */
1386 1516 DBG_MSG("\nAllocating nucleus pages.\n");
1387 1517 ktext_phys = (uintptr_t)do_mem_alloc(ksize, FOUR_MEG);
1388 1518 if (ktext_phys == 0)
1389 1519 dboot_panic("failed to allocate aligned kernel memory");
1390 1520 if (dboot_elfload64(mb_header.load_addr) != 0)
1391 1521 dboot_panic("failed to parse kernel ELF image, rebooting");
1392 1522 #endif
1393 1523
1394 1524 DBG(ktext_phys);
1395 1525
1396 1526 /*
1397 1527 * Allocate page tables.
1398 1528 */
1399 1529 build_page_tables();
1400 1530
1401 1531 /*
1402 1532 * return to assembly code to switch to running kernel
1403 1533 */
1404 1534 entry_addr_low = (uint32_t)target_kernel_text;
1405 1535 DBG(entry_addr_low);
1406 1536 bi->bi_use_largepage = largepage_support;
1407 1537 bi->bi_use_pae = pae_support;
1408 1538 bi->bi_use_pge = pge_support;
1409 1539 bi->bi_use_nx = NX_support;
1410 1540
1411 1541 #if defined(__xpv)
1412 1542
1413 1543 bi->bi_next_paddr = next_avail_addr - mfn_base;
1414 1544 DBG(bi->bi_next_paddr);
1415 1545 bi->bi_next_vaddr = (native_ptr_t)next_avail_addr;
1416 1546 DBG(bi->bi_next_vaddr);
1417 1547
1418 1548 /*
1419 1549 * unmap unused pages in start area to make them available for DMA
1420 1550 */
1421 1551 while (next_avail_addr < scratch_end) {
1422 1552 (void) HYPERVISOR_update_va_mapping(next_avail_addr,
1423 1553 0, UVMF_INVLPG | UVMF_LOCAL);
1424 1554 next_avail_addr += MMU_PAGESIZE;
1425 1555 }
1426 1556
1427 1557 bi->bi_xen_start_info = (uintptr_t)xen_info;
1428 1558 DBG((uintptr_t)HYPERVISOR_shared_info);
1429 1559 bi->bi_shared_info = (native_ptr_t)HYPERVISOR_shared_info;
1430 1560 bi->bi_top_page_table = (uintptr_t)top_page_table - mfn_base;
1431 1561
1432 1562 #else /* __xpv */
1433 1563
1434 1564 bi->bi_next_paddr = next_avail_addr;
1435 1565 DBG(bi->bi_next_paddr);
1436 1566 bi->bi_next_vaddr = (uintptr_t)next_avail_addr;
1437 1567 DBG(bi->bi_next_vaddr);
1438 1568 bi->bi_mb_info = (uintptr_t)mb_info;
1439 1569 bi->bi_top_page_table = (uintptr_t)top_page_table;
1440 1570
1441 1571 #endif /* __xpv */
1442 1572
1443 1573 bi->bi_kseg_size = FOUR_MEG;
1444 1574 DBG(bi->bi_kseg_size);
1445 1575
1446 1576 #ifndef __xpv
1447 1577 if (map_debug)
1448 1578 dump_tables();
1449 1579 #endif
1450 1580
1451 1581 DBG_MSG("\n\n*** DBOOT DONE -- back to asm to jump to kernel\n\n");
1452 1582 }
↓ open down ↓ |
609 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX