1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2012 Gary Mills
23 * Copyright 2016 PALO, Richard.
24 *
25 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
26 *
27 * Copyright 2018 Joyent, Inc.
28 */
29
30 #include <sys/types.h>
31 #include <sys/clock.h>
32 #include <sys/psm.h>
33 #include <sys/archsystm.h>
34 #include <sys/machsystm.h>
35 #include <sys/compress.h>
36 #include <sys/modctl.h>
37 #include <sys/trap.h>
38 #include <sys/panic.h>
39 #include <sys/regset.h>
40 #include <sys/frame.h>
41 #include <sys/kobj.h>
42 #include <sys/apic.h>
43 #include <sys/apic_timer.h>
44 #include <sys/dumphdr.h>
45 #include <sys/mem.h>
46 #include <sys/x86_archext.h>
47 #include <sys/xpv_panic.h>
48 #include <sys/boot_console.h>
49 #include <sys/bootsvcs.h>
50 #include <sys/consdev.h>
51 #include <vm/hat_pte.h>
52 #include <vm/hat_i86.h>
53
54 /* XXX: need to add a PAE version too, if we ever support both PAE and non */
55 #if defined(__i386)
56 #define XPV_FILENAME "/boot/xen-syms"
57 #else
58 #define XPV_FILENAME "/boot/amd64/xen-syms"
59 #endif
60 #define XPV_MODNAME "xpv"
61
62 int xpv_panicking = 0;
63
64 struct module *xpv_module;
65 struct modctl *xpv_modctl;
66
67 #define ALIGN(x, a) ((a) == 0 ? (uintptr_t)(x) : \
68 (((uintptr_t)(x) + (uintptr_t)(a) - 1l) & ~((uintptr_t)(a) - 1l)))
69
70 /* Pointer to the xpv_panic_info structure handed to us by Xen. */
71 static struct panic_info *xpv_panic_info = NULL;
72
73 /* Timer support */
74 #define NSEC_SHIFT 5
75 #define T_XPV_TIMER 0xd1
76 #define XPV_TIMER_INTERVAL 1000 /* 1000 microseconds */
77 static uint32_t *xpv_apicadr = NULL;
78 static uint_t nsec_scale;
79
80 /* IDT support */
81 #pragma align 16(xpv_panic_idt)
82 static gate_desc_t xpv_panic_idt[NIDT]; /* interrupt descriptor table */
83
84 /* Xen pagetables mapped into our HAT's ptable windows */
85 static pfn_t ptable_pfn[MAX_NUM_LEVEL];
86
87 /* Number of MMU_PAGESIZE pages we're adding to the Solaris dump */
88 static int xpv_dump_pages;
89
90 /*
91 * There are up to two large swathes of RAM that we don't want to include
92 * in the dump: those that comprise the Xen version of segkpm. On 32-bit
93 * systems there is no such region of memory. On 64-bit systems, there
94 * should be just a single contiguous region that corresponds to all of
95 * physical memory. The tricky bit is that Xen's heap sometimes lives in
96 * the middle of their segkpm, and is mapped using only kpm-like addresses.
97 * In that case, we need to skip the swathes before and after Xen's heap.
98 */
99 uintptr_t kpm1_low = 0;
100 uintptr_t kpm1_high = 0;
101 uintptr_t kpm2_low = 0;
102 uintptr_t kpm2_high = 0;
103
104 /*
105 * Some commonly used values that we don't want to recompute over and over.
106 */
107 static int xpv_panic_nptes[MAX_NUM_LEVEL];
108 static ulong_t xpv_panic_cr3;
109 static uintptr_t xpv_end;
110
111 static void xpv_panic_console_print(const char *fmt, ...);
112 static void (*xpv_panic_printf)(const char *, ...) = xpv_panic_console_print;
113
114 #define CONSOLE_BUF_SIZE 256
115 static char console_buffer[CONSOLE_BUF_SIZE];
116 static boolean_t use_polledio;
117
118 /*
119 * Pointers to machine check panic info (if any).
120 */
121 xpv_mca_panic_data_t *xpv_mca_panic_data = NULL;
122
123 static void
124 xpv_panic_putc(int m)
125 {
126 struct cons_polledio *c = cons_polledio;
127
128 /* This really shouldn't happen */
129 if (boot_console_type(NULL) == CONS_HYPERVISOR)
130 return;
131
132 if (use_polledio == B_TRUE)
133 c->cons_polledio_putchar(c->cons_polledio_argument, m);
134 else
135 bcons_putchar(m);
136 }
137
138 static void
139 xpv_panic_puts(char *msg)
140 {
141 char *m;
142
143 dump_timeleft = dump_timeout;
144 for (m = msg; *m; m++)
145 xpv_panic_putc((int)*m);
146 }
147
148 static void
149 xpv_panic_console_print(const char *fmt, ...)
150 {
151 va_list ap;
152
153 va_start(ap, fmt);
154 (void) vsnprintf(console_buffer, sizeof (console_buffer), fmt, ap);
155 va_end(ap);
156
157 xpv_panic_puts(console_buffer);
158 }
159
160 static void
161 xpv_panic_map(int level, pfn_t pfn)
162 {
163 x86pte_t pte, *pteptr;
164
165 /*
166 * The provided pfn represents a level 'level' page table. Map it
167 * into the 'level' slot in the list of page table windows.
168 */
169 pteptr = (x86pte_t *)PWIN_PTE_VA(level);
170 pte = pfn_to_pa(pfn) | PT_VALID;
171
172 XPV_ALLOW_PAGETABLE_UPDATES();
173 if (mmu.pae_hat)
174 *pteptr = pte;
175 else
176 *(x86pte32_t *)pteptr = pte;
177 XPV_DISALLOW_PAGETABLE_UPDATES();
178
179 mmu_flush_tlb_page((uintptr_t)PWIN_VA(level));
180 }
181
182 /*
183 * Walk the page tables to find the pfn mapped by the given va.
184 */
185 static pfn_t
186 xpv_va_walk(uintptr_t *vaddr)
187 {
188 int l, idx;
189 pfn_t pfn;
190 x86pte_t pte;
191 x86pte_t *ptep;
192 uintptr_t va = *vaddr;
193 uintptr_t scan_va;
194 caddr_t ptable_window;
195 static pfn_t toplevel_pfn;
196 static uintptr_t lastva;
197
198 /*
199 * If we do anything other than a simple scan through memory, don't
200 * trust the mapped page tables.
201 */
202 if (va != lastva + MMU_PAGESIZE)
203 for (l = mmu.max_level; l >= 0; l--)
204 ptable_pfn[l] = PFN_INVALID;
205
206 toplevel_pfn = mmu_btop(xpv_panic_cr3);
207
208 while (va < xpv_end && va >= *vaddr) {
209 /* Find the lowest table with any entry for va */
210 pfn = toplevel_pfn;
211 for (l = mmu.max_level; l >= 0; l--) {
212 if (ptable_pfn[l] != pfn) {
213 xpv_panic_map(l, pfn);
214 ptable_pfn[l] = pfn;
215 }
216
217 /*
218 * Search this pagetable for any mapping to an
219 * address >= va.
220 */
221 ptable_window = PWIN_VA(l);
222 if (l == mmu.max_level && mmu.pae_hat)
223 ptable_window +=
224 (xpv_panic_cr3 & MMU_PAGEOFFSET);
225
226 idx = (va >> LEVEL_SHIFT(l)) & (xpv_panic_nptes[l] - 1);
227 scan_va = va;
228 while (idx < xpv_panic_nptes[l] && scan_va < xpv_end &&
229 scan_va >= *vaddr) {
230 ptep = (x86pte_t *)(ptable_window +
231 (idx << mmu.pte_size_shift));
232 pte = GET_PTE(ptep);
233 if (pte & PTE_VALID)
234 break;
235 idx++;
236 scan_va += mmu.level_size[l];
237 }
238
239 /*
240 * If there are no valid mappings in this table, we
241 * can skip to the end of the VA range it covers.
242 */
243 if (idx == xpv_panic_nptes[l]) {
244 va = NEXT_ENTRY_VA(va, l + 1);
245 break;
246 }
247
248 va = scan_va;
249 /*
250 * See if we've hit the end of the range.
251 */
252 if (va >= xpv_end || va < *vaddr)
253 break;
254
255 /*
256 * If this mapping is for a pagetable, we drop down
257 * to the next level in the hierarchy and look for
258 * a mapping in it.
259 */
260 pfn = PTE2MFN(pte, l);
261 if (!PTE_ISPAGE(pte, l))
262 continue;
263
264 /*
265 * The APIC page is magic. Nothing to see here;
266 * move along.
267 */
268 if (((uintptr_t)xpv_apicadr & MMU_PAGEMASK) ==
269 (va & MMU_PAGEMASK)) {
270 va += MMU_PAGESIZE;
271 break;
272 }
273
274 /*
275 * See if the address is within one of the two
276 * kpm-like regions we want to skip.
277 */
278 if (va >= kpm1_low && va < kpm1_high) {
279 va = kpm1_high;
280 break;
281 }
282 if (va >= kpm2_low && va < kpm2_high) {
283 va = kpm2_high;
284 break;
285 }
286
287 /*
288 * The Xen panic code only handles small pages. If
289 * this mapping is for a large page, we need to
290 * identify the consituent page that covers the
291 * specific VA we were looking for.
292 */
293 if (l > 0) {
294 if (l > 1)
295 panic("Xen panic can't cope with "
296 "giant pages.");
297 idx = (va >> LEVEL_SHIFT(0)) &
298 (xpv_panic_nptes[0] - 1);
299 pfn += idx;
300 }
301
302 *vaddr = va;
303 lastva = va;
304 return (pfn | PFN_IS_FOREIGN_MFN);
305 }
306 }
307 return (PFN_INVALID);
308 }
309
310 /*
311 * Walk through the Xen VA space, finding pages that are mapped in.
312 *
313 * These pages all have MFNs rather than PFNs, meaning they may be outside
314 * the physical address space the kernel knows about, or they may collide
315 * with PFNs the kernel is using.
316 *
317 * The obvious trick of just adding the PFN_IS_FOREIGN_MFN bit to the MFNs
318 * to avoid collisions doesn't work. The pages need to be written to disk
319 * in PFN-order or savecore gets confused. We can't allocate memory to
320 * contruct a sorted pfn->VA reverse mapping, so we have to write the pages
321 * to disk in VA order.
322 *
323 * To square this circle, we simply make up PFNs for each of Xen's pages.
324 * We assign each mapped page a fake PFN in ascending order. These fake
325 * PFNs each have the FOREIGN bit set, ensuring that they fall outside the
326 * range of Solaris PFNs written by the kernel.
327 */
328 int
329 dump_xpv_addr()
330 {
331 uintptr_t va;
332 mem_vtop_t mem_vtop;
333
334 xpv_dump_pages = 0;
335 va = xen_virt_start;
336
337 while (xpv_va_walk(&va) != PFN_INVALID) {
338 mem_vtop.m_as = &kas;
339 mem_vtop.m_va = (void *)va;
340 mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN;
341
342 dumpvp_write(&mem_vtop, sizeof (mem_vtop_t));
343 xpv_dump_pages++;
344
345 va += MMU_PAGESIZE;
346 }
347
348 /*
349 * Add the shared_info page. This page actually ends up in the
350 * dump twice: once for the Xen va and once for the Solaris va.
351 * This isn't ideal, but we don't know the address Xen is using for
352 * the page, so we can't share it.
353 */
354 mem_vtop.m_as = &kas;
355 mem_vtop.m_va = HYPERVISOR_shared_info;
356 mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN;
357 dumpvp_write(&mem_vtop, sizeof (mem_vtop_t));
358 xpv_dump_pages++;
359
360 return (xpv_dump_pages);
361 }
362
363 void
364 dump_xpv_pfn()
365 {
366 pfn_t pfn;
367 int cnt;
368
369 for (cnt = 0; cnt < xpv_dump_pages; cnt++) {
370 pfn = (pfn_t)cnt | PFN_IS_FOREIGN_MFN;
371 dumpvp_write(&pfn, sizeof (pfn));
372 }
373 }
374
375 int
376 dump_xpv_data(void *dump_cbuf)
377 {
378 uintptr_t va;
379 uint32_t csize;
380 int cnt = 0;
381
382 /*
383 * XXX: we should probably run this data through a UE check. The
384 * catch is that the UE code relies on on_trap() and getpfnum()
385 * working.
386 */
387 va = xen_virt_start;
388
389 while (xpv_va_walk(&va) != PFN_INVALID) {
390 csize = (uint32_t)compress((void *)va, dump_cbuf, PAGESIZE);
391 dumpvp_write(&csize, sizeof (uint32_t));
392 dumpvp_write(dump_cbuf, csize);
393 if (dump_ioerr) {
394 dumphdr->dump_flags &= ~DF_COMPLETE;
395 return (cnt);
396 }
397 cnt++;
398 va += MMU_PAGESIZE;
399 }
400
401 /*
402 * Finally, dump the shared_info page
403 */
404 csize = (uint32_t)compress((void *)HYPERVISOR_shared_info, dump_cbuf,
405 PAGESIZE);
406 dumpvp_write(&csize, sizeof (uint32_t));
407 dumpvp_write(dump_cbuf, csize);
408 if (dump_ioerr)
409 dumphdr->dump_flags &= ~DF_COMPLETE;
410 cnt++;
411
412 return (cnt);
413 }
414
415 static void *
416 showstack(void *fpreg, int xpv_only)
417 {
418 struct frame *fpp;
419 ulong_t off;
420 char *sym;
421 uintptr_t pc, fp, lastfp;
422 uintptr_t minaddr = min(KERNELBASE, xen_virt_start);
423
424 fp = (uintptr_t)fpreg;
425 if (fp < minaddr) {
426 xpv_panic_printf("Bad frame ptr: 0x%p\n", fpreg);
427 return (fpreg);
428 }
429
430 do {
431 fpp = (struct frame *)fp;
432 pc = fpp->fr_savpc;
433
434 if ((xpv_only != 0) &&
435 (fp > xpv_end || fp < xen_virt_start))
436 break;
437 if ((sym = kobj_getsymname(pc, &off)) != NULL)
438 xpv_panic_printf("%08lx %s:%s+%lx\n", fp,
439 mod_containing_pc((caddr_t)pc), sym, off);
440 else if ((pc >= xen_virt_start) && (pc <= xpv_end))
441 xpv_panic_printf("%08lx 0x%lx (in Xen)\n", fp, pc);
442 else
443 xpv_panic_printf("%08lx %lx\n", fp, pc);
444
445 lastfp = fp;
446 fp = fpp->fr_savfp;
447
448 /*
449 * Xen marks an exception frame by inverting the frame
450 * pointer.
451 */
452 if (fp < lastfp) {
453 if ((~fp > minaddr) && ((~fp) ^ lastfp) < 0xfff)
454 fp = ~fp;
455 }
456 } while (fp > lastfp);
457 return ((void *)fp);
458 }
459
460 void *
461 xpv_traceback(void *fpreg)
462 {
463 return (showstack(fpreg, 1));
464 }
465
466 #if defined(__amd64)
467 static void
468 xpv_panic_hypercall(ulong_t call)
469 {
470 panic("Illegally issued hypercall %d during panic!\n", (int)call);
471 }
472 #endif
473
474 void
475 xpv_die(struct regs *rp)
476 {
477 struct panic_trap_info ti;
478 struct cregs creg;
479
480 ti.trap_regs = rp;
481 ti.trap_type = rp->r_trapno;
482
483 curthread->t_panic_trap = &ti;
484 if (ti.trap_type == T_PGFLT) {
485 getcregs(&creg);
486 ti.trap_addr = (caddr_t)creg.cr_cr2;
487 panic("Fatal pagefault at 0x%lx. fault addr=0x%p rp=0x%p",
488 rp->r_pc, (void *)ti.trap_addr, (void *)rp);
489 } else {
490 ti.trap_addr = (caddr_t)rp->r_pc;
491 panic("Fatal trap %ld at 0x%lx. rp=0x%p", rp->r_trapno,
492 rp->r_pc, (void *)rp);
493 }
494 }
495
496 /*
497 * Build IDT to handle a Xen panic
498 */
499 static void
500 switch_to_xpv_panic_idt()
501 {
502 int i;
503 desctbr_t idtr;
504 gate_desc_t *idt = xpv_panic_idt;
505 selector_t cs = get_cs_register();
506
507 for (i = 0; i < 32; i++)
508 set_gatesegd(&idt[i], &xpv_invaltrap, cs, SDT_SYSIGT, TRP_XPL,
509 0);
510
511 set_gatesegd(&idt[T_ZERODIV], &xpv_div0trap, cs, SDT_SYSIGT, TRP_XPL,
512 0);
513 set_gatesegd(&idt[T_SGLSTP], &xpv_dbgtrap, cs, SDT_SYSIGT, TRP_XPL, 0);
514 set_gatesegd(&idt[T_NMIFLT], &xpv_nmiint, cs, SDT_SYSIGT, TRP_XPL, 0);
515 set_gatesegd(&idt[T_BOUNDFLT], &xpv_boundstrap, cs, SDT_SYSIGT,
516 TRP_XPL, 0);
517 set_gatesegd(&idt[T_ILLINST], &xpv_invoptrap, cs, SDT_SYSIGT, TRP_XPL,
518 0);
519 set_gatesegd(&idt[T_NOEXTFLT], &xpv_ndptrap, cs, SDT_SYSIGT, TRP_XPL,
520 0);
521 set_gatesegd(&idt[T_TSSFLT], &xpv_invtsstrap, cs, SDT_SYSIGT, TRP_XPL,
522 0);
523 set_gatesegd(&idt[T_SEGFLT], &xpv_segnptrap, cs, SDT_SYSIGT, TRP_XPL,
524 0);
525 set_gatesegd(&idt[T_STKFLT], &xpv_stktrap, cs, SDT_SYSIGT, TRP_XPL, 0);
526 set_gatesegd(&idt[T_GPFLT], &xpv_gptrap, cs, SDT_SYSIGT, TRP_XPL, 0);
527 set_gatesegd(&idt[T_PGFLT], &xpv_pftrap, cs, SDT_SYSIGT, TRP_XPL, 0);
528 set_gatesegd(&idt[T_EXTERRFLT], &xpv_ndperr, cs, SDT_SYSIGT, TRP_XPL,
529 0);
530 set_gatesegd(&idt[T_ALIGNMENT], &xpv_achktrap, cs, SDT_SYSIGT, TRP_XPL,
531 0);
532 set_gatesegd(&idt[T_MCE], &xpv_mcetrap, cs, SDT_SYSIGT, TRP_XPL, 0);
533 set_gatesegd(&idt[T_SIMDFPE], &xpv_xmtrap, cs, SDT_SYSIGT, TRP_XPL, 0);
534
535 /*
536 * We have no double fault handler. Any single fault represents a
537 * catastrophic failure for us, so there is no attempt to handle
538 * them cleanly: we just print a message and reboot. If we
539 * encounter a second fault while doing that, there is nothing
540 * else we can do.
541 */
542
543 /*
544 * Be prepared to absorb any stray device interrupts received
545 * while writing the core to disk.
546 */
547 for (i = 33; i < NIDT; i++)
548 set_gatesegd(&idt[i], &xpv_surprise_intr, cs, SDT_SYSIGT,
549 TRP_XPL, 0);
550
551 /* The one interrupt we expect to get is from the APIC timer. */
552 set_gatesegd(&idt[T_XPV_TIMER], &xpv_timer_trap, cs, SDT_SYSIGT,
553 TRP_XPL, 0);
554
555 idtr.dtr_base = (uintptr_t)xpv_panic_idt;
556 idtr.dtr_limit = sizeof (xpv_panic_idt) - 1;
557 wr_idtr(&idtr);
558
559 #if defined(__amd64)
560 /* Catch any hypercalls. */
561 wrmsr(MSR_AMD_LSTAR, (uintptr_t)xpv_panic_hypercall);
562 wrmsr(MSR_AMD_CSTAR, (uintptr_t)xpv_panic_hypercall);
563 #endif
564 }
565
566 static void
567 xpv_apic_clkinit()
568 {
569 uint_t apic_ticks = 0;
570
571 /*
572 * Measure how many APIC ticks there are within a fixed time
573 * period. We're going to be fairly coarse here. This timer is
574 * just being used to detect a stalled panic, so as long as we have
575 * the right order of magnitude, everything should be fine.
576 */
577 xpv_apicadr[APIC_SPUR_INT_REG] = AV_UNIT_ENABLE | APIC_SPUR_INTR;
578 xpv_apicadr[APIC_LOCAL_TIMER] = AV_MASK;
579 xpv_apicadr[APIC_INT_VECT0] = AV_MASK; /* local intr reg 0 */
580
581 xpv_apicadr[APIC_DIVIDE_REG] = 0;
582 xpv_apicadr[APIC_INIT_COUNT] = APIC_MAXVAL;
583 drv_usecwait(XPV_TIMER_INTERVAL);
584 apic_ticks = APIC_MAXVAL - xpv_apicadr[APIC_CURR_COUNT];
585
586 /*
587 * apic_ticks now represents roughly how many apic ticks comprise
588 * one timeout interval. Program the timer to send us an interrupt
589 * every time that interval expires.
590 */
591 xpv_apicadr[APIC_LOCAL_TIMER] = T_XPV_TIMER | AV_PERIODIC;
592 xpv_apicadr[APIC_INIT_COUNT] = apic_ticks;
593 xpv_apicadr[APIC_EOI_REG] = 0;
594 }
595
596 void
597 xpv_timer_tick(void)
598 {
599 static int ticks = 0;
600
601 if (ticks++ >= MICROSEC / XPV_TIMER_INTERVAL) {
602 ticks = 0;
603 if (dump_timeleft && (--dump_timeleft == 0))
604 panic("Xen panic timeout\n");
605 }
606 xpv_apicadr[APIC_EOI_REG] = 0;
607 }
608
609 void
610 xpv_interrupt(void)
611 {
612 #ifdef DEBUG
613 static int cnt = 0;
614
615 if (cnt++ < 10)
616 xpv_panic_printf("Unexpected interrupt received.\n");
617 if ((cnt < 1000) && ((cnt % 100) == 0))
618 xpv_panic_printf("%d unexpected interrupts received.\n", cnt);
619 #endif
620
621 xpv_apicadr[APIC_EOI_REG] = 0;
622 }
623
624 /*
625 * Managing time in panic context is trivial. We only have a single CPU,
626 * we never get rescheduled, we never get suspended. We just need to
627 * convert clock ticks into nanoseconds.
628 */
629 static hrtime_t
630 xpv_panic_gethrtime(void)
631 {
632 hrtime_t tsc, hrt;
633 unsigned int *l = (unsigned int *)&(tsc);
634
635 tsc = __rdtsc_insn();
636 hrt = (mul32(l[1], nsec_scale) << NSEC_SHIFT) +
637 (mul32(l[0], nsec_scale) >> (32 - NSEC_SHIFT));
638
639 return (hrt);
640 }
641
642 static void
643 xpv_panic_time_init()
644 {
645 nsec_scale =
646 CPU->cpu_m.mcpu_vcpu_info->time.tsc_to_system_mul >> NSEC_SHIFT;
647
648 gethrtimef = xpv_panic_gethrtime;
649 }
650
651 static void
652 xpv_panicsys(struct regs *rp, char *fmt, ...)
653 {
654 extern void panicsys(const char *, va_list, struct regs *, int);
655 va_list alist;
656
657 va_start(alist, fmt);
658 panicsys(fmt, alist, rp, 1);
659 va_end(alist);
660 }
661
662 void
663 xpv_do_panic(void *arg)
664 {
665 struct panic_info *pip = (struct panic_info *)arg;
666 int l;
667 struct cregs creg;
668 #if defined(__amd64)
669 extern uintptr_t postbootkernelbase;
670 #endif
671
672 if (xpv_panicking++ > 0)
673 panic("multiple calls to xpv_do_panic()");
674
675 /*
676 * Indicate to the underlying panic framework that a panic has been
677 * initiated. This is ordinarily done as part of vpanic(). Since
678 * we already have all the register state saved by the hypervisor,
679 * we skip that and jump straight into the panic processing code.
680 *
681 * XXX If another thread grabs and wins the panic_quiesce trigger
682 * then we'll have two threads in panicsys believing they are in
683 * charge of the panic attempt!
684 */
685 (void) panic_trigger(&panic_quiesce);
686
687 #if defined(__amd64)
688 /*
689 * bzero() and bcopy() get unhappy when asked to operate on
690 * addresses outside of the kernel. At this point Xen is really a
691 * part of the kernel, so we update the routines' notion of where
692 * the kernel starts.
693 */
694 postbootkernelbase = xen_virt_start;
695 #endif
696
697 #if defined(HYPERVISOR_VIRT_END)
698 xpv_end = HYPERVISOR_VIRT_END;
699 #else
700 xpv_end = (uintptr_t)UINTPTR_MAX - sizeof (uintptr_t);
701 #endif
702
703 /*
704 * If we were redirecting console output to the hypervisor, we have
705 * to stop.
706 */
707 use_polledio = B_FALSE;
708 if (boot_console_type(NULL) == CONS_HYPERVISOR) {
709 bcons_device_change(CONS_HYPERVISOR);
710 } else if (cons_polledio != NULL &&
711 cons_polledio->cons_polledio_putchar != NULL) {
712 if (cons_polledio->cons_polledio_enter != NULL)
713 cons_polledio->cons_polledio_enter(
714 cons_polledio->cons_polledio_argument);
715 use_polledio = 1;
716 }
717
718 /* Make sure we handle all console output from here on. */
719 sysp->bsvc_putchar = xpv_panic_putc;
720
721 /*
722 * If we find an unsupported panic_info structure, there's not much
723 * we can do other than complain, plow on, and hope for the best.
724 */
725 if (pip->pi_version != PANIC_INFO_VERSION)
726 xpv_panic_printf("Warning: Xen is using an unsupported "
727 "version of the panic_info structure.\n");
728
729 xpv_panic_info = pip;
730
731 #if defined(__amd64)
732 kpm1_low = (uintptr_t)xpv_panic_info->pi_ram_start;
733 if (xpv_panic_info->pi_xen_start == NULL) {
734 kpm1_high = (uintptr_t)xpv_panic_info->pi_ram_end;
735 } else {
736 kpm1_high = (uintptr_t)xpv_panic_info->pi_xen_start;
737 kpm2_low = (uintptr_t)xpv_panic_info->pi_xen_end;
738 kpm2_high = (uintptr_t)xpv_panic_info->pi_ram_end;
739 }
740 #endif
741
742 /*
743 * Make sure we are running on the Solaris %gs. The Xen panic code
744 * should already have set up the GDT properly.
745 */
746 xpv_panic_resetgs();
747 #if defined(__amd64)
748 wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]);
749 #endif
750
751 xpv_panic_time_init();
752
753 /*
754 * Switch to our own IDT, avoiding any accidental returns to Xen
755 * world.
756 */
757 switch_to_xpv_panic_idt();
758
759 /*
760 * Initialize the APIC timer, which is used to detect a hung dump
761 * attempt.
762 */
763 xpv_apicadr = pip->pi_apic;
764 xpv_apic_clkinit();
765
766 /*
767 * Set up a few values that we'll need repeatedly.
768 */
769 getcregs(&creg);
770 xpv_panic_cr3 = creg.cr_cr3;
771 for (l = mmu.max_level; l >= 0; l--)
772 xpv_panic_nptes[l] = mmu.ptes_per_table;
773 #ifdef __i386
774 if (mmu.pae_hat)
775 xpv_panic_nptes[mmu.max_level] = 4;
776 #endif
777
778 /* Add the fake Xen module to the module list */
779 if (xpv_module != NULL) {
780 extern int last_module_id;
781
782 xpv_modctl->mod_id = last_module_id++;
783 xpv_modctl->mod_next = &modules;
784 xpv_modctl->mod_prev = modules.mod_prev;
785 modules.mod_prev->mod_next = xpv_modctl;
786 modules.mod_prev = xpv_modctl;
787 }
788
789 if (pip->pi_mca.mpd_magic == MCA_PANICDATA_MAGIC)
790 xpv_mca_panic_data = &pip->pi_mca;
791
792 xpv_panic_printf = printf;
793 xpv_panicsys((struct regs *)pip->pi_regs, pip->pi_panicstr);
794 xpv_panic_printf("Failed to reboot following panic.\n");
795 for (;;)
796 ;
797 }
798
799 /*
800 * Set up the necessary data structures to pretend that the Xen hypervisor
801 * is a loadable module, allowing mdb to find the Xen symbols in a crash
802 * dump. Since these symbols all map to VA space Solaris doesn't normally
803 * have access to, we don't link these structures into the kernel's lists
804 * until/unless we hit a Xen panic.
805 *
806 * The observant reader will note a striking amount of overlap between this
807 * code and that found in krtld. While it would be handy if we could just
808 * ask krtld to do this work for us, it's not that simple. Among the
809 * complications: we're not actually loading the text here (grub did it at
810 * boot), the .text section is writable, there are no relocations to do,
811 * none of the module text/data is in readable memory, etc. Training krtld
812 * to deal with this weird module is as complicated, and more risky, than
813 * reimplementing the necessary subset of it here.
814 */
815 static void
816 init_xen_module()
817 {
818 struct _buf *file = NULL;
819 struct module *mp;
820 struct modctl *mcp;
821 int i, shn;
822 Shdr *shp, *ctf_shp;
823 char *names = NULL;
824 size_t n, namesize, text_align, data_align;
825 #if defined(__amd64)
826 const char machine = EM_AMD64;
827 #else
828 const char machine = EM_386;
829 #endif
830
831 /* Allocate and init the module structure */
832 mp = kmem_zalloc(sizeof (*mp), KM_SLEEP);
833 mp->filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP);
834 (void) strcpy(mp->filename, XPV_FILENAME);
835
836 /* Allocate and init the modctl structure */
837 mcp = kmem_zalloc(sizeof (*mcp), KM_SLEEP);
838 mcp->mod_modname = kobj_zalloc(strlen(XPV_MODNAME) + 1, KM_SLEEP);
839 (void) strcpy(mcp->mod_modname, XPV_MODNAME);
840 mcp->mod_filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP);
841 (void) strcpy(mcp->mod_filename, XPV_FILENAME);
842 mcp->mod_inprogress_thread = (kthread_id_t)-1;
843 mcp->mod_ref = 1;
844 mcp->mod_loaded = 1;
845 mcp->mod_loadcnt = 1;
846 mcp->mod_mp = mp;
847
848 /*
849 * Try to open a Xen image that hasn't had its symbol and CTF
850 * information stripped off.
851 */
852 file = kobj_open_file(XPV_FILENAME);
853 if (file == (struct _buf *)-1) {
854 file = NULL;
855 goto err;
856 }
857
858 /*
859 * Read the header and ensure that this is an ELF file for the
860 * proper ISA. If it's not, somebody has done something very
861 * stupid. Why bother? See Mencken.
862 */
863 if (kobj_read_file(file, (char *)&mp->hdr, sizeof (mp->hdr), 0) < 0)
864 goto err;
865 for (i = 0; i < SELFMAG; i++)
866 if (mp->hdr.e_ident[i] != ELFMAG[i])
867 goto err;
868 if ((mp->hdr.e_ident[EI_DATA] != ELFDATA2LSB) ||
869 (mp->hdr.e_machine != machine))
870 goto err;
871
872 /* Read in the section headers */
873 n = mp->hdr.e_shentsize * mp->hdr.e_shnum;
874 mp->shdrs = kmem_zalloc(n, KM_SLEEP);
875 if (kobj_read_file(file, mp->shdrs, n, mp->hdr.e_shoff) < 0)
876 goto err;
877
878 /* Read the section names */
879 shp = (Shdr *)(mp->shdrs + mp->hdr.e_shstrndx * mp->hdr.e_shentsize);
880 namesize = shp->sh_size;
881 names = kmem_zalloc(shp->sh_size, KM_SLEEP);
882 if (kobj_read_file(file, names, shp->sh_size, shp->sh_offset) < 0)
883 goto err;
884
885 /*
886 * Fill in the text and data size fields.
887 */
888 ctf_shp = NULL;
889 text_align = data_align = 0;
890 for (shn = 1; shn < mp->hdr.e_shnum; shn++) {
891 shp = (Shdr *)(mp->shdrs + shn * mp->hdr.e_shentsize);
892
893 /* Sanity check the offset of the section name */
894 if (shp->sh_name >= namesize)
895 continue;
896
897 /* If we find the symtab section, remember it for later. */
898 if (shp->sh_type == SHT_SYMTAB) {
899 mp->symtbl_section = shn;
900 mp->symhdr = shp;
901 continue;
902 }
903
904 /* If we find the CTF section, remember it for later. */
905 if ((shp->sh_size != 0) &&
906 (strcmp(names + shp->sh_name, ".SUNW_ctf") == 0)) {
907 ctf_shp = shp;
908 continue;
909 }
910
911 if (!(shp->sh_flags & SHF_ALLOC))
912 continue;
913
914 /*
915 * Xen marks its text section as writable, so we need to
916 * look for the name - not just the flag.
917 */
918 if ((strcmp(&names[shp->sh_name], ".text") != 0) &&
919 (shp->sh_flags & SHF_WRITE) != 0) {
920 if (shp->sh_addralign > data_align)
921 data_align = shp->sh_addralign;
922 mp->data_size = ALIGN(mp->data_size, data_align);
923 mp->data_size += ALIGN(shp->sh_size, 8);
924 if (mp->data == NULL || mp->data > (char *)shp->sh_addr)
925 mp->data = (char *)shp->sh_addr;
926 } else {
927 if (shp->sh_addralign > text_align)
928 text_align = shp->sh_addralign;
929 mp->text_size = ALIGN(mp->text_size, text_align);
930 mp->text_size += ALIGN(shp->sh_size, 8);
931 if (mp->text == NULL || mp->text > (char *)shp->sh_addr)
932 mp->text = (char *)shp->sh_addr;
933 }
934 }
935 kmem_free(names, namesize);
936 names = NULL;
937 shp = NULL;
938 mcp->mod_text = mp->text;
939 mcp->mod_text_size = mp->text_size;
940
941 /*
942 * If we have symbol table and string table sections, read them in
943 * now. If we don't, we just plow on. We'll still get a valid
944 * core dump, but finding anything useful will be just a bit
945 * harder.
946 *
947 * Note: we don't bother with a hash table. We'll never do a
948 * symbol lookup unless we crash, and then mdb creates its own. We
949 * also don't try to perform any relocations. Xen should be loaded
950 * exactly where the ELF file indicates, and the symbol information
951 * in the file should be complete and correct already. Static
952 * linking ain't all bad.
953 */
954 if ((mp->symhdr != NULL) && (mp->symhdr->sh_link < mp->hdr.e_shnum)) {
955 mp->strhdr = (Shdr *)
956 (mp->shdrs + mp->symhdr->sh_link * mp->hdr.e_shentsize);
957 mp->nsyms = mp->symhdr->sh_size / mp->symhdr->sh_entsize;
958
959 /* Allocate space for the symbol table and strings. */
960 mp->symsize = mp->symhdr->sh_size +
961 mp->nsyms * sizeof (symid_t) + mp->strhdr->sh_size;
962 mp->symspace = kmem_zalloc(mp->symsize, KM_SLEEP);
963 mp->symtbl = mp->symspace;
964 mp->strings = (char *)(mp->symtbl + mp->symhdr->sh_size);
965
966 if ((kobj_read_file(file, mp->symtbl,
967 mp->symhdr->sh_size, mp->symhdr->sh_offset) < 0) ||
968 (kobj_read_file(file, mp->strings,
969 mp->strhdr->sh_size, mp->strhdr->sh_offset) < 0))
970 goto err;
971 }
972
973 /*
974 * Read in the CTF section
975 */
976 if ((ctf_shp != NULL) && ((moddebug & MODDEBUG_NOCTF) == 0)) {
977 mp->ctfdata = kmem_zalloc(ctf_shp->sh_size, KM_SLEEP);
978 mp->ctfsize = ctf_shp->sh_size;
979 if (kobj_read_file(file, mp->ctfdata, mp->ctfsize,
980 ctf_shp->sh_offset) < 0)
981 goto err;
982 }
983
984 kobj_close_file(file);
985
986 xpv_module = mp;
987 xpv_modctl = mcp;
988 return;
989
990 err:
991 cmn_err(CE_WARN, "Failed to initialize xpv module.");
992 if (file != NULL)
993 kobj_close_file(file);
994
995 kmem_free(mp->filename, strlen(XPV_FILENAME) + 1);
996 if (mp->shdrs != NULL)
997 kmem_free(mp->shdrs, mp->hdr.e_shentsize * mp->hdr.e_shnum);
998 if (mp->symspace != NULL)
999 kmem_free(mp->symspace, mp->symsize);
1000 if (mp->ctfdata != NULL)
1001 kmem_free(mp->ctfdata, mp->ctfsize);
1002 kmem_free(mp, sizeof (*mp));
1003 kmem_free(mcp->mod_filename, strlen(XPV_FILENAME) + 1);
1004 kmem_free(mcp->mod_modname, strlen(XPV_MODNAME) + 1);
1005 kmem_free(mcp, sizeof (*mcp));
1006 if (names != NULL)
1007 kmem_free(names, namesize);
1008 }
1009
1010 void
1011 xpv_panic_init()
1012 {
1013 xen_platform_op_t op;
1014 int i;
1015
1016 ASSERT(DOMAIN_IS_INITDOMAIN(xen_info));
1017
1018 for (i = 0; i < mmu.num_level; i++)
1019 ptable_pfn[i] = PFN_INVALID;
1020
1021 /* Let Xen know where to jump if/when it panics. */
1022 op.cmd = XENPF_panic_init;
1023 op.interface_version = XENPF_INTERFACE_VERSION;
1024 op.u.panic_init.panic_addr = (unsigned long)xpv_panic_hdlr;
1025
1026 (void) HYPERVISOR_platform_op(&op);
1027
1028 init_xen_module();
1029 }