1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2012 Gary Mills
  23  * Copyright 2016 PALO, Richard.
  24  *
  25  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  26  */
  27 
  28 #include <sys/types.h>
  29 #include <sys/clock.h>
  30 #include <sys/psm.h>
  31 #include <sys/archsystm.h>
  32 #include <sys/machsystm.h>
  33 #include <sys/compress.h>
  34 #include <sys/modctl.h>
  35 #include <sys/trap.h>
  36 #include <sys/panic.h>
  37 #include <sys/regset.h>
  38 #include <sys/frame.h>
  39 #include <sys/kobj.h>
  40 #include <sys/apic.h>
  41 #include <sys/apic_timer.h>
  42 #include <sys/dumphdr.h>
  43 #include <sys/mem.h>
  44 #include <sys/x86_archext.h>
  45 #include <sys/xpv_panic.h>
  46 #include <sys/boot_console.h>
  47 #include <sys/bootsvcs.h>
  48 #include <sys/consdev.h>
  49 #include <vm/hat_pte.h>
  50 #include <vm/hat_i86.h>
  51 
  52 /* XXX: need to add a PAE version too, if we ever support both PAE and non */
  53 #if defined(__i386)
  54 #define XPV_FILENAME    "/boot/xen-syms"
  55 #else
  56 #define XPV_FILENAME    "/boot/amd64/xen-syms"
  57 #endif
  58 #define XPV_MODNAME     "xpv"
  59 
  60 int xpv_panicking = 0;
  61 
  62 struct module *xpv_module;
  63 struct modctl *xpv_modctl;
  64 
  65 #define ALIGN(x, a)     ((a) == 0 ? (uintptr_t)(x) : \
  66         (((uintptr_t)(x) + (uintptr_t)(a) - 1l) & ~((uintptr_t)(a) - 1l)))
  67 
  68 /* Pointer to the xpv_panic_info structure handed to us by Xen.  */
  69 static struct panic_info *xpv_panic_info = NULL;
  70 
  71 /* Timer support */
  72 #define NSEC_SHIFT 5
  73 #define T_XPV_TIMER     0xd1
  74 #define XPV_TIMER_INTERVAL      1000    /* 1000 microseconds */
  75 static uint32_t *xpv_apicadr = NULL;
  76 static uint_t   nsec_scale;
  77 
  78 /* IDT support */
  79 #pragma align   16(xpv_panic_idt)
  80 static gate_desc_t      xpv_panic_idt[NIDT];    /* interrupt descriptor table */
  81 
  82 /* Xen pagetables mapped into our HAT's ptable windows */
  83 static pfn_t ptable_pfn[MAX_NUM_LEVEL];
  84 
  85 /* Number of MMU_PAGESIZE pages we're adding to the Solaris dump */
  86 static int xpv_dump_pages;
  87 
  88 /*
  89  * There are up to two large swathes of RAM that we don't want to include
  90  * in the dump: those that comprise the Xen version of segkpm.  On 32-bit
  91  * systems there is no such region of memory.  On 64-bit systems, there
  92  * should be just a single contiguous region that corresponds to all of
  93  * physical memory.  The tricky bit is that Xen's heap sometimes lives in
  94  * the middle of their segkpm, and is mapped using only kpm-like addresses.
  95  * In that case, we need to skip the swathes before and after Xen's heap.
  96  */
  97 uintptr_t kpm1_low = 0;
  98 uintptr_t kpm1_high = 0;
  99 uintptr_t kpm2_low = 0;
 100 uintptr_t kpm2_high = 0;
 101 
 102 /*
 103  * Some commonly used values that we don't want to recompute over and over.
 104  */
 105 static int xpv_panic_nptes[MAX_NUM_LEVEL];
 106 static ulong_t xpv_panic_cr3;
 107 static uintptr_t xpv_end;
 108 
 109 static void xpv_panic_console_print(const char *fmt, ...);
 110 static void (*xpv_panic_printf)(const char *, ...) = xpv_panic_console_print;
 111 
 112 #define CONSOLE_BUF_SIZE        256
 113 static char console_buffer[CONSOLE_BUF_SIZE];
 114 static boolean_t use_polledio;
 115 
 116 /*
 117  * Pointers to machine check panic info (if any).
 118  */
 119 xpv_mca_panic_data_t *xpv_mca_panic_data = NULL;
 120 
 121 static void
 122 xpv_panic_putc(int m)
 123 {
 124         struct cons_polledio *c = cons_polledio;
 125 
 126         /* This really shouldn't happen */
 127         if (boot_console_type(NULL) == CONS_HYPERVISOR)
 128                 return;
 129 
 130         if (use_polledio == B_TRUE)
 131                 c->cons_polledio_putchar(c->cons_polledio_argument, m);
 132         else
 133                 bcons_putchar(m);
 134 }
 135 
 136 static void
 137 xpv_panic_puts(char *msg)
 138 {
 139         char *m;
 140 
 141         dump_timeleft = dump_timeout;
 142         for (m = msg; *m; m++)
 143                 xpv_panic_putc((int)*m);
 144 }
 145 
 146 static void
 147 xpv_panic_console_print(const char *fmt, ...)
 148 {
 149         va_list ap;
 150 
 151         va_start(ap, fmt);
 152         (void) vsnprintf(console_buffer, sizeof (console_buffer), fmt, ap);
 153         va_end(ap);
 154 
 155         xpv_panic_puts(console_buffer);
 156 }
 157 
 158 static void
 159 xpv_panic_map(int level, pfn_t pfn)
 160 {
 161         x86pte_t pte, *pteptr;
 162 
 163         /*
 164          * The provided pfn represents a level 'level' page table.  Map it
 165          * into the 'level' slot in the list of page table windows.
 166          */
 167         pteptr = (x86pte_t *)PWIN_PTE_VA(level);
 168         pte = pfn_to_pa(pfn) | PT_VALID;
 169 
 170         XPV_ALLOW_PAGETABLE_UPDATES();
 171         if (mmu.pae_hat)
 172                 *pteptr = pte;
 173         else
 174                 *(x86pte32_t *)pteptr = pte;
 175         XPV_DISALLOW_PAGETABLE_UPDATES();
 176 
 177         mmu_tlbflush_entry(PWIN_VA(level));
 178 }
 179 
 180 /*
 181  * Walk the page tables to find the pfn mapped by the given va.
 182  */
 183 static pfn_t
 184 xpv_va_walk(uintptr_t *vaddr)
 185 {
 186         int l, idx;
 187         pfn_t pfn;
 188         x86pte_t pte;
 189         x86pte_t *ptep;
 190         uintptr_t va = *vaddr;
 191         uintptr_t scan_va;
 192         caddr_t ptable_window;
 193         static pfn_t toplevel_pfn;
 194         static uintptr_t lastva;
 195 
 196         /*
 197          * If we do anything other than a simple scan through memory, don't
 198          * trust the mapped page tables.
 199          */
 200         if (va != lastva + MMU_PAGESIZE)
 201                 for (l = mmu.max_level; l >= 0; l--)
 202                         ptable_pfn[l] = PFN_INVALID;
 203 
 204         toplevel_pfn = mmu_btop(xpv_panic_cr3);
 205 
 206         while (va < xpv_end && va >= *vaddr) {
 207                 /* Find the lowest table with any entry for va */
 208                 pfn = toplevel_pfn;
 209                 for (l = mmu.max_level; l >= 0; l--) {
 210                         if (ptable_pfn[l] != pfn) {
 211                                 xpv_panic_map(l, pfn);
 212                                 ptable_pfn[l] = pfn;
 213                         }
 214 
 215                         /*
 216                          * Search this pagetable for any mapping to an
 217                          * address >= va.
 218                          */
 219                         ptable_window = PWIN_VA(l);
 220                         if (l == mmu.max_level && mmu.pae_hat)
 221                                 ptable_window +=
 222                                     (xpv_panic_cr3 & MMU_PAGEOFFSET);
 223 
 224                         idx = (va >> LEVEL_SHIFT(l)) & (xpv_panic_nptes[l] - 1);
 225                         scan_va = va;
 226                         while (idx < xpv_panic_nptes[l] && scan_va < xpv_end &&
 227                             scan_va >= *vaddr) {
 228                                 ptep = (x86pte_t *)(ptable_window +
 229                                     (idx << mmu.pte_size_shift));
 230                                 pte = GET_PTE(ptep);
 231                                 if (pte & PTE_VALID)
 232                                         break;
 233                                 idx++;
 234                                 scan_va += mmu.level_size[l];
 235                         }
 236 
 237                         /*
 238                          * If there are no valid mappings in this table, we
 239                          * can skip to the end of the VA range it covers.
 240                          */
 241                         if (idx == xpv_panic_nptes[l]) {
 242                                 va = NEXT_ENTRY_VA(va, l + 1);
 243                                 break;
 244                         }
 245 
 246                         va = scan_va;
 247                         /*
 248                          * See if we've hit the end of the range.
 249                          */
 250                         if (va >= xpv_end || va < *vaddr)
 251                                 break;
 252 
 253                         /*
 254                          * If this mapping is for a pagetable, we drop down
 255                          * to the next level in the hierarchy and look for
 256                          * a mapping in it.
 257                          */
 258                         pfn = PTE2MFN(pte, l);
 259                         if (!PTE_ISPAGE(pte, l))
 260                                 continue;
 261 
 262                         /*
 263                          * The APIC page is magic.  Nothing to see here;
 264                          * move along.
 265                          */
 266                         if (((uintptr_t)xpv_apicadr & MMU_PAGEMASK) ==
 267                             (va & MMU_PAGEMASK)) {
 268                                 va += MMU_PAGESIZE;
 269                                 break;
 270                         }
 271 
 272                         /*
 273                          * See if the address is within one of the two
 274                          * kpm-like regions we want to skip.
 275                          */
 276                         if (va >= kpm1_low && va < kpm1_high) {
 277                                 va = kpm1_high;
 278                                 break;
 279                         }
 280                         if (va >= kpm2_low && va < kpm2_high) {
 281                                 va = kpm2_high;
 282                                 break;
 283                         }
 284 
 285                         /*
 286                          * The Xen panic code only handles small pages.  If
 287                          * this mapping is for a large page, we need to
 288                          * identify the consituent page that covers the
 289                          * specific VA we were looking for.
 290                          */
 291                         if (l > 0) {
 292                                 if (l > 1)
 293                                         panic("Xen panic can't cope with "
 294                                             "giant pages.");
 295                                 idx = (va >> LEVEL_SHIFT(0)) &
 296                                     (xpv_panic_nptes[0] - 1);
 297                                 pfn += idx;
 298                         }
 299 
 300                         *vaddr = va;
 301                         lastva = va;
 302                         return (pfn | PFN_IS_FOREIGN_MFN);
 303                 }
 304         }
 305         return (PFN_INVALID);
 306 }
 307 
 308 /*
 309  * Walk through the Xen VA space, finding pages that are mapped in.
 310  *
 311  * These pages all have MFNs rather than PFNs, meaning they may be outside
 312  * the physical address space the kernel knows about, or they may collide
 313  * with PFNs the kernel is using.
 314  *
 315  * The obvious trick of just adding the PFN_IS_FOREIGN_MFN bit to the MFNs
 316  * to avoid collisions doesn't work.  The pages need to be written to disk
 317  * in PFN-order or savecore gets confused.  We can't allocate memory to
 318  * contruct a sorted pfn->VA reverse mapping, so we have to write the pages
 319  * to disk in VA order.
 320  *
 321  * To square this circle, we simply make up PFNs for each of Xen's pages.
 322  * We assign each mapped page a fake PFN in ascending order.  These fake
 323  * PFNs each have the FOREIGN bit set, ensuring that they fall outside the
 324  * range of Solaris PFNs written by the kernel.
 325  */
 326 int
 327 dump_xpv_addr()
 328 {
 329         uintptr_t va;
 330         mem_vtop_t mem_vtop;
 331 
 332         xpv_dump_pages = 0;
 333         va = xen_virt_start;
 334 
 335         while (xpv_va_walk(&va) != PFN_INVALID) {
 336                 mem_vtop.m_as = &kas;
 337                 mem_vtop.m_va = (void *)va;
 338                 mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN;
 339 
 340                 dumpvp_write(&mem_vtop, sizeof (mem_vtop_t));
 341                 xpv_dump_pages++;
 342 
 343                 va += MMU_PAGESIZE;
 344         }
 345 
 346         /*
 347          * Add the shared_info page.  This page actually ends up in the
 348          * dump twice: once for the Xen va and once for the Solaris va.
 349          * This isn't ideal, but we don't know the address Xen is using for
 350          * the page, so we can't share it.
 351          */
 352         mem_vtop.m_as = &kas;
 353         mem_vtop.m_va = HYPERVISOR_shared_info;
 354         mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN;
 355         dumpvp_write(&mem_vtop, sizeof (mem_vtop_t));
 356         xpv_dump_pages++;
 357 
 358         return (xpv_dump_pages);
 359 }
 360 
 361 void
 362 dump_xpv_pfn()
 363 {
 364         pfn_t pfn;
 365         int cnt;
 366 
 367         for (cnt = 0; cnt < xpv_dump_pages; cnt++) {
 368                 pfn = (pfn_t)cnt | PFN_IS_FOREIGN_MFN;
 369                 dumpvp_write(&pfn, sizeof (pfn));
 370         }
 371 }
 372 
 373 int
 374 dump_xpv_data(void *dump_cbuf)
 375 {
 376         uintptr_t va;
 377         uint32_t csize;
 378         int cnt = 0;
 379 
 380         /*
 381          * XXX: we should probably run this data through a UE check.  The
 382          * catch is that the UE code relies on on_trap() and getpfnum()
 383          * working.
 384          */
 385         va = xen_virt_start;
 386 
 387         while (xpv_va_walk(&va) != PFN_INVALID) {
 388                 csize = (uint32_t)compress((void *)va, dump_cbuf, PAGESIZE);
 389                 dumpvp_write(&csize, sizeof (uint32_t));
 390                 dumpvp_write(dump_cbuf, csize);
 391                 if (dump_ioerr) {
 392                         dumphdr->dump_flags &= ~DF_COMPLETE;
 393                         return (cnt);
 394                 }
 395                 cnt++;
 396                 va += MMU_PAGESIZE;
 397         }
 398 
 399         /*
 400          * Finally, dump the shared_info page
 401          */
 402         csize = (uint32_t)compress((void *)HYPERVISOR_shared_info, dump_cbuf,
 403             PAGESIZE);
 404         dumpvp_write(&csize, sizeof (uint32_t));
 405         dumpvp_write(dump_cbuf, csize);
 406         if (dump_ioerr)
 407                 dumphdr->dump_flags &= ~DF_COMPLETE;
 408         cnt++;
 409 
 410         return (cnt);
 411 }
 412 
 413 static void *
 414 showstack(void *fpreg, int xpv_only)
 415 {
 416         struct frame *fpp;
 417         ulong_t off;
 418         char *sym;
 419         uintptr_t pc, fp, lastfp;
 420         uintptr_t minaddr = min(KERNELBASE, xen_virt_start);
 421 
 422         fp = (uintptr_t)fpreg;
 423         if (fp < minaddr) {
 424                 xpv_panic_printf("Bad frame ptr: 0x%p\n", fpreg);
 425                 return (fpreg);
 426         }
 427 
 428         do {
 429                 fpp = (struct frame *)fp;
 430                 pc = fpp->fr_savpc;
 431 
 432                 if ((xpv_only != 0) &&
 433                     (fp > xpv_end || fp < xen_virt_start))
 434                         break;
 435                 if ((sym = kobj_getsymname(pc, &off)) != NULL)
 436                         xpv_panic_printf("%08lx %s:%s+%lx\n", fp,
 437                             mod_containing_pc((caddr_t)pc), sym, off);
 438                 else if ((pc >= xen_virt_start) && (pc <= xpv_end))
 439                         xpv_panic_printf("%08lx 0x%lx (in Xen)\n", fp, pc);
 440                 else
 441                         xpv_panic_printf("%08lx %lx\n", fp, pc);
 442 
 443                 lastfp = fp;
 444                 fp = fpp->fr_savfp;
 445 
 446                 /*
 447                  * Xen marks an exception frame by inverting the frame
 448                  * pointer.
 449                  */
 450                 if (fp < lastfp) {
 451                         if ((~fp > minaddr) && ((~fp) ^ lastfp) < 0xfff)
 452                                 fp = ~fp;
 453                 }
 454         } while (fp > lastfp);
 455         return ((void *)fp);
 456 }
 457 
 458 void *
 459 xpv_traceback(void *fpreg)
 460 {
 461         return (showstack(fpreg, 1));
 462 }
 463 
 464 #if defined(__amd64)
 465 static void
 466 xpv_panic_hypercall(ulong_t call)
 467 {
 468         panic("Illegally issued hypercall %d during panic!\n", (int)call);
 469 }
 470 #endif
 471 
 472 void
 473 xpv_die(struct regs *rp)
 474 {
 475         struct panic_trap_info ti;
 476         struct cregs creg;
 477 
 478         ti.trap_regs = rp;
 479         ti.trap_type = rp->r_trapno;
 480 
 481         curthread->t_panic_trap = &ti;
 482         if (ti.trap_type == T_PGFLT) {
 483                 getcregs(&creg);
 484                 ti.trap_addr = (caddr_t)creg.cr_cr2;
 485                 panic("Fatal pagefault at 0x%lx.  fault addr=0x%p  rp=0x%p",
 486                     rp->r_pc, (void *)ti.trap_addr, (void *)rp);
 487         } else {
 488                 ti.trap_addr = (caddr_t)rp->r_pc;
 489                 panic("Fatal trap %ld at 0x%lx.  rp=0x%p", rp->r_trapno,
 490                     rp->r_pc, (void *)rp);
 491         }
 492 }
 493 
 494 /*
 495  * Build IDT to handle a Xen panic
 496  */
 497 static void
 498 switch_to_xpv_panic_idt()
 499 {
 500         int i;
 501         desctbr_t idtr;
 502         gate_desc_t *idt = xpv_panic_idt;
 503         selector_t cs = get_cs_register();
 504 
 505         for (i = 0; i < 32; i++)
 506                 set_gatesegd(&idt[i], &xpv_invaltrap, cs, SDT_SYSIGT, TRP_XPL,
 507                     0);
 508 
 509         set_gatesegd(&idt[T_ZERODIV], &xpv_div0trap, cs, SDT_SYSIGT, TRP_XPL,
 510             0);
 511         set_gatesegd(&idt[T_SGLSTP], &xpv_dbgtrap, cs, SDT_SYSIGT, TRP_XPL, 0);
 512         set_gatesegd(&idt[T_NMIFLT], &xpv_nmiint, cs, SDT_SYSIGT, TRP_XPL, 0);
 513         set_gatesegd(&idt[T_BOUNDFLT], &xpv_boundstrap, cs, SDT_SYSIGT,
 514             TRP_XPL, 0);
 515         set_gatesegd(&idt[T_ILLINST], &xpv_invoptrap, cs, SDT_SYSIGT, TRP_XPL,
 516             0);
 517         set_gatesegd(&idt[T_NOEXTFLT], &xpv_ndptrap, cs, SDT_SYSIGT, TRP_XPL,
 518             0);
 519         set_gatesegd(&idt[T_TSSFLT], &xpv_invtsstrap, cs, SDT_SYSIGT, TRP_XPL,
 520             0);
 521         set_gatesegd(&idt[T_SEGFLT], &xpv_segnptrap, cs, SDT_SYSIGT, TRP_XPL,
 522             0);
 523         set_gatesegd(&idt[T_STKFLT], &xpv_stktrap, cs, SDT_SYSIGT, TRP_XPL, 0);
 524         set_gatesegd(&idt[T_GPFLT], &xpv_gptrap, cs, SDT_SYSIGT, TRP_XPL, 0);
 525         set_gatesegd(&idt[T_PGFLT], &xpv_pftrap, cs, SDT_SYSIGT, TRP_XPL, 0);
 526         set_gatesegd(&idt[T_EXTERRFLT], &xpv_ndperr, cs, SDT_SYSIGT, TRP_XPL,
 527             0);
 528         set_gatesegd(&idt[T_ALIGNMENT], &xpv_achktrap, cs, SDT_SYSIGT, TRP_XPL,
 529             0);
 530         set_gatesegd(&idt[T_MCE], &xpv_mcetrap, cs, SDT_SYSIGT, TRP_XPL, 0);
 531         set_gatesegd(&idt[T_SIMDFPE], &xpv_xmtrap, cs, SDT_SYSIGT, TRP_XPL, 0);
 532 
 533         /*
 534          * We have no double fault handler.  Any single fault represents a
 535          * catastrophic failure for us, so there is no attempt to handle
 536          * them cleanly: we just print a message and reboot.  If we
 537          * encounter a second fault while doing that, there is nothing
 538          * else we can do.
 539          */
 540 
 541         /*
 542          * Be prepared to absorb any stray device interrupts received
 543          * while writing the core to disk.
 544          */
 545         for (i = 33; i < NIDT; i++)
 546                 set_gatesegd(&idt[i], &xpv_surprise_intr, cs, SDT_SYSIGT,
 547                     TRP_XPL, 0);
 548 
 549         /* The one interrupt we expect to get is from the APIC timer.  */
 550         set_gatesegd(&idt[T_XPV_TIMER], &xpv_timer_trap, cs, SDT_SYSIGT,
 551             TRP_XPL, 0);
 552 
 553         idtr.dtr_base = (uintptr_t)xpv_panic_idt;
 554         idtr.dtr_limit = sizeof (xpv_panic_idt) - 1;
 555         wr_idtr(&idtr);
 556 
 557 #if defined(__amd64)
 558         /* Catch any hypercalls. */
 559         wrmsr(MSR_AMD_LSTAR, (uintptr_t)xpv_panic_hypercall);
 560         wrmsr(MSR_AMD_CSTAR, (uintptr_t)xpv_panic_hypercall);
 561 #endif
 562 }
 563 
 564 static void
 565 xpv_apic_clkinit()
 566 {
 567         uint_t          apic_ticks = 0;
 568 
 569         /*
 570          * Measure how many APIC ticks there are within a fixed time
 571          * period.  We're going to be fairly coarse here.  This timer is
 572          * just being used to detect a stalled panic, so as long as we have
 573          * the right order of magnitude, everything should be fine.
 574          */
 575         xpv_apicadr[APIC_SPUR_INT_REG] = AV_UNIT_ENABLE | APIC_SPUR_INTR;
 576         xpv_apicadr[APIC_LOCAL_TIMER] = AV_MASK;
 577         xpv_apicadr[APIC_INT_VECT0] = AV_MASK;  /* local intr reg 0 */
 578 
 579         xpv_apicadr[APIC_DIVIDE_REG] = 0;
 580         xpv_apicadr[APIC_INIT_COUNT] = APIC_MAXVAL;
 581         drv_usecwait(XPV_TIMER_INTERVAL);
 582         apic_ticks = APIC_MAXVAL - xpv_apicadr[APIC_CURR_COUNT];
 583 
 584         /*
 585          * apic_ticks now represents roughly how many apic ticks comprise
 586          * one timeout interval.  Program the timer to send us an interrupt
 587          * every time that interval expires.
 588          */
 589         xpv_apicadr[APIC_LOCAL_TIMER] = T_XPV_TIMER | AV_PERIODIC;
 590         xpv_apicadr[APIC_INIT_COUNT] = apic_ticks;
 591         xpv_apicadr[APIC_EOI_REG] = 0;
 592 }
 593 
 594 void
 595 xpv_timer_tick(void)
 596 {
 597         static int ticks = 0;
 598 
 599         if (ticks++ >= MICROSEC / XPV_TIMER_INTERVAL) {
 600                 ticks = 0;
 601                 if (dump_timeleft && (--dump_timeleft == 0))
 602                         panic("Xen panic timeout\n");
 603         }
 604         xpv_apicadr[APIC_EOI_REG] = 0;
 605 }
 606 
 607 void
 608 xpv_interrupt(void)
 609 {
 610 #ifdef  DEBUG
 611         static int cnt = 0;
 612 
 613         if (cnt++ < 10)
 614                 xpv_panic_printf("Unexpected interrupt received.\n");
 615         if ((cnt < 1000) && ((cnt % 100) == 0))
 616                 xpv_panic_printf("%d unexpected interrupts received.\n", cnt);
 617 #endif
 618 
 619         xpv_apicadr[APIC_EOI_REG] = 0;
 620 }
 621 
 622 /*
 623  * Managing time in panic context is trivial.  We only have a single CPU,
 624  * we never get rescheduled, we never get suspended.  We just need to
 625  * convert clock ticks into nanoseconds.
 626  */
 627 static hrtime_t
 628 xpv_panic_gethrtime(void)
 629 {
 630         hrtime_t tsc, hrt;
 631         unsigned int *l = (unsigned int *)&(tsc);
 632 
 633         tsc = __rdtsc_insn();
 634         hrt = (mul32(l[1], nsec_scale) << NSEC_SHIFT) +
 635             (mul32(l[0], nsec_scale) >> (32 - NSEC_SHIFT));
 636 
 637         return (hrt);
 638 }
 639 
 640 static void
 641 xpv_panic_time_init()
 642 {
 643         nsec_scale =
 644             CPU->cpu_m.mcpu_vcpu_info->time.tsc_to_system_mul >> NSEC_SHIFT;
 645 
 646         gethrtimef = xpv_panic_gethrtime;
 647 }
 648 
 649 static void
 650 xpv_panicsys(struct regs *rp, char *fmt, ...)
 651 {
 652         extern void panicsys(const char *, va_list, struct regs *, int);
 653         va_list alist;
 654 
 655         va_start(alist, fmt);
 656         panicsys(fmt, alist, rp, 1);
 657         va_end(alist);
 658 }
 659 
 660 void
 661 xpv_do_panic(void *arg)
 662 {
 663         struct panic_info *pip = (struct panic_info *)arg;
 664         int l;
 665         struct cregs creg;
 666 #if defined(__amd64)
 667         extern uintptr_t postbootkernelbase;
 668 #endif
 669 
 670         if (xpv_panicking++ > 0)
 671                 panic("multiple calls to xpv_do_panic()");
 672 
 673         /*
 674          * Indicate to the underlying panic framework that a panic has been
 675          * initiated.  This is ordinarily done as part of vpanic().  Since
 676          * we already have all the register state saved by the hypervisor,
 677          * we skip that and jump straight into the panic processing code.
 678          *
 679          * XXX If another thread grabs and wins the panic_quiesce trigger
 680          * then we'll have two threads in panicsys believing they are in
 681          * charge of the panic attempt!
 682          */
 683         (void) panic_trigger(&panic_quiesce);
 684 
 685 #if defined(__amd64)
 686         /*
 687          * bzero() and bcopy() get unhappy when asked to operate on
 688          * addresses outside of the kernel.  At this point Xen is really a
 689          * part of the kernel, so we update the routines' notion of where
 690          * the kernel starts.
 691          */
 692         postbootkernelbase = xen_virt_start;
 693 #endif
 694 
 695 #if defined(HYPERVISOR_VIRT_END)
 696         xpv_end = HYPERVISOR_VIRT_END;
 697 #else
 698         xpv_end = (uintptr_t)UINTPTR_MAX - sizeof (uintptr_t);
 699 #endif
 700 
 701         /*
 702          * If we were redirecting console output to the hypervisor, we have
 703          * to stop.
 704          */
 705         use_polledio = B_FALSE;
 706         if (boot_console_type(NULL) == CONS_HYPERVISOR) {
 707                 bcons_device_change(CONS_HYPERVISOR);
 708         } else if (cons_polledio != NULL &&
 709             cons_polledio->cons_polledio_putchar != NULL)  {
 710                 if (cons_polledio->cons_polledio_enter != NULL)
 711                         cons_polledio->cons_polledio_enter(
 712                             cons_polledio->cons_polledio_argument);
 713                 use_polledio = 1;
 714         }
 715 
 716         /* Make sure we handle all console output from here on. */
 717         sysp->bsvc_putchar = xpv_panic_putc;
 718 
 719         /*
 720          * If we find an unsupported panic_info structure, there's not much
 721          * we can do other than complain, plow on, and hope for the best.
 722          */
 723         if (pip->pi_version != PANIC_INFO_VERSION)
 724                 xpv_panic_printf("Warning: Xen is using an unsupported "
 725                     "version of the panic_info structure.\n");
 726 
 727         xpv_panic_info = pip;
 728 
 729 #if defined(__amd64)
 730         kpm1_low = (uintptr_t)xpv_panic_info->pi_ram_start;
 731         if (xpv_panic_info->pi_xen_start == NULL) {
 732                 kpm1_high = (uintptr_t)xpv_panic_info->pi_ram_end;
 733         } else {
 734                 kpm1_high = (uintptr_t)xpv_panic_info->pi_xen_start;
 735                 kpm2_low = (uintptr_t)xpv_panic_info->pi_xen_end;
 736                 kpm2_high = (uintptr_t)xpv_panic_info->pi_ram_end;
 737         }
 738 #endif
 739 
 740         /*
 741          * Make sure we are running on the Solaris %gs.  The Xen panic code
 742          * should already have set up the GDT properly.
 743          */
 744         xpv_panic_resetgs();
 745 #if defined(__amd64)
 746         wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]);
 747 #endif
 748 
 749         xpv_panic_time_init();
 750 
 751         /*
 752          * Switch to our own IDT, avoiding any accidental returns to Xen
 753          * world.
 754          */
 755         switch_to_xpv_panic_idt();
 756 
 757         /*
 758          * Initialize the APIC timer, which is used to detect a hung dump
 759          * attempt.
 760          */
 761         xpv_apicadr = pip->pi_apic;
 762         xpv_apic_clkinit();
 763 
 764         /*
 765          * Set up a few values that we'll need repeatedly.
 766          */
 767         getcregs(&creg);
 768         xpv_panic_cr3 = creg.cr_cr3;
 769         for (l = mmu.max_level; l >= 0; l--)
 770                 xpv_panic_nptes[l] = mmu.ptes_per_table;
 771 #ifdef __i386
 772         if (mmu.pae_hat)
 773                 xpv_panic_nptes[mmu.max_level] = 4;
 774 #endif
 775 
 776         /* Add the fake Xen module to the module list */
 777         if (xpv_module != NULL) {
 778                 extern int last_module_id;
 779 
 780                 xpv_modctl->mod_id = last_module_id++;
 781                 xpv_modctl->mod_next = &modules;
 782                 xpv_modctl->mod_prev = modules.mod_prev;
 783                 modules.mod_prev->mod_next = xpv_modctl;
 784                 modules.mod_prev = xpv_modctl;
 785         }
 786 
 787         if (pip->pi_mca.mpd_magic == MCA_PANICDATA_MAGIC)
 788                 xpv_mca_panic_data = &pip->pi_mca;
 789 
 790         xpv_panic_printf = printf;
 791         xpv_panicsys((struct regs *)pip->pi_regs, pip->pi_panicstr);
 792         xpv_panic_printf("Failed to reboot following panic.\n");
 793         for (;;)
 794                 ;
 795 }
 796 
 797 /*
 798  * Set up the necessary data structures to pretend that the Xen hypervisor
 799  * is a loadable module, allowing mdb to find the Xen symbols in a crash
 800  * dump.  Since these symbols all map to VA space Solaris doesn't normally
 801  * have access to, we don't link these structures into the kernel's lists
 802  * until/unless we hit a Xen panic.
 803  *
 804  * The observant reader will note a striking amount of overlap between this
 805  * code and that found in krtld.  While it would be handy if we could just
 806  * ask krtld to do this work for us, it's not that simple.  Among the
 807  * complications: we're not actually loading the text here (grub did it at
 808  * boot), the .text section is writable, there are no relocations to do,
 809  * none of the module text/data is in readable memory, etc.  Training krtld
 810  * to deal with this weird module is as complicated, and more risky, than
 811  * reimplementing the necessary subset of it here.
 812  */
 813 static void
 814 init_xen_module()
 815 {
 816         struct _buf *file = NULL;
 817         struct module *mp;
 818         struct modctl *mcp;
 819         int i, shn;
 820         Shdr *shp, *ctf_shp;
 821         char *names = NULL;
 822         size_t n, namesize, text_align, data_align;
 823 #if defined(__amd64)
 824         const char machine = EM_AMD64;
 825 #else
 826         const char machine = EM_386;
 827 #endif
 828 
 829         /* Allocate and init the module structure */
 830         mp = kmem_zalloc(sizeof (*mp), KM_SLEEP);
 831         mp->filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP);
 832         (void) strcpy(mp->filename, XPV_FILENAME);
 833 
 834         /* Allocate and init the modctl structure */
 835         mcp = kmem_zalloc(sizeof (*mcp), KM_SLEEP);
 836         mcp->mod_modname = kobj_zalloc(strlen(XPV_MODNAME) + 1, KM_SLEEP);
 837         (void) strcpy(mcp->mod_modname, XPV_MODNAME);
 838         mcp->mod_filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP);
 839         (void) strcpy(mcp->mod_filename, XPV_FILENAME);
 840         mcp->mod_inprogress_thread = (kthread_id_t)-1;
 841         mcp->mod_ref = 1;
 842         mcp->mod_loaded = 1;
 843         mcp->mod_loadcnt = 1;
 844         mcp->mod_mp = mp;
 845 
 846         /*
 847          * Try to open a Xen image that hasn't had its symbol and CTF
 848          * information stripped off.
 849          */
 850         file = kobj_open_file(XPV_FILENAME);
 851         if (file == (struct _buf *)-1) {
 852                 file = NULL;
 853                 goto err;
 854         }
 855 
 856         /*
 857          * Read the header and ensure that this is an ELF file for the
 858          * proper ISA.  If it's not, somebody has done something very
 859          * stupid.  Why bother?  See Mencken.
 860          */
 861         if (kobj_read_file(file, (char *)&mp->hdr, sizeof (mp->hdr), 0) < 0)
 862                 goto err;
 863         for (i = 0; i < SELFMAG; i++)
 864                 if (mp->hdr.e_ident[i] != ELFMAG[i])
 865                         goto err;
 866         if ((mp->hdr.e_ident[EI_DATA] != ELFDATA2LSB) ||
 867             (mp->hdr.e_machine != machine))
 868                 goto err;
 869 
 870         /* Read in the section headers */
 871         n = mp->hdr.e_shentsize * mp->hdr.e_shnum;
 872         mp->shdrs = kmem_zalloc(n, KM_SLEEP);
 873         if (kobj_read_file(file, mp->shdrs, n, mp->hdr.e_shoff) < 0)
 874                 goto err;
 875 
 876         /* Read the section names */
 877         shp = (Shdr *)(mp->shdrs + mp->hdr.e_shstrndx * mp->hdr.e_shentsize);
 878         namesize = shp->sh_size;
 879         names = kmem_zalloc(shp->sh_size, KM_SLEEP);
 880         if (kobj_read_file(file, names, shp->sh_size, shp->sh_offset) < 0)
 881                 goto err;
 882 
 883         /*
 884          * Fill in the text and data size fields.
 885          */
 886         ctf_shp = NULL;
 887         text_align = data_align = 0;
 888         for (shn = 1; shn < mp->hdr.e_shnum; shn++) {
 889                 shp = (Shdr *)(mp->shdrs + shn * mp->hdr.e_shentsize);
 890 
 891                 /* Sanity check the offset of the section name */
 892                 if (shp->sh_name >= namesize)
 893                         continue;
 894 
 895                 /* If we find the symtab section, remember it for later. */
 896                 if (shp->sh_type == SHT_SYMTAB) {
 897                         mp->symtbl_section = shn;
 898                         mp->symhdr = shp;
 899                         continue;
 900                 }
 901 
 902                 /* If we find the CTF section, remember it for later. */
 903                 if ((shp->sh_size != 0) &&
 904                     (strcmp(names + shp->sh_name, ".SUNW_ctf") == 0)) {
 905                         ctf_shp = shp;
 906                         continue;
 907                 }
 908 
 909                 if (!(shp->sh_flags & SHF_ALLOC))
 910                         continue;
 911 
 912                 /*
 913                  * Xen marks its text section as writable, so we need to
 914                  * look for the name - not just the flag.
 915                  */
 916                 if ((strcmp(&names[shp->sh_name], ".text") != 0) &&
 917                     (shp->sh_flags & SHF_WRITE) != 0) {
 918                         if (shp->sh_addralign > data_align)
 919                                 data_align = shp->sh_addralign;
 920                         mp->data_size = ALIGN(mp->data_size, data_align);
 921                         mp->data_size += ALIGN(shp->sh_size, 8);
 922                         if (mp->data == NULL || mp->data > (char *)shp->sh_addr)
 923                                 mp->data = (char *)shp->sh_addr;
 924                 } else {
 925                         if (shp->sh_addralign > text_align)
 926                                 text_align = shp->sh_addralign;
 927                         mp->text_size = ALIGN(mp->text_size, text_align);
 928                         mp->text_size += ALIGN(shp->sh_size, 8);
 929                         if (mp->text == NULL || mp->text > (char *)shp->sh_addr)
 930                                 mp->text = (char *)shp->sh_addr;
 931                 }
 932         }
 933         kmem_free(names, namesize);
 934         names = NULL;
 935         shp = NULL;
 936         mcp->mod_text = mp->text;
 937         mcp->mod_text_size = mp->text_size;
 938 
 939         /*
 940          * If we have symbol table and string table sections, read them in
 941          * now.  If we don't, we just plow on.  We'll still get a valid
 942          * core dump, but finding anything useful will be just a bit
 943          * harder.
 944          *
 945          * Note: we don't bother with a hash table.  We'll never do a
 946          * symbol lookup unless we crash, and then mdb creates its own.  We
 947          * also don't try to perform any relocations.  Xen should be loaded
 948          * exactly where the ELF file indicates, and the symbol information
 949          * in the file should be complete and correct already.  Static
 950          * linking ain't all bad.
 951          */
 952         if ((mp->symhdr != NULL) && (mp->symhdr->sh_link < mp->hdr.e_shnum)) {
 953                 mp->strhdr = (Shdr *)
 954                     (mp->shdrs + mp->symhdr->sh_link * mp->hdr.e_shentsize);
 955                 mp->nsyms = mp->symhdr->sh_size / mp->symhdr->sh_entsize;
 956 
 957                 /* Allocate space for the symbol table and strings.  */
 958                 mp->symsize = mp->symhdr->sh_size +
 959                     mp->nsyms * sizeof (symid_t) + mp->strhdr->sh_size;
 960                 mp->symspace = kmem_zalloc(mp->symsize, KM_SLEEP);
 961                 mp->symtbl = mp->symspace;
 962                 mp->strings = (char *)(mp->symtbl + mp->symhdr->sh_size);
 963 
 964                 if ((kobj_read_file(file, mp->symtbl,
 965                     mp->symhdr->sh_size, mp->symhdr->sh_offset) < 0) ||
 966                     (kobj_read_file(file, mp->strings,
 967                     mp->strhdr->sh_size, mp->strhdr->sh_offset) < 0))
 968                         goto err;
 969         }
 970 
 971         /*
 972          * Read in the CTF section
 973          */
 974         if ((ctf_shp != NULL) && ((moddebug & MODDEBUG_NOCTF) == 0)) {
 975                 mp->ctfdata = kmem_zalloc(ctf_shp->sh_size, KM_SLEEP);
 976                 mp->ctfsize = ctf_shp->sh_size;
 977                 if (kobj_read_file(file, mp->ctfdata, mp->ctfsize,
 978                     ctf_shp->sh_offset) < 0)
 979                         goto err;
 980         }
 981 
 982         kobj_close_file(file);
 983 
 984         xpv_module = mp;
 985         xpv_modctl = mcp;
 986         return;
 987 
 988 err:
 989         cmn_err(CE_WARN, "Failed to initialize xpv module.");
 990         if (file != NULL)
 991                 kobj_close_file(file);
 992 
 993         kmem_free(mp->filename, strlen(XPV_FILENAME) + 1);
 994         if (mp->shdrs != NULL)
 995                 kmem_free(mp->shdrs, mp->hdr.e_shentsize * mp->hdr.e_shnum);
 996         if (mp->symspace != NULL)
 997                 kmem_free(mp->symspace, mp->symsize);
 998         if (mp->ctfdata != NULL)
 999                 kmem_free(mp->ctfdata, mp->ctfsize);
1000         kmem_free(mp, sizeof (*mp));
1001         kmem_free(mcp->mod_filename, strlen(XPV_FILENAME) + 1);
1002         kmem_free(mcp->mod_modname, strlen(XPV_MODNAME) + 1);
1003         kmem_free(mcp, sizeof (*mcp));
1004         if (names != NULL)
1005                 kmem_free(names, namesize);
1006 }
1007 
1008 void
1009 xpv_panic_init()
1010 {
1011         xen_platform_op_t op;
1012         int i;
1013 
1014         ASSERT(DOMAIN_IS_INITDOMAIN(xen_info));
1015 
1016         for (i = 0; i < mmu.num_level; i++)
1017                 ptable_pfn[i] = PFN_INVALID;
1018 
1019         /* Let Xen know where to jump if/when it panics. */
1020         op.cmd = XENPF_panic_init;
1021         op.interface_version = XENPF_INTERFACE_VERSION;
1022         op.u.panic_init.panic_addr = (unsigned long)xpv_panic_hdlr;
1023 
1024         (void) HYPERVISOR_platform_op(&op);
1025 
1026         init_xen_module();
1027 }