1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2012 Gary Mills
  23  * Copyright 2016 PALO, Richard.
  24  *
  25  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  26  *
  27  * Copyright 2018 Joyent, Inc.
  28  */
  29 
  30 #include <sys/types.h>
  31 #include <sys/clock.h>
  32 #include <sys/psm.h>
  33 #include <sys/archsystm.h>
  34 #include <sys/machsystm.h>
  35 #include <sys/compress.h>
  36 #include <sys/modctl.h>
  37 #include <sys/trap.h>
  38 #include <sys/panic.h>
  39 #include <sys/regset.h>
  40 #include <sys/frame.h>
  41 #include <sys/kobj.h>
  42 #include <sys/apic.h>
  43 #include <sys/apic_timer.h>
  44 #include <sys/dumphdr.h>
  45 #include <sys/mem.h>
  46 #include <sys/x86_archext.h>
  47 #include <sys/xpv_panic.h>
  48 #include <sys/boot_console.h>
  49 #include <sys/bootsvcs.h>
  50 #include <sys/consdev.h>
  51 #include <vm/hat_pte.h>
  52 #include <vm/hat_i86.h>
  53 
  54 /* XXX: need to add a PAE version too, if we ever support both PAE and non */
  55 #if defined(__i386)
  56 #define XPV_FILENAME    "/boot/xen-syms"
  57 #else
  58 #define XPV_FILENAME    "/boot/amd64/xen-syms"
  59 #endif
  60 #define XPV_MODNAME     "xpv"
  61 
  62 int xpv_panicking = 0;
  63 
  64 struct module *xpv_module;
  65 struct modctl *xpv_modctl;
  66 
  67 #define ALIGN(x, a)     ((a) == 0 ? (uintptr_t)(x) : \
  68         (((uintptr_t)(x) + (uintptr_t)(a) - 1l) & ~((uintptr_t)(a) - 1l)))
  69 
  70 /* Pointer to the xpv_panic_info structure handed to us by Xen.  */
  71 static struct panic_info *xpv_panic_info = NULL;
  72 
  73 /* Timer support */
  74 #define NSEC_SHIFT 5
  75 #define T_XPV_TIMER     0xd1
  76 #define XPV_TIMER_INTERVAL      1000    /* 1000 microseconds */
  77 static uint32_t *xpv_apicadr = NULL;
  78 static uint_t   nsec_scale;
  79 
  80 /* IDT support */
  81 #pragma align   16(xpv_panic_idt)
  82 static gate_desc_t      xpv_panic_idt[NIDT];    /* interrupt descriptor table */
  83 
  84 /* Xen pagetables mapped into our HAT's ptable windows */
  85 static pfn_t ptable_pfn[MAX_NUM_LEVEL];
  86 
  87 /* Number of MMU_PAGESIZE pages we're adding to the Solaris dump */
  88 static int xpv_dump_pages;
  89 
  90 /*
  91  * There are up to two large swathes of RAM that we don't want to include
  92  * in the dump: those that comprise the Xen version of segkpm.  On 32-bit
  93  * systems there is no such region of memory.  On 64-bit systems, there
  94  * should be just a single contiguous region that corresponds to all of
  95  * physical memory.  The tricky bit is that Xen's heap sometimes lives in
  96  * the middle of their segkpm, and is mapped using only kpm-like addresses.
  97  * In that case, we need to skip the swathes before and after Xen's heap.
  98  */
  99 uintptr_t kpm1_low = 0;
 100 uintptr_t kpm1_high = 0;
 101 uintptr_t kpm2_low = 0;
 102 uintptr_t kpm2_high = 0;
 103 
 104 /*
 105  * Some commonly used values that we don't want to recompute over and over.
 106  */
 107 static int xpv_panic_nptes[MAX_NUM_LEVEL];
 108 static ulong_t xpv_panic_cr3;
 109 static uintptr_t xpv_end;
 110 
 111 static void xpv_panic_console_print(const char *fmt, ...);
 112 static void (*xpv_panic_printf)(const char *, ...) = xpv_panic_console_print;
 113 
 114 #define CONSOLE_BUF_SIZE        256
 115 static char console_buffer[CONSOLE_BUF_SIZE];
 116 static boolean_t use_polledio;
 117 
 118 /*
 119  * Pointers to machine check panic info (if any).
 120  */
 121 xpv_mca_panic_data_t *xpv_mca_panic_data = NULL;
 122 
 123 static void
 124 xpv_panic_putc(int m)
 125 {
 126         struct cons_polledio *c = cons_polledio;
 127 
 128         /* This really shouldn't happen */
 129         if (boot_console_type(NULL) == CONS_HYPERVISOR)
 130                 return;
 131 
 132         if (use_polledio == B_TRUE)
 133                 c->cons_polledio_putchar(c->cons_polledio_argument, m);
 134         else
 135                 bcons_putchar(m);
 136 }
 137 
 138 static void
 139 xpv_panic_puts(char *msg)
 140 {
 141         char *m;
 142 
 143         dump_timeleft = dump_timeout;
 144         for (m = msg; *m; m++)
 145                 xpv_panic_putc((int)*m);
 146 }
 147 
 148 static void
 149 xpv_panic_console_print(const char *fmt, ...)
 150 {
 151         va_list ap;
 152 
 153         va_start(ap, fmt);
 154         (void) vsnprintf(console_buffer, sizeof (console_buffer), fmt, ap);
 155         va_end(ap);
 156 
 157         xpv_panic_puts(console_buffer);
 158 }
 159 
 160 static void
 161 xpv_panic_map(int level, pfn_t pfn)
 162 {
 163         x86pte_t pte, *pteptr;
 164 
 165         /*
 166          * The provided pfn represents a level 'level' page table.  Map it
 167          * into the 'level' slot in the list of page table windows.
 168          */
 169         pteptr = (x86pte_t *)PWIN_PTE_VA(level);
 170         pte = pfn_to_pa(pfn) | PT_VALID;
 171 
 172         XPV_ALLOW_PAGETABLE_UPDATES();
 173         if (mmu.pae_hat)
 174                 *pteptr = pte;
 175         else
 176                 *(x86pte32_t *)pteptr = pte;
 177         XPV_DISALLOW_PAGETABLE_UPDATES();
 178 
 179         mmu_flush_tlb_page((uintptr_t)PWIN_VA(level));
 180 }
 181 
 182 /*
 183  * Walk the page tables to find the pfn mapped by the given va.
 184  */
 185 static pfn_t
 186 xpv_va_walk(uintptr_t *vaddr)
 187 {
 188         int l, idx;
 189         pfn_t pfn;
 190         x86pte_t pte;
 191         x86pte_t *ptep;
 192         uintptr_t va = *vaddr;
 193         uintptr_t scan_va;
 194         caddr_t ptable_window;
 195         static pfn_t toplevel_pfn;
 196         static uintptr_t lastva;
 197 
 198         /*
 199          * If we do anything other than a simple scan through memory, don't
 200          * trust the mapped page tables.
 201          */
 202         if (va != lastva + MMU_PAGESIZE)
 203                 for (l = mmu.max_level; l >= 0; l--)
 204                         ptable_pfn[l] = PFN_INVALID;
 205 
 206         toplevel_pfn = mmu_btop(xpv_panic_cr3);
 207 
 208         while (va < xpv_end && va >= *vaddr) {
 209                 /* Find the lowest table with any entry for va */
 210                 pfn = toplevel_pfn;
 211                 for (l = mmu.max_level; l >= 0; l--) {
 212                         if (ptable_pfn[l] != pfn) {
 213                                 xpv_panic_map(l, pfn);
 214                                 ptable_pfn[l] = pfn;
 215                         }
 216 
 217                         /*
 218                          * Search this pagetable for any mapping to an
 219                          * address >= va.
 220                          */
 221                         ptable_window = PWIN_VA(l);
 222                         if (l == mmu.max_level && mmu.pae_hat)
 223                                 ptable_window +=
 224                                     (xpv_panic_cr3 & MMU_PAGEOFFSET);
 225 
 226                         idx = (va >> LEVEL_SHIFT(l)) & (xpv_panic_nptes[l] - 1);
 227                         scan_va = va;
 228                         while (idx < xpv_panic_nptes[l] && scan_va < xpv_end &&
 229                             scan_va >= *vaddr) {
 230                                 ptep = (x86pte_t *)(ptable_window +
 231                                     (idx << mmu.pte_size_shift));
 232                                 pte = GET_PTE(ptep);
 233                                 if (pte & PTE_VALID)
 234                                         break;
 235                                 idx++;
 236                                 scan_va += mmu.level_size[l];
 237                         }
 238 
 239                         /*
 240                          * If there are no valid mappings in this table, we
 241                          * can skip to the end of the VA range it covers.
 242                          */
 243                         if (idx == xpv_panic_nptes[l]) {
 244                                 va = NEXT_ENTRY_VA(va, l + 1);
 245                                 break;
 246                         }
 247 
 248                         va = scan_va;
 249                         /*
 250                          * See if we've hit the end of the range.
 251                          */
 252                         if (va >= xpv_end || va < *vaddr)
 253                                 break;
 254 
 255                         /*
 256                          * If this mapping is for a pagetable, we drop down
 257                          * to the next level in the hierarchy and look for
 258                          * a mapping in it.
 259                          */
 260                         pfn = PTE2MFN(pte, l);
 261                         if (!PTE_ISPAGE(pte, l))
 262                                 continue;
 263 
 264                         /*
 265                          * The APIC page is magic.  Nothing to see here;
 266                          * move along.
 267                          */
 268                         if (((uintptr_t)xpv_apicadr & MMU_PAGEMASK) ==
 269                             (va & MMU_PAGEMASK)) {
 270                                 va += MMU_PAGESIZE;
 271                                 break;
 272                         }
 273 
 274                         /*
 275                          * See if the address is within one of the two
 276                          * kpm-like regions we want to skip.
 277                          */
 278                         if (va >= kpm1_low && va < kpm1_high) {
 279                                 va = kpm1_high;
 280                                 break;
 281                         }
 282                         if (va >= kpm2_low && va < kpm2_high) {
 283                                 va = kpm2_high;
 284                                 break;
 285                         }
 286 
 287                         /*
 288                          * The Xen panic code only handles small pages.  If
 289                          * this mapping is for a large page, we need to
 290                          * identify the consituent page that covers the
 291                          * specific VA we were looking for.
 292                          */
 293                         if (l > 0) {
 294                                 if (l > 1)
 295                                         panic("Xen panic can't cope with "
 296                                             "giant pages.");
 297                                 idx = (va >> LEVEL_SHIFT(0)) &
 298                                     (xpv_panic_nptes[0] - 1);
 299                                 pfn += idx;
 300                         }
 301 
 302                         *vaddr = va;
 303                         lastva = va;
 304                         return (pfn | PFN_IS_FOREIGN_MFN);
 305                 }
 306         }
 307         return (PFN_INVALID);
 308 }
 309 
 310 /*
 311  * Walk through the Xen VA space, finding pages that are mapped in.
 312  *
 313  * These pages all have MFNs rather than PFNs, meaning they may be outside
 314  * the physical address space the kernel knows about, or they may collide
 315  * with PFNs the kernel is using.
 316  *
 317  * The obvious trick of just adding the PFN_IS_FOREIGN_MFN bit to the MFNs
 318  * to avoid collisions doesn't work.  The pages need to be written to disk
 319  * in PFN-order or savecore gets confused.  We can't allocate memory to
 320  * contruct a sorted pfn->VA reverse mapping, so we have to write the pages
 321  * to disk in VA order.
 322  *
 323  * To square this circle, we simply make up PFNs for each of Xen's pages.
 324  * We assign each mapped page a fake PFN in ascending order.  These fake
 325  * PFNs each have the FOREIGN bit set, ensuring that they fall outside the
 326  * range of Solaris PFNs written by the kernel.
 327  */
 328 int
 329 dump_xpv_addr()
 330 {
 331         uintptr_t va;
 332         mem_vtop_t mem_vtop;
 333 
 334         xpv_dump_pages = 0;
 335         va = xen_virt_start;
 336 
 337         while (xpv_va_walk(&va) != PFN_INVALID) {
 338                 mem_vtop.m_as = &kas;
 339                 mem_vtop.m_va = (void *)va;
 340                 mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN;
 341 
 342                 dumpvp_write(&mem_vtop, sizeof (mem_vtop_t));
 343                 xpv_dump_pages++;
 344 
 345                 va += MMU_PAGESIZE;
 346         }
 347 
 348         /*
 349          * Add the shared_info page.  This page actually ends up in the
 350          * dump twice: once for the Xen va and once for the Solaris va.
 351          * This isn't ideal, but we don't know the address Xen is using for
 352          * the page, so we can't share it.
 353          */
 354         mem_vtop.m_as = &kas;
 355         mem_vtop.m_va = HYPERVISOR_shared_info;
 356         mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN;
 357         dumpvp_write(&mem_vtop, sizeof (mem_vtop_t));
 358         xpv_dump_pages++;
 359 
 360         return (xpv_dump_pages);
 361 }
 362 
 363 void
 364 dump_xpv_pfn()
 365 {
 366         pfn_t pfn;
 367         int cnt;
 368 
 369         for (cnt = 0; cnt < xpv_dump_pages; cnt++) {
 370                 pfn = (pfn_t)cnt | PFN_IS_FOREIGN_MFN;
 371                 dumpvp_write(&pfn, sizeof (pfn));
 372         }
 373 }
 374 
 375 int
 376 dump_xpv_data(void *dump_cbuf)
 377 {
 378         uintptr_t va;
 379         uint32_t csize;
 380         int cnt = 0;
 381 
 382         /*
 383          * XXX: we should probably run this data through a UE check.  The
 384          * catch is that the UE code relies on on_trap() and getpfnum()
 385          * working.
 386          */
 387         va = xen_virt_start;
 388 
 389         while (xpv_va_walk(&va) != PFN_INVALID) {
 390                 csize = (uint32_t)compress((void *)va, dump_cbuf, PAGESIZE);
 391                 dumpvp_write(&csize, sizeof (uint32_t));
 392                 dumpvp_write(dump_cbuf, csize);
 393                 if (dump_ioerr) {
 394                         dumphdr->dump_flags &= ~DF_COMPLETE;
 395                         return (cnt);
 396                 }
 397                 cnt++;
 398                 va += MMU_PAGESIZE;
 399         }
 400 
 401         /*
 402          * Finally, dump the shared_info page
 403          */
 404         csize = (uint32_t)compress((void *)HYPERVISOR_shared_info, dump_cbuf,
 405             PAGESIZE);
 406         dumpvp_write(&csize, sizeof (uint32_t));
 407         dumpvp_write(dump_cbuf, csize);
 408         if (dump_ioerr)
 409                 dumphdr->dump_flags &= ~DF_COMPLETE;
 410         cnt++;
 411 
 412         return (cnt);
 413 }
 414 
 415 static void *
 416 showstack(void *fpreg, int xpv_only)
 417 {
 418         struct frame *fpp;
 419         ulong_t off;
 420         char *sym;
 421         uintptr_t pc, fp, lastfp;
 422         uintptr_t minaddr = min(KERNELBASE, xen_virt_start);
 423 
 424         fp = (uintptr_t)fpreg;
 425         if (fp < minaddr) {
 426                 xpv_panic_printf("Bad frame ptr: 0x%p\n", fpreg);
 427                 return (fpreg);
 428         }
 429 
 430         do {
 431                 fpp = (struct frame *)fp;
 432                 pc = fpp->fr_savpc;
 433 
 434                 if ((xpv_only != 0) &&
 435                     (fp > xpv_end || fp < xen_virt_start))
 436                         break;
 437                 if ((sym = kobj_getsymname(pc, &off)) != NULL)
 438                         xpv_panic_printf("%08lx %s:%s+%lx\n", fp,
 439                             mod_containing_pc((caddr_t)pc), sym, off);
 440                 else if ((pc >= xen_virt_start) && (pc <= xpv_end))
 441                         xpv_panic_printf("%08lx 0x%lx (in Xen)\n", fp, pc);
 442                 else
 443                         xpv_panic_printf("%08lx %lx\n", fp, pc);
 444 
 445                 lastfp = fp;
 446                 fp = fpp->fr_savfp;
 447 
 448                 /*
 449                  * Xen marks an exception frame by inverting the frame
 450                  * pointer.
 451                  */
 452                 if (fp < lastfp) {
 453                         if ((~fp > minaddr) && ((~fp) ^ lastfp) < 0xfff)
 454                                 fp = ~fp;
 455                 }
 456         } while (fp > lastfp);
 457         return ((void *)fp);
 458 }
 459 
 460 void *
 461 xpv_traceback(void *fpreg)
 462 {
 463         return (showstack(fpreg, 1));
 464 }
 465 
 466 #if defined(__amd64)
 467 static void
 468 xpv_panic_hypercall(ulong_t call)
 469 {
 470         panic("Illegally issued hypercall %d during panic!\n", (int)call);
 471 }
 472 #endif
 473 
 474 void
 475 xpv_die(struct regs *rp)
 476 {
 477         struct panic_trap_info ti;
 478         struct cregs creg;
 479 
 480         ti.trap_regs = rp;
 481         ti.trap_type = rp->r_trapno;
 482 
 483         curthread->t_panic_trap = &ti;
 484         if (ti.trap_type == T_PGFLT) {
 485                 getcregs(&creg);
 486                 ti.trap_addr = (caddr_t)creg.cr_cr2;
 487                 panic("Fatal pagefault at 0x%lx.  fault addr=0x%p  rp=0x%p",
 488                     rp->r_pc, (void *)ti.trap_addr, (void *)rp);
 489         } else {
 490                 ti.trap_addr = (caddr_t)rp->r_pc;
 491                 panic("Fatal trap %ld at 0x%lx.  rp=0x%p", rp->r_trapno,
 492                     rp->r_pc, (void *)rp);
 493         }
 494 }
 495 
 496 /*
 497  * Build IDT to handle a Xen panic
 498  */
 499 static void
 500 switch_to_xpv_panic_idt()
 501 {
 502         int i;
 503         desctbr_t idtr;
 504         gate_desc_t *idt = xpv_panic_idt;
 505         selector_t cs = get_cs_register();
 506 
 507         for (i = 0; i < 32; i++)
 508                 set_gatesegd(&idt[i], &xpv_invaltrap, cs, SDT_SYSIGT, TRP_XPL,
 509                     0);
 510 
 511         set_gatesegd(&idt[T_ZERODIV], &xpv_div0trap, cs, SDT_SYSIGT, TRP_XPL,
 512             0);
 513         set_gatesegd(&idt[T_SGLSTP], &xpv_dbgtrap, cs, SDT_SYSIGT, TRP_XPL, 0);
 514         set_gatesegd(&idt[T_NMIFLT], &xpv_nmiint, cs, SDT_SYSIGT, TRP_XPL, 0);
 515         set_gatesegd(&idt[T_BOUNDFLT], &xpv_boundstrap, cs, SDT_SYSIGT,
 516             TRP_XPL, 0);
 517         set_gatesegd(&idt[T_ILLINST], &xpv_invoptrap, cs, SDT_SYSIGT, TRP_XPL,
 518             0);
 519         set_gatesegd(&idt[T_NOEXTFLT], &xpv_ndptrap, cs, SDT_SYSIGT, TRP_XPL,
 520             0);
 521         set_gatesegd(&idt[T_TSSFLT], &xpv_invtsstrap, cs, SDT_SYSIGT, TRP_XPL,
 522             0);
 523         set_gatesegd(&idt[T_SEGFLT], &xpv_segnptrap, cs, SDT_SYSIGT, TRP_XPL,
 524             0);
 525         set_gatesegd(&idt[T_STKFLT], &xpv_stktrap, cs, SDT_SYSIGT, TRP_XPL, 0);
 526         set_gatesegd(&idt[T_GPFLT], &xpv_gptrap, cs, SDT_SYSIGT, TRP_XPL, 0);
 527         set_gatesegd(&idt[T_PGFLT], &xpv_pftrap, cs, SDT_SYSIGT, TRP_XPL, 0);
 528         set_gatesegd(&idt[T_EXTERRFLT], &xpv_ndperr, cs, SDT_SYSIGT, TRP_XPL,
 529             0);
 530         set_gatesegd(&idt[T_ALIGNMENT], &xpv_achktrap, cs, SDT_SYSIGT, TRP_XPL,
 531             0);
 532         set_gatesegd(&idt[T_MCE], &xpv_mcetrap, cs, SDT_SYSIGT, TRP_XPL, 0);
 533         set_gatesegd(&idt[T_SIMDFPE], &xpv_xmtrap, cs, SDT_SYSIGT, TRP_XPL, 0);
 534 
 535         /*
 536          * We have no double fault handler.  Any single fault represents a
 537          * catastrophic failure for us, so there is no attempt to handle
 538          * them cleanly: we just print a message and reboot.  If we
 539          * encounter a second fault while doing that, there is nothing
 540          * else we can do.
 541          */
 542 
 543         /*
 544          * Be prepared to absorb any stray device interrupts received
 545          * while writing the core to disk.
 546          */
 547         for (i = 33; i < NIDT; i++)
 548                 set_gatesegd(&idt[i], &xpv_surprise_intr, cs, SDT_SYSIGT,
 549                     TRP_XPL, 0);
 550 
 551         /* The one interrupt we expect to get is from the APIC timer.  */
 552         set_gatesegd(&idt[T_XPV_TIMER], &xpv_timer_trap, cs, SDT_SYSIGT,
 553             TRP_XPL, 0);
 554 
 555         idtr.dtr_base = (uintptr_t)xpv_panic_idt;
 556         idtr.dtr_limit = sizeof (xpv_panic_idt) - 1;
 557         wr_idtr(&idtr);
 558 
 559 #if defined(__amd64)
 560         /* Catch any hypercalls. */
 561         wrmsr(MSR_AMD_LSTAR, (uintptr_t)xpv_panic_hypercall);
 562         wrmsr(MSR_AMD_CSTAR, (uintptr_t)xpv_panic_hypercall);
 563 #endif
 564 }
 565 
 566 static void
 567 xpv_apic_clkinit()
 568 {
 569         uint_t          apic_ticks = 0;
 570 
 571         /*
 572          * Measure how many APIC ticks there are within a fixed time
 573          * period.  We're going to be fairly coarse here.  This timer is
 574          * just being used to detect a stalled panic, so as long as we have
 575          * the right order of magnitude, everything should be fine.
 576          */
 577         xpv_apicadr[APIC_SPUR_INT_REG] = AV_UNIT_ENABLE | APIC_SPUR_INTR;
 578         xpv_apicadr[APIC_LOCAL_TIMER] = AV_MASK;
 579         xpv_apicadr[APIC_INT_VECT0] = AV_MASK;  /* local intr reg 0 */
 580 
 581         xpv_apicadr[APIC_DIVIDE_REG] = 0;
 582         xpv_apicadr[APIC_INIT_COUNT] = APIC_MAXVAL;
 583         drv_usecwait(XPV_TIMER_INTERVAL);
 584         apic_ticks = APIC_MAXVAL - xpv_apicadr[APIC_CURR_COUNT];
 585 
 586         /*
 587          * apic_ticks now represents roughly how many apic ticks comprise
 588          * one timeout interval.  Program the timer to send us an interrupt
 589          * every time that interval expires.
 590          */
 591         xpv_apicadr[APIC_LOCAL_TIMER] = T_XPV_TIMER | AV_PERIODIC;
 592         xpv_apicadr[APIC_INIT_COUNT] = apic_ticks;
 593         xpv_apicadr[APIC_EOI_REG] = 0;
 594 }
 595 
 596 void
 597 xpv_timer_tick(void)
 598 {
 599         static int ticks = 0;
 600 
 601         if (ticks++ >= MICROSEC / XPV_TIMER_INTERVAL) {
 602                 ticks = 0;
 603                 if (dump_timeleft && (--dump_timeleft == 0))
 604                         panic("Xen panic timeout\n");
 605         }
 606         xpv_apicadr[APIC_EOI_REG] = 0;
 607 }
 608 
 609 void
 610 xpv_interrupt(void)
 611 {
 612 #ifdef  DEBUG
 613         static int cnt = 0;
 614 
 615         if (cnt++ < 10)
 616                 xpv_panic_printf("Unexpected interrupt received.\n");
 617         if ((cnt < 1000) && ((cnt % 100) == 0))
 618                 xpv_panic_printf("%d unexpected interrupts received.\n", cnt);
 619 #endif
 620 
 621         xpv_apicadr[APIC_EOI_REG] = 0;
 622 }
 623 
 624 /*
 625  * Managing time in panic context is trivial.  We only have a single CPU,
 626  * we never get rescheduled, we never get suspended.  We just need to
 627  * convert clock ticks into nanoseconds.
 628  */
 629 static hrtime_t
 630 xpv_panic_gethrtime(void)
 631 {
 632         hrtime_t tsc, hrt;
 633         unsigned int *l = (unsigned int *)&(tsc);
 634 
 635         tsc = __rdtsc_insn();
 636         hrt = (mul32(l[1], nsec_scale) << NSEC_SHIFT) +
 637             (mul32(l[0], nsec_scale) >> (32 - NSEC_SHIFT));
 638 
 639         return (hrt);
 640 }
 641 
 642 static void
 643 xpv_panic_time_init()
 644 {
 645         nsec_scale =
 646             CPU->cpu_m.mcpu_vcpu_info->time.tsc_to_system_mul >> NSEC_SHIFT;
 647 
 648         gethrtimef = xpv_panic_gethrtime;
 649 }
 650 
 651 static void
 652 xpv_panicsys(struct regs *rp, char *fmt, ...)
 653 {
 654         extern void panicsys(const char *, va_list, struct regs *, int);
 655         va_list alist;
 656 
 657         va_start(alist, fmt);
 658         panicsys(fmt, alist, rp, 1);
 659         va_end(alist);
 660 }
 661 
 662 void
 663 xpv_do_panic(void *arg)
 664 {
 665         struct panic_info *pip = (struct panic_info *)arg;
 666         int l;
 667         struct cregs creg;
 668 #if defined(__amd64)
 669         extern uintptr_t postbootkernelbase;
 670 #endif
 671 
 672         if (xpv_panicking++ > 0)
 673                 panic("multiple calls to xpv_do_panic()");
 674 
 675         /*
 676          * Indicate to the underlying panic framework that a panic has been
 677          * initiated.  This is ordinarily done as part of vpanic().  Since
 678          * we already have all the register state saved by the hypervisor,
 679          * we skip that and jump straight into the panic processing code.
 680          *
 681          * XXX If another thread grabs and wins the panic_quiesce trigger
 682          * then we'll have two threads in panicsys believing they are in
 683          * charge of the panic attempt!
 684          */
 685         (void) panic_trigger(&panic_quiesce);
 686 
 687 #if defined(__amd64)
 688         /*
 689          * bzero() and bcopy() get unhappy when asked to operate on
 690          * addresses outside of the kernel.  At this point Xen is really a
 691          * part of the kernel, so we update the routines' notion of where
 692          * the kernel starts.
 693          */
 694         postbootkernelbase = xen_virt_start;
 695 #endif
 696 
 697 #if defined(HYPERVISOR_VIRT_END)
 698         xpv_end = HYPERVISOR_VIRT_END;
 699 #else
 700         xpv_end = (uintptr_t)UINTPTR_MAX - sizeof (uintptr_t);
 701 #endif
 702 
 703         /*
 704          * If we were redirecting console output to the hypervisor, we have
 705          * to stop.
 706          */
 707         use_polledio = B_FALSE;
 708         if (boot_console_type(NULL) == CONS_HYPERVISOR) {
 709                 bcons_device_change(CONS_HYPERVISOR);
 710         } else if (cons_polledio != NULL &&
 711             cons_polledio->cons_polledio_putchar != NULL)  {
 712                 if (cons_polledio->cons_polledio_enter != NULL)
 713                         cons_polledio->cons_polledio_enter(
 714                             cons_polledio->cons_polledio_argument);
 715                 use_polledio = 1;
 716         }
 717 
 718         /* Make sure we handle all console output from here on. */
 719         sysp->bsvc_putchar = xpv_panic_putc;
 720 
 721         /*
 722          * If we find an unsupported panic_info structure, there's not much
 723          * we can do other than complain, plow on, and hope for the best.
 724          */
 725         if (pip->pi_version != PANIC_INFO_VERSION)
 726                 xpv_panic_printf("Warning: Xen is using an unsupported "
 727                     "version of the panic_info structure.\n");
 728 
 729         xpv_panic_info = pip;
 730 
 731 #if defined(__amd64)
 732         kpm1_low = (uintptr_t)xpv_panic_info->pi_ram_start;
 733         if (xpv_panic_info->pi_xen_start == NULL) {
 734                 kpm1_high = (uintptr_t)xpv_panic_info->pi_ram_end;
 735         } else {
 736                 kpm1_high = (uintptr_t)xpv_panic_info->pi_xen_start;
 737                 kpm2_low = (uintptr_t)xpv_panic_info->pi_xen_end;
 738                 kpm2_high = (uintptr_t)xpv_panic_info->pi_ram_end;
 739         }
 740 #endif
 741 
 742         /*
 743          * Make sure we are running on the Solaris %gs.  The Xen panic code
 744          * should already have set up the GDT properly.
 745          */
 746         xpv_panic_resetgs();
 747 #if defined(__amd64)
 748         wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]);
 749 #endif
 750 
 751         xpv_panic_time_init();
 752 
 753         /*
 754          * Switch to our own IDT, avoiding any accidental returns to Xen
 755          * world.
 756          */
 757         switch_to_xpv_panic_idt();
 758 
 759         /*
 760          * Initialize the APIC timer, which is used to detect a hung dump
 761          * attempt.
 762          */
 763         xpv_apicadr = pip->pi_apic;
 764         xpv_apic_clkinit();
 765 
 766         /*
 767          * Set up a few values that we'll need repeatedly.
 768          */
 769         getcregs(&creg);
 770         xpv_panic_cr3 = creg.cr_cr3;
 771         for (l = mmu.max_level; l >= 0; l--)
 772                 xpv_panic_nptes[l] = mmu.ptes_per_table;
 773 #ifdef __i386
 774         if (mmu.pae_hat)
 775                 xpv_panic_nptes[mmu.max_level] = 4;
 776 #endif
 777 
 778         /* Add the fake Xen module to the module list */
 779         if (xpv_module != NULL) {
 780                 extern int last_module_id;
 781 
 782                 xpv_modctl->mod_id = last_module_id++;
 783                 xpv_modctl->mod_next = &modules;
 784                 xpv_modctl->mod_prev = modules.mod_prev;
 785                 modules.mod_prev->mod_next = xpv_modctl;
 786                 modules.mod_prev = xpv_modctl;
 787         }
 788 
 789         if (pip->pi_mca.mpd_magic == MCA_PANICDATA_MAGIC)
 790                 xpv_mca_panic_data = &pip->pi_mca;
 791 
 792         xpv_panic_printf = printf;
 793         xpv_panicsys((struct regs *)pip->pi_regs, pip->pi_panicstr);
 794         xpv_panic_printf("Failed to reboot following panic.\n");
 795         for (;;)
 796                 ;
 797 }
 798 
 799 /*
 800  * Set up the necessary data structures to pretend that the Xen hypervisor
 801  * is a loadable module, allowing mdb to find the Xen symbols in a crash
 802  * dump.  Since these symbols all map to VA space Solaris doesn't normally
 803  * have access to, we don't link these structures into the kernel's lists
 804  * until/unless we hit a Xen panic.
 805  *
 806  * The observant reader will note a striking amount of overlap between this
 807  * code and that found in krtld.  While it would be handy if we could just
 808  * ask krtld to do this work for us, it's not that simple.  Among the
 809  * complications: we're not actually loading the text here (grub did it at
 810  * boot), the .text section is writable, there are no relocations to do,
 811  * none of the module text/data is in readable memory, etc.  Training krtld
 812  * to deal with this weird module is as complicated, and more risky, than
 813  * reimplementing the necessary subset of it here.
 814  */
 815 static void
 816 init_xen_module()
 817 {
 818         struct _buf *file = NULL;
 819         struct module *mp;
 820         struct modctl *mcp;
 821         int i, shn;
 822         Shdr *shp, *ctf_shp;
 823         char *names = NULL;
 824         size_t n, namesize, text_align, data_align;
 825 #if defined(__amd64)
 826         const char machine = EM_AMD64;
 827 #else
 828         const char machine = EM_386;
 829 #endif
 830 
 831         /* Allocate and init the module structure */
 832         mp = kmem_zalloc(sizeof (*mp), KM_SLEEP);
 833         mp->filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP);
 834         (void) strcpy(mp->filename, XPV_FILENAME);
 835 
 836         /* Allocate and init the modctl structure */
 837         mcp = kmem_zalloc(sizeof (*mcp), KM_SLEEP);
 838         mcp->mod_modname = kobj_zalloc(strlen(XPV_MODNAME) + 1, KM_SLEEP);
 839         (void) strcpy(mcp->mod_modname, XPV_MODNAME);
 840         mcp->mod_filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP);
 841         (void) strcpy(mcp->mod_filename, XPV_FILENAME);
 842         mcp->mod_inprogress_thread = (kthread_id_t)-1;
 843         mcp->mod_ref = 1;
 844         mcp->mod_loaded = 1;
 845         mcp->mod_loadcnt = 1;
 846         mcp->mod_mp = mp;
 847 
 848         /*
 849          * Try to open a Xen image that hasn't had its symbol and CTF
 850          * information stripped off.
 851          */
 852         file = kobj_open_file(XPV_FILENAME);
 853         if (file == (struct _buf *)-1) {
 854                 file = NULL;
 855                 goto err;
 856         }
 857 
 858         /*
 859          * Read the header and ensure that this is an ELF file for the
 860          * proper ISA.  If it's not, somebody has done something very
 861          * stupid.  Why bother?  See Mencken.
 862          */
 863         if (kobj_read_file(file, (char *)&mp->hdr, sizeof (mp->hdr), 0) < 0)
 864                 goto err;
 865         for (i = 0; i < SELFMAG; i++)
 866                 if (mp->hdr.e_ident[i] != ELFMAG[i])
 867                         goto err;
 868         if ((mp->hdr.e_ident[EI_DATA] != ELFDATA2LSB) ||
 869             (mp->hdr.e_machine != machine))
 870                 goto err;
 871 
 872         /* Read in the section headers */
 873         n = mp->hdr.e_shentsize * mp->hdr.e_shnum;
 874         mp->shdrs = kmem_zalloc(n, KM_SLEEP);
 875         if (kobj_read_file(file, mp->shdrs, n, mp->hdr.e_shoff) < 0)
 876                 goto err;
 877 
 878         /* Read the section names */
 879         shp = (Shdr *)(mp->shdrs + mp->hdr.e_shstrndx * mp->hdr.e_shentsize);
 880         namesize = shp->sh_size;
 881         names = kmem_zalloc(shp->sh_size, KM_SLEEP);
 882         if (kobj_read_file(file, names, shp->sh_size, shp->sh_offset) < 0)
 883                 goto err;
 884 
 885         /*
 886          * Fill in the text and data size fields.
 887          */
 888         ctf_shp = NULL;
 889         text_align = data_align = 0;
 890         for (shn = 1; shn < mp->hdr.e_shnum; shn++) {
 891                 shp = (Shdr *)(mp->shdrs + shn * mp->hdr.e_shentsize);
 892 
 893                 /* Sanity check the offset of the section name */
 894                 if (shp->sh_name >= namesize)
 895                         continue;
 896 
 897                 /* If we find the symtab section, remember it for later. */
 898                 if (shp->sh_type == SHT_SYMTAB) {
 899                         mp->symtbl_section = shn;
 900                         mp->symhdr = shp;
 901                         continue;
 902                 }
 903 
 904                 /* If we find the CTF section, remember it for later. */
 905                 if ((shp->sh_size != 0) &&
 906                     (strcmp(names + shp->sh_name, ".SUNW_ctf") == 0)) {
 907                         ctf_shp = shp;
 908                         continue;
 909                 }
 910 
 911                 if (!(shp->sh_flags & SHF_ALLOC))
 912                         continue;
 913 
 914                 /*
 915                  * Xen marks its text section as writable, so we need to
 916                  * look for the name - not just the flag.
 917                  */
 918                 if ((strcmp(&names[shp->sh_name], ".text") != 0) &&
 919                     (shp->sh_flags & SHF_WRITE) != 0) {
 920                         if (shp->sh_addralign > data_align)
 921                                 data_align = shp->sh_addralign;
 922                         mp->data_size = ALIGN(mp->data_size, data_align);
 923                         mp->data_size += ALIGN(shp->sh_size, 8);
 924                         if (mp->data == NULL || mp->data > (char *)shp->sh_addr)
 925                                 mp->data = (char *)shp->sh_addr;
 926                 } else {
 927                         if (shp->sh_addralign > text_align)
 928                                 text_align = shp->sh_addralign;
 929                         mp->text_size = ALIGN(mp->text_size, text_align);
 930                         mp->text_size += ALIGN(shp->sh_size, 8);
 931                         if (mp->text == NULL || mp->text > (char *)shp->sh_addr)
 932                                 mp->text = (char *)shp->sh_addr;
 933                 }
 934         }
 935         kmem_free(names, namesize);
 936         names = NULL;
 937         shp = NULL;
 938         mcp->mod_text = mp->text;
 939         mcp->mod_text_size = mp->text_size;
 940 
 941         /*
 942          * If we have symbol table and string table sections, read them in
 943          * now.  If we don't, we just plow on.  We'll still get a valid
 944          * core dump, but finding anything useful will be just a bit
 945          * harder.
 946          *
 947          * Note: we don't bother with a hash table.  We'll never do a
 948          * symbol lookup unless we crash, and then mdb creates its own.  We
 949          * also don't try to perform any relocations.  Xen should be loaded
 950          * exactly where the ELF file indicates, and the symbol information
 951          * in the file should be complete and correct already.  Static
 952          * linking ain't all bad.
 953          */
 954         if ((mp->symhdr != NULL) && (mp->symhdr->sh_link < mp->hdr.e_shnum)) {
 955                 mp->strhdr = (Shdr *)
 956                     (mp->shdrs + mp->symhdr->sh_link * mp->hdr.e_shentsize);
 957                 mp->nsyms = mp->symhdr->sh_size / mp->symhdr->sh_entsize;
 958 
 959                 /* Allocate space for the symbol table and strings.  */
 960                 mp->symsize = mp->symhdr->sh_size +
 961                     mp->nsyms * sizeof (symid_t) + mp->strhdr->sh_size;
 962                 mp->symspace = kmem_zalloc(mp->symsize, KM_SLEEP);
 963                 mp->symtbl = mp->symspace;
 964                 mp->strings = (char *)(mp->symtbl + mp->symhdr->sh_size);
 965 
 966                 if ((kobj_read_file(file, mp->symtbl,
 967                     mp->symhdr->sh_size, mp->symhdr->sh_offset) < 0) ||
 968                     (kobj_read_file(file, mp->strings,
 969                     mp->strhdr->sh_size, mp->strhdr->sh_offset) < 0))
 970                         goto err;
 971         }
 972 
 973         /*
 974          * Read in the CTF section
 975          */
 976         if ((ctf_shp != NULL) && ((moddebug & MODDEBUG_NOCTF) == 0)) {
 977                 mp->ctfdata = kmem_zalloc(ctf_shp->sh_size, KM_SLEEP);
 978                 mp->ctfsize = ctf_shp->sh_size;
 979                 if (kobj_read_file(file, mp->ctfdata, mp->ctfsize,
 980                     ctf_shp->sh_offset) < 0)
 981                         goto err;
 982         }
 983 
 984         kobj_close_file(file);
 985 
 986         xpv_module = mp;
 987         xpv_modctl = mcp;
 988         return;
 989 
 990 err:
 991         cmn_err(CE_WARN, "Failed to initialize xpv module.");
 992         if (file != NULL)
 993                 kobj_close_file(file);
 994 
 995         kmem_free(mp->filename, strlen(XPV_FILENAME) + 1);
 996         if (mp->shdrs != NULL)
 997                 kmem_free(mp->shdrs, mp->hdr.e_shentsize * mp->hdr.e_shnum);
 998         if (mp->symspace != NULL)
 999                 kmem_free(mp->symspace, mp->symsize);
1000         if (mp->ctfdata != NULL)
1001                 kmem_free(mp->ctfdata, mp->ctfsize);
1002         kmem_free(mp, sizeof (*mp));
1003         kmem_free(mcp->mod_filename, strlen(XPV_FILENAME) + 1);
1004         kmem_free(mcp->mod_modname, strlen(XPV_MODNAME) + 1);
1005         kmem_free(mcp, sizeof (*mcp));
1006         if (names != NULL)
1007                 kmem_free(names, namesize);
1008 }
1009 
1010 void
1011 xpv_panic_init()
1012 {
1013         xen_platform_op_t op;
1014         int i;
1015 
1016         ASSERT(DOMAIN_IS_INITDOMAIN(xen_info));
1017 
1018         for (i = 0; i < mmu.num_level; i++)
1019                 ptable_pfn[i] = PFN_INVALID;
1020 
1021         /* Let Xen know where to jump if/when it panics. */
1022         op.cmd = XENPF_panic_init;
1023         op.interface_version = XENPF_INTERFACE_VERSION;
1024         op.u.panic_init.panic_addr = (unsigned long)xpv_panic_hdlr;
1025 
1026         (void) HYPERVISOR_platform_op(&op);
1027 
1028         init_xen_module();
1029 }