Print this page
    
8956 Implement KPTI
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
    
      
        | Split | Close | 
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/i86xpv/os/xpv_panic.c
          +++ new/usr/src/uts/i86xpv/os/xpv_panic.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  
    | ↓ open down ↓ | 15 lines elided | ↑ open up ↑ | 
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2012 Gary Mills
  23   23   * Copyright 2016 PALO, Richard.
  24   24   *
  25   25   * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
       26 + *
       27 + * Copyright 2018 Joyent, Inc.
  26   28   */
  27   29  
  28   30  #include <sys/types.h>
  29   31  #include <sys/clock.h>
  30   32  #include <sys/psm.h>
  31   33  #include <sys/archsystm.h>
  32   34  #include <sys/machsystm.h>
  33   35  #include <sys/compress.h>
  34   36  #include <sys/modctl.h>
  35   37  #include <sys/trap.h>
  36   38  #include <sys/panic.h>
  37   39  #include <sys/regset.h>
  38   40  #include <sys/frame.h>
  39   41  #include <sys/kobj.h>
  40   42  #include <sys/apic.h>
  41   43  #include <sys/apic_timer.h>
  42   44  #include <sys/dumphdr.h>
  43   45  #include <sys/mem.h>
  44   46  #include <sys/x86_archext.h>
  45   47  #include <sys/xpv_panic.h>
  46   48  #include <sys/boot_console.h>
  47   49  #include <sys/bootsvcs.h>
  48   50  #include <sys/consdev.h>
  49   51  #include <vm/hat_pte.h>
  50   52  #include <vm/hat_i86.h>
  51   53  
  52   54  /* XXX: need to add a PAE version too, if we ever support both PAE and non */
  53   55  #if defined(__i386)
  54   56  #define XPV_FILENAME    "/boot/xen-syms"
  55   57  #else
  56   58  #define XPV_FILENAME    "/boot/amd64/xen-syms"
  57   59  #endif
  58   60  #define XPV_MODNAME     "xpv"
  59   61  
  60   62  int xpv_panicking = 0;
  61   63  
  62   64  struct module *xpv_module;
  63   65  struct modctl *xpv_modctl;
  64   66  
  65   67  #define ALIGN(x, a)     ((a) == 0 ? (uintptr_t)(x) : \
  66   68          (((uintptr_t)(x) + (uintptr_t)(a) - 1l) & ~((uintptr_t)(a) - 1l)))
  67   69  
  68   70  /* Pointer to the xpv_panic_info structure handed to us by Xen.  */
  69   71  static struct panic_info *xpv_panic_info = NULL;
  70   72  
  71   73  /* Timer support */
  72   74  #define NSEC_SHIFT 5
  73   75  #define T_XPV_TIMER     0xd1
  74   76  #define XPV_TIMER_INTERVAL      1000    /* 1000 microseconds */
  75   77  static uint32_t *xpv_apicadr = NULL;
  76   78  static uint_t   nsec_scale;
  77   79  
  78   80  /* IDT support */
  79   81  #pragma align   16(xpv_panic_idt)
  80   82  static gate_desc_t      xpv_panic_idt[NIDT];    /* interrupt descriptor table */
  81   83  
  82   84  /* Xen pagetables mapped into our HAT's ptable windows */
  83   85  static pfn_t ptable_pfn[MAX_NUM_LEVEL];
  84   86  
  85   87  /* Number of MMU_PAGESIZE pages we're adding to the Solaris dump */
  86   88  static int xpv_dump_pages;
  87   89  
  88   90  /*
  89   91   * There are up to two large swathes of RAM that we don't want to include
  90   92   * in the dump: those that comprise the Xen version of segkpm.  On 32-bit
  91   93   * systems there is no such region of memory.  On 64-bit systems, there
  92   94   * should be just a single contiguous region that corresponds to all of
  93   95   * physical memory.  The tricky bit is that Xen's heap sometimes lives in
  94   96   * the middle of their segkpm, and is mapped using only kpm-like addresses.
  95   97   * In that case, we need to skip the swathes before and after Xen's heap.
  96   98   */
  97   99  uintptr_t kpm1_low = 0;
  98  100  uintptr_t kpm1_high = 0;
  99  101  uintptr_t kpm2_low = 0;
 100  102  uintptr_t kpm2_high = 0;
 101  103  
 102  104  /*
 103  105   * Some commonly used values that we don't want to recompute over and over.
 104  106   */
 105  107  static int xpv_panic_nptes[MAX_NUM_LEVEL];
 106  108  static ulong_t xpv_panic_cr3;
 107  109  static uintptr_t xpv_end;
 108  110  
 109  111  static void xpv_panic_console_print(const char *fmt, ...);
 110  112  static void (*xpv_panic_printf)(const char *, ...) = xpv_panic_console_print;
 111  113  
 112  114  #define CONSOLE_BUF_SIZE        256
 113  115  static char console_buffer[CONSOLE_BUF_SIZE];
 114  116  static boolean_t use_polledio;
 115  117  
 116  118  /*
 117  119   * Pointers to machine check panic info (if any).
 118  120   */
 119  121  xpv_mca_panic_data_t *xpv_mca_panic_data = NULL;
 120  122  
 121  123  static void
 122  124  xpv_panic_putc(int m)
 123  125  {
 124  126          struct cons_polledio *c = cons_polledio;
 125  127  
 126  128          /* This really shouldn't happen */
 127  129          if (boot_console_type(NULL) == CONS_HYPERVISOR)
 128  130                  return;
 129  131  
 130  132          if (use_polledio == B_TRUE)
 131  133                  c->cons_polledio_putchar(c->cons_polledio_argument, m);
 132  134          else
 133  135                  bcons_putchar(m);
 134  136  }
 135  137  
 136  138  static void
 137  139  xpv_panic_puts(char *msg)
 138  140  {
 139  141          char *m;
 140  142  
 141  143          dump_timeleft = dump_timeout;
 142  144          for (m = msg; *m; m++)
 143  145                  xpv_panic_putc((int)*m);
 144  146  }
 145  147  
 146  148  static void
 147  149  xpv_panic_console_print(const char *fmt, ...)
 148  150  {
 149  151          va_list ap;
 150  152  
 151  153          va_start(ap, fmt);
 152  154          (void) vsnprintf(console_buffer, sizeof (console_buffer), fmt, ap);
 153  155          va_end(ap);
 154  156  
 155  157          xpv_panic_puts(console_buffer);
 156  158  }
 157  159  
 158  160  static void
 159  161  xpv_panic_map(int level, pfn_t pfn)
 160  162  {
 161  163          x86pte_t pte, *pteptr;
 162  164  
 163  165          /*
 164  166           * The provided pfn represents a level 'level' page table.  Map it
 165  167           * into the 'level' slot in the list of page table windows.
 166  168           */
  
    | ↓ open down ↓ | 131 lines elided | ↑ open up ↑ | 
 167  169          pteptr = (x86pte_t *)PWIN_PTE_VA(level);
 168  170          pte = pfn_to_pa(pfn) | PT_VALID;
 169  171  
 170  172          XPV_ALLOW_PAGETABLE_UPDATES();
 171  173          if (mmu.pae_hat)
 172  174                  *pteptr = pte;
 173  175          else
 174  176                  *(x86pte32_t *)pteptr = pte;
 175  177          XPV_DISALLOW_PAGETABLE_UPDATES();
 176  178  
 177      -        mmu_tlbflush_entry(PWIN_VA(level));
      179 +        mmu_flush_tlb_page((uintptr_t)PWIN_VA(level));
 178  180  }
 179  181  
 180  182  /*
 181  183   * Walk the page tables to find the pfn mapped by the given va.
 182  184   */
 183  185  static pfn_t
 184  186  xpv_va_walk(uintptr_t *vaddr)
 185  187  {
 186  188          int l, idx;
 187  189          pfn_t pfn;
 188  190          x86pte_t pte;
 189  191          x86pte_t *ptep;
 190  192          uintptr_t va = *vaddr;
 191  193          uintptr_t scan_va;
 192  194          caddr_t ptable_window;
 193  195          static pfn_t toplevel_pfn;
 194  196          static uintptr_t lastva;
 195  197  
 196  198          /*
 197  199           * If we do anything other than a simple scan through memory, don't
 198  200           * trust the mapped page tables.
 199  201           */
 200  202          if (va != lastva + MMU_PAGESIZE)
 201  203                  for (l = mmu.max_level; l >= 0; l--)
 202  204                          ptable_pfn[l] = PFN_INVALID;
 203  205  
 204  206          toplevel_pfn = mmu_btop(xpv_panic_cr3);
 205  207  
 206  208          while (va < xpv_end && va >= *vaddr) {
 207  209                  /* Find the lowest table with any entry for va */
 208  210                  pfn = toplevel_pfn;
 209  211                  for (l = mmu.max_level; l >= 0; l--) {
 210  212                          if (ptable_pfn[l] != pfn) {
 211  213                                  xpv_panic_map(l, pfn);
 212  214                                  ptable_pfn[l] = pfn;
 213  215                          }
 214  216  
 215  217                          /*
 216  218                           * Search this pagetable for any mapping to an
 217  219                           * address >= va.
 218  220                           */
 219  221                          ptable_window = PWIN_VA(l);
 220  222                          if (l == mmu.max_level && mmu.pae_hat)
 221  223                                  ptable_window +=
 222  224                                      (xpv_panic_cr3 & MMU_PAGEOFFSET);
 223  225  
 224  226                          idx = (va >> LEVEL_SHIFT(l)) & (xpv_panic_nptes[l] - 1);
 225  227                          scan_va = va;
 226  228                          while (idx < xpv_panic_nptes[l] && scan_va < xpv_end &&
 227  229                              scan_va >= *vaddr) {
 228  230                                  ptep = (x86pte_t *)(ptable_window +
 229  231                                      (idx << mmu.pte_size_shift));
 230  232                                  pte = GET_PTE(ptep);
 231  233                                  if (pte & PTE_VALID)
 232  234                                          break;
 233  235                                  idx++;
 234  236                                  scan_va += mmu.level_size[l];
 235  237                          }
 236  238  
 237  239                          /*
 238  240                           * If there are no valid mappings in this table, we
 239  241                           * can skip to the end of the VA range it covers.
 240  242                           */
 241  243                          if (idx == xpv_panic_nptes[l]) {
 242  244                                  va = NEXT_ENTRY_VA(va, l + 1);
 243  245                                  break;
 244  246                          }
 245  247  
 246  248                          va = scan_va;
 247  249                          /*
 248  250                           * See if we've hit the end of the range.
 249  251                           */
 250  252                          if (va >= xpv_end || va < *vaddr)
 251  253                                  break;
 252  254  
 253  255                          /*
 254  256                           * If this mapping is for a pagetable, we drop down
 255  257                           * to the next level in the hierarchy and look for
 256  258                           * a mapping in it.
 257  259                           */
 258  260                          pfn = PTE2MFN(pte, l);
 259  261                          if (!PTE_ISPAGE(pte, l))
 260  262                                  continue;
 261  263  
 262  264                          /*
 263  265                           * The APIC page is magic.  Nothing to see here;
 264  266                           * move along.
 265  267                           */
 266  268                          if (((uintptr_t)xpv_apicadr & MMU_PAGEMASK) ==
 267  269                              (va & MMU_PAGEMASK)) {
 268  270                                  va += MMU_PAGESIZE;
 269  271                                  break;
 270  272                          }
 271  273  
 272  274                          /*
 273  275                           * See if the address is within one of the two
 274  276                           * kpm-like regions we want to skip.
 275  277                           */
 276  278                          if (va >= kpm1_low && va < kpm1_high) {
 277  279                                  va = kpm1_high;
 278  280                                  break;
 279  281                          }
 280  282                          if (va >= kpm2_low && va < kpm2_high) {
 281  283                                  va = kpm2_high;
 282  284                                  break;
 283  285                          }
 284  286  
 285  287                          /*
 286  288                           * The Xen panic code only handles small pages.  If
 287  289                           * this mapping is for a large page, we need to
 288  290                           * identify the consituent page that covers the
 289  291                           * specific VA we were looking for.
 290  292                           */
 291  293                          if (l > 0) {
 292  294                                  if (l > 1)
 293  295                                          panic("Xen panic can't cope with "
 294  296                                              "giant pages.");
 295  297                                  idx = (va >> LEVEL_SHIFT(0)) &
 296  298                                      (xpv_panic_nptes[0] - 1);
 297  299                                  pfn += idx;
 298  300                          }
 299  301  
 300  302                          *vaddr = va;
 301  303                          lastva = va;
 302  304                          return (pfn | PFN_IS_FOREIGN_MFN);
 303  305                  }
 304  306          }
 305  307          return (PFN_INVALID);
 306  308  }
 307  309  
 308  310  /*
 309  311   * Walk through the Xen VA space, finding pages that are mapped in.
 310  312   *
 311  313   * These pages all have MFNs rather than PFNs, meaning they may be outside
 312  314   * the physical address space the kernel knows about, or they may collide
 313  315   * with PFNs the kernel is using.
 314  316   *
 315  317   * The obvious trick of just adding the PFN_IS_FOREIGN_MFN bit to the MFNs
 316  318   * to avoid collisions doesn't work.  The pages need to be written to disk
 317  319   * in PFN-order or savecore gets confused.  We can't allocate memory to
 318  320   * contruct a sorted pfn->VA reverse mapping, so we have to write the pages
 319  321   * to disk in VA order.
 320  322   *
 321  323   * To square this circle, we simply make up PFNs for each of Xen's pages.
 322  324   * We assign each mapped page a fake PFN in ascending order.  These fake
 323  325   * PFNs each have the FOREIGN bit set, ensuring that they fall outside the
 324  326   * range of Solaris PFNs written by the kernel.
 325  327   */
 326  328  int
 327  329  dump_xpv_addr()
 328  330  {
 329  331          uintptr_t va;
 330  332          mem_vtop_t mem_vtop;
 331  333  
 332  334          xpv_dump_pages = 0;
 333  335          va = xen_virt_start;
 334  336  
 335  337          while (xpv_va_walk(&va) != PFN_INVALID) {
 336  338                  mem_vtop.m_as = &kas;
 337  339                  mem_vtop.m_va = (void *)va;
 338  340                  mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN;
 339  341  
 340  342                  dumpvp_write(&mem_vtop, sizeof (mem_vtop_t));
 341  343                  xpv_dump_pages++;
 342  344  
 343  345                  va += MMU_PAGESIZE;
 344  346          }
 345  347  
 346  348          /*
 347  349           * Add the shared_info page.  This page actually ends up in the
 348  350           * dump twice: once for the Xen va and once for the Solaris va.
 349  351           * This isn't ideal, but we don't know the address Xen is using for
 350  352           * the page, so we can't share it.
 351  353           */
 352  354          mem_vtop.m_as = &kas;
 353  355          mem_vtop.m_va = HYPERVISOR_shared_info;
 354  356          mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN;
 355  357          dumpvp_write(&mem_vtop, sizeof (mem_vtop_t));
 356  358          xpv_dump_pages++;
 357  359  
 358  360          return (xpv_dump_pages);
 359  361  }
 360  362  
 361  363  void
 362  364  dump_xpv_pfn()
 363  365  {
 364  366          pfn_t pfn;
 365  367          int cnt;
 366  368  
 367  369          for (cnt = 0; cnt < xpv_dump_pages; cnt++) {
 368  370                  pfn = (pfn_t)cnt | PFN_IS_FOREIGN_MFN;
 369  371                  dumpvp_write(&pfn, sizeof (pfn));
 370  372          }
 371  373  }
 372  374  
 373  375  int
 374  376  dump_xpv_data(void *dump_cbuf)
 375  377  {
 376  378          uintptr_t va;
 377  379          uint32_t csize;
 378  380          int cnt = 0;
 379  381  
 380  382          /*
 381  383           * XXX: we should probably run this data through a UE check.  The
 382  384           * catch is that the UE code relies on on_trap() and getpfnum()
 383  385           * working.
 384  386           */
 385  387          va = xen_virt_start;
 386  388  
 387  389          while (xpv_va_walk(&va) != PFN_INVALID) {
 388  390                  csize = (uint32_t)compress((void *)va, dump_cbuf, PAGESIZE);
 389  391                  dumpvp_write(&csize, sizeof (uint32_t));
 390  392                  dumpvp_write(dump_cbuf, csize);
 391  393                  if (dump_ioerr) {
 392  394                          dumphdr->dump_flags &= ~DF_COMPLETE;
 393  395                          return (cnt);
 394  396                  }
 395  397                  cnt++;
 396  398                  va += MMU_PAGESIZE;
 397  399          }
 398  400  
 399  401          /*
 400  402           * Finally, dump the shared_info page
 401  403           */
 402  404          csize = (uint32_t)compress((void *)HYPERVISOR_shared_info, dump_cbuf,
 403  405              PAGESIZE);
 404  406          dumpvp_write(&csize, sizeof (uint32_t));
 405  407          dumpvp_write(dump_cbuf, csize);
 406  408          if (dump_ioerr)
 407  409                  dumphdr->dump_flags &= ~DF_COMPLETE;
 408  410          cnt++;
 409  411  
 410  412          return (cnt);
 411  413  }
 412  414  
 413  415  static void *
 414  416  showstack(void *fpreg, int xpv_only)
 415  417  {
 416  418          struct frame *fpp;
 417  419          ulong_t off;
 418  420          char *sym;
 419  421          uintptr_t pc, fp, lastfp;
 420  422          uintptr_t minaddr = min(KERNELBASE, xen_virt_start);
 421  423  
 422  424          fp = (uintptr_t)fpreg;
 423  425          if (fp < minaddr) {
 424  426                  xpv_panic_printf("Bad frame ptr: 0x%p\n", fpreg);
 425  427                  return (fpreg);
 426  428          }
 427  429  
 428  430          do {
 429  431                  fpp = (struct frame *)fp;
 430  432                  pc = fpp->fr_savpc;
 431  433  
 432  434                  if ((xpv_only != 0) &&
 433  435                      (fp > xpv_end || fp < xen_virt_start))
 434  436                          break;
 435  437                  if ((sym = kobj_getsymname(pc, &off)) != NULL)
 436  438                          xpv_panic_printf("%08lx %s:%s+%lx\n", fp,
 437  439                              mod_containing_pc((caddr_t)pc), sym, off);
 438  440                  else if ((pc >= xen_virt_start) && (pc <= xpv_end))
 439  441                          xpv_panic_printf("%08lx 0x%lx (in Xen)\n", fp, pc);
 440  442                  else
 441  443                          xpv_panic_printf("%08lx %lx\n", fp, pc);
 442  444  
 443  445                  lastfp = fp;
 444  446                  fp = fpp->fr_savfp;
 445  447  
 446  448                  /*
 447  449                   * Xen marks an exception frame by inverting the frame
 448  450                   * pointer.
 449  451                   */
 450  452                  if (fp < lastfp) {
 451  453                          if ((~fp > minaddr) && ((~fp) ^ lastfp) < 0xfff)
 452  454                                  fp = ~fp;
 453  455                  }
 454  456          } while (fp > lastfp);
 455  457          return ((void *)fp);
 456  458  }
 457  459  
 458  460  void *
 459  461  xpv_traceback(void *fpreg)
 460  462  {
 461  463          return (showstack(fpreg, 1));
 462  464  }
 463  465  
 464  466  #if defined(__amd64)
 465  467  static void
 466  468  xpv_panic_hypercall(ulong_t call)
 467  469  {
 468  470          panic("Illegally issued hypercall %d during panic!\n", (int)call);
 469  471  }
 470  472  #endif
 471  473  
 472  474  void
 473  475  xpv_die(struct regs *rp)
 474  476  {
 475  477          struct panic_trap_info ti;
 476  478          struct cregs creg;
 477  479  
 478  480          ti.trap_regs = rp;
 479  481          ti.trap_type = rp->r_trapno;
 480  482  
 481  483          curthread->t_panic_trap = &ti;
 482  484          if (ti.trap_type == T_PGFLT) {
 483  485                  getcregs(&creg);
 484  486                  ti.trap_addr = (caddr_t)creg.cr_cr2;
 485  487                  panic("Fatal pagefault at 0x%lx.  fault addr=0x%p  rp=0x%p",
 486  488                      rp->r_pc, (void *)ti.trap_addr, (void *)rp);
 487  489          } else {
 488  490                  ti.trap_addr = (caddr_t)rp->r_pc;
 489  491                  panic("Fatal trap %ld at 0x%lx.  rp=0x%p", rp->r_trapno,
 490  492                      rp->r_pc, (void *)rp);
 491  493          }
 492  494  }
 493  495  
 494  496  /*
 495  497   * Build IDT to handle a Xen panic
 496  498   */
 497  499  static void
 498  500  switch_to_xpv_panic_idt()
 499  501  {
 500  502          int i;
 501  503          desctbr_t idtr;
 502  504          gate_desc_t *idt = xpv_panic_idt;
 503  505          selector_t cs = get_cs_register();
 504  506  
 505  507          for (i = 0; i < 32; i++)
 506  508                  set_gatesegd(&idt[i], &xpv_invaltrap, cs, SDT_SYSIGT, TRP_XPL,
 507  509                      0);
 508  510  
 509  511          set_gatesegd(&idt[T_ZERODIV], &xpv_div0trap, cs, SDT_SYSIGT, TRP_XPL,
 510  512              0);
 511  513          set_gatesegd(&idt[T_SGLSTP], &xpv_dbgtrap, cs, SDT_SYSIGT, TRP_XPL, 0);
 512  514          set_gatesegd(&idt[T_NMIFLT], &xpv_nmiint, cs, SDT_SYSIGT, TRP_XPL, 0);
 513  515          set_gatesegd(&idt[T_BOUNDFLT], &xpv_boundstrap, cs, SDT_SYSIGT,
 514  516              TRP_XPL, 0);
 515  517          set_gatesegd(&idt[T_ILLINST], &xpv_invoptrap, cs, SDT_SYSIGT, TRP_XPL,
 516  518              0);
 517  519          set_gatesegd(&idt[T_NOEXTFLT], &xpv_ndptrap, cs, SDT_SYSIGT, TRP_XPL,
 518  520              0);
 519  521          set_gatesegd(&idt[T_TSSFLT], &xpv_invtsstrap, cs, SDT_SYSIGT, TRP_XPL,
 520  522              0);
 521  523          set_gatesegd(&idt[T_SEGFLT], &xpv_segnptrap, cs, SDT_SYSIGT, TRP_XPL,
 522  524              0);
 523  525          set_gatesegd(&idt[T_STKFLT], &xpv_stktrap, cs, SDT_SYSIGT, TRP_XPL, 0);
 524  526          set_gatesegd(&idt[T_GPFLT], &xpv_gptrap, cs, SDT_SYSIGT, TRP_XPL, 0);
 525  527          set_gatesegd(&idt[T_PGFLT], &xpv_pftrap, cs, SDT_SYSIGT, TRP_XPL, 0);
 526  528          set_gatesegd(&idt[T_EXTERRFLT], &xpv_ndperr, cs, SDT_SYSIGT, TRP_XPL,
 527  529              0);
 528  530          set_gatesegd(&idt[T_ALIGNMENT], &xpv_achktrap, cs, SDT_SYSIGT, TRP_XPL,
 529  531              0);
 530  532          set_gatesegd(&idt[T_MCE], &xpv_mcetrap, cs, SDT_SYSIGT, TRP_XPL, 0);
 531  533          set_gatesegd(&idt[T_SIMDFPE], &xpv_xmtrap, cs, SDT_SYSIGT, TRP_XPL, 0);
 532  534  
 533  535          /*
 534  536           * We have no double fault handler.  Any single fault represents a
 535  537           * catastrophic failure for us, so there is no attempt to handle
 536  538           * them cleanly: we just print a message and reboot.  If we
 537  539           * encounter a second fault while doing that, there is nothing
 538  540           * else we can do.
 539  541           */
 540  542  
 541  543          /*
 542  544           * Be prepared to absorb any stray device interrupts received
 543  545           * while writing the core to disk.
 544  546           */
 545  547          for (i = 33; i < NIDT; i++)
 546  548                  set_gatesegd(&idt[i], &xpv_surprise_intr, cs, SDT_SYSIGT,
 547  549                      TRP_XPL, 0);
 548  550  
 549  551          /* The one interrupt we expect to get is from the APIC timer.  */
 550  552          set_gatesegd(&idt[T_XPV_TIMER], &xpv_timer_trap, cs, SDT_SYSIGT,
 551  553              TRP_XPL, 0);
 552  554  
 553  555          idtr.dtr_base = (uintptr_t)xpv_panic_idt;
 554  556          idtr.dtr_limit = sizeof (xpv_panic_idt) - 1;
 555  557          wr_idtr(&idtr);
 556  558  
 557  559  #if defined(__amd64)
 558  560          /* Catch any hypercalls. */
 559  561          wrmsr(MSR_AMD_LSTAR, (uintptr_t)xpv_panic_hypercall);
 560  562          wrmsr(MSR_AMD_CSTAR, (uintptr_t)xpv_panic_hypercall);
 561  563  #endif
 562  564  }
 563  565  
 564  566  static void
 565  567  xpv_apic_clkinit()
 566  568  {
 567  569          uint_t          apic_ticks = 0;
 568  570  
 569  571          /*
 570  572           * Measure how many APIC ticks there are within a fixed time
 571  573           * period.  We're going to be fairly coarse here.  This timer is
 572  574           * just being used to detect a stalled panic, so as long as we have
 573  575           * the right order of magnitude, everything should be fine.
 574  576           */
 575  577          xpv_apicadr[APIC_SPUR_INT_REG] = AV_UNIT_ENABLE | APIC_SPUR_INTR;
 576  578          xpv_apicadr[APIC_LOCAL_TIMER] = AV_MASK;
 577  579          xpv_apicadr[APIC_INT_VECT0] = AV_MASK;  /* local intr reg 0 */
 578  580  
 579  581          xpv_apicadr[APIC_DIVIDE_REG] = 0;
 580  582          xpv_apicadr[APIC_INIT_COUNT] = APIC_MAXVAL;
 581  583          drv_usecwait(XPV_TIMER_INTERVAL);
 582  584          apic_ticks = APIC_MAXVAL - xpv_apicadr[APIC_CURR_COUNT];
 583  585  
 584  586          /*
 585  587           * apic_ticks now represents roughly how many apic ticks comprise
 586  588           * one timeout interval.  Program the timer to send us an interrupt
 587  589           * every time that interval expires.
 588  590           */
 589  591          xpv_apicadr[APIC_LOCAL_TIMER] = T_XPV_TIMER | AV_PERIODIC;
 590  592          xpv_apicadr[APIC_INIT_COUNT] = apic_ticks;
 591  593          xpv_apicadr[APIC_EOI_REG] = 0;
 592  594  }
 593  595  
 594  596  void
 595  597  xpv_timer_tick(void)
 596  598  {
 597  599          static int ticks = 0;
 598  600  
 599  601          if (ticks++ >= MICROSEC / XPV_TIMER_INTERVAL) {
 600  602                  ticks = 0;
 601  603                  if (dump_timeleft && (--dump_timeleft == 0))
 602  604                          panic("Xen panic timeout\n");
 603  605          }
 604  606          xpv_apicadr[APIC_EOI_REG] = 0;
 605  607  }
 606  608  
 607  609  void
 608  610  xpv_interrupt(void)
 609  611  {
 610  612  #ifdef  DEBUG
 611  613          static int cnt = 0;
 612  614  
 613  615          if (cnt++ < 10)
 614  616                  xpv_panic_printf("Unexpected interrupt received.\n");
 615  617          if ((cnt < 1000) && ((cnt % 100) == 0))
 616  618                  xpv_panic_printf("%d unexpected interrupts received.\n", cnt);
 617  619  #endif
 618  620  
 619  621          xpv_apicadr[APIC_EOI_REG] = 0;
 620  622  }
 621  623  
 622  624  /*
 623  625   * Managing time in panic context is trivial.  We only have a single CPU,
 624  626   * we never get rescheduled, we never get suspended.  We just need to
 625  627   * convert clock ticks into nanoseconds.
 626  628   */
 627  629  static hrtime_t
 628  630  xpv_panic_gethrtime(void)
 629  631  {
 630  632          hrtime_t tsc, hrt;
 631  633          unsigned int *l = (unsigned int *)&(tsc);
 632  634  
 633  635          tsc = __rdtsc_insn();
 634  636          hrt = (mul32(l[1], nsec_scale) << NSEC_SHIFT) +
 635  637              (mul32(l[0], nsec_scale) >> (32 - NSEC_SHIFT));
 636  638  
 637  639          return (hrt);
 638  640  }
 639  641  
 640  642  static void
 641  643  xpv_panic_time_init()
 642  644  {
 643  645          nsec_scale =
 644  646              CPU->cpu_m.mcpu_vcpu_info->time.tsc_to_system_mul >> NSEC_SHIFT;
 645  647  
 646  648          gethrtimef = xpv_panic_gethrtime;
 647  649  }
 648  650  
 649  651  static void
 650  652  xpv_panicsys(struct regs *rp, char *fmt, ...)
 651  653  {
 652  654          extern void panicsys(const char *, va_list, struct regs *, int);
 653  655          va_list alist;
 654  656  
 655  657          va_start(alist, fmt);
 656  658          panicsys(fmt, alist, rp, 1);
 657  659          va_end(alist);
 658  660  }
 659  661  
 660  662  void
 661  663  xpv_do_panic(void *arg)
 662  664  {
 663  665          struct panic_info *pip = (struct panic_info *)arg;
 664  666          int l;
 665  667          struct cregs creg;
 666  668  #if defined(__amd64)
 667  669          extern uintptr_t postbootkernelbase;
 668  670  #endif
 669  671  
 670  672          if (xpv_panicking++ > 0)
 671  673                  panic("multiple calls to xpv_do_panic()");
 672  674  
 673  675          /*
 674  676           * Indicate to the underlying panic framework that a panic has been
 675  677           * initiated.  This is ordinarily done as part of vpanic().  Since
 676  678           * we already have all the register state saved by the hypervisor,
 677  679           * we skip that and jump straight into the panic processing code.
 678  680           *
 679  681           * XXX If another thread grabs and wins the panic_quiesce trigger
 680  682           * then we'll have two threads in panicsys believing they are in
 681  683           * charge of the panic attempt!
 682  684           */
 683  685          (void) panic_trigger(&panic_quiesce);
 684  686  
 685  687  #if defined(__amd64)
 686  688          /*
 687  689           * bzero() and bcopy() get unhappy when asked to operate on
 688  690           * addresses outside of the kernel.  At this point Xen is really a
 689  691           * part of the kernel, so we update the routines' notion of where
 690  692           * the kernel starts.
 691  693           */
 692  694          postbootkernelbase = xen_virt_start;
 693  695  #endif
 694  696  
 695  697  #if defined(HYPERVISOR_VIRT_END)
 696  698          xpv_end = HYPERVISOR_VIRT_END;
 697  699  #else
 698  700          xpv_end = (uintptr_t)UINTPTR_MAX - sizeof (uintptr_t);
 699  701  #endif
 700  702  
 701  703          /*
 702  704           * If we were redirecting console output to the hypervisor, we have
 703  705           * to stop.
 704  706           */
 705  707          use_polledio = B_FALSE;
 706  708          if (boot_console_type(NULL) == CONS_HYPERVISOR) {
 707  709                  bcons_device_change(CONS_HYPERVISOR);
 708  710          } else if (cons_polledio != NULL &&
 709  711              cons_polledio->cons_polledio_putchar != NULL)  {
 710  712                  if (cons_polledio->cons_polledio_enter != NULL)
 711  713                          cons_polledio->cons_polledio_enter(
 712  714                              cons_polledio->cons_polledio_argument);
 713  715                  use_polledio = 1;
 714  716          }
 715  717  
 716  718          /* Make sure we handle all console output from here on. */
 717  719          sysp->bsvc_putchar = xpv_panic_putc;
 718  720  
 719  721          /*
 720  722           * If we find an unsupported panic_info structure, there's not much
 721  723           * we can do other than complain, plow on, and hope for the best.
 722  724           */
 723  725          if (pip->pi_version != PANIC_INFO_VERSION)
 724  726                  xpv_panic_printf("Warning: Xen is using an unsupported "
 725  727                      "version of the panic_info structure.\n");
 726  728  
 727  729          xpv_panic_info = pip;
 728  730  
 729  731  #if defined(__amd64)
 730  732          kpm1_low = (uintptr_t)xpv_panic_info->pi_ram_start;
 731  733          if (xpv_panic_info->pi_xen_start == NULL) {
 732  734                  kpm1_high = (uintptr_t)xpv_panic_info->pi_ram_end;
 733  735          } else {
 734  736                  kpm1_high = (uintptr_t)xpv_panic_info->pi_xen_start;
 735  737                  kpm2_low = (uintptr_t)xpv_panic_info->pi_xen_end;
 736  738                  kpm2_high = (uintptr_t)xpv_panic_info->pi_ram_end;
 737  739          }
 738  740  #endif
 739  741  
 740  742          /*
 741  743           * Make sure we are running on the Solaris %gs.  The Xen panic code
 742  744           * should already have set up the GDT properly.
 743  745           */
 744  746          xpv_panic_resetgs();
 745  747  #if defined(__amd64)
 746  748          wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]);
 747  749  #endif
 748  750  
 749  751          xpv_panic_time_init();
 750  752  
 751  753          /*
 752  754           * Switch to our own IDT, avoiding any accidental returns to Xen
 753  755           * world.
 754  756           */
 755  757          switch_to_xpv_panic_idt();
 756  758  
 757  759          /*
 758  760           * Initialize the APIC timer, which is used to detect a hung dump
 759  761           * attempt.
 760  762           */
 761  763          xpv_apicadr = pip->pi_apic;
 762  764          xpv_apic_clkinit();
 763  765  
 764  766          /*
 765  767           * Set up a few values that we'll need repeatedly.
 766  768           */
 767  769          getcregs(&creg);
 768  770          xpv_panic_cr3 = creg.cr_cr3;
 769  771          for (l = mmu.max_level; l >= 0; l--)
 770  772                  xpv_panic_nptes[l] = mmu.ptes_per_table;
 771  773  #ifdef __i386
 772  774          if (mmu.pae_hat)
 773  775                  xpv_panic_nptes[mmu.max_level] = 4;
 774  776  #endif
 775  777  
 776  778          /* Add the fake Xen module to the module list */
 777  779          if (xpv_module != NULL) {
 778  780                  extern int last_module_id;
 779  781  
 780  782                  xpv_modctl->mod_id = last_module_id++;
 781  783                  xpv_modctl->mod_next = &modules;
 782  784                  xpv_modctl->mod_prev = modules.mod_prev;
 783  785                  modules.mod_prev->mod_next = xpv_modctl;
 784  786                  modules.mod_prev = xpv_modctl;
 785  787          }
 786  788  
 787  789          if (pip->pi_mca.mpd_magic == MCA_PANICDATA_MAGIC)
 788  790                  xpv_mca_panic_data = &pip->pi_mca;
 789  791  
 790  792          xpv_panic_printf = printf;
 791  793          xpv_panicsys((struct regs *)pip->pi_regs, pip->pi_panicstr);
 792  794          xpv_panic_printf("Failed to reboot following panic.\n");
 793  795          for (;;)
 794  796                  ;
 795  797  }
 796  798  
 797  799  /*
 798  800   * Set up the necessary data structures to pretend that the Xen hypervisor
 799  801   * is a loadable module, allowing mdb to find the Xen symbols in a crash
 800  802   * dump.  Since these symbols all map to VA space Solaris doesn't normally
 801  803   * have access to, we don't link these structures into the kernel's lists
 802  804   * until/unless we hit a Xen panic.
 803  805   *
 804  806   * The observant reader will note a striking amount of overlap between this
 805  807   * code and that found in krtld.  While it would be handy if we could just
 806  808   * ask krtld to do this work for us, it's not that simple.  Among the
 807  809   * complications: we're not actually loading the text here (grub did it at
 808  810   * boot), the .text section is writable, there are no relocations to do,
 809  811   * none of the module text/data is in readable memory, etc.  Training krtld
 810  812   * to deal with this weird module is as complicated, and more risky, than
 811  813   * reimplementing the necessary subset of it here.
 812  814   */
 813  815  static void
 814  816  init_xen_module()
 815  817  {
 816  818          struct _buf *file = NULL;
 817  819          struct module *mp;
 818  820          struct modctl *mcp;
 819  821          int i, shn;
 820  822          Shdr *shp, *ctf_shp;
 821  823          char *names = NULL;
 822  824          size_t n, namesize, text_align, data_align;
 823  825  #if defined(__amd64)
 824  826          const char machine = EM_AMD64;
 825  827  #else
 826  828          const char machine = EM_386;
 827  829  #endif
 828  830  
 829  831          /* Allocate and init the module structure */
 830  832          mp = kmem_zalloc(sizeof (*mp), KM_SLEEP);
 831  833          mp->filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP);
 832  834          (void) strcpy(mp->filename, XPV_FILENAME);
 833  835  
 834  836          /* Allocate and init the modctl structure */
 835  837          mcp = kmem_zalloc(sizeof (*mcp), KM_SLEEP);
 836  838          mcp->mod_modname = kobj_zalloc(strlen(XPV_MODNAME) + 1, KM_SLEEP);
 837  839          (void) strcpy(mcp->mod_modname, XPV_MODNAME);
 838  840          mcp->mod_filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP);
 839  841          (void) strcpy(mcp->mod_filename, XPV_FILENAME);
 840  842          mcp->mod_inprogress_thread = (kthread_id_t)-1;
 841  843          mcp->mod_ref = 1;
 842  844          mcp->mod_loaded = 1;
 843  845          mcp->mod_loadcnt = 1;
 844  846          mcp->mod_mp = mp;
 845  847  
 846  848          /*
 847  849           * Try to open a Xen image that hasn't had its symbol and CTF
 848  850           * information stripped off.
 849  851           */
 850  852          file = kobj_open_file(XPV_FILENAME);
 851  853          if (file == (struct _buf *)-1) {
 852  854                  file = NULL;
 853  855                  goto err;
 854  856          }
 855  857  
 856  858          /*
 857  859           * Read the header and ensure that this is an ELF file for the
 858  860           * proper ISA.  If it's not, somebody has done something very
 859  861           * stupid.  Why bother?  See Mencken.
 860  862           */
 861  863          if (kobj_read_file(file, (char *)&mp->hdr, sizeof (mp->hdr), 0) < 0)
 862  864                  goto err;
 863  865          for (i = 0; i < SELFMAG; i++)
 864  866                  if (mp->hdr.e_ident[i] != ELFMAG[i])
 865  867                          goto err;
 866  868          if ((mp->hdr.e_ident[EI_DATA] != ELFDATA2LSB) ||
 867  869              (mp->hdr.e_machine != machine))
 868  870                  goto err;
 869  871  
 870  872          /* Read in the section headers */
 871  873          n = mp->hdr.e_shentsize * mp->hdr.e_shnum;
 872  874          mp->shdrs = kmem_zalloc(n, KM_SLEEP);
 873  875          if (kobj_read_file(file, mp->shdrs, n, mp->hdr.e_shoff) < 0)
 874  876                  goto err;
 875  877  
 876  878          /* Read the section names */
 877  879          shp = (Shdr *)(mp->shdrs + mp->hdr.e_shstrndx * mp->hdr.e_shentsize);
 878  880          namesize = shp->sh_size;
 879  881          names = kmem_zalloc(shp->sh_size, KM_SLEEP);
 880  882          if (kobj_read_file(file, names, shp->sh_size, shp->sh_offset) < 0)
 881  883                  goto err;
 882  884  
 883  885          /*
 884  886           * Fill in the text and data size fields.
 885  887           */
 886  888          ctf_shp = NULL;
 887  889          text_align = data_align = 0;
 888  890          for (shn = 1; shn < mp->hdr.e_shnum; shn++) {
 889  891                  shp = (Shdr *)(mp->shdrs + shn * mp->hdr.e_shentsize);
 890  892  
 891  893                  /* Sanity check the offset of the section name */
 892  894                  if (shp->sh_name >= namesize)
 893  895                          continue;
 894  896  
 895  897                  /* If we find the symtab section, remember it for later. */
 896  898                  if (shp->sh_type == SHT_SYMTAB) {
 897  899                          mp->symtbl_section = shn;
 898  900                          mp->symhdr = shp;
 899  901                          continue;
 900  902                  }
 901  903  
 902  904                  /* If we find the CTF section, remember it for later. */
 903  905                  if ((shp->sh_size != 0) &&
 904  906                      (strcmp(names + shp->sh_name, ".SUNW_ctf") == 0)) {
 905  907                          ctf_shp = shp;
 906  908                          continue;
 907  909                  }
 908  910  
 909  911                  if (!(shp->sh_flags & SHF_ALLOC))
 910  912                          continue;
 911  913  
 912  914                  /*
 913  915                   * Xen marks its text section as writable, so we need to
 914  916                   * look for the name - not just the flag.
 915  917                   */
 916  918                  if ((strcmp(&names[shp->sh_name], ".text") != 0) &&
 917  919                      (shp->sh_flags & SHF_WRITE) != 0) {
 918  920                          if (shp->sh_addralign > data_align)
 919  921                                  data_align = shp->sh_addralign;
 920  922                          mp->data_size = ALIGN(mp->data_size, data_align);
 921  923                          mp->data_size += ALIGN(shp->sh_size, 8);
 922  924                          if (mp->data == NULL || mp->data > (char *)shp->sh_addr)
 923  925                                  mp->data = (char *)shp->sh_addr;
 924  926                  } else {
 925  927                          if (shp->sh_addralign > text_align)
 926  928                                  text_align = shp->sh_addralign;
 927  929                          mp->text_size = ALIGN(mp->text_size, text_align);
 928  930                          mp->text_size += ALIGN(shp->sh_size, 8);
 929  931                          if (mp->text == NULL || mp->text > (char *)shp->sh_addr)
 930  932                                  mp->text = (char *)shp->sh_addr;
 931  933                  }
 932  934          }
 933  935          kmem_free(names, namesize);
 934  936          names = NULL;
 935  937          shp = NULL;
 936  938          mcp->mod_text = mp->text;
 937  939          mcp->mod_text_size = mp->text_size;
 938  940  
 939  941          /*
 940  942           * If we have symbol table and string table sections, read them in
 941  943           * now.  If we don't, we just plow on.  We'll still get a valid
 942  944           * core dump, but finding anything useful will be just a bit
 943  945           * harder.
 944  946           *
 945  947           * Note: we don't bother with a hash table.  We'll never do a
 946  948           * symbol lookup unless we crash, and then mdb creates its own.  We
 947  949           * also don't try to perform any relocations.  Xen should be loaded
 948  950           * exactly where the ELF file indicates, and the symbol information
 949  951           * in the file should be complete and correct already.  Static
 950  952           * linking ain't all bad.
 951  953           */
 952  954          if ((mp->symhdr != NULL) && (mp->symhdr->sh_link < mp->hdr.e_shnum)) {
 953  955                  mp->strhdr = (Shdr *)
 954  956                      (mp->shdrs + mp->symhdr->sh_link * mp->hdr.e_shentsize);
 955  957                  mp->nsyms = mp->symhdr->sh_size / mp->symhdr->sh_entsize;
 956  958  
 957  959                  /* Allocate space for the symbol table and strings.  */
 958  960                  mp->symsize = mp->symhdr->sh_size +
 959  961                      mp->nsyms * sizeof (symid_t) + mp->strhdr->sh_size;
 960  962                  mp->symspace = kmem_zalloc(mp->symsize, KM_SLEEP);
 961  963                  mp->symtbl = mp->symspace;
 962  964                  mp->strings = (char *)(mp->symtbl + mp->symhdr->sh_size);
 963  965  
 964  966                  if ((kobj_read_file(file, mp->symtbl,
 965  967                      mp->symhdr->sh_size, mp->symhdr->sh_offset) < 0) ||
 966  968                      (kobj_read_file(file, mp->strings,
 967  969                      mp->strhdr->sh_size, mp->strhdr->sh_offset) < 0))
 968  970                          goto err;
 969  971          }
 970  972  
 971  973          /*
 972  974           * Read in the CTF section
 973  975           */
 974  976          if ((ctf_shp != NULL) && ((moddebug & MODDEBUG_NOCTF) == 0)) {
 975  977                  mp->ctfdata = kmem_zalloc(ctf_shp->sh_size, KM_SLEEP);
 976  978                  mp->ctfsize = ctf_shp->sh_size;
 977  979                  if (kobj_read_file(file, mp->ctfdata, mp->ctfsize,
 978  980                      ctf_shp->sh_offset) < 0)
 979  981                          goto err;
 980  982          }
 981  983  
 982  984          kobj_close_file(file);
 983  985  
 984  986          xpv_module = mp;
 985  987          xpv_modctl = mcp;
 986  988          return;
 987  989  
 988  990  err:
 989  991          cmn_err(CE_WARN, "Failed to initialize xpv module.");
 990  992          if (file != NULL)
 991  993                  kobj_close_file(file);
 992  994  
 993  995          kmem_free(mp->filename, strlen(XPV_FILENAME) + 1);
 994  996          if (mp->shdrs != NULL)
 995  997                  kmem_free(mp->shdrs, mp->hdr.e_shentsize * mp->hdr.e_shnum);
 996  998          if (mp->symspace != NULL)
 997  999                  kmem_free(mp->symspace, mp->symsize);
 998 1000          if (mp->ctfdata != NULL)
 999 1001                  kmem_free(mp->ctfdata, mp->ctfsize);
1000 1002          kmem_free(mp, sizeof (*mp));
1001 1003          kmem_free(mcp->mod_filename, strlen(XPV_FILENAME) + 1);
1002 1004          kmem_free(mcp->mod_modname, strlen(XPV_MODNAME) + 1);
1003 1005          kmem_free(mcp, sizeof (*mcp));
1004 1006          if (names != NULL)
1005 1007                  kmem_free(names, namesize);
1006 1008  }
1007 1009  
1008 1010  void
1009 1011  xpv_panic_init()
1010 1012  {
1011 1013          xen_platform_op_t op;
1012 1014          int i;
1013 1015  
1014 1016          ASSERT(DOMAIN_IS_INITDOMAIN(xen_info));
1015 1017  
1016 1018          for (i = 0; i < mmu.num_level; i++)
1017 1019                  ptable_pfn[i] = PFN_INVALID;
1018 1020  
1019 1021          /* Let Xen know where to jump if/when it panics. */
1020 1022          op.cmd = XENPF_panic_init;
1021 1023          op.interface_version = XENPF_INTERFACE_VERSION;
1022 1024          op.u.panic_init.panic_addr = (unsigned long)xpv_panic_hdlr;
1023 1025  
1024 1026          (void) HYPERVISOR_platform_op(&op);
1025 1027  
1026 1028          init_xen_module();
1027 1029  }
  
    | ↓ open down ↓ | 840 lines elided | ↑ open up ↑ | 
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX