Print this page
8956 Implement KPTI
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/i86xpv/os/xpv_panic.c
+++ new/usr/src/uts/i86xpv/os/xpv_panic.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
↓ open down ↓ |
15 lines elided |
↑ open up ↑ |
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2012 Gary Mills
23 23 * Copyright 2016 PALO, Richard.
24 24 *
25 25 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
26 + *
27 + * Copyright 2018 Joyent, Inc.
26 28 */
27 29
28 30 #include <sys/types.h>
29 31 #include <sys/clock.h>
30 32 #include <sys/psm.h>
31 33 #include <sys/archsystm.h>
32 34 #include <sys/machsystm.h>
33 35 #include <sys/compress.h>
34 36 #include <sys/modctl.h>
35 37 #include <sys/trap.h>
36 38 #include <sys/panic.h>
37 39 #include <sys/regset.h>
38 40 #include <sys/frame.h>
39 41 #include <sys/kobj.h>
40 42 #include <sys/apic.h>
41 43 #include <sys/apic_timer.h>
42 44 #include <sys/dumphdr.h>
43 45 #include <sys/mem.h>
44 46 #include <sys/x86_archext.h>
45 47 #include <sys/xpv_panic.h>
46 48 #include <sys/boot_console.h>
47 49 #include <sys/bootsvcs.h>
48 50 #include <sys/consdev.h>
49 51 #include <vm/hat_pte.h>
50 52 #include <vm/hat_i86.h>
51 53
52 54 /* XXX: need to add a PAE version too, if we ever support both PAE and non */
53 55 #if defined(__i386)
54 56 #define XPV_FILENAME "/boot/xen-syms"
55 57 #else
56 58 #define XPV_FILENAME "/boot/amd64/xen-syms"
57 59 #endif
58 60 #define XPV_MODNAME "xpv"
59 61
60 62 int xpv_panicking = 0;
61 63
62 64 struct module *xpv_module;
63 65 struct modctl *xpv_modctl;
64 66
65 67 #define ALIGN(x, a) ((a) == 0 ? (uintptr_t)(x) : \
66 68 (((uintptr_t)(x) + (uintptr_t)(a) - 1l) & ~((uintptr_t)(a) - 1l)))
67 69
68 70 /* Pointer to the xpv_panic_info structure handed to us by Xen. */
69 71 static struct panic_info *xpv_panic_info = NULL;
70 72
71 73 /* Timer support */
72 74 #define NSEC_SHIFT 5
73 75 #define T_XPV_TIMER 0xd1
74 76 #define XPV_TIMER_INTERVAL 1000 /* 1000 microseconds */
75 77 static uint32_t *xpv_apicadr = NULL;
76 78 static uint_t nsec_scale;
77 79
78 80 /* IDT support */
79 81 #pragma align 16(xpv_panic_idt)
80 82 static gate_desc_t xpv_panic_idt[NIDT]; /* interrupt descriptor table */
81 83
82 84 /* Xen pagetables mapped into our HAT's ptable windows */
83 85 static pfn_t ptable_pfn[MAX_NUM_LEVEL];
84 86
85 87 /* Number of MMU_PAGESIZE pages we're adding to the Solaris dump */
86 88 static int xpv_dump_pages;
87 89
88 90 /*
89 91 * There are up to two large swathes of RAM that we don't want to include
90 92 * in the dump: those that comprise the Xen version of segkpm. On 32-bit
91 93 * systems there is no such region of memory. On 64-bit systems, there
92 94 * should be just a single contiguous region that corresponds to all of
93 95 * physical memory. The tricky bit is that Xen's heap sometimes lives in
94 96 * the middle of their segkpm, and is mapped using only kpm-like addresses.
95 97 * In that case, we need to skip the swathes before and after Xen's heap.
96 98 */
97 99 uintptr_t kpm1_low = 0;
98 100 uintptr_t kpm1_high = 0;
99 101 uintptr_t kpm2_low = 0;
100 102 uintptr_t kpm2_high = 0;
101 103
102 104 /*
103 105 * Some commonly used values that we don't want to recompute over and over.
104 106 */
105 107 static int xpv_panic_nptes[MAX_NUM_LEVEL];
106 108 static ulong_t xpv_panic_cr3;
107 109 static uintptr_t xpv_end;
108 110
109 111 static void xpv_panic_console_print(const char *fmt, ...);
110 112 static void (*xpv_panic_printf)(const char *, ...) = xpv_panic_console_print;
111 113
112 114 #define CONSOLE_BUF_SIZE 256
113 115 static char console_buffer[CONSOLE_BUF_SIZE];
114 116 static boolean_t use_polledio;
115 117
116 118 /*
117 119 * Pointers to machine check panic info (if any).
118 120 */
119 121 xpv_mca_panic_data_t *xpv_mca_panic_data = NULL;
120 122
121 123 static void
122 124 xpv_panic_putc(int m)
123 125 {
124 126 struct cons_polledio *c = cons_polledio;
125 127
126 128 /* This really shouldn't happen */
127 129 if (boot_console_type(NULL) == CONS_HYPERVISOR)
128 130 return;
129 131
130 132 if (use_polledio == B_TRUE)
131 133 c->cons_polledio_putchar(c->cons_polledio_argument, m);
132 134 else
133 135 bcons_putchar(m);
134 136 }
135 137
136 138 static void
137 139 xpv_panic_puts(char *msg)
138 140 {
139 141 char *m;
140 142
141 143 dump_timeleft = dump_timeout;
142 144 for (m = msg; *m; m++)
143 145 xpv_panic_putc((int)*m);
144 146 }
145 147
146 148 static void
147 149 xpv_panic_console_print(const char *fmt, ...)
148 150 {
149 151 va_list ap;
150 152
151 153 va_start(ap, fmt);
152 154 (void) vsnprintf(console_buffer, sizeof (console_buffer), fmt, ap);
153 155 va_end(ap);
154 156
155 157 xpv_panic_puts(console_buffer);
156 158 }
157 159
158 160 static void
159 161 xpv_panic_map(int level, pfn_t pfn)
160 162 {
161 163 x86pte_t pte, *pteptr;
162 164
163 165 /*
164 166 * The provided pfn represents a level 'level' page table. Map it
165 167 * into the 'level' slot in the list of page table windows.
166 168 */
↓ open down ↓ |
131 lines elided |
↑ open up ↑ |
167 169 pteptr = (x86pte_t *)PWIN_PTE_VA(level);
168 170 pte = pfn_to_pa(pfn) | PT_VALID;
169 171
170 172 XPV_ALLOW_PAGETABLE_UPDATES();
171 173 if (mmu.pae_hat)
172 174 *pteptr = pte;
173 175 else
174 176 *(x86pte32_t *)pteptr = pte;
175 177 XPV_DISALLOW_PAGETABLE_UPDATES();
176 178
177 - mmu_tlbflush_entry(PWIN_VA(level));
179 + mmu_flush_tlb_page((uintptr_t)PWIN_VA(level));
178 180 }
179 181
180 182 /*
181 183 * Walk the page tables to find the pfn mapped by the given va.
182 184 */
183 185 static pfn_t
184 186 xpv_va_walk(uintptr_t *vaddr)
185 187 {
186 188 int l, idx;
187 189 pfn_t pfn;
188 190 x86pte_t pte;
189 191 x86pte_t *ptep;
190 192 uintptr_t va = *vaddr;
191 193 uintptr_t scan_va;
192 194 caddr_t ptable_window;
193 195 static pfn_t toplevel_pfn;
194 196 static uintptr_t lastva;
195 197
196 198 /*
197 199 * If we do anything other than a simple scan through memory, don't
198 200 * trust the mapped page tables.
199 201 */
200 202 if (va != lastva + MMU_PAGESIZE)
201 203 for (l = mmu.max_level; l >= 0; l--)
202 204 ptable_pfn[l] = PFN_INVALID;
203 205
204 206 toplevel_pfn = mmu_btop(xpv_panic_cr3);
205 207
206 208 while (va < xpv_end && va >= *vaddr) {
207 209 /* Find the lowest table with any entry for va */
208 210 pfn = toplevel_pfn;
209 211 for (l = mmu.max_level; l >= 0; l--) {
210 212 if (ptable_pfn[l] != pfn) {
211 213 xpv_panic_map(l, pfn);
212 214 ptable_pfn[l] = pfn;
213 215 }
214 216
215 217 /*
216 218 * Search this pagetable for any mapping to an
217 219 * address >= va.
218 220 */
219 221 ptable_window = PWIN_VA(l);
220 222 if (l == mmu.max_level && mmu.pae_hat)
221 223 ptable_window +=
222 224 (xpv_panic_cr3 & MMU_PAGEOFFSET);
223 225
224 226 idx = (va >> LEVEL_SHIFT(l)) & (xpv_panic_nptes[l] - 1);
225 227 scan_va = va;
226 228 while (idx < xpv_panic_nptes[l] && scan_va < xpv_end &&
227 229 scan_va >= *vaddr) {
228 230 ptep = (x86pte_t *)(ptable_window +
229 231 (idx << mmu.pte_size_shift));
230 232 pte = GET_PTE(ptep);
231 233 if (pte & PTE_VALID)
232 234 break;
233 235 idx++;
234 236 scan_va += mmu.level_size[l];
235 237 }
236 238
237 239 /*
238 240 * If there are no valid mappings in this table, we
239 241 * can skip to the end of the VA range it covers.
240 242 */
241 243 if (idx == xpv_panic_nptes[l]) {
242 244 va = NEXT_ENTRY_VA(va, l + 1);
243 245 break;
244 246 }
245 247
246 248 va = scan_va;
247 249 /*
248 250 * See if we've hit the end of the range.
249 251 */
250 252 if (va >= xpv_end || va < *vaddr)
251 253 break;
252 254
253 255 /*
254 256 * If this mapping is for a pagetable, we drop down
255 257 * to the next level in the hierarchy and look for
256 258 * a mapping in it.
257 259 */
258 260 pfn = PTE2MFN(pte, l);
259 261 if (!PTE_ISPAGE(pte, l))
260 262 continue;
261 263
262 264 /*
263 265 * The APIC page is magic. Nothing to see here;
264 266 * move along.
265 267 */
266 268 if (((uintptr_t)xpv_apicadr & MMU_PAGEMASK) ==
267 269 (va & MMU_PAGEMASK)) {
268 270 va += MMU_PAGESIZE;
269 271 break;
270 272 }
271 273
272 274 /*
273 275 * See if the address is within one of the two
274 276 * kpm-like regions we want to skip.
275 277 */
276 278 if (va >= kpm1_low && va < kpm1_high) {
277 279 va = kpm1_high;
278 280 break;
279 281 }
280 282 if (va >= kpm2_low && va < kpm2_high) {
281 283 va = kpm2_high;
282 284 break;
283 285 }
284 286
285 287 /*
286 288 * The Xen panic code only handles small pages. If
287 289 * this mapping is for a large page, we need to
288 290 * identify the consituent page that covers the
289 291 * specific VA we were looking for.
290 292 */
291 293 if (l > 0) {
292 294 if (l > 1)
293 295 panic("Xen panic can't cope with "
294 296 "giant pages.");
295 297 idx = (va >> LEVEL_SHIFT(0)) &
296 298 (xpv_panic_nptes[0] - 1);
297 299 pfn += idx;
298 300 }
299 301
300 302 *vaddr = va;
301 303 lastva = va;
302 304 return (pfn | PFN_IS_FOREIGN_MFN);
303 305 }
304 306 }
305 307 return (PFN_INVALID);
306 308 }
307 309
308 310 /*
309 311 * Walk through the Xen VA space, finding pages that are mapped in.
310 312 *
311 313 * These pages all have MFNs rather than PFNs, meaning they may be outside
312 314 * the physical address space the kernel knows about, or they may collide
313 315 * with PFNs the kernel is using.
314 316 *
315 317 * The obvious trick of just adding the PFN_IS_FOREIGN_MFN bit to the MFNs
316 318 * to avoid collisions doesn't work. The pages need to be written to disk
317 319 * in PFN-order or savecore gets confused. We can't allocate memory to
318 320 * contruct a sorted pfn->VA reverse mapping, so we have to write the pages
319 321 * to disk in VA order.
320 322 *
321 323 * To square this circle, we simply make up PFNs for each of Xen's pages.
322 324 * We assign each mapped page a fake PFN in ascending order. These fake
323 325 * PFNs each have the FOREIGN bit set, ensuring that they fall outside the
324 326 * range of Solaris PFNs written by the kernel.
325 327 */
326 328 int
327 329 dump_xpv_addr()
328 330 {
329 331 uintptr_t va;
330 332 mem_vtop_t mem_vtop;
331 333
332 334 xpv_dump_pages = 0;
333 335 va = xen_virt_start;
334 336
335 337 while (xpv_va_walk(&va) != PFN_INVALID) {
336 338 mem_vtop.m_as = &kas;
337 339 mem_vtop.m_va = (void *)va;
338 340 mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN;
339 341
340 342 dumpvp_write(&mem_vtop, sizeof (mem_vtop_t));
341 343 xpv_dump_pages++;
342 344
343 345 va += MMU_PAGESIZE;
344 346 }
345 347
346 348 /*
347 349 * Add the shared_info page. This page actually ends up in the
348 350 * dump twice: once for the Xen va and once for the Solaris va.
349 351 * This isn't ideal, but we don't know the address Xen is using for
350 352 * the page, so we can't share it.
351 353 */
352 354 mem_vtop.m_as = &kas;
353 355 mem_vtop.m_va = HYPERVISOR_shared_info;
354 356 mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN;
355 357 dumpvp_write(&mem_vtop, sizeof (mem_vtop_t));
356 358 xpv_dump_pages++;
357 359
358 360 return (xpv_dump_pages);
359 361 }
360 362
361 363 void
362 364 dump_xpv_pfn()
363 365 {
364 366 pfn_t pfn;
365 367 int cnt;
366 368
367 369 for (cnt = 0; cnt < xpv_dump_pages; cnt++) {
368 370 pfn = (pfn_t)cnt | PFN_IS_FOREIGN_MFN;
369 371 dumpvp_write(&pfn, sizeof (pfn));
370 372 }
371 373 }
372 374
373 375 int
374 376 dump_xpv_data(void *dump_cbuf)
375 377 {
376 378 uintptr_t va;
377 379 uint32_t csize;
378 380 int cnt = 0;
379 381
380 382 /*
381 383 * XXX: we should probably run this data through a UE check. The
382 384 * catch is that the UE code relies on on_trap() and getpfnum()
383 385 * working.
384 386 */
385 387 va = xen_virt_start;
386 388
387 389 while (xpv_va_walk(&va) != PFN_INVALID) {
388 390 csize = (uint32_t)compress((void *)va, dump_cbuf, PAGESIZE);
389 391 dumpvp_write(&csize, sizeof (uint32_t));
390 392 dumpvp_write(dump_cbuf, csize);
391 393 if (dump_ioerr) {
392 394 dumphdr->dump_flags &= ~DF_COMPLETE;
393 395 return (cnt);
394 396 }
395 397 cnt++;
396 398 va += MMU_PAGESIZE;
397 399 }
398 400
399 401 /*
400 402 * Finally, dump the shared_info page
401 403 */
402 404 csize = (uint32_t)compress((void *)HYPERVISOR_shared_info, dump_cbuf,
403 405 PAGESIZE);
404 406 dumpvp_write(&csize, sizeof (uint32_t));
405 407 dumpvp_write(dump_cbuf, csize);
406 408 if (dump_ioerr)
407 409 dumphdr->dump_flags &= ~DF_COMPLETE;
408 410 cnt++;
409 411
410 412 return (cnt);
411 413 }
412 414
413 415 static void *
414 416 showstack(void *fpreg, int xpv_only)
415 417 {
416 418 struct frame *fpp;
417 419 ulong_t off;
418 420 char *sym;
419 421 uintptr_t pc, fp, lastfp;
420 422 uintptr_t minaddr = min(KERNELBASE, xen_virt_start);
421 423
422 424 fp = (uintptr_t)fpreg;
423 425 if (fp < minaddr) {
424 426 xpv_panic_printf("Bad frame ptr: 0x%p\n", fpreg);
425 427 return (fpreg);
426 428 }
427 429
428 430 do {
429 431 fpp = (struct frame *)fp;
430 432 pc = fpp->fr_savpc;
431 433
432 434 if ((xpv_only != 0) &&
433 435 (fp > xpv_end || fp < xen_virt_start))
434 436 break;
435 437 if ((sym = kobj_getsymname(pc, &off)) != NULL)
436 438 xpv_panic_printf("%08lx %s:%s+%lx\n", fp,
437 439 mod_containing_pc((caddr_t)pc), sym, off);
438 440 else if ((pc >= xen_virt_start) && (pc <= xpv_end))
439 441 xpv_panic_printf("%08lx 0x%lx (in Xen)\n", fp, pc);
440 442 else
441 443 xpv_panic_printf("%08lx %lx\n", fp, pc);
442 444
443 445 lastfp = fp;
444 446 fp = fpp->fr_savfp;
445 447
446 448 /*
447 449 * Xen marks an exception frame by inverting the frame
448 450 * pointer.
449 451 */
450 452 if (fp < lastfp) {
451 453 if ((~fp > minaddr) && ((~fp) ^ lastfp) < 0xfff)
452 454 fp = ~fp;
453 455 }
454 456 } while (fp > lastfp);
455 457 return ((void *)fp);
456 458 }
457 459
458 460 void *
459 461 xpv_traceback(void *fpreg)
460 462 {
461 463 return (showstack(fpreg, 1));
462 464 }
463 465
464 466 #if defined(__amd64)
465 467 static void
466 468 xpv_panic_hypercall(ulong_t call)
467 469 {
468 470 panic("Illegally issued hypercall %d during panic!\n", (int)call);
469 471 }
470 472 #endif
471 473
472 474 void
473 475 xpv_die(struct regs *rp)
474 476 {
475 477 struct panic_trap_info ti;
476 478 struct cregs creg;
477 479
478 480 ti.trap_regs = rp;
479 481 ti.trap_type = rp->r_trapno;
480 482
481 483 curthread->t_panic_trap = &ti;
482 484 if (ti.trap_type == T_PGFLT) {
483 485 getcregs(&creg);
484 486 ti.trap_addr = (caddr_t)creg.cr_cr2;
485 487 panic("Fatal pagefault at 0x%lx. fault addr=0x%p rp=0x%p",
486 488 rp->r_pc, (void *)ti.trap_addr, (void *)rp);
487 489 } else {
488 490 ti.trap_addr = (caddr_t)rp->r_pc;
489 491 panic("Fatal trap %ld at 0x%lx. rp=0x%p", rp->r_trapno,
490 492 rp->r_pc, (void *)rp);
491 493 }
492 494 }
493 495
494 496 /*
495 497 * Build IDT to handle a Xen panic
496 498 */
497 499 static void
498 500 switch_to_xpv_panic_idt()
499 501 {
500 502 int i;
501 503 desctbr_t idtr;
502 504 gate_desc_t *idt = xpv_panic_idt;
503 505 selector_t cs = get_cs_register();
504 506
505 507 for (i = 0; i < 32; i++)
506 508 set_gatesegd(&idt[i], &xpv_invaltrap, cs, SDT_SYSIGT, TRP_XPL,
507 509 0);
508 510
509 511 set_gatesegd(&idt[T_ZERODIV], &xpv_div0trap, cs, SDT_SYSIGT, TRP_XPL,
510 512 0);
511 513 set_gatesegd(&idt[T_SGLSTP], &xpv_dbgtrap, cs, SDT_SYSIGT, TRP_XPL, 0);
512 514 set_gatesegd(&idt[T_NMIFLT], &xpv_nmiint, cs, SDT_SYSIGT, TRP_XPL, 0);
513 515 set_gatesegd(&idt[T_BOUNDFLT], &xpv_boundstrap, cs, SDT_SYSIGT,
514 516 TRP_XPL, 0);
515 517 set_gatesegd(&idt[T_ILLINST], &xpv_invoptrap, cs, SDT_SYSIGT, TRP_XPL,
516 518 0);
517 519 set_gatesegd(&idt[T_NOEXTFLT], &xpv_ndptrap, cs, SDT_SYSIGT, TRP_XPL,
518 520 0);
519 521 set_gatesegd(&idt[T_TSSFLT], &xpv_invtsstrap, cs, SDT_SYSIGT, TRP_XPL,
520 522 0);
521 523 set_gatesegd(&idt[T_SEGFLT], &xpv_segnptrap, cs, SDT_SYSIGT, TRP_XPL,
522 524 0);
523 525 set_gatesegd(&idt[T_STKFLT], &xpv_stktrap, cs, SDT_SYSIGT, TRP_XPL, 0);
524 526 set_gatesegd(&idt[T_GPFLT], &xpv_gptrap, cs, SDT_SYSIGT, TRP_XPL, 0);
525 527 set_gatesegd(&idt[T_PGFLT], &xpv_pftrap, cs, SDT_SYSIGT, TRP_XPL, 0);
526 528 set_gatesegd(&idt[T_EXTERRFLT], &xpv_ndperr, cs, SDT_SYSIGT, TRP_XPL,
527 529 0);
528 530 set_gatesegd(&idt[T_ALIGNMENT], &xpv_achktrap, cs, SDT_SYSIGT, TRP_XPL,
529 531 0);
530 532 set_gatesegd(&idt[T_MCE], &xpv_mcetrap, cs, SDT_SYSIGT, TRP_XPL, 0);
531 533 set_gatesegd(&idt[T_SIMDFPE], &xpv_xmtrap, cs, SDT_SYSIGT, TRP_XPL, 0);
532 534
533 535 /*
534 536 * We have no double fault handler. Any single fault represents a
535 537 * catastrophic failure for us, so there is no attempt to handle
536 538 * them cleanly: we just print a message and reboot. If we
537 539 * encounter a second fault while doing that, there is nothing
538 540 * else we can do.
539 541 */
540 542
541 543 /*
542 544 * Be prepared to absorb any stray device interrupts received
543 545 * while writing the core to disk.
544 546 */
545 547 for (i = 33; i < NIDT; i++)
546 548 set_gatesegd(&idt[i], &xpv_surprise_intr, cs, SDT_SYSIGT,
547 549 TRP_XPL, 0);
548 550
549 551 /* The one interrupt we expect to get is from the APIC timer. */
550 552 set_gatesegd(&idt[T_XPV_TIMER], &xpv_timer_trap, cs, SDT_SYSIGT,
551 553 TRP_XPL, 0);
552 554
553 555 idtr.dtr_base = (uintptr_t)xpv_panic_idt;
554 556 idtr.dtr_limit = sizeof (xpv_panic_idt) - 1;
555 557 wr_idtr(&idtr);
556 558
557 559 #if defined(__amd64)
558 560 /* Catch any hypercalls. */
559 561 wrmsr(MSR_AMD_LSTAR, (uintptr_t)xpv_panic_hypercall);
560 562 wrmsr(MSR_AMD_CSTAR, (uintptr_t)xpv_panic_hypercall);
561 563 #endif
562 564 }
563 565
564 566 static void
565 567 xpv_apic_clkinit()
566 568 {
567 569 uint_t apic_ticks = 0;
568 570
569 571 /*
570 572 * Measure how many APIC ticks there are within a fixed time
571 573 * period. We're going to be fairly coarse here. This timer is
572 574 * just being used to detect a stalled panic, so as long as we have
573 575 * the right order of magnitude, everything should be fine.
574 576 */
575 577 xpv_apicadr[APIC_SPUR_INT_REG] = AV_UNIT_ENABLE | APIC_SPUR_INTR;
576 578 xpv_apicadr[APIC_LOCAL_TIMER] = AV_MASK;
577 579 xpv_apicadr[APIC_INT_VECT0] = AV_MASK; /* local intr reg 0 */
578 580
579 581 xpv_apicadr[APIC_DIVIDE_REG] = 0;
580 582 xpv_apicadr[APIC_INIT_COUNT] = APIC_MAXVAL;
581 583 drv_usecwait(XPV_TIMER_INTERVAL);
582 584 apic_ticks = APIC_MAXVAL - xpv_apicadr[APIC_CURR_COUNT];
583 585
584 586 /*
585 587 * apic_ticks now represents roughly how many apic ticks comprise
586 588 * one timeout interval. Program the timer to send us an interrupt
587 589 * every time that interval expires.
588 590 */
589 591 xpv_apicadr[APIC_LOCAL_TIMER] = T_XPV_TIMER | AV_PERIODIC;
590 592 xpv_apicadr[APIC_INIT_COUNT] = apic_ticks;
591 593 xpv_apicadr[APIC_EOI_REG] = 0;
592 594 }
593 595
594 596 void
595 597 xpv_timer_tick(void)
596 598 {
597 599 static int ticks = 0;
598 600
599 601 if (ticks++ >= MICROSEC / XPV_TIMER_INTERVAL) {
600 602 ticks = 0;
601 603 if (dump_timeleft && (--dump_timeleft == 0))
602 604 panic("Xen panic timeout\n");
603 605 }
604 606 xpv_apicadr[APIC_EOI_REG] = 0;
605 607 }
606 608
607 609 void
608 610 xpv_interrupt(void)
609 611 {
610 612 #ifdef DEBUG
611 613 static int cnt = 0;
612 614
613 615 if (cnt++ < 10)
614 616 xpv_panic_printf("Unexpected interrupt received.\n");
615 617 if ((cnt < 1000) && ((cnt % 100) == 0))
616 618 xpv_panic_printf("%d unexpected interrupts received.\n", cnt);
617 619 #endif
618 620
619 621 xpv_apicadr[APIC_EOI_REG] = 0;
620 622 }
621 623
622 624 /*
623 625 * Managing time in panic context is trivial. We only have a single CPU,
624 626 * we never get rescheduled, we never get suspended. We just need to
625 627 * convert clock ticks into nanoseconds.
626 628 */
627 629 static hrtime_t
628 630 xpv_panic_gethrtime(void)
629 631 {
630 632 hrtime_t tsc, hrt;
631 633 unsigned int *l = (unsigned int *)&(tsc);
632 634
633 635 tsc = __rdtsc_insn();
634 636 hrt = (mul32(l[1], nsec_scale) << NSEC_SHIFT) +
635 637 (mul32(l[0], nsec_scale) >> (32 - NSEC_SHIFT));
636 638
637 639 return (hrt);
638 640 }
639 641
640 642 static void
641 643 xpv_panic_time_init()
642 644 {
643 645 nsec_scale =
644 646 CPU->cpu_m.mcpu_vcpu_info->time.tsc_to_system_mul >> NSEC_SHIFT;
645 647
646 648 gethrtimef = xpv_panic_gethrtime;
647 649 }
648 650
649 651 static void
650 652 xpv_panicsys(struct regs *rp, char *fmt, ...)
651 653 {
652 654 extern void panicsys(const char *, va_list, struct regs *, int);
653 655 va_list alist;
654 656
655 657 va_start(alist, fmt);
656 658 panicsys(fmt, alist, rp, 1);
657 659 va_end(alist);
658 660 }
659 661
660 662 void
661 663 xpv_do_panic(void *arg)
662 664 {
663 665 struct panic_info *pip = (struct panic_info *)arg;
664 666 int l;
665 667 struct cregs creg;
666 668 #if defined(__amd64)
667 669 extern uintptr_t postbootkernelbase;
668 670 #endif
669 671
670 672 if (xpv_panicking++ > 0)
671 673 panic("multiple calls to xpv_do_panic()");
672 674
673 675 /*
674 676 * Indicate to the underlying panic framework that a panic has been
675 677 * initiated. This is ordinarily done as part of vpanic(). Since
676 678 * we already have all the register state saved by the hypervisor,
677 679 * we skip that and jump straight into the panic processing code.
678 680 *
679 681 * XXX If another thread grabs and wins the panic_quiesce trigger
680 682 * then we'll have two threads in panicsys believing they are in
681 683 * charge of the panic attempt!
682 684 */
683 685 (void) panic_trigger(&panic_quiesce);
684 686
685 687 #if defined(__amd64)
686 688 /*
687 689 * bzero() and bcopy() get unhappy when asked to operate on
688 690 * addresses outside of the kernel. At this point Xen is really a
689 691 * part of the kernel, so we update the routines' notion of where
690 692 * the kernel starts.
691 693 */
692 694 postbootkernelbase = xen_virt_start;
693 695 #endif
694 696
695 697 #if defined(HYPERVISOR_VIRT_END)
696 698 xpv_end = HYPERVISOR_VIRT_END;
697 699 #else
698 700 xpv_end = (uintptr_t)UINTPTR_MAX - sizeof (uintptr_t);
699 701 #endif
700 702
701 703 /*
702 704 * If we were redirecting console output to the hypervisor, we have
703 705 * to stop.
704 706 */
705 707 use_polledio = B_FALSE;
706 708 if (boot_console_type(NULL) == CONS_HYPERVISOR) {
707 709 bcons_device_change(CONS_HYPERVISOR);
708 710 } else if (cons_polledio != NULL &&
709 711 cons_polledio->cons_polledio_putchar != NULL) {
710 712 if (cons_polledio->cons_polledio_enter != NULL)
711 713 cons_polledio->cons_polledio_enter(
712 714 cons_polledio->cons_polledio_argument);
713 715 use_polledio = 1;
714 716 }
715 717
716 718 /* Make sure we handle all console output from here on. */
717 719 sysp->bsvc_putchar = xpv_panic_putc;
718 720
719 721 /*
720 722 * If we find an unsupported panic_info structure, there's not much
721 723 * we can do other than complain, plow on, and hope for the best.
722 724 */
723 725 if (pip->pi_version != PANIC_INFO_VERSION)
724 726 xpv_panic_printf("Warning: Xen is using an unsupported "
725 727 "version of the panic_info structure.\n");
726 728
727 729 xpv_panic_info = pip;
728 730
729 731 #if defined(__amd64)
730 732 kpm1_low = (uintptr_t)xpv_panic_info->pi_ram_start;
731 733 if (xpv_panic_info->pi_xen_start == NULL) {
732 734 kpm1_high = (uintptr_t)xpv_panic_info->pi_ram_end;
733 735 } else {
734 736 kpm1_high = (uintptr_t)xpv_panic_info->pi_xen_start;
735 737 kpm2_low = (uintptr_t)xpv_panic_info->pi_xen_end;
736 738 kpm2_high = (uintptr_t)xpv_panic_info->pi_ram_end;
737 739 }
738 740 #endif
739 741
740 742 /*
741 743 * Make sure we are running on the Solaris %gs. The Xen panic code
742 744 * should already have set up the GDT properly.
743 745 */
744 746 xpv_panic_resetgs();
745 747 #if defined(__amd64)
746 748 wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]);
747 749 #endif
748 750
749 751 xpv_panic_time_init();
750 752
751 753 /*
752 754 * Switch to our own IDT, avoiding any accidental returns to Xen
753 755 * world.
754 756 */
755 757 switch_to_xpv_panic_idt();
756 758
757 759 /*
758 760 * Initialize the APIC timer, which is used to detect a hung dump
759 761 * attempt.
760 762 */
761 763 xpv_apicadr = pip->pi_apic;
762 764 xpv_apic_clkinit();
763 765
764 766 /*
765 767 * Set up a few values that we'll need repeatedly.
766 768 */
767 769 getcregs(&creg);
768 770 xpv_panic_cr3 = creg.cr_cr3;
769 771 for (l = mmu.max_level; l >= 0; l--)
770 772 xpv_panic_nptes[l] = mmu.ptes_per_table;
771 773 #ifdef __i386
772 774 if (mmu.pae_hat)
773 775 xpv_panic_nptes[mmu.max_level] = 4;
774 776 #endif
775 777
776 778 /* Add the fake Xen module to the module list */
777 779 if (xpv_module != NULL) {
778 780 extern int last_module_id;
779 781
780 782 xpv_modctl->mod_id = last_module_id++;
781 783 xpv_modctl->mod_next = &modules;
782 784 xpv_modctl->mod_prev = modules.mod_prev;
783 785 modules.mod_prev->mod_next = xpv_modctl;
784 786 modules.mod_prev = xpv_modctl;
785 787 }
786 788
787 789 if (pip->pi_mca.mpd_magic == MCA_PANICDATA_MAGIC)
788 790 xpv_mca_panic_data = &pip->pi_mca;
789 791
790 792 xpv_panic_printf = printf;
791 793 xpv_panicsys((struct regs *)pip->pi_regs, pip->pi_panicstr);
792 794 xpv_panic_printf("Failed to reboot following panic.\n");
793 795 for (;;)
794 796 ;
795 797 }
796 798
797 799 /*
798 800 * Set up the necessary data structures to pretend that the Xen hypervisor
799 801 * is a loadable module, allowing mdb to find the Xen symbols in a crash
800 802 * dump. Since these symbols all map to VA space Solaris doesn't normally
801 803 * have access to, we don't link these structures into the kernel's lists
802 804 * until/unless we hit a Xen panic.
803 805 *
804 806 * The observant reader will note a striking amount of overlap between this
805 807 * code and that found in krtld. While it would be handy if we could just
806 808 * ask krtld to do this work for us, it's not that simple. Among the
807 809 * complications: we're not actually loading the text here (grub did it at
808 810 * boot), the .text section is writable, there are no relocations to do,
809 811 * none of the module text/data is in readable memory, etc. Training krtld
810 812 * to deal with this weird module is as complicated, and more risky, than
811 813 * reimplementing the necessary subset of it here.
812 814 */
813 815 static void
814 816 init_xen_module()
815 817 {
816 818 struct _buf *file = NULL;
817 819 struct module *mp;
818 820 struct modctl *mcp;
819 821 int i, shn;
820 822 Shdr *shp, *ctf_shp;
821 823 char *names = NULL;
822 824 size_t n, namesize, text_align, data_align;
823 825 #if defined(__amd64)
824 826 const char machine = EM_AMD64;
825 827 #else
826 828 const char machine = EM_386;
827 829 #endif
828 830
829 831 /* Allocate and init the module structure */
830 832 mp = kmem_zalloc(sizeof (*mp), KM_SLEEP);
831 833 mp->filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP);
832 834 (void) strcpy(mp->filename, XPV_FILENAME);
833 835
834 836 /* Allocate and init the modctl structure */
835 837 mcp = kmem_zalloc(sizeof (*mcp), KM_SLEEP);
836 838 mcp->mod_modname = kobj_zalloc(strlen(XPV_MODNAME) + 1, KM_SLEEP);
837 839 (void) strcpy(mcp->mod_modname, XPV_MODNAME);
838 840 mcp->mod_filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP);
839 841 (void) strcpy(mcp->mod_filename, XPV_FILENAME);
840 842 mcp->mod_inprogress_thread = (kthread_id_t)-1;
841 843 mcp->mod_ref = 1;
842 844 mcp->mod_loaded = 1;
843 845 mcp->mod_loadcnt = 1;
844 846 mcp->mod_mp = mp;
845 847
846 848 /*
847 849 * Try to open a Xen image that hasn't had its symbol and CTF
848 850 * information stripped off.
849 851 */
850 852 file = kobj_open_file(XPV_FILENAME);
851 853 if (file == (struct _buf *)-1) {
852 854 file = NULL;
853 855 goto err;
854 856 }
855 857
856 858 /*
857 859 * Read the header and ensure that this is an ELF file for the
858 860 * proper ISA. If it's not, somebody has done something very
859 861 * stupid. Why bother? See Mencken.
860 862 */
861 863 if (kobj_read_file(file, (char *)&mp->hdr, sizeof (mp->hdr), 0) < 0)
862 864 goto err;
863 865 for (i = 0; i < SELFMAG; i++)
864 866 if (mp->hdr.e_ident[i] != ELFMAG[i])
865 867 goto err;
866 868 if ((mp->hdr.e_ident[EI_DATA] != ELFDATA2LSB) ||
867 869 (mp->hdr.e_machine != machine))
868 870 goto err;
869 871
870 872 /* Read in the section headers */
871 873 n = mp->hdr.e_shentsize * mp->hdr.e_shnum;
872 874 mp->shdrs = kmem_zalloc(n, KM_SLEEP);
873 875 if (kobj_read_file(file, mp->shdrs, n, mp->hdr.e_shoff) < 0)
874 876 goto err;
875 877
876 878 /* Read the section names */
877 879 shp = (Shdr *)(mp->shdrs + mp->hdr.e_shstrndx * mp->hdr.e_shentsize);
878 880 namesize = shp->sh_size;
879 881 names = kmem_zalloc(shp->sh_size, KM_SLEEP);
880 882 if (kobj_read_file(file, names, shp->sh_size, shp->sh_offset) < 0)
881 883 goto err;
882 884
883 885 /*
884 886 * Fill in the text and data size fields.
885 887 */
886 888 ctf_shp = NULL;
887 889 text_align = data_align = 0;
888 890 for (shn = 1; shn < mp->hdr.e_shnum; shn++) {
889 891 shp = (Shdr *)(mp->shdrs + shn * mp->hdr.e_shentsize);
890 892
891 893 /* Sanity check the offset of the section name */
892 894 if (shp->sh_name >= namesize)
893 895 continue;
894 896
895 897 /* If we find the symtab section, remember it for later. */
896 898 if (shp->sh_type == SHT_SYMTAB) {
897 899 mp->symtbl_section = shn;
898 900 mp->symhdr = shp;
899 901 continue;
900 902 }
901 903
902 904 /* If we find the CTF section, remember it for later. */
903 905 if ((shp->sh_size != 0) &&
904 906 (strcmp(names + shp->sh_name, ".SUNW_ctf") == 0)) {
905 907 ctf_shp = shp;
906 908 continue;
907 909 }
908 910
909 911 if (!(shp->sh_flags & SHF_ALLOC))
910 912 continue;
911 913
912 914 /*
913 915 * Xen marks its text section as writable, so we need to
914 916 * look for the name - not just the flag.
915 917 */
916 918 if ((strcmp(&names[shp->sh_name], ".text") != 0) &&
917 919 (shp->sh_flags & SHF_WRITE) != 0) {
918 920 if (shp->sh_addralign > data_align)
919 921 data_align = shp->sh_addralign;
920 922 mp->data_size = ALIGN(mp->data_size, data_align);
921 923 mp->data_size += ALIGN(shp->sh_size, 8);
922 924 if (mp->data == NULL || mp->data > (char *)shp->sh_addr)
923 925 mp->data = (char *)shp->sh_addr;
924 926 } else {
925 927 if (shp->sh_addralign > text_align)
926 928 text_align = shp->sh_addralign;
927 929 mp->text_size = ALIGN(mp->text_size, text_align);
928 930 mp->text_size += ALIGN(shp->sh_size, 8);
929 931 if (mp->text == NULL || mp->text > (char *)shp->sh_addr)
930 932 mp->text = (char *)shp->sh_addr;
931 933 }
932 934 }
933 935 kmem_free(names, namesize);
934 936 names = NULL;
935 937 shp = NULL;
936 938 mcp->mod_text = mp->text;
937 939 mcp->mod_text_size = mp->text_size;
938 940
939 941 /*
940 942 * If we have symbol table and string table sections, read them in
941 943 * now. If we don't, we just plow on. We'll still get a valid
942 944 * core dump, but finding anything useful will be just a bit
943 945 * harder.
944 946 *
945 947 * Note: we don't bother with a hash table. We'll never do a
946 948 * symbol lookup unless we crash, and then mdb creates its own. We
947 949 * also don't try to perform any relocations. Xen should be loaded
948 950 * exactly where the ELF file indicates, and the symbol information
949 951 * in the file should be complete and correct already. Static
950 952 * linking ain't all bad.
951 953 */
952 954 if ((mp->symhdr != NULL) && (mp->symhdr->sh_link < mp->hdr.e_shnum)) {
953 955 mp->strhdr = (Shdr *)
954 956 (mp->shdrs + mp->symhdr->sh_link * mp->hdr.e_shentsize);
955 957 mp->nsyms = mp->symhdr->sh_size / mp->symhdr->sh_entsize;
956 958
957 959 /* Allocate space for the symbol table and strings. */
958 960 mp->symsize = mp->symhdr->sh_size +
959 961 mp->nsyms * sizeof (symid_t) + mp->strhdr->sh_size;
960 962 mp->symspace = kmem_zalloc(mp->symsize, KM_SLEEP);
961 963 mp->symtbl = mp->symspace;
962 964 mp->strings = (char *)(mp->symtbl + mp->symhdr->sh_size);
963 965
964 966 if ((kobj_read_file(file, mp->symtbl,
965 967 mp->symhdr->sh_size, mp->symhdr->sh_offset) < 0) ||
966 968 (kobj_read_file(file, mp->strings,
967 969 mp->strhdr->sh_size, mp->strhdr->sh_offset) < 0))
968 970 goto err;
969 971 }
970 972
971 973 /*
972 974 * Read in the CTF section
973 975 */
974 976 if ((ctf_shp != NULL) && ((moddebug & MODDEBUG_NOCTF) == 0)) {
975 977 mp->ctfdata = kmem_zalloc(ctf_shp->sh_size, KM_SLEEP);
976 978 mp->ctfsize = ctf_shp->sh_size;
977 979 if (kobj_read_file(file, mp->ctfdata, mp->ctfsize,
978 980 ctf_shp->sh_offset) < 0)
979 981 goto err;
980 982 }
981 983
982 984 kobj_close_file(file);
983 985
984 986 xpv_module = mp;
985 987 xpv_modctl = mcp;
986 988 return;
987 989
988 990 err:
989 991 cmn_err(CE_WARN, "Failed to initialize xpv module.");
990 992 if (file != NULL)
991 993 kobj_close_file(file);
992 994
993 995 kmem_free(mp->filename, strlen(XPV_FILENAME) + 1);
994 996 if (mp->shdrs != NULL)
995 997 kmem_free(mp->shdrs, mp->hdr.e_shentsize * mp->hdr.e_shnum);
996 998 if (mp->symspace != NULL)
997 999 kmem_free(mp->symspace, mp->symsize);
998 1000 if (mp->ctfdata != NULL)
999 1001 kmem_free(mp->ctfdata, mp->ctfsize);
1000 1002 kmem_free(mp, sizeof (*mp));
1001 1003 kmem_free(mcp->mod_filename, strlen(XPV_FILENAME) + 1);
1002 1004 kmem_free(mcp->mod_modname, strlen(XPV_MODNAME) + 1);
1003 1005 kmem_free(mcp, sizeof (*mcp));
1004 1006 if (names != NULL)
1005 1007 kmem_free(names, namesize);
1006 1008 }
1007 1009
1008 1010 void
1009 1011 xpv_panic_init()
1010 1012 {
1011 1013 xen_platform_op_t op;
1012 1014 int i;
1013 1015
1014 1016 ASSERT(DOMAIN_IS_INITDOMAIN(xen_info));
1015 1017
1016 1018 for (i = 0; i < mmu.num_level; i++)
1017 1019 ptable_pfn[i] = PFN_INVALID;
1018 1020
1019 1021 /* Let Xen know where to jump if/when it panics. */
1020 1022 op.cmd = XENPF_panic_init;
1021 1023 op.interface_version = XENPF_INTERFACE_VERSION;
1022 1024 op.u.panic_init.panic_addr = (unsigned long)xpv_panic_hdlr;
1023 1025
1024 1026 (void) HYPERVISOR_platform_op(&op);
1025 1027
1026 1028 init_xen_module();
1027 1029 }
↓ open down ↓ |
840 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX