Print this page
8956 Implement KPTI
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/i86pc/os/trap.c
+++ new/usr/src/uts/i86pc/os/trap.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
24 24 */
↓ open down ↓ |
24 lines elided |
↑ open up ↑ |
25 25
26 26 /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
27 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */
28 28 /* All Rights Reserved */
29 29 /* */
30 30 /* Copyright (c) 1987, 1988 Microsoft Corporation */
31 31 /* All Rights Reserved */
32 32 /* */
33 33
34 34 /*
35 - * Copyright 2017 Joyent, Inc.
35 + * Copyright 2018 Joyent, Inc.
36 36 */
37 37
38 38 #include <sys/types.h>
39 39 #include <sys/sysmacros.h>
40 40 #include <sys/param.h>
41 41 #include <sys/signal.h>
42 42 #include <sys/systm.h>
43 43 #include <sys/user.h>
44 44 #include <sys/proc.h>
45 45 #include <sys/disp.h>
46 46 #include <sys/class.h>
47 47 #include <sys/core.h>
48 48 #include <sys/syscall.h>
49 49 #include <sys/cpuvar.h>
50 50 #include <sys/vm.h>
51 51 #include <sys/sysinfo.h>
52 52 #include <sys/fault.h>
53 53 #include <sys/stack.h>
54 54 #include <sys/psw.h>
55 55 #include <sys/regset.h>
56 56 #include <sys/fp.h>
57 57 #include <sys/trap.h>
58 58 #include <sys/kmem.h>
59 59 #include <sys/vtrace.h>
60 60 #include <sys/cmn_err.h>
61 61 #include <sys/prsystm.h>
62 62 #include <sys/mutex_impl.h>
63 63 #include <sys/machsystm.h>
64 64 #include <sys/archsystm.h>
65 65 #include <sys/sdt.h>
66 66 #include <sys/avintr.h>
67 67 #include <sys/kobj.h>
68 68
69 69 #include <vm/hat.h>
70 70
71 71 #include <vm/seg_kmem.h>
72 72 #include <vm/as.h>
73 73 #include <vm/seg.h>
74 74 #include <vm/hat_pte.h>
75 75 #include <vm/hat_i86.h>
76 76
77 77 #include <sys/procfs.h>
78 78
79 79 #include <sys/reboot.h>
80 80 #include <sys/debug.h>
81 81 #include <sys/debugreg.h>
82 82 #include <sys/modctl.h>
83 83 #include <sys/aio_impl.h>
84 84 #include <sys/tnf.h>
85 85 #include <sys/tnf_probe.h>
86 86 #include <sys/cred.h>
87 87 #include <sys/mman.h>
88 88 #include <sys/x86_archext.h>
89 89 #include <sys/copyops.h>
90 90 #include <c2/audit.h>
91 91 #include <sys/ftrace.h>
92 92 #include <sys/panic.h>
93 93 #include <sys/traptrace.h>
94 94 #include <sys/ontrap.h>
95 95 #include <sys/cpc_impl.h>
96 96 #include <sys/bootconf.h>
97 97 #include <sys/bootinfo.h>
98 98 #include <sys/promif.h>
99 99 #include <sys/mach_mmu.h>
100 100 #if defined(__xpv)
101 101 #include <sys/hypervisor.h>
102 102 #endif
103 103 #include <sys/contract/process_impl.h>
104 104
105 105 #define USER 0x10000 /* user-mode flag added to trap type */
106 106
107 107 static const char *trap_type_mnemonic[] = {
108 108 "de", "db", "2", "bp",
109 109 "of", "br", "ud", "nm",
110 110 "df", "9", "ts", "np",
111 111 "ss", "gp", "pf", "15",
112 112 "mf", "ac", "mc", "xf"
113 113 };
114 114
115 115 static const char *trap_type[] = {
116 116 "Divide error", /* trap id 0 */
117 117 "Debug", /* trap id 1 */
118 118 "NMI interrupt", /* trap id 2 */
119 119 "Breakpoint", /* trap id 3 */
120 120 "Overflow", /* trap id 4 */
121 121 "BOUND range exceeded", /* trap id 5 */
122 122 "Invalid opcode", /* trap id 6 */
123 123 "Device not available", /* trap id 7 */
124 124 "Double fault", /* trap id 8 */
125 125 "Coprocessor segment overrun", /* trap id 9 */
126 126 "Invalid TSS", /* trap id 10 */
127 127 "Segment not present", /* trap id 11 */
128 128 "Stack segment fault", /* trap id 12 */
129 129 "General protection", /* trap id 13 */
130 130 "Page fault", /* trap id 14 */
131 131 "Reserved", /* trap id 15 */
132 132 "x87 floating point error", /* trap id 16 */
133 133 "Alignment check", /* trap id 17 */
134 134 "Machine check", /* trap id 18 */
135 135 "SIMD floating point exception", /* trap id 19 */
136 136 };
137 137
138 138 #define TRAP_TYPES (sizeof (trap_type) / sizeof (trap_type[0]))
139 139
140 140 #define SLOW_SCALL_SIZE 2
141 141 #define FAST_SCALL_SIZE 2
142 142
143 143 int tudebug = 0;
144 144 int tudebugbpt = 0;
145 145 int tudebugfpe = 0;
146 146 int tudebugsse = 0;
147 147
148 148 #if defined(TRAPDEBUG) || defined(lint)
149 149 int tdebug = 0;
150 150 int lodebug = 0;
151 151 int faultdebug = 0;
152 152 #else
153 153 #define tdebug 0
154 154 #define lodebug 0
155 155 #define faultdebug 0
156 156 #endif /* defined(TRAPDEBUG) || defined(lint) */
157 157
158 158 #if defined(TRAPTRACE)
159 159 /*
160 160 * trap trace record for cpu0 is allocated here.
161 161 * trap trace records for non-boot cpus are allocated in mp_startup_init().
162 162 */
163 163 static trap_trace_rec_t trap_tr0[TRAPTR_NENT];
164 164 trap_trace_ctl_t trap_trace_ctl[NCPU] = {
165 165 {
166 166 (uintptr_t)trap_tr0, /* next record */
167 167 (uintptr_t)trap_tr0, /* first record */
168 168 (uintptr_t)(trap_tr0 + TRAPTR_NENT), /* limit */
169 169 (uintptr_t)0 /* current */
170 170 },
171 171 };
172 172
173 173 /*
174 174 * default trap buffer size
175 175 */
176 176 size_t trap_trace_bufsize = TRAPTR_NENT * sizeof (trap_trace_rec_t);
177 177 int trap_trace_freeze = 0;
178 178 int trap_trace_off = 0;
179 179
180 180 /*
181 181 * A dummy TRAPTRACE entry to use after death.
182 182 */
183 183 trap_trace_rec_t trap_trace_postmort;
184 184
185 185 static void dump_ttrace(void);
186 186 #endif /* TRAPTRACE */
187 187 static void dumpregs(struct regs *);
188 188 static void showregs(uint_t, struct regs *, caddr_t);
189 189 static int kern_gpfault(struct regs *);
190 190
191 191 /*ARGSUSED*/
192 192 static int
193 193 die(uint_t type, struct regs *rp, caddr_t addr, processorid_t cpuid)
194 194 {
195 195 struct panic_trap_info ti;
196 196 const char *trap_name, *trap_mnemonic;
197 197
198 198 if (type < TRAP_TYPES) {
199 199 trap_name = trap_type[type];
200 200 trap_mnemonic = trap_type_mnemonic[type];
201 201 } else {
202 202 trap_name = "trap";
203 203 trap_mnemonic = "-";
204 204 }
205 205
206 206 #ifdef TRAPTRACE
207 207 TRAPTRACE_FREEZE;
208 208 #endif
209 209
210 210 ti.trap_regs = rp;
211 211 ti.trap_type = type & ~USER;
212 212 ti.trap_addr = addr;
213 213
214 214 curthread->t_panic_trap = &ti;
215 215
216 216 if (type == T_PGFLT && addr < (caddr_t)kernelbase) {
217 217 panic("BAD TRAP: type=%x (#%s %s) rp=%p addr=%p "
218 218 "occurred in module \"%s\" due to %s",
219 219 type, trap_mnemonic, trap_name, (void *)rp, (void *)addr,
220 220 mod_containing_pc((caddr_t)rp->r_pc),
221 221 addr < (caddr_t)PAGESIZE ?
222 222 "a NULL pointer dereference" :
223 223 "an illegal access to a user address");
224 224 } else
225 225 panic("BAD TRAP: type=%x (#%s %s) rp=%p addr=%p",
226 226 type, trap_mnemonic, trap_name, (void *)rp, (void *)addr);
227 227 return (0);
228 228 }
229 229
230 230 /*
231 231 * Rewrite the instruction at pc to be an int $T_SYSCALLINT instruction.
232 232 *
233 233 * int <vector> is two bytes: 0xCD <vector>
234 234 */
235 235
236 236 static int
237 237 rewrite_syscall(caddr_t pc)
238 238 {
239 239 uchar_t instr[SLOW_SCALL_SIZE] = { 0xCD, T_SYSCALLINT };
240 240
241 241 if (uwrite(curthread->t_procp, instr, SLOW_SCALL_SIZE,
242 242 (uintptr_t)pc) != 0)
243 243 return (1);
244 244
245 245 return (0);
246 246 }
247 247
248 248 /*
249 249 * Test to see if the instruction at pc is sysenter or syscall. The second
250 250 * argument should be the x86 feature flag corresponding to the expected
251 251 * instruction.
252 252 *
253 253 * sysenter is two bytes: 0x0F 0x34
254 254 * syscall is two bytes: 0x0F 0x05
255 255 * int $T_SYSCALLINT is two bytes: 0xCD 0x91
256 256 */
257 257
258 258 static int
259 259 instr_is_other_syscall(caddr_t pc, int which)
260 260 {
261 261 uchar_t instr[FAST_SCALL_SIZE];
262 262
263 263 ASSERT(which == X86FSET_SEP || which == X86FSET_ASYSC || which == 0xCD);
264 264
265 265 if (copyin_nowatch(pc, (caddr_t)instr, FAST_SCALL_SIZE) != 0)
266 266 return (0);
267 267
268 268 switch (which) {
269 269 case X86FSET_SEP:
270 270 if (instr[0] == 0x0F && instr[1] == 0x34)
271 271 return (1);
272 272 break;
273 273 case X86FSET_ASYSC:
274 274 if (instr[0] == 0x0F && instr[1] == 0x05)
275 275 return (1);
276 276 break;
277 277 case 0xCD:
278 278 if (instr[0] == 0xCD && instr[1] == T_SYSCALLINT)
279 279 return (1);
280 280 break;
281 281 }
282 282
283 283 return (0);
284 284 }
285 285
286 286 static const char *
287 287 syscall_insn_string(int syscall_insn)
288 288 {
289 289 switch (syscall_insn) {
290 290 case X86FSET_SEP:
291 291 return ("sysenter");
292 292 case X86FSET_ASYSC:
293 293 return ("syscall");
294 294 case 0xCD:
295 295 return ("int");
296 296 default:
297 297 return ("Unknown");
298 298 }
299 299 }
300 300
301 301 static int
302 302 ldt_rewrite_syscall(struct regs *rp, proc_t *p, int syscall_insn)
303 303 {
304 304 caddr_t linearpc;
305 305 int return_code = 0;
306 306
307 307 mutex_enter(&p->p_ldtlock); /* Must be held across linear_pc() */
308 308
309 309 if (linear_pc(rp, p, &linearpc) == 0) {
310 310
311 311 /*
312 312 * If another thread beat us here, it already changed
313 313 * this site to the slower (int) syscall instruction.
314 314 */
315 315 if (instr_is_other_syscall(linearpc, 0xCD)) {
316 316 return_code = 1;
317 317 } else if (instr_is_other_syscall(linearpc, syscall_insn)) {
318 318
319 319 if (rewrite_syscall(linearpc) == 0) {
320 320 return_code = 1;
321 321 }
322 322 #ifdef DEBUG
323 323 else
324 324 cmn_err(CE_WARN, "failed to rewrite %s "
325 325 "instruction in process %d",
326 326 syscall_insn_string(syscall_insn),
327 327 p->p_pid);
328 328 #endif /* DEBUG */
329 329 }
330 330 }
331 331
332 332 mutex_exit(&p->p_ldtlock); /* Must be held across linear_pc() */
333 333
334 334 return (return_code);
335 335 }
336 336
337 337 /*
338 338 * Test to see if the instruction at pc is a system call instruction.
339 339 *
340 340 * The bytes of an lcall instruction used for the syscall trap.
341 341 * static uchar_t lcall[7] = { 0x9a, 0, 0, 0, 0, 0x7, 0 };
342 342 * static uchar_t lcallalt[7] = { 0x9a, 0, 0, 0, 0, 0x27, 0 };
343 343 */
344 344
345 345 #define LCALLSIZE 7
346 346
347 347 static int
348 348 instr_is_lcall_syscall(caddr_t pc)
349 349 {
350 350 uchar_t instr[LCALLSIZE];
351 351
352 352 if (copyin_nowatch(pc, (caddr_t)instr, LCALLSIZE) == 0 &&
353 353 instr[0] == 0x9a &&
354 354 instr[1] == 0 &&
355 355 instr[2] == 0 &&
356 356 instr[3] == 0 &&
357 357 instr[4] == 0 &&
358 358 (instr[5] == 0x7 || instr[5] == 0x27) &&
359 359 instr[6] == 0)
360 360 return (1);
361 361
362 362 return (0);
363 363 }
364 364
365 365 #ifdef __amd64
366 366
367 367 /*
368 368 * In the first revisions of amd64 CPUs produced by AMD, the LAHF and
369 369 * SAHF instructions were not implemented in 64-bit mode. Later revisions
370 370 * did implement these instructions. An extension to the cpuid instruction
371 371 * was added to check for the capability of executing these instructions
372 372 * in 64-bit mode.
373 373 *
374 374 * Intel originally did not implement these instructions in EM64T either,
375 375 * but added them in later revisions.
376 376 *
377 377 * So, there are different chip revisions by both vendors out there that
378 378 * may or may not implement these instructions. The easy solution is to
379 379 * just always emulate these instructions on demand.
380 380 *
381 381 * SAHF == store %ah in the lower 8 bits of %rflags (opcode 0x9e)
382 382 * LAHF == load the lower 8 bits of %rflags into %ah (opcode 0x9f)
383 383 */
384 384
385 385 #define LSAHFSIZE 1
386 386
387 387 static int
388 388 instr_is_lsahf(caddr_t pc, uchar_t *instr)
389 389 {
390 390 if (copyin_nowatch(pc, (caddr_t)instr, LSAHFSIZE) == 0 &&
391 391 (*instr == 0x9e || *instr == 0x9f))
392 392 return (1);
393 393 return (0);
394 394 }
395 395
396 396 /*
397 397 * Emulate the LAHF and SAHF instructions. The reference manuals define
398 398 * these instructions to always load/store bit 1 as a 1, and bits 3 and 5
399 399 * as a 0. The other, defined, bits are copied (the PS_ICC bits and PS_P).
400 400 *
401 401 * Note that %ah is bits 8-15 of %rax.
402 402 */
403 403 static void
404 404 emulate_lsahf(struct regs *rp, uchar_t instr)
405 405 {
406 406 if (instr == 0x9e) {
407 407 /* sahf. Copy bits from %ah to flags. */
408 408 rp->r_ps = (rp->r_ps & ~0xff) |
409 409 ((rp->r_rax >> 8) & PSL_LSAHFMASK) | PS_MB1;
410 410 } else {
411 411 /* lahf. Copy bits from flags to %ah. */
412 412 rp->r_rax = (rp->r_rax & ~0xff00) |
413 413 (((rp->r_ps & PSL_LSAHFMASK) | PS_MB1) << 8);
414 414 }
415 415 rp->r_pc += LSAHFSIZE;
416 416 }
417 417 #endif /* __amd64 */
418 418
419 419 #ifdef OPTERON_ERRATUM_91
420 420
421 421 /*
422 422 * Test to see if the instruction at pc is a prefetch instruction.
423 423 *
424 424 * The first byte of prefetch instructions is always 0x0F.
425 425 * The second byte is 0x18 for regular prefetch or 0x0D for AMD 3dnow prefetch.
426 426 * The third byte (ModRM) contains the register field bits (bits 3-5).
427 427 * These bits must be between 0 and 3 inclusive for regular prefetch and
428 428 * 0 and 1 inclusive for AMD 3dnow prefetch.
429 429 *
430 430 * In 64-bit mode, there may be a one-byte REX prefex (0x40-0x4F).
431 431 */
432 432
433 433 static int
434 434 cmp_to_prefetch(uchar_t *p)
435 435 {
436 436 #ifdef _LP64
437 437 if ((p[0] & 0xF0) == 0x40) /* 64-bit REX prefix */
438 438 p++;
439 439 #endif
440 440 return ((p[0] == 0x0F && p[1] == 0x18 && ((p[2] >> 3) & 7) <= 3) ||
441 441 (p[0] == 0x0F && p[1] == 0x0D && ((p[2] >> 3) & 7) <= 1));
442 442 }
443 443
444 444 static int
445 445 instr_is_prefetch(caddr_t pc)
446 446 {
447 447 uchar_t instr[4]; /* optional REX prefix plus 3-byte opcode */
448 448
449 449 return (copyin_nowatch(pc, instr, sizeof (instr)) == 0 &&
450 450 cmp_to_prefetch(instr));
451 451 }
452 452
453 453 #endif /* OPTERON_ERRATUM_91 */
454 454
455 455 /*
456 456 * Called from the trap handler when a processor trap occurs.
457 457 *
458 458 * Note: All user-level traps that might call stop() must exit
459 459 * trap() by 'goto out' or by falling through.
460 460 * Note Also: trap() is usually called with interrupts enabled, (PS_IE == 1)
461 461 * however, there are paths that arrive here with PS_IE == 0 so special care
462 462 * must be taken in those cases.
463 463 */
464 464 void
465 465 trap(struct regs *rp, caddr_t addr, processorid_t cpuid)
466 466 {
467 467 kthread_t *ct = curthread;
468 468 enum seg_rw rw;
469 469 unsigned type;
470 470 proc_t *p = ttoproc(ct);
471 471 klwp_t *lwp = ttolwp(ct);
472 472 uintptr_t lofault;
↓ open down ↓ |
427 lines elided |
↑ open up ↑ |
473 473 label_t *onfault;
474 474 faultcode_t pagefault(), res, errcode;
475 475 enum fault_type fault_type;
476 476 k_siginfo_t siginfo;
477 477 uint_t fault = 0;
478 478 int mstate;
479 479 int sicode = 0;
480 480 int watchcode;
481 481 int watchpage;
482 482 caddr_t vaddr;
483 - int singlestep_twiddle;
484 483 size_t sz;
485 484 int ta;
486 485 #ifdef __amd64
487 486 uchar_t instr;
488 487 #endif
489 488
490 489 ASSERT_STACK_ALIGNED();
491 490
492 491 type = rp->r_trapno;
493 492 CPU_STATS_ADDQ(CPU, sys, trap, 1);
494 493 ASSERT(ct->t_schedflag & TS_DONT_SWAP);
495 494
496 495 if (type == T_PGFLT) {
497 496
498 497 errcode = rp->r_err;
499 498 if (errcode & PF_ERR_WRITE)
500 499 rw = S_WRITE;
501 500 else if ((caddr_t)rp->r_pc == addr ||
502 501 (mmu.pt_nx != 0 && (errcode & PF_ERR_EXEC)))
503 502 rw = S_EXEC;
504 503 else
505 504 rw = S_READ;
506 505
507 506 #if defined(__i386)
508 507 /*
509 508 * Pentium Pro work-around
510 509 */
511 510 if ((errcode & PF_ERR_PROT) && pentiumpro_bug4046376) {
512 511 uint_t attr;
513 512 uint_t priv_violation;
514 513 uint_t access_violation;
515 514
516 515 if (hat_getattr(addr < (caddr_t)kernelbase ?
517 516 curproc->p_as->a_hat : kas.a_hat, addr, &attr)
518 517 == -1) {
519 518 errcode &= ~PF_ERR_PROT;
520 519 } else {
521 520 priv_violation = (errcode & PF_ERR_USER) &&
522 521 !(attr & PROT_USER);
523 522 access_violation = (errcode & PF_ERR_WRITE) &&
524 523 !(attr & PROT_WRITE);
525 524 if (!priv_violation && !access_violation)
526 525 goto cleanup;
527 526 }
528 527 }
529 528 #endif /* __i386 */
530 529
531 530 } else if (type == T_SGLSTP && lwp != NULL)
532 531 lwp->lwp_pcb.pcb_drstat = (uintptr_t)addr;
533 532
534 533 if (tdebug)
535 534 showregs(type, rp, addr);
536 535
537 536 if (USERMODE(rp->r_cs)) {
538 537 /*
539 538 * Set up the current cred to use during this trap. u_cred
540 539 * no longer exists. t_cred is used instead.
541 540 * The current process credential applies to the thread for
542 541 * the entire trap. If trapping from the kernel, this
543 542 * should already be set up.
544 543 */
545 544 if (ct->t_cred != p->p_cred) {
546 545 cred_t *oldcred = ct->t_cred;
547 546 /*
548 547 * DTrace accesses t_cred in probe context. t_cred
549 548 * must always be either NULL, or point to a valid,
550 549 * allocated cred structure.
551 550 */
552 551 ct->t_cred = crgetcred();
553 552 crfree(oldcred);
554 553 }
555 554 ASSERT(lwp != NULL);
556 555 type |= USER;
557 556 ASSERT(lwptoregs(lwp) == rp);
558 557 lwp->lwp_state = LWP_SYS;
559 558
560 559 switch (type) {
561 560 case T_PGFLT + USER:
562 561 if ((caddr_t)rp->r_pc == addr)
563 562 mstate = LMS_TFAULT;
564 563 else
565 564 mstate = LMS_DFAULT;
566 565 break;
567 566 default:
568 567 mstate = LMS_TRAP;
569 568 break;
570 569 }
571 570 /* Kernel probe */
572 571 TNF_PROBE_1(thread_state, "thread", /* CSTYLED */,
573 572 tnf_microstate, state, mstate);
574 573 mstate = new_mstate(ct, mstate);
575 574
576 575 bzero(&siginfo, sizeof (siginfo));
577 576 }
578 577
579 578 switch (type) {
580 579 case T_PGFLT + USER:
581 580 case T_SGLSTP:
582 581 case T_SGLSTP + USER:
583 582 case T_BPTFLT + USER:
584 583 break;
585 584
586 585 default:
587 586 FTRACE_2("trap(): type=0x%lx, regs=0x%lx",
588 587 (ulong_t)type, (ulong_t)rp);
589 588 break;
590 589 }
591 590
592 591 switch (type) {
593 592 case T_SIMDFPE:
594 593 /* Make sure we enable interrupts before die()ing */
595 594 sti(); /* The SIMD exception comes in via cmninttrap */
596 595 /*FALLTHROUGH*/
597 596 default:
598 597 if (type & USER) {
599 598 if (tudebug)
600 599 showregs(type, rp, (caddr_t)0);
601 600 printf("trap: Unknown trap type %d in user mode\n",
602 601 type & ~USER);
603 602 siginfo.si_signo = SIGILL;
604 603 siginfo.si_code = ILL_ILLTRP;
605 604 siginfo.si_addr = (caddr_t)rp->r_pc;
606 605 siginfo.si_trapno = type & ~USER;
607 606 fault = FLTILL;
608 607 break;
609 608 } else {
610 609 (void) die(type, rp, addr, cpuid);
611 610 /*NOTREACHED*/
612 611 }
613 612
614 613 case T_PGFLT: /* system page fault */
615 614 /*
616 615 * If we're under on_trap() protection (see <sys/ontrap.h>),
617 616 * set ot_trap and bounce back to the on_trap() call site
618 617 * via the installed trampoline.
619 618 */
620 619 if ((ct->t_ontrap != NULL) &&
621 620 (ct->t_ontrap->ot_prot & OT_DATA_ACCESS)) {
622 621 ct->t_ontrap->ot_trap |= OT_DATA_ACCESS;
623 622 rp->r_pc = ct->t_ontrap->ot_trampoline;
624 623 goto cleanup;
625 624 }
626 625
627 626 /*
628 627 * If we have an Instruction fault in kernel mode, then that
629 628 * means we've tried to execute a user page (SMEP) or both of
630 629 * PAE and NXE are enabled. In either case, given that it's a
631 630 * kernel fault, we should panic immediately and not try to make
632 631 * any more forward progress. This indicates a bug in the
633 632 * kernel, which if execution continued, could be exploited to
634 633 * wreak havoc on the system.
635 634 */
636 635 if (errcode & PF_ERR_EXEC) {
637 636 (void) die(type, rp, addr, cpuid);
638 637 }
639 638
640 639 /*
641 640 * We need to check if SMAP is in play. If SMAP is in play, then
642 641 * any access to a user page will show up as a protection
643 642 * violation. To see if SMAP is enabled we first check if it's a
644 643 * user address and whether we have the feature flag set. If we
645 644 * do and the interrupted registers do not allow for user
646 645 * accesses (PS_ACHK is not enabled), then we need to die
647 646 * immediately.
648 647 */
649 648 if (addr < (caddr_t)kernelbase &&
650 649 is_x86_feature(x86_featureset, X86FSET_SMAP) == B_TRUE &&
651 650 (rp->r_ps & PS_ACHK) == 0) {
652 651 (void) die(type, rp, addr, cpuid);
653 652 }
654 653
655 654 /*
656 655 * See if we can handle as pagefault. Save lofault and onfault
657 656 * across this. Here we assume that an address less than
658 657 * KERNELBASE is a user fault. We can do this as copy.s
659 658 * routines verify that the starting address is less than
660 659 * KERNELBASE before starting and because we know that we
661 660 * always have KERNELBASE mapped as invalid to serve as a
662 661 * "barrier".
663 662 */
664 663 lofault = ct->t_lofault;
665 664 onfault = ct->t_onfault;
666 665 ct->t_lofault = 0;
667 666
668 667 mstate = new_mstate(ct, LMS_KFAULT);
669 668
670 669 if (addr < (caddr_t)kernelbase) {
671 670 res = pagefault(addr,
672 671 (errcode & PF_ERR_PROT)? F_PROT: F_INVAL, rw, 0);
673 672 if (res == FC_NOMAP &&
674 673 addr < p->p_usrstack &&
675 674 grow(addr))
676 675 res = 0;
677 676 } else {
678 677 res = pagefault(addr,
679 678 (errcode & PF_ERR_PROT)? F_PROT: F_INVAL, rw, 1);
680 679 }
681 680 (void) new_mstate(ct, mstate);
682 681
683 682 /*
684 683 * Restore lofault and onfault. If we resolved the fault, exit.
685 684 * If we didn't and lofault wasn't set, die.
686 685 */
687 686 ct->t_lofault = lofault;
688 687 ct->t_onfault = onfault;
689 688 if (res == 0)
690 689 goto cleanup;
691 690
692 691 #if defined(OPTERON_ERRATUM_93) && defined(_LP64)
693 692 if (lofault == 0 && opteron_erratum_93) {
694 693 /*
695 694 * Workaround for Opteron Erratum 93. On return from
696 695 * a System Managment Interrupt at a HLT instruction
697 696 * the %rip might be truncated to a 32 bit value.
698 697 * BIOS is supposed to fix this, but some don't.
699 698 * If this occurs we simply restore the high order bits.
700 699 * The HLT instruction is 1 byte of 0xf4.
701 700 */
702 701 uintptr_t rip = rp->r_pc;
703 702
704 703 if ((rip & 0xfffffffful) == rip) {
705 704 rip |= 0xfffffffful << 32;
706 705 if (hat_getpfnum(kas.a_hat, (caddr_t)rip) !=
707 706 PFN_INVALID &&
708 707 (*(uchar_t *)rip == 0xf4 ||
709 708 *(uchar_t *)(rip - 1) == 0xf4)) {
710 709 rp->r_pc = rip;
711 710 goto cleanup;
712 711 }
713 712 }
714 713 }
715 714 #endif /* OPTERON_ERRATUM_93 && _LP64 */
716 715
717 716 #ifdef OPTERON_ERRATUM_91
718 717 if (lofault == 0 && opteron_erratum_91) {
719 718 /*
720 719 * Workaround for Opteron Erratum 91. Prefetches may
721 720 * generate a page fault (they're not supposed to do
722 721 * that!). If this occurs we simply return back to the
723 722 * instruction.
724 723 */
725 724 caddr_t pc = (caddr_t)rp->r_pc;
726 725
727 726 /*
728 727 * If the faulting PC is not mapped, this is a
729 728 * legitimate kernel page fault that must result in a
730 729 * panic. If the faulting PC is mapped, it could contain
731 730 * a prefetch instruction. Check for that here.
732 731 */
733 732 if (hat_getpfnum(kas.a_hat, pc) != PFN_INVALID) {
734 733 if (cmp_to_prefetch((uchar_t *)pc)) {
735 734 #ifdef DEBUG
736 735 cmn_err(CE_WARN, "Opteron erratum 91 "
737 736 "occurred: kernel prefetch"
738 737 " at %p generated a page fault!",
739 738 (void *)rp->r_pc);
740 739 #endif /* DEBUG */
741 740 goto cleanup;
742 741 }
743 742 }
744 743 (void) die(type, rp, addr, cpuid);
745 744 }
746 745 #endif /* OPTERON_ERRATUM_91 */
747 746
748 747 if (lofault == 0)
749 748 (void) die(type, rp, addr, cpuid);
750 749
751 750 /*
752 751 * Cannot resolve fault. Return to lofault.
753 752 */
754 753 if (lodebug) {
755 754 showregs(type, rp, addr);
756 755 traceregs(rp);
757 756 }
758 757 if (FC_CODE(res) == FC_OBJERR)
759 758 res = FC_ERRNO(res);
760 759 else
761 760 res = EFAULT;
762 761 rp->r_r0 = res;
763 762 rp->r_pc = ct->t_lofault;
764 763 goto cleanup;
765 764
766 765 case T_PGFLT + USER: /* user page fault */
767 766 if (faultdebug) {
768 767 char *fault_str;
769 768
770 769 switch (rw) {
771 770 case S_READ:
772 771 fault_str = "read";
773 772 break;
774 773 case S_WRITE:
775 774 fault_str = "write";
776 775 break;
777 776 case S_EXEC:
778 777 fault_str = "exec";
779 778 break;
780 779 default:
781 780 fault_str = "";
782 781 break;
783 782 }
784 783 printf("user %s fault: addr=0x%lx errcode=0x%x\n",
785 784 fault_str, (uintptr_t)addr, errcode);
786 785 }
787 786
788 787 #if defined(OPTERON_ERRATUM_100) && defined(_LP64)
789 788 /*
790 789 * Workaround for AMD erratum 100
791 790 *
792 791 * A 32-bit process may receive a page fault on a non
793 792 * 32-bit address by mistake. The range of the faulting
794 793 * address will be
795 794 *
796 795 * 0xffffffff80000000 .. 0xffffffffffffffff or
797 796 * 0x0000000100000000 .. 0x000000017fffffff
798 797 *
799 798 * The fault is always due to an instruction fetch, however
800 799 * the value of r_pc should be correct (in 32 bit range),
801 800 * so we ignore the page fault on the bogus address.
802 801 */
803 802 if (p->p_model == DATAMODEL_ILP32 &&
804 803 (0xffffffff80000000 <= (uintptr_t)addr ||
805 804 (0x100000000 <= (uintptr_t)addr &&
806 805 (uintptr_t)addr <= 0x17fffffff))) {
807 806 if (!opteron_erratum_100)
808 807 panic("unexpected erratum #100");
809 808 if (rp->r_pc <= 0xffffffff)
810 809 goto out;
811 810 }
812 811 #endif /* OPTERON_ERRATUM_100 && _LP64 */
813 812
814 813 ASSERT(!(curthread->t_flag & T_WATCHPT));
815 814 watchpage = (pr_watch_active(p) && pr_is_watchpage(addr, rw));
816 815 #ifdef __i386
817 816 /*
818 817 * In 32-bit mode, the lcall (system call) instruction fetches
819 818 * one word from the stack, at the stack pointer, because of the
820 819 * way the call gate is constructed. This is a bogus
821 820 * read and should not be counted as a read watchpoint.
822 821 * We work around the problem here by testing to see if
823 822 * this situation applies and, if so, simply jumping to
824 823 * the code in locore.s that fields the system call trap.
825 824 * The registers on the stack are already set up properly
826 825 * due to the match between the call gate sequence and the
827 826 * trap gate sequence. We just have to adjust the pc.
828 827 */
829 828 if (watchpage && addr == (caddr_t)rp->r_sp &&
830 829 rw == S_READ && instr_is_lcall_syscall((caddr_t)rp->r_pc)) {
831 830 extern void watch_syscall(void);
832 831
833 832 rp->r_pc += LCALLSIZE;
834 833 watch_syscall(); /* never returns */
835 834 /* NOTREACHED */
836 835 }
837 836 #endif /* __i386 */
838 837 vaddr = addr;
839 838 if (!watchpage || (sz = instr_size(rp, &vaddr, rw)) <= 0)
840 839 fault_type = (errcode & PF_ERR_PROT)? F_PROT: F_INVAL;
841 840 else if ((watchcode = pr_is_watchpoint(&vaddr, &ta,
842 841 sz, NULL, rw)) != 0) {
843 842 if (ta) {
844 843 do_watch_step(vaddr, sz, rw,
845 844 watchcode, rp->r_pc);
846 845 fault_type = F_INVAL;
847 846 } else {
848 847 bzero(&siginfo, sizeof (siginfo));
849 848 siginfo.si_signo = SIGTRAP;
850 849 siginfo.si_code = watchcode;
851 850 siginfo.si_addr = vaddr;
852 851 siginfo.si_trapafter = 0;
853 852 siginfo.si_pc = (caddr_t)rp->r_pc;
854 853 fault = FLTWATCH;
855 854 break;
856 855 }
857 856 } else {
858 857 /* XXX pr_watch_emul() never succeeds (for now) */
859 858 if (rw != S_EXEC && pr_watch_emul(rp, vaddr, rw))
860 859 goto out;
861 860 do_watch_step(vaddr, sz, rw, 0, 0);
862 861 fault_type = F_INVAL;
863 862 }
864 863
865 864 res = pagefault(addr, fault_type, rw, 0);
866 865
867 866 /*
868 867 * If pagefault() succeeded, ok.
869 868 * Otherwise attempt to grow the stack.
870 869 */
871 870 if (res == 0 ||
872 871 (res == FC_NOMAP &&
873 872 addr < p->p_usrstack &&
874 873 grow(addr))) {
875 874 lwp->lwp_lastfault = FLTPAGE;
876 875 lwp->lwp_lastfaddr = addr;
877 876 if (prismember(&p->p_fltmask, FLTPAGE)) {
878 877 bzero(&siginfo, sizeof (siginfo));
879 878 siginfo.si_addr = addr;
880 879 (void) stop_on_fault(FLTPAGE, &siginfo);
881 880 }
882 881 goto out;
883 882 } else if (res == FC_PROT && addr < p->p_usrstack &&
884 883 (mmu.pt_nx != 0 && (errcode & PF_ERR_EXEC))) {
885 884 report_stack_exec(p, addr);
886 885 }
887 886
888 887 #ifdef OPTERON_ERRATUM_91
889 888 /*
890 889 * Workaround for Opteron Erratum 91. Prefetches may generate a
891 890 * page fault (they're not supposed to do that!). If this
892 891 * occurs we simply return back to the instruction.
893 892 *
894 893 * We rely on copyin to properly fault in the page with r_pc.
895 894 */
896 895 if (opteron_erratum_91 &&
897 896 addr != (caddr_t)rp->r_pc &&
898 897 instr_is_prefetch((caddr_t)rp->r_pc)) {
899 898 #ifdef DEBUG
900 899 cmn_err(CE_WARN, "Opteron erratum 91 occurred: "
901 900 "prefetch at %p in pid %d generated a trap!",
902 901 (void *)rp->r_pc, p->p_pid);
903 902 #endif /* DEBUG */
904 903 goto out;
905 904 }
906 905 #endif /* OPTERON_ERRATUM_91 */
907 906
908 907 if (tudebug)
909 908 showregs(type, rp, addr);
910 909 /*
911 910 * In the case where both pagefault and grow fail,
912 911 * set the code to the value provided by pagefault.
913 912 * We map all errors returned from pagefault() to SIGSEGV.
914 913 */
915 914 bzero(&siginfo, sizeof (siginfo));
916 915 siginfo.si_addr = addr;
917 916 switch (FC_CODE(res)) {
918 917 case FC_HWERR:
919 918 case FC_NOSUPPORT:
920 919 siginfo.si_signo = SIGBUS;
921 920 siginfo.si_code = BUS_ADRERR;
922 921 fault = FLTACCESS;
923 922 break;
924 923 case FC_ALIGN:
925 924 siginfo.si_signo = SIGBUS;
926 925 siginfo.si_code = BUS_ADRALN;
927 926 fault = FLTACCESS;
928 927 break;
929 928 case FC_OBJERR:
930 929 if ((siginfo.si_errno = FC_ERRNO(res)) != EINTR) {
931 930 siginfo.si_signo = SIGBUS;
932 931 siginfo.si_code = BUS_OBJERR;
933 932 fault = FLTACCESS;
934 933 }
935 934 break;
936 935 default: /* FC_NOMAP or FC_PROT */
937 936 siginfo.si_signo = SIGSEGV;
938 937 siginfo.si_code =
939 938 (res == FC_NOMAP)? SEGV_MAPERR : SEGV_ACCERR;
940 939 fault = FLTBOUNDS;
941 940 break;
942 941 }
943 942 break;
944 943
945 944 case T_ILLINST + USER: /* invalid opcode fault */
946 945 /*
947 946 * If the syscall instruction is disabled due to LDT usage, a
948 947 * user program that attempts to execute it will trigger a #ud
949 948 * trap. Check for that case here. If this occurs on a CPU which
950 949 * doesn't even support syscall, the result of all of this will
951 950 * be to emulate that particular instruction.
952 951 */
953 952 if (p->p_ldt != NULL &&
954 953 ldt_rewrite_syscall(rp, p, X86FSET_ASYSC))
955 954 goto out;
956 955
957 956 #ifdef __amd64
958 957 /*
959 958 * Emulate the LAHF and SAHF instructions if needed.
960 959 * See the instr_is_lsahf function for details.
961 960 */
962 961 if (p->p_model == DATAMODEL_LP64 &&
963 962 instr_is_lsahf((caddr_t)rp->r_pc, &instr)) {
964 963 emulate_lsahf(rp, instr);
965 964 goto out;
966 965 }
967 966 #endif
968 967
969 968 /*FALLTHROUGH*/
970 969
971 970 if (tudebug)
972 971 showregs(type, rp, (caddr_t)0);
973 972 siginfo.si_signo = SIGILL;
974 973 siginfo.si_code = ILL_ILLOPC;
975 974 siginfo.si_addr = (caddr_t)rp->r_pc;
976 975 fault = FLTILL;
977 976 break;
978 977
979 978 case T_ZERODIV + USER: /* integer divide by zero */
980 979 if (tudebug && tudebugfpe)
981 980 showregs(type, rp, (caddr_t)0);
982 981 siginfo.si_signo = SIGFPE;
983 982 siginfo.si_code = FPE_INTDIV;
984 983 siginfo.si_addr = (caddr_t)rp->r_pc;
985 984 fault = FLTIZDIV;
986 985 break;
987 986
988 987 case T_OVFLW + USER: /* integer overflow */
989 988 if (tudebug && tudebugfpe)
990 989 showregs(type, rp, (caddr_t)0);
991 990 siginfo.si_signo = SIGFPE;
992 991 siginfo.si_code = FPE_INTOVF;
993 992 siginfo.si_addr = (caddr_t)rp->r_pc;
994 993 fault = FLTIOVF;
995 994 break;
996 995
997 996 case T_NOEXTFLT + USER: /* math coprocessor not available */
998 997 if (tudebug && tudebugfpe)
999 998 showregs(type, rp, addr);
1000 999 if (fpnoextflt(rp)) {
1001 1000 siginfo.si_signo = SIGILL;
1002 1001 siginfo.si_code = ILL_ILLOPC;
1003 1002 siginfo.si_addr = (caddr_t)rp->r_pc;
1004 1003 fault = FLTILL;
1005 1004 }
1006 1005 break;
1007 1006
1008 1007 case T_EXTOVRFLT: /* extension overrun fault */
1009 1008 /* check if we took a kernel trap on behalf of user */
1010 1009 {
1011 1010 extern void ndptrap_frstor(void);
1012 1011 if (rp->r_pc != (uintptr_t)ndptrap_frstor) {
1013 1012 sti(); /* T_EXTOVRFLT comes in via cmninttrap */
1014 1013 (void) die(type, rp, addr, cpuid);
1015 1014 }
1016 1015 type |= USER;
1017 1016 }
1018 1017 /*FALLTHROUGH*/
1019 1018 case T_EXTOVRFLT + USER: /* extension overrun fault */
1020 1019 if (tudebug && tudebugfpe)
1021 1020 showregs(type, rp, addr);
1022 1021 if (fpextovrflt(rp)) {
1023 1022 siginfo.si_signo = SIGSEGV;
1024 1023 siginfo.si_code = SEGV_MAPERR;
1025 1024 siginfo.si_addr = (caddr_t)rp->r_pc;
1026 1025 fault = FLTBOUNDS;
1027 1026 }
1028 1027 break;
1029 1028
1030 1029 case T_EXTERRFLT: /* x87 floating point exception pending */
1031 1030 /* check if we took a kernel trap on behalf of user */
1032 1031 {
1033 1032 extern void ndptrap_frstor(void);
1034 1033 if (rp->r_pc != (uintptr_t)ndptrap_frstor) {
1035 1034 sti(); /* T_EXTERRFLT comes in via cmninttrap */
1036 1035 (void) die(type, rp, addr, cpuid);
1037 1036 }
1038 1037 type |= USER;
1039 1038 }
1040 1039 /*FALLTHROUGH*/
1041 1040
1042 1041 case T_EXTERRFLT + USER: /* x87 floating point exception pending */
1043 1042 if (tudebug && tudebugfpe)
1044 1043 showregs(type, rp, addr);
1045 1044 if (sicode = fpexterrflt(rp)) {
1046 1045 siginfo.si_signo = SIGFPE;
1047 1046 siginfo.si_code = sicode;
1048 1047 siginfo.si_addr = (caddr_t)rp->r_pc;
1049 1048 fault = FLTFPE;
1050 1049 }
1051 1050 break;
1052 1051
1053 1052 case T_SIMDFPE + USER: /* SSE and SSE2 exceptions */
1054 1053 if (tudebug && tudebugsse)
1055 1054 showregs(type, rp, addr);
1056 1055 if (!is_x86_feature(x86_featureset, X86FSET_SSE) &&
1057 1056 !is_x86_feature(x86_featureset, X86FSET_SSE2)) {
1058 1057 /*
1059 1058 * There are rumours that some user instructions
1060 1059 * on older CPUs can cause this trap to occur; in
1061 1060 * which case send a SIGILL instead of a SIGFPE.
1062 1061 */
1063 1062 siginfo.si_signo = SIGILL;
1064 1063 siginfo.si_code = ILL_ILLTRP;
1065 1064 siginfo.si_addr = (caddr_t)rp->r_pc;
1066 1065 siginfo.si_trapno = type & ~USER;
1067 1066 fault = FLTILL;
1068 1067 } else if ((sicode = fpsimderrflt(rp)) != 0) {
1069 1068 siginfo.si_signo = SIGFPE;
1070 1069 siginfo.si_code = sicode;
1071 1070 siginfo.si_addr = (caddr_t)rp->r_pc;
1072 1071 fault = FLTFPE;
1073 1072 }
1074 1073
1075 1074 sti(); /* The SIMD exception comes in via cmninttrap */
1076 1075 break;
1077 1076
1078 1077 case T_BPTFLT: /* breakpoint trap */
1079 1078 /*
1080 1079 * Kernel breakpoint traps should only happen when kmdb is
1081 1080 * active, and even then, it'll have interposed on the IDT, so
1082 1081 * control won't get here. If it does, we've hit a breakpoint
1083 1082 * without the debugger, which is very strange, and very
↓ open down ↓ |
590 lines elided |
↑ open up ↑ |
1084 1083 * fatal.
1085 1084 */
1086 1085 if (tudebug && tudebugbpt)
1087 1086 showregs(type, rp, (caddr_t)0);
1088 1087
1089 1088 (void) die(type, rp, addr, cpuid);
1090 1089 break;
1091 1090
1092 1091 case T_SGLSTP: /* single step/hw breakpoint exception */
1093 1092
1094 - /* Now evaluate how we got here */
1093 +#if !defined(__xpv)
1094 + /*
1095 + * We'd never normally get here, as kmdb handles its own single
1096 + * step traps. There is one nasty exception though, as
1097 + * described in more detail in sys_sysenter(). Note that
1098 + * checking for all four locations covers both the KPTI and the
1099 + * non-KPTI cases correctly: the former will never be found at
1100 + * (brand_)sys_sysenter, and vice versa.
1101 + */
1095 1102 if (lwp != NULL && (lwp->lwp_pcb.pcb_drstat & DR_SINGLESTEP)) {
1096 - /*
1097 - * i386 single-steps even through lcalls which
1098 - * change the privilege level. So we take a trap at
1099 - * the first instruction in privileged mode.
1100 - *
1101 - * Set a flag to indicate that upon completion of
1102 - * the system call, deal with the single-step trap.
1103 - *
1104 - * The same thing happens for sysenter, too.
1105 - */
1106 - singlestep_twiddle = 0;
1107 - if (rp->r_pc == (uintptr_t)sys_sysenter ||
1108 - rp->r_pc == (uintptr_t)brand_sys_sysenter) {
1109 - singlestep_twiddle = 1;
1110 -#if defined(__amd64)
1111 - /*
1112 - * Since we are already on the kernel's
1113 - * %gs, on 64-bit systems the sysenter case
1114 - * needs to adjust the pc to avoid
1115 - * executing the swapgs instruction at the
1116 - * top of the handler.
1117 - */
1118 - if (rp->r_pc == (uintptr_t)sys_sysenter)
1119 - rp->r_pc = (uintptr_t)
1120 - _sys_sysenter_post_swapgs;
1121 - else
1122 - rp->r_pc = (uintptr_t)
1123 - _brand_sys_sysenter_post_swapgs;
1124 -#endif
1125 - }
1126 -#if defined(__i386)
1127 - else if (rp->r_pc == (uintptr_t)sys_call ||
1128 - rp->r_pc == (uintptr_t)brand_sys_call) {
1129 - singlestep_twiddle = 1;
1130 - }
1131 -#endif
1132 - else {
1133 - /* not on sysenter/syscall; uregs available */
1134 - if (tudebug && tudebugbpt)
1135 - showregs(type, rp, (caddr_t)0);
1136 - }
1137 - if (singlestep_twiddle) {
1103 + if (rp->r_pc == (greg_t)brand_sys_sysenter ||
1104 + rp->r_pc == (greg_t)sys_sysenter ||
1105 + rp->r_pc == (greg_t)tr_brand_sys_sysenter ||
1106 + rp->r_pc == (greg_t)tr_sys_sysenter) {
1107 +
1108 + rp->r_pc += 0x3; /* sizeof (swapgs) */
1109 +
1138 1110 rp->r_ps &= ~PS_T; /* turn off trace */
1139 1111 lwp->lwp_pcb.pcb_flags |= DEBUG_PENDING;
1140 1112 ct->t_post_sys = 1;
1141 1113 aston(curthread);
1142 1114 goto cleanup;
1115 + } else {
1116 + if (tudebug && tudebugbpt)
1117 + showregs(type, rp, (caddr_t)0);
1143 1118 }
1144 1119 }
1145 - /* XXX - needs review on debugger interface? */
1120 +#endif /* !__xpv */
1121 +
1146 1122 if (boothowto & RB_DEBUG)
1147 1123 debug_enter((char *)NULL);
1148 1124 else
1149 1125 (void) die(type, rp, addr, cpuid);
1150 1126 break;
1151 1127
1152 1128 case T_NMIFLT: /* NMI interrupt */
1153 1129 printf("Unexpected NMI in system mode\n");
1154 1130 goto cleanup;
1155 1131
1156 1132 case T_NMIFLT + USER: /* NMI interrupt */
1157 1133 printf("Unexpected NMI in user mode\n");
1158 1134 break;
1159 1135
1160 1136 case T_GPFLT: /* general protection violation */
1161 1137 /*
1162 1138 * Any #GP that occurs during an on_trap .. no_trap bracket
1163 1139 * with OT_DATA_ACCESS or OT_SEGMENT_ACCESS protection,
1164 1140 * or in a on_fault .. no_fault bracket, is forgiven
1165 1141 * and we trampoline. This protection is given regardless
1166 1142 * of whether we are 32/64 bit etc - if a distinction is
1167 1143 * required then define new on_trap protection types.
1168 1144 *
1169 1145 * On amd64, we can get a #gp from referencing addresses
1170 1146 * in the virtual address hole e.g. from a copyin or in
1171 1147 * update_sregs while updating user segment registers.
1172 1148 *
1173 1149 * On the 32-bit hypervisor we could also generate one in
1174 1150 * mfn_to_pfn by reaching around or into where the hypervisor
1175 1151 * lives which is protected by segmentation.
1176 1152 */
1177 1153
1178 1154 /*
1179 1155 * If we're under on_trap() protection (see <sys/ontrap.h>),
1180 1156 * set ot_trap and trampoline back to the on_trap() call site
1181 1157 * for OT_DATA_ACCESS or OT_SEGMENT_ACCESS.
1182 1158 */
1183 1159 if (ct->t_ontrap != NULL) {
1184 1160 int ttype = ct->t_ontrap->ot_prot &
1185 1161 (OT_DATA_ACCESS | OT_SEGMENT_ACCESS);
1186 1162
1187 1163 if (ttype != 0) {
1188 1164 ct->t_ontrap->ot_trap |= ttype;
1189 1165 if (tudebug)
1190 1166 showregs(type, rp, (caddr_t)0);
1191 1167 rp->r_pc = ct->t_ontrap->ot_trampoline;
1192 1168 goto cleanup;
1193 1169 }
1194 1170 }
1195 1171
1196 1172 /*
1197 1173 * If we're under lofault protection (copyin etc.),
1198 1174 * longjmp back to lofault with an EFAULT.
1199 1175 */
1200 1176 if (ct->t_lofault) {
1201 1177 /*
1202 1178 * Fault is not resolvable, so just return to lofault
1203 1179 */
1204 1180 if (lodebug) {
1205 1181 showregs(type, rp, addr);
1206 1182 traceregs(rp);
1207 1183 }
1208 1184 rp->r_r0 = EFAULT;
1209 1185 rp->r_pc = ct->t_lofault;
1210 1186 goto cleanup;
1211 1187 }
1212 1188
1213 1189 /*
1214 1190 * We fall through to the next case, which repeats
1215 1191 * the OT_SEGMENT_ACCESS check which we've already
1216 1192 * done, so we'll always fall through to the
1217 1193 * T_STKFLT case.
1218 1194 */
1219 1195 /*FALLTHROUGH*/
1220 1196 case T_SEGFLT: /* segment not present fault */
1221 1197 /*
1222 1198 * One example of this is #NP in update_sregs while
1223 1199 * attempting to update a user segment register
1224 1200 * that points to a descriptor that is marked not
1225 1201 * present.
1226 1202 */
1227 1203 if (ct->t_ontrap != NULL &&
1228 1204 ct->t_ontrap->ot_prot & OT_SEGMENT_ACCESS) {
1229 1205 ct->t_ontrap->ot_trap |= OT_SEGMENT_ACCESS;
1230 1206 if (tudebug)
1231 1207 showregs(type, rp, (caddr_t)0);
1232 1208 rp->r_pc = ct->t_ontrap->ot_trampoline;
1233 1209 goto cleanup;
1234 1210 }
1235 1211 /*FALLTHROUGH*/
1236 1212 case T_STKFLT: /* stack fault */
1237 1213 case T_TSSFLT: /* invalid TSS fault */
1238 1214 if (tudebug)
1239 1215 showregs(type, rp, (caddr_t)0);
1240 1216 if (kern_gpfault(rp))
1241 1217 (void) die(type, rp, addr, cpuid);
1242 1218 goto cleanup;
1243 1219
1244 1220 /*
1245 1221 * ONLY 32-bit PROCESSES can USE a PRIVATE LDT! 64-bit apps
1246 1222 * should have no need for them, so we put a stop to it here.
1247 1223 *
1248 1224 * So: not-present fault is ONLY valid for 32-bit processes with
1249 1225 * a private LDT trying to do a system call. Emulate it.
1250 1226 *
1251 1227 * #gp fault is ONLY valid for 32-bit processes also, which DO NOT
1252 1228 * have a private LDT, and are trying to do a system call. Emulate it.
1253 1229 */
1254 1230
1255 1231 case T_SEGFLT + USER: /* segment not present fault */
1256 1232 case T_GPFLT + USER: /* general protection violation */
1257 1233 #ifdef _SYSCALL32_IMPL
1258 1234 if (p->p_model != DATAMODEL_NATIVE) {
1259 1235 #endif /* _SYSCALL32_IMPL */
1260 1236 if (instr_is_lcall_syscall((caddr_t)rp->r_pc)) {
1261 1237 if (type == T_SEGFLT + USER)
1262 1238 ASSERT(p->p_ldt != NULL);
1263 1239
1264 1240 if ((p->p_ldt == NULL && type == T_GPFLT + USER) ||
1265 1241 type == T_SEGFLT + USER) {
1266 1242
1267 1243 /*
1268 1244 * The user attempted a system call via the obsolete
1269 1245 * call gate mechanism. Because the process doesn't have
1270 1246 * an LDT (i.e. the ldtr contains 0), a #gp results.
1271 1247 * Emulate the syscall here, just as we do above for a
1272 1248 * #np trap.
1273 1249 */
1274 1250
1275 1251 /*
1276 1252 * Since this is a not-present trap, rp->r_pc points to
1277 1253 * the trapping lcall instruction. We need to bump it
1278 1254 * to the next insn so the app can continue on.
1279 1255 */
1280 1256 rp->r_pc += LCALLSIZE;
1281 1257 lwp->lwp_regs = rp;
1282 1258
1283 1259 /*
1284 1260 * Normally the microstate of the LWP is forced back to
1285 1261 * LMS_USER by the syscall handlers. Emulate that
1286 1262 * behavior here.
1287 1263 */
1288 1264 mstate = LMS_USER;
1289 1265
1290 1266 dosyscall();
1291 1267 goto out;
1292 1268 }
1293 1269 }
1294 1270 #ifdef _SYSCALL32_IMPL
1295 1271 }
1296 1272 #endif /* _SYSCALL32_IMPL */
1297 1273 /*
1298 1274 * If the current process is using a private LDT and the
1299 1275 * trapping instruction is sysenter, the sysenter instruction
1300 1276 * has been disabled on the CPU because it destroys segment
1301 1277 * registers. If this is the case, rewrite the instruction to
1302 1278 * be a safe system call and retry it. If this occurs on a CPU
1303 1279 * which doesn't even support sysenter, the result of all of
1304 1280 * this will be to emulate that particular instruction.
1305 1281 */
1306 1282 if (p->p_ldt != NULL &&
1307 1283 ldt_rewrite_syscall(rp, p, X86FSET_SEP))
1308 1284 goto out;
1309 1285
1310 1286 /*FALLTHROUGH*/
1311 1287
1312 1288 case T_BOUNDFLT + USER: /* bound fault */
1313 1289 case T_STKFLT + USER: /* stack fault */
1314 1290 case T_TSSFLT + USER: /* invalid TSS fault */
1315 1291 if (tudebug)
1316 1292 showregs(type, rp, (caddr_t)0);
1317 1293 siginfo.si_signo = SIGSEGV;
1318 1294 siginfo.si_code = SEGV_MAPERR;
1319 1295 siginfo.si_addr = (caddr_t)rp->r_pc;
1320 1296 fault = FLTBOUNDS;
1321 1297 break;
1322 1298
1323 1299 case T_ALIGNMENT + USER: /* user alignment error (486) */
1324 1300 if (tudebug)
1325 1301 showregs(type, rp, (caddr_t)0);
1326 1302 bzero(&siginfo, sizeof (siginfo));
1327 1303 siginfo.si_signo = SIGBUS;
1328 1304 siginfo.si_code = BUS_ADRALN;
1329 1305 siginfo.si_addr = (caddr_t)rp->r_pc;
1330 1306 fault = FLTACCESS;
1331 1307 break;
1332 1308
1333 1309 case T_SGLSTP + USER: /* single step/hw breakpoint exception */
1334 1310 if (tudebug && tudebugbpt)
1335 1311 showregs(type, rp, (caddr_t)0);
1336 1312
1337 1313 /* Was it single-stepping? */
1338 1314 if (lwp->lwp_pcb.pcb_drstat & DR_SINGLESTEP) {
1339 1315 pcb_t *pcb = &lwp->lwp_pcb;
1340 1316
1341 1317 rp->r_ps &= ~PS_T;
1342 1318 /*
1343 1319 * If both NORMAL_STEP and WATCH_STEP are in effect,
1344 1320 * give precedence to WATCH_STEP. If neither is set,
1345 1321 * user must have set the PS_T bit in %efl; treat this
1346 1322 * as NORMAL_STEP.
1347 1323 */
1348 1324 if ((fault = undo_watch_step(&siginfo)) == 0 &&
1349 1325 ((pcb->pcb_flags & NORMAL_STEP) ||
1350 1326 !(pcb->pcb_flags & WATCH_STEP))) {
1351 1327 siginfo.si_signo = SIGTRAP;
1352 1328 siginfo.si_code = TRAP_TRACE;
1353 1329 siginfo.si_addr = (caddr_t)rp->r_pc;
1354 1330 fault = FLTTRACE;
1355 1331 }
1356 1332 pcb->pcb_flags &= ~(NORMAL_STEP|WATCH_STEP);
1357 1333 }
1358 1334 break;
1359 1335
1360 1336 case T_BPTFLT + USER: /* breakpoint trap */
1361 1337 if (tudebug && tudebugbpt)
1362 1338 showregs(type, rp, (caddr_t)0);
1363 1339 /*
1364 1340 * int 3 (the breakpoint instruction) leaves the pc referring
1365 1341 * to the address one byte after the breakpointed address.
1366 1342 * If the P_PR_BPTADJ flag has been set via /proc, We adjust
1367 1343 * it back so it refers to the breakpointed address.
1368 1344 */
1369 1345 if (p->p_proc_flag & P_PR_BPTADJ)
1370 1346 rp->r_pc--;
1371 1347 siginfo.si_signo = SIGTRAP;
1372 1348 siginfo.si_code = TRAP_BRKPT;
1373 1349 siginfo.si_addr = (caddr_t)rp->r_pc;
1374 1350 fault = FLTBPT;
1375 1351 break;
1376 1352
1377 1353 case T_AST:
1378 1354 /*
1379 1355 * This occurs only after the cs register has been made to
1380 1356 * look like a kernel selector, either through debugging or
1381 1357 * possibly by functions like setcontext(). The thread is
1382 1358 * about to cause a general protection fault at common_iret()
1383 1359 * in locore. We let that happen immediately instead of
1384 1360 * doing the T_AST processing.
1385 1361 */
1386 1362 goto cleanup;
1387 1363
1388 1364 case T_AST + USER: /* profiling, resched, h/w error pseudo trap */
1389 1365 if (lwp->lwp_pcb.pcb_flags & ASYNC_HWERR) {
1390 1366 proc_t *p = ttoproc(curthread);
1391 1367 extern void print_msg_hwerr(ctid_t ct_id, proc_t *p);
1392 1368
1393 1369 lwp->lwp_pcb.pcb_flags &= ~ASYNC_HWERR;
1394 1370 print_msg_hwerr(p->p_ct_process->conp_contract.ct_id,
1395 1371 p);
1396 1372 contract_process_hwerr(p->p_ct_process, p);
1397 1373 siginfo.si_signo = SIGKILL;
1398 1374 siginfo.si_code = SI_NOINFO;
1399 1375 } else if (lwp->lwp_pcb.pcb_flags & CPC_OVERFLOW) {
1400 1376 lwp->lwp_pcb.pcb_flags &= ~CPC_OVERFLOW;
1401 1377 if (kcpc_overflow_ast()) {
1402 1378 /*
1403 1379 * Signal performance counter overflow
1404 1380 */
1405 1381 if (tudebug)
1406 1382 showregs(type, rp, (caddr_t)0);
1407 1383 bzero(&siginfo, sizeof (siginfo));
1408 1384 siginfo.si_signo = SIGEMT;
1409 1385 siginfo.si_code = EMT_CPCOVF;
1410 1386 siginfo.si_addr = (caddr_t)rp->r_pc;
1411 1387 fault = FLTCPCOVF;
1412 1388 }
1413 1389 }
1414 1390
1415 1391 break;
1416 1392 }
1417 1393
1418 1394 /*
1419 1395 * We can't get here from a system trap
1420 1396 */
1421 1397 ASSERT(type & USER);
1422 1398
1423 1399 if (fault) {
1424 1400 /* We took a fault so abort single step. */
1425 1401 lwp->lwp_pcb.pcb_flags &= ~(NORMAL_STEP|WATCH_STEP);
1426 1402 /*
1427 1403 * Remember the fault and fault adddress
1428 1404 * for real-time (SIGPROF) profiling.
1429 1405 */
1430 1406 lwp->lwp_lastfault = fault;
1431 1407 lwp->lwp_lastfaddr = siginfo.si_addr;
1432 1408
1433 1409 DTRACE_PROC2(fault, int, fault, ksiginfo_t *, &siginfo);
1434 1410
1435 1411 /*
1436 1412 * If a debugger has declared this fault to be an
1437 1413 * event of interest, stop the lwp. Otherwise just
1438 1414 * deliver the associated signal.
1439 1415 */
1440 1416 if (siginfo.si_signo != SIGKILL &&
1441 1417 prismember(&p->p_fltmask, fault) &&
1442 1418 stop_on_fault(fault, &siginfo) == 0)
1443 1419 siginfo.si_signo = 0;
1444 1420 }
1445 1421
1446 1422 if (siginfo.si_signo)
1447 1423 trapsig(&siginfo, (fault != FLTFPE && fault != FLTCPCOVF));
1448 1424
1449 1425 if (lwp->lwp_oweupc)
1450 1426 profil_tick(rp->r_pc);
1451 1427
1452 1428 if (ct->t_astflag | ct->t_sig_check) {
1453 1429 /*
1454 1430 * Turn off the AST flag before checking all the conditions that
1455 1431 * may have caused an AST. This flag is on whenever a signal or
1456 1432 * unusual condition should be handled after the next trap or
1457 1433 * syscall.
1458 1434 */
1459 1435 astoff(ct);
1460 1436 /*
1461 1437 * If a single-step trap occurred on a syscall (see above)
1462 1438 * recognize it now. Do this before checking for signals
1463 1439 * because deferred_singlestep_trap() may generate a SIGTRAP to
1464 1440 * the LWP or may otherwise mark the LWP to call issig(FORREAL).
1465 1441 */
1466 1442 if (lwp->lwp_pcb.pcb_flags & DEBUG_PENDING)
1467 1443 deferred_singlestep_trap((caddr_t)rp->r_pc);
1468 1444
1469 1445 ct->t_sig_check = 0;
1470 1446
1471 1447 /*
1472 1448 * As in other code paths that check against TP_CHANGEBIND,
1473 1449 * we perform the check first without p_lock held -- only
1474 1450 * acquiring p_lock in the unlikely event that it is indeed
1475 1451 * set. This is safe because we are doing this after the
1476 1452 * astoff(); if we are racing another thread setting
1477 1453 * TP_CHANGEBIND on us, we will pick it up on a subsequent
1478 1454 * lap through.
1479 1455 */
1480 1456 if (curthread->t_proc_flag & TP_CHANGEBIND) {
1481 1457 mutex_enter(&p->p_lock);
1482 1458 if (curthread->t_proc_flag & TP_CHANGEBIND) {
1483 1459 timer_lwpbind();
1484 1460 curthread->t_proc_flag &= ~TP_CHANGEBIND;
1485 1461 }
1486 1462 mutex_exit(&p->p_lock);
1487 1463 }
1488 1464
1489 1465 /*
1490 1466 * for kaio requests that are on the per-process poll queue,
1491 1467 * aiop->aio_pollq, they're AIO_POLL bit is set, the kernel
1492 1468 * should copyout their result_t to user memory. by copying
1493 1469 * out the result_t, the user can poll on memory waiting
1494 1470 * for the kaio request to complete.
1495 1471 */
1496 1472 if (p->p_aio)
1497 1473 aio_cleanup(0);
1498 1474 /*
1499 1475 * If this LWP was asked to hold, call holdlwp(), which will
1500 1476 * stop. holdlwps() sets this up and calls pokelwps() which
1501 1477 * sets the AST flag.
1502 1478 *
1503 1479 * Also check TP_EXITLWP, since this is used by fresh new LWPs
1504 1480 * through lwp_rtt(). That flag is set if the lwp_create(2)
1505 1481 * syscall failed after creating the LWP.
1506 1482 */
1507 1483 if (ISHOLD(p))
1508 1484 holdlwp();
1509 1485
1510 1486 /*
1511 1487 * All code that sets signals and makes ISSIG evaluate true must
1512 1488 * set t_astflag afterwards.
1513 1489 */
1514 1490 if (ISSIG_PENDING(ct, lwp, p)) {
1515 1491 if (issig(FORREAL))
1516 1492 psig();
1517 1493 ct->t_sig_check = 1;
1518 1494 }
1519 1495
1520 1496 if (ct->t_rprof != NULL) {
1521 1497 realsigprof(0, 0, 0);
1522 1498 ct->t_sig_check = 1;
1523 1499 }
1524 1500
1525 1501 /*
1526 1502 * /proc can't enable/disable the trace bit itself
1527 1503 * because that could race with the call gate used by
1528 1504 * system calls via "lcall". If that happened, an
1529 1505 * invalid EFLAGS would result. prstep()/prnostep()
1530 1506 * therefore schedule an AST for the purpose.
1531 1507 */
1532 1508 if (lwp->lwp_pcb.pcb_flags & REQUEST_STEP) {
1533 1509 lwp->lwp_pcb.pcb_flags &= ~REQUEST_STEP;
1534 1510 rp->r_ps |= PS_T;
1535 1511 }
1536 1512 if (lwp->lwp_pcb.pcb_flags & REQUEST_NOSTEP) {
1537 1513 lwp->lwp_pcb.pcb_flags &= ~REQUEST_NOSTEP;
1538 1514 rp->r_ps &= ~PS_T;
1539 1515 }
1540 1516 }
1541 1517
1542 1518 out: /* We can't get here from a system trap */
1543 1519 ASSERT(type & USER);
1544 1520
1545 1521 if (ISHOLD(p))
1546 1522 holdlwp();
1547 1523
1548 1524 /*
1549 1525 * Set state to LWP_USER here so preempt won't give us a kernel
1550 1526 * priority if it occurs after this point. Call CL_TRAPRET() to
1551 1527 * restore the user-level priority.
1552 1528 *
1553 1529 * It is important that no locks (other than spinlocks) be entered
1554 1530 * after this point before returning to user mode (unless lwp_state
1555 1531 * is set back to LWP_SYS).
1556 1532 */
1557 1533 lwp->lwp_state = LWP_USER;
1558 1534
1559 1535 if (ct->t_trapret) {
1560 1536 ct->t_trapret = 0;
1561 1537 thread_lock(ct);
1562 1538 CL_TRAPRET(ct);
1563 1539 thread_unlock(ct);
1564 1540 }
1565 1541 if (CPU->cpu_runrun || curthread->t_schedflag & TS_ANYWAITQ)
1566 1542 preempt();
1567 1543 prunstop();
1568 1544 (void) new_mstate(ct, mstate);
1569 1545
1570 1546 /* Kernel probe */
1571 1547 TNF_PROBE_1(thread_state, "thread", /* CSTYLED */,
1572 1548 tnf_microstate, state, LMS_USER);
1573 1549
1574 1550 return;
1575 1551
1576 1552 cleanup: /* system traps end up here */
1577 1553 ASSERT(!(type & USER));
1578 1554 }
1579 1555
1580 1556 /*
1581 1557 * Patch non-zero to disable preemption of threads in the kernel.
1582 1558 */
1583 1559 int IGNORE_KERNEL_PREEMPTION = 0; /* XXX - delete this someday */
1584 1560
1585 1561 struct kpreempt_cnts { /* kernel preemption statistics */
1586 1562 int kpc_idle; /* executing idle thread */
1587 1563 int kpc_intr; /* executing interrupt thread */
1588 1564 int kpc_clock; /* executing clock thread */
1589 1565 int kpc_blocked; /* thread has blocked preemption (t_preempt) */
1590 1566 int kpc_notonproc; /* thread is surrendering processor */
1591 1567 int kpc_inswtch; /* thread has ratified scheduling decision */
1592 1568 int kpc_prilevel; /* processor interrupt level is too high */
1593 1569 int kpc_apreempt; /* asynchronous preemption */
1594 1570 int kpc_spreempt; /* synchronous preemption */
1595 1571 } kpreempt_cnts;
1596 1572
1597 1573 /*
1598 1574 * kernel preemption: forced rescheduling, preempt the running kernel thread.
1599 1575 * the argument is old PIL for an interrupt,
1600 1576 * or the distingished value KPREEMPT_SYNC.
1601 1577 */
1602 1578 void
1603 1579 kpreempt(int asyncspl)
1604 1580 {
1605 1581 kthread_t *ct = curthread;
1606 1582
1607 1583 if (IGNORE_KERNEL_PREEMPTION) {
1608 1584 aston(CPU->cpu_dispthread);
1609 1585 return;
1610 1586 }
1611 1587
1612 1588 /*
1613 1589 * Check that conditions are right for kernel preemption
1614 1590 */
1615 1591 do {
1616 1592 if (ct->t_preempt) {
1617 1593 /*
1618 1594 * either a privileged thread (idle, panic, interrupt)
1619 1595 * or will check when t_preempt is lowered
1620 1596 * We need to specifically handle the case where
1621 1597 * the thread is in the middle of swtch (resume has
1622 1598 * been called) and has its t_preempt set
1623 1599 * [idle thread and a thread which is in kpreempt
1624 1600 * already] and then a high priority thread is
1625 1601 * available in the local dispatch queue.
1626 1602 * In this case the resumed thread needs to take a
1627 1603 * trap so that it can call kpreempt. We achieve
1628 1604 * this by using siron().
1629 1605 * How do we detect this condition:
1630 1606 * idle thread is running and is in the midst of
1631 1607 * resume: curthread->t_pri == -1 && CPU->dispthread
1632 1608 * != CPU->thread
1633 1609 * Need to ensure that this happens only at high pil
1634 1610 * resume is called at high pil
1635 1611 * Only in resume_from_idle is the pil changed.
1636 1612 */
1637 1613 if (ct->t_pri < 0) {
1638 1614 kpreempt_cnts.kpc_idle++;
1639 1615 if (CPU->cpu_dispthread != CPU->cpu_thread)
1640 1616 siron();
1641 1617 } else if (ct->t_flag & T_INTR_THREAD) {
1642 1618 kpreempt_cnts.kpc_intr++;
1643 1619 if (ct->t_pil == CLOCK_LEVEL)
1644 1620 kpreempt_cnts.kpc_clock++;
1645 1621 } else {
1646 1622 kpreempt_cnts.kpc_blocked++;
1647 1623 if (CPU->cpu_dispthread != CPU->cpu_thread)
1648 1624 siron();
1649 1625 }
1650 1626 aston(CPU->cpu_dispthread);
1651 1627 return;
1652 1628 }
1653 1629 if (ct->t_state != TS_ONPROC ||
1654 1630 ct->t_disp_queue != CPU->cpu_disp) {
1655 1631 /* this thread will be calling swtch() shortly */
1656 1632 kpreempt_cnts.kpc_notonproc++;
1657 1633 if (CPU->cpu_thread != CPU->cpu_dispthread) {
1658 1634 /* already in swtch(), force another */
1659 1635 kpreempt_cnts.kpc_inswtch++;
1660 1636 siron();
1661 1637 }
1662 1638 return;
1663 1639 }
1664 1640 if (getpil() >= DISP_LEVEL) {
1665 1641 /*
1666 1642 * We can't preempt this thread if it is at
1667 1643 * a PIL >= DISP_LEVEL since it may be holding
1668 1644 * a spin lock (like sched_lock).
1669 1645 */
1670 1646 siron(); /* check back later */
1671 1647 kpreempt_cnts.kpc_prilevel++;
1672 1648 return;
1673 1649 }
1674 1650 if (!interrupts_enabled()) {
1675 1651 /*
1676 1652 * Can't preempt while running with ints disabled
1677 1653 */
1678 1654 kpreempt_cnts.kpc_prilevel++;
1679 1655 return;
1680 1656 }
1681 1657 if (asyncspl != KPREEMPT_SYNC)
1682 1658 kpreempt_cnts.kpc_apreempt++;
1683 1659 else
1684 1660 kpreempt_cnts.kpc_spreempt++;
1685 1661
1686 1662 ct->t_preempt++;
1687 1663 preempt();
1688 1664 ct->t_preempt--;
1689 1665 } while (CPU->cpu_kprunrun);
1690 1666 }
1691 1667
1692 1668 /*
1693 1669 * Print out debugging info.
1694 1670 */
1695 1671 static void
1696 1672 showregs(uint_t type, struct regs *rp, caddr_t addr)
1697 1673 {
1698 1674 int s;
1699 1675
1700 1676 s = spl7();
1701 1677 type &= ~USER;
1702 1678 if (PTOU(curproc)->u_comm[0])
1703 1679 printf("%s: ", PTOU(curproc)->u_comm);
1704 1680 if (type < TRAP_TYPES)
1705 1681 printf("#%s %s\n", trap_type_mnemonic[type], trap_type[type]);
1706 1682 else
1707 1683 switch (type) {
1708 1684 case T_SYSCALL:
1709 1685 printf("Syscall Trap:\n");
1710 1686 break;
1711 1687 case T_AST:
1712 1688 printf("AST\n");
1713 1689 break;
1714 1690 default:
1715 1691 printf("Bad Trap = %d\n", type);
1716 1692 break;
1717 1693 }
1718 1694 if (type == T_PGFLT) {
1719 1695 printf("Bad %s fault at addr=0x%lx\n",
1720 1696 USERMODE(rp->r_cs) ? "user": "kernel", (uintptr_t)addr);
1721 1697 } else if (addr) {
1722 1698 printf("addr=0x%lx\n", (uintptr_t)addr);
1723 1699 }
↓ open down ↓ |
568 lines elided |
↑ open up ↑ |
1724 1700
1725 1701 printf("pid=%d, pc=0x%lx, sp=0x%lx, eflags=0x%lx\n",
1726 1702 (ttoproc(curthread) && ttoproc(curthread)->p_pidp) ?
1727 1703 ttoproc(curthread)->p_pid : 0, rp->r_pc, rp->r_sp, rp->r_ps);
1728 1704
1729 1705 #if defined(__lint)
1730 1706 /*
1731 1707 * this clause can be deleted when lint bug 4870403 is fixed
1732 1708 * (lint thinks that bit 32 is illegal in a %b format string)
1733 1709 */
1734 - printf("cr0: %x cr4: %b\n",
1710 + printf("cr0: %x cr4: %b\n",
1735 1711 (uint_t)getcr0(), (uint_t)getcr4(), FMT_CR4);
1736 1712 #else
1737 - printf("cr0: %b cr4: %b\n",
1713 + printf("cr0: %b cr4: %b\n",
1738 1714 (uint_t)getcr0(), FMT_CR0, (uint_t)getcr4(), FMT_CR4);
1739 1715 #endif /* __lint */
1740 1716
1741 - printf("cr2: %lx", getcr2());
1717 + printf("cr2: %lx ", getcr2());
1742 1718 #if !defined(__xpv)
1743 - printf("cr3: %lx", getcr3());
1719 + printf("cr3: %lx ", getcr3());
1744 1720 #if defined(__amd64)
1745 1721 printf("cr8: %lx\n", getcr8());
1746 1722 #endif
1747 1723 #endif
1748 1724 printf("\n");
1749 1725
1750 1726 dumpregs(rp);
1751 1727 splx(s);
1752 1728 }
1753 1729
1754 1730 static void
1755 1731 dumpregs(struct regs *rp)
1756 1732 {
1757 1733 #if defined(__amd64)
1758 1734 const char fmt[] = "\t%3s: %16lx %3s: %16lx %3s: %16lx\n";
1759 1735
1760 1736 printf(fmt, "rdi", rp->r_rdi, "rsi", rp->r_rsi, "rdx", rp->r_rdx);
1761 1737 printf(fmt, "rcx", rp->r_rcx, " r8", rp->r_r8, " r9", rp->r_r9);
1762 1738 printf(fmt, "rax", rp->r_rax, "rbx", rp->r_rbx, "rbp", rp->r_rbp);
1763 1739 printf(fmt, "r10", rp->r_r10, "r11", rp->r_r11, "r12", rp->r_r12);
1764 1740 printf(fmt, "r13", rp->r_r13, "r14", rp->r_r14, "r15", rp->r_r15);
1765 1741
1766 1742 printf(fmt, "fsb", rdmsr(MSR_AMD_FSBASE), "gsb", rdmsr(MSR_AMD_GSBASE),
1767 1743 " ds", rp->r_ds);
1768 1744 printf(fmt, " es", rp->r_es, " fs", rp->r_fs, " gs", rp->r_gs);
1769 1745
1770 1746 printf(fmt, "trp", rp->r_trapno, "err", rp->r_err, "rip", rp->r_rip);
1771 1747 printf(fmt, " cs", rp->r_cs, "rfl", rp->r_rfl, "rsp", rp->r_rsp);
1772 1748
1773 1749 printf("\t%3s: %16lx\n", " ss", rp->r_ss);
1774 1750
1775 1751 #elif defined(__i386)
1776 1752 const char fmt[] = "\t%3s: %8lx %3s: %8lx %3s: %8lx %3s: %8lx\n";
1777 1753
1778 1754 printf(fmt, " gs", rp->r_gs, " fs", rp->r_fs,
1779 1755 " es", rp->r_es, " ds", rp->r_ds);
1780 1756 printf(fmt, "edi", rp->r_edi, "esi", rp->r_esi,
1781 1757 "ebp", rp->r_ebp, "esp", rp->r_esp);
1782 1758 printf(fmt, "ebx", rp->r_ebx, "edx", rp->r_edx,
1783 1759 "ecx", rp->r_ecx, "eax", rp->r_eax);
1784 1760 printf(fmt, "trp", rp->r_trapno, "err", rp->r_err,
1785 1761 "eip", rp->r_eip, " cs", rp->r_cs);
1786 1762 printf("\t%3s: %8lx %3s: %8lx %3s: %8lx\n",
1787 1763 "efl", rp->r_efl, "usp", rp->r_uesp, " ss", rp->r_ss);
1788 1764
1789 1765 #endif /* __i386 */
1790 1766 }
1791 1767
1792 1768 /*
1793 1769 * Test to see if the instruction is iret on i386 or iretq on amd64.
1794 1770 *
1795 1771 * On the hypervisor we can only test for nopop_sys_rtt_syscall. If true
1796 1772 * then we are in the context of hypervisor's failsafe handler because it
1797 1773 * tried to iret and failed due to a bad selector. See xen_failsafe_callback.
1798 1774 */
1799 1775 static int
1800 1776 instr_is_iret(caddr_t pc)
1801 1777 {
1802 1778
1803 1779 #if defined(__xpv)
1804 1780 extern void nopop_sys_rtt_syscall(void);
1805 1781 return ((pc == (caddr_t)nopop_sys_rtt_syscall) ? 1 : 0);
1806 1782
1807 1783 #else
1808 1784
1809 1785 #if defined(__amd64)
1810 1786 static const uint8_t iret_insn[2] = { 0x48, 0xcf }; /* iretq */
1811 1787
1812 1788 #elif defined(__i386)
1813 1789 static const uint8_t iret_insn[1] = { 0xcf }; /* iret */
1814 1790 #endif /* __i386 */
1815 1791 return (bcmp(pc, iret_insn, sizeof (iret_insn)) == 0);
1816 1792
1817 1793 #endif /* __xpv */
1818 1794 }
1819 1795
1820 1796 #if defined(__i386)
1821 1797
1822 1798 /*
1823 1799 * Test to see if the instruction is part of __SEGREGS_POP
1824 1800 *
1825 1801 * Note carefully the appallingly awful dependency between
1826 1802 * the instruction sequence used in __SEGREGS_POP and these
1827 1803 * instructions encoded here.
1828 1804 */
1829 1805 static int
1830 1806 instr_is_segregs_pop(caddr_t pc)
1831 1807 {
1832 1808 static const uint8_t movw_0_esp_gs[4] = { 0x8e, 0x6c, 0x24, 0x0 };
1833 1809 static const uint8_t movw_4_esp_fs[4] = { 0x8e, 0x64, 0x24, 0x4 };
1834 1810 static const uint8_t movw_8_esp_es[4] = { 0x8e, 0x44, 0x24, 0x8 };
1835 1811 static const uint8_t movw_c_esp_ds[4] = { 0x8e, 0x5c, 0x24, 0xc };
1836 1812
1837 1813 if (bcmp(pc, movw_0_esp_gs, sizeof (movw_0_esp_gs)) == 0 ||
1838 1814 bcmp(pc, movw_4_esp_fs, sizeof (movw_4_esp_fs)) == 0 ||
↓ open down ↓ |
85 lines elided |
↑ open up ↑ |
1839 1815 bcmp(pc, movw_8_esp_es, sizeof (movw_8_esp_es)) == 0 ||
1840 1816 bcmp(pc, movw_c_esp_ds, sizeof (movw_c_esp_ds)) == 0)
1841 1817 return (1);
1842 1818
1843 1819 return (0);
1844 1820 }
1845 1821
1846 1822 #endif /* __i386 */
1847 1823
1848 1824 /*
1849 - * Test to see if the instruction is part of _sys_rtt.
1825 + * Test to see if the instruction is part of _sys_rtt (or the KPTI trampolines
1826 + * which are used by _sys_rtt).
1850 1827 *
1851 1828 * Again on the hypervisor if we try to IRET to user land with a bad code
1852 1829 * or stack selector we will get vectored through xen_failsafe_callback.
1853 1830 * In which case we assume we got here via _sys_rtt since we only allow
1854 1831 * IRET to user land to take place in _sys_rtt.
1855 1832 */
1856 1833 static int
1857 1834 instr_is_sys_rtt(caddr_t pc)
1858 1835 {
1859 1836 extern void _sys_rtt(), _sys_rtt_end();
1860 1837
1838 +#if !defined(__xpv)
1839 + extern void tr_sysc_ret_start(), tr_sysc_ret_end();
1840 + extern void tr_intr_ret_start(), tr_intr_ret_end();
1841 +
1842 + if ((uintptr_t)pc >= (uintptr_t)tr_sysc_ret_start &&
1843 + (uintptr_t)pc <= (uintptr_t)tr_sysc_ret_end)
1844 + return (1);
1845 +
1846 + if ((uintptr_t)pc >= (uintptr_t)tr_intr_ret_start &&
1847 + (uintptr_t)pc <= (uintptr_t)tr_intr_ret_end)
1848 + return (1);
1849 +#endif
1850 +
1861 1851 if ((uintptr_t)pc < (uintptr_t)_sys_rtt ||
1862 1852 (uintptr_t)pc > (uintptr_t)_sys_rtt_end)
1863 1853 return (0);
1864 1854
1865 1855 return (1);
1866 1856 }
1867 1857
1868 1858 /*
1869 1859 * Handle #gp faults in kernel mode.
1870 1860 *
1871 1861 * One legitimate way this can happen is if we attempt to update segment
1872 1862 * registers to naughty values on the way out of the kernel.
1873 1863 *
1874 1864 * This can happen in a couple of ways: someone - either accidentally or
1875 1865 * on purpose - creates (setcontext(2), lwp_create(2)) or modifies
1876 1866 * (signal(2)) a ucontext that contains silly segment register values.
1877 1867 * Or someone - either accidentally or on purpose - modifies the prgregset_t
1878 1868 * of a subject process via /proc to contain silly segment register values.
1879 1869 *
1880 1870 * (The unfortunate part is that we can end up discovering the bad segment
1881 1871 * register value in the middle of an 'iret' after we've popped most of the
1882 1872 * stack. So it becomes quite difficult to associate an accurate ucontext
1883 1873 * with the lwp, because the act of taking the #gp trap overwrites most of
1884 1874 * what we were going to send the lwp.)
1885 1875 *
1886 1876 * OTOH if it turns out that's -not- the problem, and we're -not- an lwp
1887 1877 * trying to return to user mode and we get a #gp fault, then we need
1888 1878 * to die() -- which will happen if we return non-zero from this routine.
1889 1879 */
1890 1880 static int
1891 1881 kern_gpfault(struct regs *rp)
1892 1882 {
1893 1883 kthread_t *t = curthread;
1894 1884 proc_t *p = ttoproc(t);
1895 1885 klwp_t *lwp = ttolwp(t);
1896 1886 struct regs tmpregs, *trp = NULL;
1897 1887 caddr_t pc = (caddr_t)rp->r_pc;
1898 1888 int v;
1899 1889 uint32_t auditing = AU_AUDITING();
1900 1890
1901 1891 /*
1902 1892 * if we're not an lwp, or in the case of running native the
1903 1893 * pc range is outside _sys_rtt, then we should immediately
1904 1894 * be die()ing horribly.
1905 1895 */
1906 1896 if (lwp == NULL || !instr_is_sys_rtt(pc))
1907 1897 return (1);
1908 1898
1909 1899 /*
1910 1900 * So at least we're in the right part of the kernel.
1911 1901 *
1912 1902 * Disassemble the instruction at the faulting pc.
1913 1903 * Once we know what it is, we carefully reconstruct the stack
1914 1904 * based on the order in which the stack is deconstructed in
1915 1905 * _sys_rtt. Ew.
1916 1906 */
1917 1907 if (instr_is_iret(pc)) {
1918 1908 /*
1919 1909 * We took the #gp while trying to perform the IRET.
1920 1910 * This means that either %cs or %ss are bad.
1921 1911 * All we know for sure is that most of the general
1922 1912 * registers have been restored, including the
1923 1913 * segment registers, and all we have left on the
1924 1914 * topmost part of the lwp's stack are the
1925 1915 * registers that the iretq was unable to consume.
1926 1916 *
1927 1917 * All the rest of the state was crushed by the #gp
1928 1918 * which pushed -its- registers atop our old save area
1929 1919 * (because we had to decrement the stack pointer, sigh) so
1930 1920 * all that we can try and do is to reconstruct the
1931 1921 * crushed frame from the #gp trap frame itself.
1932 1922 */
1933 1923 trp = &tmpregs;
1934 1924 trp->r_ss = lwptoregs(lwp)->r_ss;
1935 1925 trp->r_sp = lwptoregs(lwp)->r_sp;
1936 1926 trp->r_ps = lwptoregs(lwp)->r_ps;
1937 1927 trp->r_cs = lwptoregs(lwp)->r_cs;
1938 1928 trp->r_pc = lwptoregs(lwp)->r_pc;
1939 1929 bcopy(rp, trp, offsetof(struct regs, r_pc));
1940 1930
1941 1931 /*
1942 1932 * Validate simple math
1943 1933 */
1944 1934 ASSERT(trp->r_pc == lwptoregs(lwp)->r_pc);
1945 1935 ASSERT(trp->r_err == rp->r_err);
1946 1936
1947 1937
1948 1938
1949 1939 }
1950 1940
1951 1941 #if defined(__amd64)
1952 1942 if (trp == NULL && lwp->lwp_pcb.pcb_rupdate != 0) {
1953 1943
1954 1944 /*
1955 1945 * This is the common case -- we're trying to load
1956 1946 * a bad segment register value in the only section
1957 1947 * of kernel code that ever loads segment registers.
1958 1948 *
1959 1949 * We don't need to do anything at this point because
1960 1950 * the pcb contains all the pending segment register
1961 1951 * state, and the regs are still intact because we
1962 1952 * didn't adjust the stack pointer yet. Given the fidelity
1963 1953 * of all this, we could conceivably send a signal
1964 1954 * to the lwp, rather than core-ing.
1965 1955 */
1966 1956 trp = lwptoregs(lwp);
1967 1957 ASSERT((caddr_t)trp == (caddr_t)rp->r_sp);
1968 1958 }
1969 1959
1970 1960 #elif defined(__i386)
1971 1961
1972 1962 if (trp == NULL && instr_is_segregs_pop(pc))
1973 1963 trp = lwptoregs(lwp);
1974 1964
1975 1965 #endif /* __i386 */
1976 1966
1977 1967 if (trp == NULL)
1978 1968 return (1);
1979 1969
1980 1970 /*
1981 1971 * If we get to here, we're reasonably confident that we've
1982 1972 * correctly decoded what happened on the way out of the kernel.
1983 1973 * Rewrite the lwp's registers so that we can create a core dump
1984 1974 * the (at least vaguely) represents the mcontext we were
1985 1975 * being asked to restore when things went so terribly wrong.
1986 1976 */
1987 1977
1988 1978 /*
1989 1979 * Make sure that we have a meaningful %trapno and %err.
1990 1980 */
1991 1981 trp->r_trapno = rp->r_trapno;
1992 1982 trp->r_err = rp->r_err;
1993 1983
1994 1984 if ((caddr_t)trp != (caddr_t)lwptoregs(lwp))
1995 1985 bcopy(trp, lwptoregs(lwp), sizeof (*trp));
1996 1986
1997 1987
1998 1988 mutex_enter(&p->p_lock);
1999 1989 lwp->lwp_cursig = SIGSEGV;
2000 1990 mutex_exit(&p->p_lock);
2001 1991
2002 1992 /*
2003 1993 * Terminate all LWPs but don't discard them. If another lwp beat
2004 1994 * us to the punch by calling exit(), evaporate now.
2005 1995 */
2006 1996 proc_is_exiting(p);
2007 1997 if (exitlwps(1) != 0) {
2008 1998 mutex_enter(&p->p_lock);
2009 1999 lwp_exit();
2010 2000 }
2011 2001
2012 2002 if (auditing) /* audit core dump */
2013 2003 audit_core_start(SIGSEGV);
2014 2004 v = core(SIGSEGV, B_FALSE);
2015 2005 if (auditing) /* audit core dump */
2016 2006 audit_core_finish(v ? CLD_KILLED : CLD_DUMPED);
2017 2007 exit(v ? CLD_KILLED : CLD_DUMPED, SIGSEGV);
2018 2008 return (0);
2019 2009 }
2020 2010
2021 2011 /*
2022 2012 * dump_tss() - Display the TSS structure
2023 2013 */
2024 2014
2025 2015 #if !defined(__xpv)
2026 2016 #if defined(__amd64)
2027 2017
2028 2018 static void
2029 2019 dump_tss(void)
2030 2020 {
2031 2021 const char tss_fmt[] = "tss.%s:\t0x%p\n"; /* Format string */
2032 2022 tss_t *tss = CPU->cpu_tss;
2033 2023
2034 2024 printf(tss_fmt, "tss_rsp0", (void *)tss->tss_rsp0);
2035 2025 printf(tss_fmt, "tss_rsp1", (void *)tss->tss_rsp1);
2036 2026 printf(tss_fmt, "tss_rsp2", (void *)tss->tss_rsp2);
2037 2027
2038 2028 printf(tss_fmt, "tss_ist1", (void *)tss->tss_ist1);
2039 2029 printf(tss_fmt, "tss_ist2", (void *)tss->tss_ist2);
2040 2030 printf(tss_fmt, "tss_ist3", (void *)tss->tss_ist3);
2041 2031 printf(tss_fmt, "tss_ist4", (void *)tss->tss_ist4);
2042 2032 printf(tss_fmt, "tss_ist5", (void *)tss->tss_ist5);
2043 2033 printf(tss_fmt, "tss_ist6", (void *)tss->tss_ist6);
2044 2034 printf(tss_fmt, "tss_ist7", (void *)tss->tss_ist7);
2045 2035 }
2046 2036
2047 2037 #elif defined(__i386)
2048 2038
2049 2039 static void
2050 2040 dump_tss(void)
2051 2041 {
2052 2042 const char tss_fmt[] = "tss.%s:\t0x%p\n"; /* Format string */
2053 2043 tss_t *tss = CPU->cpu_tss;
2054 2044
2055 2045 printf(tss_fmt, "tss_link", (void *)(uintptr_t)tss->tss_link);
2056 2046 printf(tss_fmt, "tss_esp0", (void *)(uintptr_t)tss->tss_esp0);
2057 2047 printf(tss_fmt, "tss_ss0", (void *)(uintptr_t)tss->tss_ss0);
2058 2048 printf(tss_fmt, "tss_esp1", (void *)(uintptr_t)tss->tss_esp1);
2059 2049 printf(tss_fmt, "tss_ss1", (void *)(uintptr_t)tss->tss_ss1);
2060 2050 printf(tss_fmt, "tss_esp2", (void *)(uintptr_t)tss->tss_esp2);
2061 2051 printf(tss_fmt, "tss_ss2", (void *)(uintptr_t)tss->tss_ss2);
2062 2052 printf(tss_fmt, "tss_cr3", (void *)(uintptr_t)tss->tss_cr3);
2063 2053 printf(tss_fmt, "tss_eip", (void *)(uintptr_t)tss->tss_eip);
2064 2054 printf(tss_fmt, "tss_eflags", (void *)(uintptr_t)tss->tss_eflags);
2065 2055 printf(tss_fmt, "tss_eax", (void *)(uintptr_t)tss->tss_eax);
2066 2056 printf(tss_fmt, "tss_ebx", (void *)(uintptr_t)tss->tss_ebx);
2067 2057 printf(tss_fmt, "tss_ecx", (void *)(uintptr_t)tss->tss_ecx);
2068 2058 printf(tss_fmt, "tss_edx", (void *)(uintptr_t)tss->tss_edx);
2069 2059 printf(tss_fmt, "tss_esp", (void *)(uintptr_t)tss->tss_esp);
2070 2060 }
2071 2061
2072 2062 #endif /* __amd64 */
2073 2063 #endif /* !__xpv */
2074 2064
2075 2065 #if defined(TRAPTRACE)
2076 2066
2077 2067 int ttrace_nrec = 10; /* number of records to dump out */
2078 2068 int ttrace_dump_nregs = 0; /* dump out this many records with regs too */
2079 2069
2080 2070 /*
2081 2071 * Dump out the last ttrace_nrec traptrace records on each CPU
2082 2072 */
2083 2073 static void
2084 2074 dump_ttrace(void)
2085 2075 {
2086 2076 trap_trace_ctl_t *ttc;
2087 2077 trap_trace_rec_t *rec;
2088 2078 uintptr_t current;
2089 2079 int i, j, k;
2090 2080 int n = NCPU;
2091 2081 #if defined(__amd64)
2092 2082 const char banner[] =
2093 2083 "CPU ADDRESS TIMESTAMP TYPE VC HANDLER PC\n";
2094 2084 /* Define format for the CPU, ADDRESS, and TIMESTAMP fields */
2095 2085 const char fmt1[] = "%3d %016lx %12llx";
2096 2086 char data1[34]; /* length of string formatted by fmt1 + 1 */
2097 2087 #elif defined(__i386)
2098 2088 const char banner[] =
2099 2089 "CPU ADDRESS TIMESTAMP TYPE VC HANDLER PC\n";
2100 2090 /* Define format for the CPU, ADDRESS, and TIMESTAMP fields */
2101 2091 const char fmt1[] = "%3d %08lx %12llx";
2102 2092 char data1[26]; /* length of string formatted by fmt1 + 1 */
2103 2093 #endif
2104 2094 /* Define format for the TYPE and VC fields */
2105 2095 const char fmt2[] = "%4s %3x";
2106 2096 char data2[9]; /* length of string formatted by fmt2 + 1 */
2107 2097 /*
2108 2098 * Define format for the HANDLER field. Width is arbitrary, but should
2109 2099 * be enough for common handler's names, and leave enough space for
2110 2100 * the PC field, especially when we are in kmdb.
2111 2101 */
2112 2102 const char fmt3h[] = "#%-15s";
2113 2103 const char fmt3p[] = "%-16p";
2114 2104 const char fmt3s[] = "%-16s";
2115 2105 char data3[17]; /* length of string formatted by fmt3* + 1 */
2116 2106
2117 2107 if (ttrace_nrec == 0)
2118 2108 return;
2119 2109
2120 2110 printf("\n");
2121 2111 printf(banner);
2122 2112
2123 2113 for (i = 0; i < n; i++) {
2124 2114 ttc = &trap_trace_ctl[i];
2125 2115 if (ttc->ttc_first == NULL)
2126 2116 continue;
2127 2117
2128 2118 current = ttc->ttc_next - sizeof (trap_trace_rec_t);
2129 2119 for (j = 0; j < ttrace_nrec; j++) {
2130 2120 struct sysent *sys;
2131 2121 struct autovec *vec;
2132 2122 extern struct av_head autovect[];
2133 2123 int type;
2134 2124 ulong_t off;
2135 2125 char *sym, *stype;
2136 2126
2137 2127 if (current < ttc->ttc_first)
2138 2128 current =
2139 2129 ttc->ttc_limit - sizeof (trap_trace_rec_t);
2140 2130
2141 2131 if (current == NULL)
2142 2132 continue;
2143 2133
2144 2134 rec = (trap_trace_rec_t *)current;
2145 2135
2146 2136 if (rec->ttr_stamp == 0)
2147 2137 break;
2148 2138
2149 2139 (void) snprintf(data1, sizeof (data1), fmt1, i,
2150 2140 (uintptr_t)rec, rec->ttr_stamp);
2151 2141
2152 2142 switch (rec->ttr_marker) {
2153 2143 case TT_SYSCALL:
2154 2144 case TT_SYSENTER:
2155 2145 case TT_SYSC:
2156 2146 case TT_SYSC64:
2157 2147 #if defined(__amd64)
2158 2148 sys = &sysent32[rec->ttr_sysnum];
2159 2149 switch (rec->ttr_marker) {
2160 2150 case TT_SYSC64:
2161 2151 sys = &sysent[rec->ttr_sysnum];
2162 2152 /*FALLTHROUGH*/
2163 2153 #elif defined(__i386)
2164 2154 sys = &sysent[rec->ttr_sysnum];
2165 2155 switch (rec->ttr_marker) {
2166 2156 case TT_SYSC64:
2167 2157 #endif
2168 2158 case TT_SYSC:
2169 2159 stype = "sysc"; /* syscall */
2170 2160 break;
2171 2161 case TT_SYSCALL:
2172 2162 stype = "lcal"; /* lcall */
2173 2163 break;
2174 2164 case TT_SYSENTER:
2175 2165 stype = "syse"; /* sysenter */
2176 2166 break;
2177 2167 default:
2178 2168 break;
2179 2169 }
2180 2170 (void) snprintf(data2, sizeof (data2), fmt2,
2181 2171 stype, rec->ttr_sysnum);
2182 2172 if (sys != NULL) {
2183 2173 sym = kobj_getsymname(
2184 2174 (uintptr_t)sys->sy_callc,
2185 2175 &off);
2186 2176 if (sym != NULL) {
2187 2177 (void) snprintf(data3,
2188 2178 sizeof (data3), fmt3s, sym);
2189 2179 } else {
2190 2180 (void) snprintf(data3,
2191 2181 sizeof (data3), fmt3p,
2192 2182 sys->sy_callc);
2193 2183 }
2194 2184 } else {
2195 2185 (void) snprintf(data3, sizeof (data3),
2196 2186 fmt3s, "unknown");
2197 2187 }
2198 2188 break;
2199 2189
2200 2190 case TT_INTERRUPT:
2201 2191 (void) snprintf(data2, sizeof (data2), fmt2,
2202 2192 "intr", rec->ttr_vector);
2203 2193 if (get_intr_handler != NULL)
2204 2194 vec = (struct autovec *)
2205 2195 (*get_intr_handler)
2206 2196 (rec->ttr_cpuid, rec->ttr_vector);
2207 2197 else
2208 2198 vec =
2209 2199 autovect[rec->ttr_vector].avh_link;
2210 2200
2211 2201 if (vec != NULL) {
2212 2202 sym = kobj_getsymname(
2213 2203 (uintptr_t)vec->av_vector, &off);
2214 2204 if (sym != NULL) {
2215 2205 (void) snprintf(data3,
2216 2206 sizeof (data3), fmt3s, sym);
2217 2207 } else {
2218 2208 (void) snprintf(data3,
2219 2209 sizeof (data3), fmt3p,
2220 2210 vec->av_vector);
2221 2211 }
2222 2212 } else {
2223 2213 (void) snprintf(data3, sizeof (data3),
2224 2214 fmt3s, "unknown");
2225 2215 }
2226 2216 break;
2227 2217
2228 2218 case TT_TRAP:
2229 2219 case TT_EVENT:
2230 2220 type = rec->ttr_regs.r_trapno;
2231 2221 (void) snprintf(data2, sizeof (data2), fmt2,
2232 2222 "trap", type);
2233 2223 if (type < TRAP_TYPES) {
2234 2224 (void) snprintf(data3, sizeof (data3),
2235 2225 fmt3h, trap_type_mnemonic[type]);
2236 2226 } else {
2237 2227 switch (type) {
2238 2228 case T_AST:
2239 2229 (void) snprintf(data3,
2240 2230 sizeof (data3), fmt3s,
2241 2231 "ast");
2242 2232 break;
2243 2233 default:
2244 2234 (void) snprintf(data3,
2245 2235 sizeof (data3), fmt3s, "");
2246 2236 break;
2247 2237 }
2248 2238 }
2249 2239 break;
2250 2240
2251 2241 default:
2252 2242 break;
2253 2243 }
2254 2244
2255 2245 sym = kobj_getsymname(rec->ttr_regs.r_pc, &off);
2256 2246 if (sym != NULL) {
2257 2247 printf("%s %s %s %s+%lx\n", data1, data2, data3,
2258 2248 sym, off);
2259 2249 } else {
2260 2250 printf("%s %s %s %lx\n", data1, data2, data3,
2261 2251 rec->ttr_regs.r_pc);
2262 2252 }
2263 2253
2264 2254 if (ttrace_dump_nregs-- > 0) {
2265 2255 int s;
2266 2256
2267 2257 if (rec->ttr_marker == TT_INTERRUPT)
2268 2258 printf(
2269 2259 "\t\tipl %x spl %x pri %x\n",
2270 2260 rec->ttr_ipl,
2271 2261 rec->ttr_spl,
2272 2262 rec->ttr_pri);
2273 2263
2274 2264 dumpregs(&rec->ttr_regs);
2275 2265
2276 2266 printf("\t%3s: %p\n\n", " ct",
2277 2267 (void *)rec->ttr_curthread);
2278 2268
2279 2269 /*
2280 2270 * print out the pc stack that we recorded
2281 2271 * at trap time (if any)
2282 2272 */
2283 2273 for (s = 0; s < rec->ttr_sdepth; s++) {
2284 2274 uintptr_t fullpc;
2285 2275
2286 2276 if (s >= TTR_STACK_DEPTH) {
2287 2277 printf("ttr_sdepth corrupt\n");
2288 2278 break;
2289 2279 }
2290 2280
2291 2281 fullpc = (uintptr_t)rec->ttr_stack[s];
2292 2282
2293 2283 sym = kobj_getsymname(fullpc, &off);
2294 2284 if (sym != NULL)
2295 2285 printf("-> %s+0x%lx()\n",
2296 2286 sym, off);
2297 2287 else
2298 2288 printf("-> 0x%lx()\n", fullpc);
2299 2289 }
2300 2290 printf("\n");
2301 2291 }
2302 2292 current -= sizeof (trap_trace_rec_t);
2303 2293 }
2304 2294 }
2305 2295 }
2306 2296
2307 2297 #endif /* TRAPTRACE */
2308 2298
2309 2299 void
2310 2300 panic_showtrap(struct panic_trap_info *tip)
2311 2301 {
2312 2302 showregs(tip->trap_type, tip->trap_regs, tip->trap_addr);
2313 2303
2314 2304 #if defined(TRAPTRACE)
2315 2305 dump_ttrace();
2316 2306 #endif
2317 2307
2318 2308 #if !defined(__xpv)
2319 2309 if (tip->trap_type == T_DBLFLT)
2320 2310 dump_tss();
2321 2311 #endif
2322 2312 }
2323 2313
2324 2314 void
2325 2315 panic_savetrap(panic_data_t *pdp, struct panic_trap_info *tip)
2326 2316 {
2327 2317 panic_saveregs(pdp, tip->trap_regs);
2328 2318 }
↓ open down ↓ |
458 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX