Print this page
9723 provide support for VMM's GDT handling
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/intel/ia32/os/desctbls.c
+++ new/usr/src/uts/intel/ia32/os/desctbls.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
24 24 */
25 25
26 26 /*
27 27 * Copyright 2018 Joyent, Inc. All rights reserved.
28 28 */
29 29
30 30 /*
31 31 * Copyright (c) 1992 Terrence R. Lambert.
32 32 * Copyright (c) 1990 The Regents of the University of California.
33 33 * All rights reserved.
34 34 *
35 35 * This code is derived from software contributed to Berkeley by
36 36 * William Jolitz.
37 37 *
38 38 * Redistribution and use in source and binary forms, with or without
39 39 * modification, are permitted provided that the following conditions
40 40 * are met:
41 41 * 1. Redistributions of source code must retain the above copyright
42 42 * notice, this list of conditions and the following disclaimer.
43 43 * 2. Redistributions in binary form must reproduce the above copyright
44 44 * notice, this list of conditions and the following disclaimer in the
45 45 * documentation and/or other materials provided with the distribution.
46 46 * 3. All advertising materials mentioning features or use of this software
47 47 * must display the following acknowledgement:
48 48 * This product includes software developed by the University of
49 49 * California, Berkeley and its contributors.
50 50 * 4. Neither the name of the University nor the names of its contributors
51 51 * may be used to endorse or promote products derived from this software
52 52 * without specific prior written permission.
53 53 *
54 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 64 * SUCH DAMAGE.
65 65 *
66 66 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
67 67 */
68 68
69 69 #include <sys/types.h>
70 70 #include <sys/sysmacros.h>
71 71 #include <sys/tss.h>
72 72 #include <sys/segments.h>
73 73 #include <sys/trap.h>
74 74 #include <sys/cpuvar.h>
75 75 #include <sys/bootconf.h>
76 76 #include <sys/x86_archext.h>
77 77 #include <sys/controlregs.h>
78 78 #include <sys/archsystm.h>
79 79 #include <sys/machsystm.h>
80 80 #include <sys/kobj.h>
81 81 #include <sys/cmn_err.h>
82 82 #include <sys/reboot.h>
83 83 #include <sys/kdi.h>
84 84 #include <sys/mach_mmu.h>
85 85 #include <sys/systm.h>
86 86 #include <sys/note.h>
87 87
88 88 #ifdef __xpv
89 89 #include <sys/hypervisor.h>
90 90 #include <vm/as.h>
91 91 #endif
92 92
93 93 #include <sys/promif.h>
94 94 #include <sys/bootinfo.h>
95 95 #include <vm/kboot_mmu.h>
96 96 #include <vm/hat_pte.h>
97 97
98 98 /*
99 99 * cpu0 and default tables and structures.
100 100 */
101 101 user_desc_t *gdt0;
102 102 #if !defined(__xpv)
103 103 desctbr_t gdt0_default_r;
104 104 #endif
105 105
106 106 gate_desc_t *idt0; /* interrupt descriptor table */
107 107 #if defined(__i386)
108 108 desctbr_t idt0_default_r; /* describes idt0 in IDTR format */
109 109 #endif
110 110
111 111 tss_t *ktss0; /* kernel task state structure */
112 112
113 113 #if defined(__i386)
114 114 tss_t *dftss0; /* #DF double-fault exception */
115 115 #endif /* __i386 */
116 116
117 117 user_desc_t zero_udesc; /* base zero user desc native procs */
118 118 user_desc_t null_udesc; /* null user descriptor */
119 119 system_desc_t null_sdesc; /* null system descriptor */
120 120
121 121 #if defined(__amd64)
122 122 user_desc_t zero_u32desc; /* 32-bit compatibility procs */
123 123 #endif /* __amd64 */
124 124
125 125 #if defined(__amd64)
126 126 user_desc_t ucs_on;
127 127 user_desc_t ucs_off;
128 128 user_desc_t ucs32_on;
129 129 user_desc_t ucs32_off;
130 130 #endif /* __amd64 */
131 131
132 132 /*
133 133 * If the size of this is changed, you must update hat_pcp_setup() and the
134 134 * definitions in exception.s
135 135 */
136 136 extern char dblfault_stack0[DEFAULTSTKSZ];
137 137 extern char nmi_stack0[DEFAULTSTKSZ];
138 138 extern char mce_stack0[DEFAULTSTKSZ];
139 139
140 140 extern void fast_null(void);
141 141 extern hrtime_t get_hrtime(void);
142 142 extern hrtime_t gethrvtime(void);
143 143 extern hrtime_t get_hrestime(void);
144 144 extern uint64_t getlgrp(void);
145 145
146 146 void (*(fasttable[]))(void) = {
147 147 fast_null, /* T_FNULL routine */
148 148 fast_null, /* T_FGETFP routine (initially null) */
149 149 fast_null, /* T_FSETFP routine (initially null) */
150 150 (void (*)())get_hrtime, /* T_GETHRTIME */
151 151 (void (*)())gethrvtime, /* T_GETHRVTIME */
152 152 (void (*)())get_hrestime, /* T_GETHRESTIME */
153 153 (void (*)())getlgrp /* T_GETLGRP */
154 154 };
155 155
156 156 /*
157 157 * Structure containing pre-computed descriptors to allow us to temporarily
158 158 * interpose on a standard handler.
159 159 */
160 160 struct interposing_handler {
161 161 int ih_inum;
162 162 gate_desc_t ih_interp_desc;
163 163 gate_desc_t ih_default_desc;
164 164 };
165 165
166 166 /*
167 167 * The brand infrastructure interposes on two handlers, and we use one as a
168 168 * NULL signpost.
169 169 */
170 170 static struct interposing_handler brand_tbl[2];
171 171
172 172 /*
173 173 * software prototypes for default local descriptor table
174 174 */
175 175
176 176 /*
177 177 * Routines for loading segment descriptors in format the hardware
178 178 * can understand.
179 179 */
180 180
181 181 /*
182 182 * In long mode we have the new L or long mode attribute bit
183 183 * for code segments. Only the conforming bit in type is used along
184 184 * with descriptor priority and present bits. Default operand size must
185 185 * be zero when in long mode. In 32-bit compatibility mode all fields
186 186 * are treated as in legacy mode. For data segments while in long mode
187 187 * only the present bit is loaded.
188 188 */
189 189 void
190 190 set_usegd(user_desc_t *dp, uint_t lmode, void *base, size_t size,
191 191 uint_t type, uint_t dpl, uint_t gran, uint_t defopsz)
192 192 {
193 193 ASSERT(lmode == SDP_SHORT || lmode == SDP_LONG);
194 194 /* This should never be a "system" segment. */
195 195 ASSERT3U(type & SDT_S, !=, 0);
196 196
197 197 /*
198 198 * 64-bit long mode.
199 199 */
200 200 if (lmode == SDP_LONG)
201 201 dp->usd_def32 = 0; /* 32-bit operands only */
202 202 else
203 203 /*
204 204 * 32-bit compatibility mode.
205 205 */
206 206 dp->usd_def32 = defopsz; /* 0 = 16, 1 = 32-bit ops */
207 207
208 208 /*
209 209 * We should always set the "accessed" bit (SDT_A), otherwise the CPU
210 210 * will write to the GDT whenever we change segment registers around.
211 211 * With KPTI on, the GDT is read-only in the user page table, which
212 212 * causes crashes if we don't set this.
213 213 */
214 214 ASSERT3U(type & SDT_A, !=, 0);
215 215
216 216 dp->usd_long = lmode; /* 64-bit mode */
217 217 dp->usd_type = type;
218 218 dp->usd_dpl = dpl;
219 219 dp->usd_p = 1;
220 220 dp->usd_gran = gran; /* 0 = bytes, 1 = pages */
221 221
222 222 dp->usd_lobase = (uintptr_t)base;
223 223 dp->usd_midbase = (uintptr_t)base >> 16;
224 224 dp->usd_hibase = (uintptr_t)base >> (16 + 8);
225 225 dp->usd_lolimit = size;
226 226 dp->usd_hilimit = (uintptr_t)size >> 16;
227 227 }
228 228
229 229 /*
230 230 * Install system segment descriptor for LDT and TSS segments.
231 231 */
232 232
233 233 void
234 234 set_syssegd(system_desc_t *dp, void *base, size_t size, uint_t type,
235 235 uint_t dpl)
236 236 {
237 237 dp->ssd_lolimit = size;
238 238 dp->ssd_hilimit = (uintptr_t)size >> 16;
239 239
240 240 dp->ssd_lobase = (uintptr_t)base;
241 241 dp->ssd_midbase = (uintptr_t)base >> 16;
242 242 dp->ssd_hibase = (uintptr_t)base >> (16 + 8);
243 243 dp->ssd_hi64base = (uintptr_t)base >> (16 + 8 + 8);
244 244
245 245 dp->ssd_type = type;
246 246 dp->ssd_zero1 = 0; /* must be zero */
247 247 dp->ssd_zero2 = 0;
248 248 dp->ssd_dpl = dpl;
249 249 dp->ssd_p = 1;
250 250 dp->ssd_gran = 0; /* force byte units */
251 251 }
252 252
253 253 void *
254 254 get_ssd_base(system_desc_t *dp)
255 255 {
256 256 uintptr_t base;
257 257
258 258 base = (uintptr_t)dp->ssd_lobase |
259 259 (uintptr_t)dp->ssd_midbase << 16 |
260 260 (uintptr_t)dp->ssd_hibase << (16 + 8) |
261 261 (uintptr_t)dp->ssd_hi64base << (16 + 8 + 8);
262 262 return ((void *)base);
263 263 }
264 264
265 265 /*
266 266 * Install gate segment descriptor for interrupt, trap, call and task gates.
267 267 *
268 268 * For 64 bit native if we have KPTI enabled, we use the IST stack mechanism on
269 269 * all interrupts. We have different ISTs for each class of exceptions that are
270 270 * most likely to occur while handling an existing exception; while many of
271 271 * these are just going to panic, it's nice not to trample on the existing
272 272 * exception state for debugging purposes.
273 273 *
274 274 * Normal interrupts are all redirected unconditionally to the KPTI trampoline
275 275 * stack space. This unifies the trampoline handling between user and kernel
276 276 * space (and avoids the need to touch %gs).
277 277 *
278 278 * The KDI IDT *all* uses the DBG IST: consider single stepping tr_pftrap, when
279 279 * we do a read from KMDB that cause another #PF. Without its own IST, this
280 280 * would stomp on the kernel's mcpu_kpti_flt frame.
281 281 */
282 282 uint_t
283 283 idt_vector_to_ist(uint_t vector)
284 284 {
285 285 #if defined(__xpv)
286 286 _NOTE(ARGUNUSED(vector));
287 287 return (IST_NONE);
288 288 #else
289 289 switch (vector) {
290 290 /* These should always use IST even without KPTI enabled. */
291 291 case T_DBLFLT:
292 292 return (IST_DF);
293 293 case T_NMIFLT:
294 294 return (IST_NMI);
295 295 case T_MCE:
296 296 return (IST_MCE);
297 297
298 298 case T_BPTFLT:
299 299 case T_SGLSTP:
300 300 if (kpti_enable == 1) {
301 301 return (IST_DBG);
302 302 }
303 303 return (IST_NONE);
304 304 case T_STKFLT:
305 305 case T_GPFLT:
306 306 case T_PGFLT:
307 307 if (kpti_enable == 1) {
308 308 return (IST_NESTABLE);
309 309 }
310 310 return (IST_NONE);
311 311 default:
312 312 if (kpti_enable == 1) {
313 313 return (IST_DEFAULT);
314 314 }
315 315 return (IST_NONE);
316 316 }
317 317 #endif
318 318 }
319 319
320 320 void
321 321 set_gatesegd(gate_desc_t *dp, void (*func)(void), selector_t sel,
322 322 uint_t type, uint_t dpl, uint_t ist)
323 323 {
324 324 dp->sgd_looffset = (uintptr_t)func;
325 325 dp->sgd_hioffset = (uintptr_t)func >> 16;
326 326 dp->sgd_hi64offset = (uintptr_t)func >> (16 + 16);
327 327 dp->sgd_selector = (uint16_t)sel;
328 328 dp->sgd_ist = ist;
329 329 dp->sgd_type = type;
330 330 dp->sgd_dpl = dpl;
331 331 dp->sgd_p = 1;
332 332 }
333 333
334 334 /*
335 335 * Updates a single user descriptor in the the GDT of the current cpu.
336 336 * Caller is responsible for preventing cpu migration.
337 337 */
338 338
339 339 void
340 340 gdt_update_usegd(uint_t sidx, user_desc_t *udp)
341 341 {
342 342 #if defined(DEBUG)
343 343 /* This should never be a "system" segment, but it might be null. */
344 344 if (udp->usd_p != 0 || udp->usd_type != 0) {
345 345 ASSERT3U(udp->usd_type & SDT_S, !=, 0);
346 346 }
347 347 /*
348 348 * We should always set the "accessed" bit (SDT_A), otherwise the CPU
349 349 * will write to the GDT whenever we change segment registers around.
350 350 * With KPTI on, the GDT is read-only in the user page table, which
351 351 * causes crashes if we don't set this.
352 352 */
353 353 if (udp->usd_p != 0 || udp->usd_type != 0) {
354 354 ASSERT3U(udp->usd_type & SDT_A, !=, 0);
355 355 }
356 356 #endif
357 357
358 358 #if defined(__xpv)
359 359 uint64_t dpa = CPU->cpu_m.mcpu_gdtpa + sizeof (*udp) * sidx;
360 360
361 361 if (HYPERVISOR_update_descriptor(pa_to_ma(dpa), *(uint64_t *)udp))
362 362 panic("gdt_update_usegd: HYPERVISOR_update_descriptor");
363 363
364 364 #else /* __xpv */
365 365 CPU->cpu_gdt[sidx] = *udp;
366 366 #endif /* __xpv */
367 367 }
368 368
369 369 /*
370 370 * Writes single descriptor pointed to by udp into a processes
371 371 * LDT entry pointed to by ldp.
372 372 */
373 373 int
374 374 ldt_update_segd(user_desc_t *ldp, user_desc_t *udp)
375 375 {
376 376 #if defined(DEBUG)
377 377 /* This should never be a "system" segment, but it might be null. */
378 378 if (udp->usd_p != 0 || udp->usd_type != 0) {
379 379 ASSERT3U(udp->usd_type & SDT_S, !=, 0);
380 380 }
381 381 /*
382 382 * We should always set the "accessed" bit (SDT_A), otherwise the CPU
383 383 * will write to the LDT whenever we change segment registers around.
384 384 * With KPTI on, the LDT is read-only in the user page table, which
385 385 * causes crashes if we don't set this.
386 386 */
387 387 if (udp->usd_p != 0 || udp->usd_type != 0) {
388 388 ASSERT3U(udp->usd_type & SDT_A, !=, 0);
389 389 }
390 390 #endif
391 391
392 392 #if defined(__xpv)
393 393 uint64_t dpa;
394 394
395 395 dpa = mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)ldp)) |
396 396 ((uintptr_t)ldp & PAGEOFFSET);
397 397
398 398 /*
399 399 * The hypervisor is a little more restrictive about what it
400 400 * supports in the LDT.
401 401 */
402 402 if (HYPERVISOR_update_descriptor(pa_to_ma(dpa), *(uint64_t *)udp) != 0)
403 403 return (EINVAL);
404 404
405 405 #else /* __xpv */
406 406 *ldp = *udp;
407 407
408 408 #endif /* __xpv */
409 409 return (0);
410 410 }
411 411
412 412 #if defined(__xpv)
413 413
414 414 /*
415 415 * Converts hw format gate descriptor into pseudo-IDT format for the hypervisor.
416 416 * Returns true if a valid entry was written.
417 417 */
418 418 int
419 419 xen_idt_to_trap_info(uint_t vec, gate_desc_t *sgd, void *ti_arg)
420 420 {
421 421 trap_info_t *ti = ti_arg; /* XXPV Aargh - segments.h comment */
422 422
423 423 /*
424 424 * skip holes in the IDT
425 425 */
426 426 if (GATESEG_GETOFFSET(sgd) == 0)
427 427 return (0);
428 428
429 429 ASSERT(sgd->sgd_type == SDT_SYSIGT);
430 430 ti->vector = vec;
431 431 TI_SET_DPL(ti, sgd->sgd_dpl);
432 432
433 433 /*
434 434 * Is this an interrupt gate?
435 435 */
436 436 if (sgd->sgd_type == SDT_SYSIGT) {
437 437 /* LINTED */
438 438 TI_SET_IF(ti, 1);
439 439 }
440 440 ti->cs = sgd->sgd_selector;
441 441 #if defined(__amd64)
442 442 ti->cs |= SEL_KPL; /* force into ring 3. see KCS_SEL */
443 443 #endif
444 444 ti->address = GATESEG_GETOFFSET(sgd);
445 445 return (1);
446 446 }
447 447
448 448 /*
449 449 * Convert a single hw format gate descriptor and write it into our virtual IDT.
450 450 */
451 451 void
452 452 xen_idt_write(gate_desc_t *sgd, uint_t vec)
453 453 {
454 454 trap_info_t trapinfo[2];
455 455
456 456 bzero(trapinfo, sizeof (trapinfo));
457 457 if (xen_idt_to_trap_info(vec, sgd, &trapinfo[0]) == 0)
458 458 return;
459 459 if (xen_set_trap_table(trapinfo) != 0)
460 460 panic("xen_idt_write: xen_set_trap_table() failed");
461 461 }
462 462
463 463 #endif /* __xpv */
464 464
465 465 #if defined(__amd64)
466 466
467 467 /*
468 468 * Build kernel GDT.
469 469 */
470 470
471 471 static void
472 472 init_gdt_common(user_desc_t *gdt)
473 473 {
474 474 int i;
475 475
476 476 /*
477 477 * 64-bit kernel code segment.
478 478 */
479 479 set_usegd(&gdt[GDT_KCODE], SDP_LONG, NULL, 0, SDT_MEMERA, SEL_KPL,
480 480 SDP_PAGES, SDP_OP32);
481 481
482 482 /*
483 483 * 64-bit kernel data segment. The limit attribute is ignored in 64-bit
484 484 * mode, but we set it here to 0xFFFF so that we can use the SYSRET
485 485 * instruction to return from system calls back to 32-bit applications.
486 486 * SYSRET doesn't update the base, limit, or attributes of %ss or %ds
487 487 * descriptors. We therefore must ensure that the kernel uses something,
488 488 * though it will be ignored by hardware, that is compatible with 32-bit
489 489 * apps. For the same reason we must set the default op size of this
490 490 * descriptor to 32-bit operands.
491 491 */
492 492 set_usegd(&gdt[GDT_KDATA], SDP_LONG, NULL, -1, SDT_MEMRWA,
493 493 SEL_KPL, SDP_PAGES, SDP_OP32);
494 494 gdt[GDT_KDATA].usd_def32 = 1;
495 495
496 496 /*
497 497 * 64-bit user code segment.
498 498 */
499 499 set_usegd(&gdt[GDT_UCODE], SDP_LONG, NULL, 0, SDT_MEMERA, SEL_UPL,
500 500 SDP_PAGES, SDP_OP32);
501 501
502 502 /*
503 503 * 32-bit user code segment.
504 504 */
505 505 set_usegd(&gdt[GDT_U32CODE], SDP_SHORT, NULL, -1, SDT_MEMERA,
506 506 SEL_UPL, SDP_PAGES, SDP_OP32);
507 507
508 508 /*
509 509 * See gdt_ucode32() and gdt_ucode_native().
510 510 */
511 511 ucs_on = ucs_off = gdt[GDT_UCODE];
512 512 ucs_off.usd_p = 0; /* forces #np fault */
513 513
514 514 ucs32_on = ucs32_off = gdt[GDT_U32CODE];
515 515 ucs32_off.usd_p = 0; /* forces #np fault */
516 516
517 517 /*
518 518 * 32 and 64 bit data segments can actually share the same descriptor.
519 519 * In long mode only the present bit is checked but all other fields
520 520 * are loaded. But in compatibility mode all fields are interpreted
521 521 * as in legacy mode so they must be set correctly for a 32-bit data
522 522 * segment.
523 523 */
524 524 set_usegd(&gdt[GDT_UDATA], SDP_SHORT, NULL, -1, SDT_MEMRWA, SEL_UPL,
525 525 SDP_PAGES, SDP_OP32);
526 526
527 527 #if !defined(__xpv)
528 528
529 529 /*
530 530 * The 64-bit kernel has no default LDT. By default, the LDT descriptor
531 531 * in the GDT is 0.
532 532 */
533 533
534 534 /*
535 535 * Kernel TSS
536 536 */
537 537 set_syssegd((system_desc_t *)&gdt[GDT_KTSS], ktss0,
538 538 sizeof (*ktss0) - 1, SDT_SYSTSS, SEL_KPL);
539 539
540 540 #endif /* !__xpv */
541 541
542 542 /*
543 543 * Initialize fs and gs descriptors for 32 bit processes.
544 544 * Only attributes and limits are initialized, the effective
545 545 * base address is programmed via fsbase/gsbase.
546 546 */
547 547 set_usegd(&gdt[GDT_LWPFS], SDP_SHORT, NULL, -1, SDT_MEMRWA,
548 548 SEL_UPL, SDP_PAGES, SDP_OP32);
549 549 set_usegd(&gdt[GDT_LWPGS], SDP_SHORT, NULL, -1, SDT_MEMRWA,
550 550 SEL_UPL, SDP_PAGES, SDP_OP32);
551 551
552 552 /*
553 553 * Initialize the descriptors set aside for brand usage.
554 554 * Only attributes and limits are initialized.
555 555 */
556 556 for (i = GDT_BRANDMIN; i <= GDT_BRANDMAX; i++)
557 557 set_usegd(&gdt0[i], SDP_SHORT, NULL, -1, SDT_MEMRWA,
558 558 SEL_UPL, SDP_PAGES, SDP_OP32);
559 559
560 560 /*
561 561 * Initialize convenient zero base user descriptors for clearing
562 562 * lwp private %fs and %gs descriptors in GDT. See setregs() for
563 563 * an example.
564 564 */
565 565 set_usegd(&zero_udesc, SDP_LONG, 0, 0, SDT_MEMRWA, SEL_UPL,
566 566 SDP_BYTES, SDP_OP32);
567 567 set_usegd(&zero_u32desc, SDP_SHORT, 0, -1, SDT_MEMRWA, SEL_UPL,
568 568 SDP_PAGES, SDP_OP32);
569 569 }
570 570
571 571 #if defined(__xpv)
572 572
573 573 static user_desc_t *
574 574 init_gdt(void)
575 575 {
576 576 uint64_t gdtpa;
577 577 ulong_t ma[1]; /* XXPV should be a memory_t */
578 578 ulong_t addr;
579 579
580 580 #if !defined(__lint)
581 581 /*
582 582 * Our gdt is never larger than a single page.
583 583 */
584 584 ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
585 585 #endif
586 586 gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
587 587 PAGESIZE, PAGESIZE);
588 588 bzero(gdt0, PAGESIZE);
589 589
590 590 init_gdt_common(gdt0);
591 591
592 592 /*
593 593 * XXX Since we never invoke kmdb until after the kernel takes
594 594 * over the descriptor tables why not have it use the kernel's
595 595 * selectors?
596 596 */
597 597 if (boothowto & RB_DEBUG) {
598 598 set_usegd(&gdt0[GDT_B32DATA], SDP_LONG, NULL, -1, SDT_MEMRWA,
599 599 SEL_KPL, SDP_PAGES, SDP_OP32);
600 600 set_usegd(&gdt0[GDT_B64CODE], SDP_LONG, NULL, -1, SDT_MEMERA,
601 601 SEL_KPL, SDP_PAGES, SDP_OP32);
602 602 }
603 603
604 604 /*
605 605 * Clear write permission for page containing the gdt and install it.
606 606 */
607 607 gdtpa = pfn_to_pa(va_to_pfn(gdt0));
608 608 ma[0] = (ulong_t)(pa_to_ma(gdtpa) >> PAGESHIFT);
609 609 kbm_read_only((uintptr_t)gdt0, gdtpa);
610 610 xen_set_gdt(ma, NGDT);
611 611
612 612 /*
613 613 * Reload the segment registers to use the new GDT.
614 614 * On 64-bit, fixup KCS_SEL to be in ring 3.
615 615 * See KCS_SEL in segments.h.
616 616 */
617 617 load_segment_registers((KCS_SEL | SEL_KPL), KFS_SEL, KGS_SEL, KDS_SEL);
618 618
619 619 /*
620 620 * setup %gs for kernel
621 621 */
622 622 xen_set_segment_base(SEGBASE_GS_KERNEL, (ulong_t)&cpus[0]);
623 623
624 624 /*
625 625 * XX64 We should never dereference off "other gsbase" or
626 626 * "fsbase". So, we should arrange to point FSBASE and
627 627 * KGSBASE somewhere truly awful e.g. point it at the last
628 628 * valid address below the hole so that any attempts to index
629 629 * off them cause an exception.
630 630 *
631 631 * For now, point it at 8G -- at least it should be unmapped
632 632 * until some 64-bit processes run.
633 633 */
634 634 addr = 0x200000000ul;
635 635 xen_set_segment_base(SEGBASE_FS, addr);
636 636 xen_set_segment_base(SEGBASE_GS_USER, addr);
637 637 xen_set_segment_base(SEGBASE_GS_USER_SEL, 0);
638 638
639 639 return (gdt0);
640 640 }
641 641
642 642 #else /* __xpv */
643 643
644 644 static user_desc_t *
645 645 init_gdt(void)
646 646 {
647 647 desctbr_t r_bgdt, r_gdt;
648 648 user_desc_t *bgdt;
649 649
650 650 #if !defined(__lint)
651 651 /*
652 652 * Our gdt is never larger than a single page.
653 653 */
654 654 ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
655 655 #endif
656 656 gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
657 657 PAGESIZE, PAGESIZE);
658 658 bzero(gdt0, PAGESIZE);
659 659
660 660 init_gdt_common(gdt0);
661 661
662 662 /*
663 663 * Copy in from boot's gdt to our gdt.
664 664 * Entry 0 is the null descriptor by definition.
665 665 */
666 666 rd_gdtr(&r_bgdt);
667 667 bgdt = (user_desc_t *)r_bgdt.dtr_base;
668 668 if (bgdt == NULL)
669 669 panic("null boot gdt");
670 670
671 671 gdt0[GDT_B32DATA] = bgdt[GDT_B32DATA];
672 672 gdt0[GDT_B32CODE] = bgdt[GDT_B32CODE];
673 673 gdt0[GDT_B16CODE] = bgdt[GDT_B16CODE];
674 674 gdt0[GDT_B16DATA] = bgdt[GDT_B16DATA];
675 675 gdt0[GDT_B64CODE] = bgdt[GDT_B64CODE];
676 676
677 677 /*
678 678 * Install our new GDT
679 679 */
680 680 r_gdt.dtr_limit = (sizeof (*gdt0) * NGDT) - 1;
681 681 r_gdt.dtr_base = (uintptr_t)gdt0;
682 682 wr_gdtr(&r_gdt);
683 683
684 684 /*
685 685 * Reload the segment registers to use the new GDT
686 686 */
687 687 load_segment_registers(KCS_SEL, KFS_SEL, KGS_SEL, KDS_SEL);
688 688
689 689 /*
690 690 * setup %gs for kernel
691 691 */
692 692 wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]);
693 693
694 694 /*
695 695 * XX64 We should never dereference off "other gsbase" or
696 696 * "fsbase". So, we should arrange to point FSBASE and
697 697 * KGSBASE somewhere truly awful e.g. point it at the last
698 698 * valid address below the hole so that any attempts to index
699 699 * off them cause an exception.
700 700 *
701 701 * For now, point it at 8G -- at least it should be unmapped
702 702 * until some 64-bit processes run.
703 703 */
704 704 wrmsr(MSR_AMD_FSBASE, 0x200000000ul);
705 705 wrmsr(MSR_AMD_KGSBASE, 0x200000000ul);
706 706 return (gdt0);
707 707 }
708 708
709 709 #endif /* __xpv */
710 710
711 711 #elif defined(__i386)
712 712
713 713 static void
714 714 init_gdt_common(user_desc_t *gdt)
715 715 {
716 716 int i;
717 717
718 718 /*
719 719 * Text and data for both kernel and user span entire 32 bit
720 720 * address space.
721 721 */
722 722
723 723 /*
724 724 * kernel code segment.
725 725 */
726 726 set_usegd(&gdt[GDT_KCODE], NULL, -1, SDT_MEMERA, SEL_KPL, SDP_PAGES,
727 727 SDP_OP32);
728 728
729 729 /*
730 730 * kernel data segment.
731 731 */
732 732 set_usegd(&gdt[GDT_KDATA], NULL, -1, SDT_MEMRWA, SEL_KPL, SDP_PAGES,
733 733 SDP_OP32);
734 734
735 735 /*
736 736 * user code segment.
737 737 */
738 738 set_usegd(&gdt[GDT_UCODE], NULL, -1, SDT_MEMERA, SEL_UPL, SDP_PAGES,
739 739 SDP_OP32);
740 740
741 741 /*
742 742 * user data segment.
743 743 */
744 744 set_usegd(&gdt[GDT_UDATA], NULL, -1, SDT_MEMRWA, SEL_UPL, SDP_PAGES,
745 745 SDP_OP32);
746 746
747 747 #if !defined(__xpv)
748 748
749 749 /*
750 750 * TSS for T_DBLFLT (double fault) handler
751 751 */
752 752 set_syssegd((system_desc_t *)&gdt[GDT_DBFLT], dftss0,
753 753 sizeof (*dftss0) - 1, SDT_SYSTSS, SEL_KPL);
754 754
755 755 /*
756 756 * TSS for kernel
757 757 */
758 758 set_syssegd((system_desc_t *)&gdt[GDT_KTSS], ktss0,
759 759 sizeof (*ktss0) - 1, SDT_SYSTSS, SEL_KPL);
760 760
761 761 #endif /* !__xpv */
762 762
763 763 /*
764 764 * %gs selector for kernel
765 765 */
766 766 set_usegd(&gdt[GDT_GS], &cpus[0], sizeof (struct cpu) -1, SDT_MEMRWA,
767 767 SEL_KPL, SDP_BYTES, SDP_OP32);
768 768
769 769 /*
770 770 * Initialize lwp private descriptors.
771 771 * Only attributes and limits are initialized, the effective
772 772 * base address is programmed via fsbase/gsbase.
773 773 */
774 774 set_usegd(&gdt[GDT_LWPFS], NULL, (size_t)-1, SDT_MEMRWA, SEL_UPL,
775 775 SDP_PAGES, SDP_OP32);
776 776 set_usegd(&gdt[GDT_LWPGS], NULL, (size_t)-1, SDT_MEMRWA, SEL_UPL,
777 777 SDP_PAGES, SDP_OP32);
778 778
779 779 /*
780 780 * Initialize the descriptors set aside for brand usage.
781 781 * Only attributes and limits are initialized.
782 782 */
783 783 for (i = GDT_BRANDMIN; i <= GDT_BRANDMAX; i++)
784 784 set_usegd(&gdt0[i], NULL, (size_t)-1, SDT_MEMRWA, SEL_UPL,
785 785 SDP_PAGES, SDP_OP32);
786 786 /*
787 787 * Initialize convenient zero base user descriptor for clearing
788 788 * lwp private %fs and %gs descriptors in GDT. See setregs() for
789 789 * an example.
790 790 */
791 791 set_usegd(&zero_udesc, NULL, -1, SDT_MEMRWA, SEL_UPL,
792 792 SDP_BYTES, SDP_OP32);
793 793 }
794 794
795 795 #if defined(__xpv)
796 796
797 797 static user_desc_t *
798 798 init_gdt(void)
799 799 {
800 800 uint64_t gdtpa;
801 801 ulong_t ma[1]; /* XXPV should be a memory_t */
802 802
803 803 #if !defined(__lint)
804 804 /*
805 805 * Our gdt is never larger than a single page.
806 806 */
807 807 ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
808 808 #endif
809 809 gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
810 810 PAGESIZE, PAGESIZE);
811 811 bzero(gdt0, PAGESIZE);
812 812
813 813 init_gdt_common(gdt0);
814 814 gdtpa = pfn_to_pa(va_to_pfn(gdt0));
815 815
816 816 /*
817 817 * XXX Since we never invoke kmdb until after the kernel takes
818 818 * over the descriptor tables why not have it use the kernel's
819 819 * selectors?
820 820 */
821 821 if (boothowto & RB_DEBUG) {
822 822 set_usegd(&gdt0[GDT_B32DATA], NULL, -1, SDT_MEMRWA, SEL_KPL,
823 823 SDP_PAGES, SDP_OP32);
824 824 set_usegd(&gdt0[GDT_B32CODE], NULL, -1, SDT_MEMERA, SEL_KPL,
825 825 SDP_PAGES, SDP_OP32);
826 826 }
827 827
828 828 /*
829 829 * Clear write permission for page containing the gdt and install it.
830 830 */
831 831 ma[0] = (ulong_t)(pa_to_ma(gdtpa) >> PAGESHIFT);
832 832 kbm_read_only((uintptr_t)gdt0, gdtpa);
833 833 xen_set_gdt(ma, NGDT);
834 834
835 835 /*
836 836 * Reload the segment registers to use the new GDT
837 837 */
838 838 load_segment_registers(
839 839 KCS_SEL, KDS_SEL, KDS_SEL, KFS_SEL, KGS_SEL, KDS_SEL);
840 840
841 841 return (gdt0);
842 842 }
843 843
844 844 #else /* __xpv */
845 845
846 846 static user_desc_t *
847 847 init_gdt(void)
848 848 {
849 849 desctbr_t r_bgdt, r_gdt;
850 850 user_desc_t *bgdt;
851 851
852 852 #if !defined(__lint)
853 853 /*
854 854 * Our gdt is never larger than a single page.
855 855 */
856 856 ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
857 857 #endif
858 858 /*
859 859 * XXX this allocation belongs in our caller, not here.
860 860 */
861 861 gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
862 862 PAGESIZE, PAGESIZE);
863 863 bzero(gdt0, PAGESIZE);
864 864
865 865 init_gdt_common(gdt0);
866 866
867 867 /*
868 868 * Copy in from boot's gdt to our gdt entries.
869 869 * Entry 0 is null descriptor by definition.
870 870 */
871 871 rd_gdtr(&r_bgdt);
872 872 bgdt = (user_desc_t *)r_bgdt.dtr_base;
873 873 if (bgdt == NULL)
874 874 panic("null boot gdt");
875 875
876 876 gdt0[GDT_B32DATA] = bgdt[GDT_B32DATA];
877 877 gdt0[GDT_B32CODE] = bgdt[GDT_B32CODE];
878 878 gdt0[GDT_B16CODE] = bgdt[GDT_B16CODE];
879 879 gdt0[GDT_B16DATA] = bgdt[GDT_B16DATA];
880 880
881 881 /*
882 882 * Install our new GDT
883 883 */
884 884 r_gdt.dtr_limit = (sizeof (*gdt0) * NGDT) - 1;
885 885 r_gdt.dtr_base = (uintptr_t)gdt0;
886 886 wr_gdtr(&r_gdt);
887 887
888 888 /*
889 889 * Reload the segment registers to use the new GDT
890 890 */
891 891 load_segment_registers(
892 892 KCS_SEL, KDS_SEL, KDS_SEL, KFS_SEL, KGS_SEL, KDS_SEL);
893 893
894 894 return (gdt0);
895 895 }
896 896
897 897 #endif /* __xpv */
898 898 #endif /* __i386 */
899 899
900 900 /*
901 901 * Build kernel IDT.
902 902 *
903 903 * Note that for amd64 we pretty much require every gate to be an interrupt
904 904 * gate which blocks interrupts atomically on entry; that's because of our
905 905 * dependency on using 'swapgs' every time we come into the kernel to find
906 906 * the cpu structure. If we get interrupted just before doing that, %cs could
907 907 * be in kernel mode (so that the trap prolog doesn't do a swapgs), but
908 908 * %gsbase is really still pointing at something in userland. Bad things will
909 909 * ensue. We also use interrupt gates for i386 as well even though this is not
910 910 * required for some traps.
911 911 *
912 912 * Perhaps they should have invented a trap gate that does an atomic swapgs?
913 913 */
914 914 static void
915 915 init_idt_common(gate_desc_t *idt)
916 916 {
917 917 set_gatesegd(&idt[T_ZERODIV],
918 918 (kpti_enable == 1) ? &tr_div0trap : &div0trap,
919 919 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ZERODIV));
920 920 set_gatesegd(&idt[T_SGLSTP],
921 921 (kpti_enable == 1) ? &tr_dbgtrap : &dbgtrap,
922 922 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SGLSTP));
923 923 set_gatesegd(&idt[T_NMIFLT],
924 924 (kpti_enable == 1) ? &tr_nmiint : &nmiint,
925 925 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NMIFLT));
926 926 set_gatesegd(&idt[T_BPTFLT],
927 927 (kpti_enable == 1) ? &tr_brktrap : &brktrap,
928 928 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_BPTFLT));
929 929 set_gatesegd(&idt[T_OVFLW],
930 930 (kpti_enable == 1) ? &tr_ovflotrap : &ovflotrap,
931 931 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_OVFLW));
932 932 set_gatesegd(&idt[T_BOUNDFLT],
933 933 (kpti_enable == 1) ? &tr_boundstrap : &boundstrap,
934 934 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_BOUNDFLT));
935 935 set_gatesegd(&idt[T_ILLINST],
936 936 (kpti_enable == 1) ? &tr_invoptrap : &invoptrap,
937 937 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ILLINST));
938 938 set_gatesegd(&idt[T_NOEXTFLT],
939 939 (kpti_enable == 1) ? &tr_ndptrap : &ndptrap,
940 940 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NOEXTFLT));
941 941
942 942 /*
943 943 * double fault handler.
944 944 *
945 945 * Note that on the hypervisor a guest does not receive #df faults.
946 946 * Instead a failsafe event is injected into the guest if its selectors
947 947 * and/or stack is in a broken state. See xen_failsafe_callback.
948 948 */
949 949 #if !defined(__xpv)
950 950 set_gatesegd(&idt[T_DBLFLT],
951 951 (kpti_enable == 1) ? &tr_syserrtrap : &syserrtrap,
952 952 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_DBLFLT));
953 953 #endif /* !__xpv */
954 954
955 955 /*
956 956 * T_EXTOVRFLT coprocessor-segment-overrun not supported.
957 957 */
958 958 set_gatesegd(&idt[T_TSSFLT],
959 959 (kpti_enable == 1) ? &tr_invtsstrap : &invtsstrap,
960 960 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_TSSFLT));
961 961 set_gatesegd(&idt[T_SEGFLT],
962 962 (kpti_enable == 1) ? &tr_segnptrap : &segnptrap,
963 963 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SEGFLT));
964 964 set_gatesegd(&idt[T_STKFLT],
965 965 (kpti_enable == 1) ? &tr_stktrap : &stktrap,
966 966 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_STKFLT));
967 967 set_gatesegd(&idt[T_GPFLT],
968 968 (kpti_enable == 1) ? &tr_gptrap : &gptrap,
969 969 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_GPFLT));
970 970 set_gatesegd(&idt[T_PGFLT],
971 971 (kpti_enable == 1) ? &tr_pftrap : &pftrap,
972 972 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_PGFLT));
973 973 set_gatesegd(&idt[T_EXTERRFLT],
974 974 (kpti_enable == 1) ? &tr_ndperr : &ndperr,
975 975 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_EXTERRFLT));
976 976 set_gatesegd(&idt[T_ALIGNMENT],
977 977 (kpti_enable == 1) ? &tr_achktrap : &achktrap,
978 978 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ALIGNMENT));
979 979 set_gatesegd(&idt[T_MCE],
980 980 (kpti_enable == 1) ? &tr_mcetrap : &mcetrap,
981 981 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_MCE));
982 982 set_gatesegd(&idt[T_SIMDFPE],
983 983 (kpti_enable == 1) ? &tr_xmtrap : &xmtrap,
984 984 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SIMDFPE));
985 985
986 986 /*
987 987 * install fast trap handler at 210.
988 988 */
989 989 set_gatesegd(&idt[T_FASTTRAP],
990 990 (kpti_enable == 1) ? &tr_fasttrap : &fasttrap,
991 991 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_FASTTRAP));
992 992
993 993 /*
994 994 * System call handler.
995 995 */
996 996 set_gatesegd(&idt[T_SYSCALLINT],
997 997 (kpti_enable == 1) ? &tr_sys_syscall_int : &sys_syscall_int,
998 998 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_SYSCALLINT));
999 999
1000 1000 /*
1001 1001 * Install the DTrace interrupt handler for the pid provider.
1002 1002 */
1003 1003 set_gatesegd(&idt[T_DTRACE_RET],
1004 1004 (kpti_enable == 1) ? &tr_dtrace_ret : &dtrace_ret,
1005 1005 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_DTRACE_RET));
1006 1006
1007 1007 /*
1008 1008 * Prepare interposing descriptor for the syscall handler
1009 1009 * and cache copy of the default descriptor.
1010 1010 */
1011 1011 brand_tbl[0].ih_inum = T_SYSCALLINT;
1012 1012 brand_tbl[0].ih_default_desc = idt0[T_SYSCALLINT];
1013 1013
1014 1014 set_gatesegd(&(brand_tbl[0].ih_interp_desc),
1015 1015 (kpti_enable == 1) ? &tr_brand_sys_syscall_int :
1016 1016 &brand_sys_syscall_int, KCS_SEL, SDT_SYSIGT, TRP_UPL,
1017 1017 idt_vector_to_ist(T_SYSCALLINT));
1018 1018
1019 1019 brand_tbl[1].ih_inum = 0;
1020 1020 }
1021 1021
1022 1022 #if defined(__xpv)
1023 1023
1024 1024 static void
1025 1025 init_idt(gate_desc_t *idt)
1026 1026 {
1027 1027 init_idt_common(idt);
1028 1028 }
1029 1029
1030 1030 #else /* __xpv */
1031 1031
1032 1032 static void
1033 1033 init_idt(gate_desc_t *idt)
1034 1034 {
1035 1035 char ivctname[80];
1036 1036 void (*ivctptr)(void);
1037 1037 int i;
1038 1038
1039 1039 /*
1040 1040 * Initialize entire table with 'reserved' trap and then overwrite
1041 1041 * specific entries. T_EXTOVRFLT (9) is unsupported and reserved
1042 1042 * since it can only be generated on a 386 processor. 15 is also
1043 1043 * unsupported and reserved.
1044 1044 */
1045 1045 #if !defined(__xpv)
1046 1046 for (i = 0; i < NIDT; i++) {
1047 1047 set_gatesegd(&idt[i],
1048 1048 (kpti_enable == 1) ? &tr_resvtrap : &resvtrap,
1049 1049 KCS_SEL, SDT_SYSIGT, TRP_KPL,
1050 1050 idt_vector_to_ist(T_RESVTRAP));
1051 1051 }
1052 1052 #else
1053 1053 for (i = 0; i < NIDT; i++) {
1054 1054 set_gatesegd(&idt[i], &resvtrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
1055 1055 IST_NONE);
1056 1056 }
1057 1057 #endif
1058 1058
1059 1059 /*
1060 1060 * 20-31 reserved
1061 1061 */
1062 1062 #if !defined(__xpv)
1063 1063 for (i = 20; i < 32; i++) {
1064 1064 set_gatesegd(&idt[i],
1065 1065 (kpti_enable == 1) ? &tr_invaltrap : &invaltrap,
1066 1066 KCS_SEL, SDT_SYSIGT, TRP_KPL,
1067 1067 idt_vector_to_ist(T_INVALTRAP));
1068 1068 }
1069 1069 #else
1070 1070 for (i = 20; i < 32; i++) {
1071 1071 set_gatesegd(&idt[i], &invaltrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
1072 1072 IST_NONE);
1073 1073 }
1074 1074 #endif
1075 1075
1076 1076 /*
1077 1077 * interrupts 32 - 255
1078 1078 */
1079 1079 for (i = 32; i < 256; i++) {
1080 1080 #if !defined(__xpv)
1081 1081 (void) snprintf(ivctname, sizeof (ivctname),
1082 1082 (kpti_enable == 1) ? "tr_ivct%d" : "ivct%d", i);
1083 1083 #else
1084 1084 (void) snprintf(ivctname, sizeof (ivctname), "ivct%d", i);
1085 1085 #endif
1086 1086 ivctptr = (void (*)(void))kobj_getsymvalue(ivctname, 0);
1087 1087 if (ivctptr == NULL)
1088 1088 panic("kobj_getsymvalue(%s) failed", ivctname);
1089 1089
1090 1090 set_gatesegd(&idt[i], ivctptr, KCS_SEL, SDT_SYSIGT, TRP_KPL,
1091 1091 idt_vector_to_ist(i));
1092 1092 }
1093 1093
1094 1094 /*
1095 1095 * Now install the common ones. Note that it will overlay some
1096 1096 * entries installed above like T_SYSCALLINT, T_FASTTRAP etc.
1097 1097 */
1098 1098 init_idt_common(idt);
1099 1099 }
1100 1100
1101 1101 #endif /* __xpv */
1102 1102
1103 1103 /*
1104 1104 * The kernel does not deal with LDTs unless a user explicitly creates
1105 1105 * one. Under normal circumstances, the LDTR contains 0. Any process attempting
1106 1106 * to reference the LDT will therefore cause a #gp. System calls made via the
1107 1107 * obsolete lcall mechanism are emulated by the #gp fault handler.
1108 1108 */
1109 1109 static void
1110 1110 init_ldt(void)
1111 1111 {
1112 1112 #if defined(__xpv)
1113 1113 xen_set_ldt(NULL, 0);
1114 1114 #else
1115 1115 wr_ldtr(0);
1116 1116 #endif
1117 1117 }
1118 1118
1119 1119 #if !defined(__xpv)
1120 1120
1121 1121 static void
1122 1122 init_tss(void)
1123 1123 {
1124 1124 extern struct cpu cpus[];
1125 1125
1126 1126 /*
1127 1127 * tss_rsp0 is dynamically filled in by resume() (in swtch.s) on each
1128 1128 * context switch but it'll be overwritten with this same value anyway.
1129 1129 */
1130 1130 if (kpti_enable == 1) {
1131 1131 ktss0->tss_rsp0 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp;
1132 1132 }
1133 1133
1134 1134 /* Set up the IST stacks for double fault, NMI, MCE. */
1135 1135 ktss0->tss_ist1 = (uintptr_t)&dblfault_stack0[sizeof (dblfault_stack0)];
1136 1136 ktss0->tss_ist2 = (uintptr_t)&nmi_stack0[sizeof (nmi_stack0)];
1137 1137 ktss0->tss_ist3 = (uintptr_t)&mce_stack0[sizeof (mce_stack0)];
1138 1138
1139 1139 /*
1140 1140 * This IST stack is used for #DB,#BP (debug) interrupts (when KPTI is
1141 1141 * enabled), and also for KDI (always).
1142 1142 */
1143 1143 ktss0->tss_ist4 = (uint64_t)&cpus->cpu_m.mcpu_kpti_dbg.kf_tr_rsp;
1144 1144
1145 1145 if (kpti_enable == 1) {
1146 1146 /* This IST stack is used for #GP,#PF,#SS (fault) interrupts. */
1147 1147 ktss0->tss_ist5 =
1148 1148 (uint64_t)&cpus->cpu_m.mcpu_kpti_flt.kf_tr_rsp;
1149 1149
1150 1150 /* This IST stack is used for all other intrs (for KPTI). */
1151 1151 ktss0->tss_ist6 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp;
1152 1152 }
1153 1153
1154 1154 /*
1155 1155 * Set I/O bit map offset equal to size of TSS segment limit
1156 1156 * for no I/O permission map. This will force all user I/O
1157 1157 * instructions to generate #gp fault.
1158 1158 */
1159 1159 ktss0->tss_bitmapbase = sizeof (*ktss0);
1160 1160
1161 1161 /*
1162 1162 * Point %tr to descriptor for ktss0 in gdt.
1163 1163 */
1164 1164 wr_tsr(KTSS_SEL);
1165 1165 }
1166 1166
1167 1167 #endif /* !__xpv */
1168 1168
1169 1169 #if defined(__xpv)
1170 1170
1171 1171 void
1172 1172 init_desctbls(void)
1173 1173 {
1174 1174 uint_t vec;
1175 1175 user_desc_t *gdt;
1176 1176
1177 1177 /*
1178 1178 * Setup and install our GDT.
1179 1179 */
1180 1180 gdt = init_gdt();
1181 1181
1182 1182 /*
1183 1183 * Store static pa of gdt to speed up pa_to_ma() translations
1184 1184 * on lwp context switches.
1185 1185 */
1186 1186 ASSERT(IS_P2ALIGNED((uintptr_t)gdt, PAGESIZE));
1187 1187 CPU->cpu_gdt = gdt;
1188 1188 CPU->cpu_m.mcpu_gdtpa = pfn_to_pa(va_to_pfn(gdt));
1189 1189
1190 1190 /*
1191 1191 * Setup and install our IDT.
1192 1192 */
1193 1193 #if !defined(__lint)
1194 1194 ASSERT(NIDT * sizeof (*idt0) <= PAGESIZE);
1195 1195 #endif
1196 1196 idt0 = (gate_desc_t *)BOP_ALLOC(bootops, (caddr_t)IDT_VA,
1197 1197 PAGESIZE, PAGESIZE);
1198 1198 bzero(idt0, PAGESIZE);
1199 1199 init_idt(idt0);
1200 1200 for (vec = 0; vec < NIDT; vec++)
1201 1201 xen_idt_write(&idt0[vec], vec);
1202 1202
1203 1203 CPU->cpu_idt = idt0;
1204 1204
1205 1205 /*
1206 1206 * set default kernel stack
1207 1207 */
1208 1208 xen_stack_switch(KDS_SEL,
1209 1209 (ulong_t)&dblfault_stack0[sizeof (dblfault_stack0)]);
1210 1210
1211 1211 xen_init_callbacks();
1212 1212
1213 1213 init_ldt();
1214 1214 }
1215 1215
1216 1216 #else /* __xpv */
1217 1217
1218 1218 void
1219 1219 init_desctbls(void)
1220 1220 {
1221 1221 user_desc_t *gdt;
1222 1222 desctbr_t idtr;
1223 1223
1224 1224 /*
1225 1225 * Allocate IDT and TSS structures on unique pages for better
1226 1226 * performance in virtual machines.
1227 1227 */
1228 1228 #if !defined(__lint)
1229 1229 ASSERT(NIDT * sizeof (*idt0) <= PAGESIZE);
1230 1230 #endif
1231 1231 idt0 = (gate_desc_t *)BOP_ALLOC(bootops, (caddr_t)IDT_VA,
1232 1232 PAGESIZE, PAGESIZE);
1233 1233 bzero(idt0, PAGESIZE);
1234 1234 #if !defined(__lint)
1235 1235 ASSERT(sizeof (*ktss0) <= PAGESIZE);
1236 1236 #endif
1237 1237 ktss0 = (tss_t *)BOP_ALLOC(bootops, (caddr_t)KTSS_VA,
1238 1238 PAGESIZE, PAGESIZE);
1239 1239 bzero(ktss0, PAGESIZE);
1240 1240
1241 1241 #if defined(__i386)
1242 1242 #if !defined(__lint)
1243 1243 ASSERT(sizeof (*dftss0) <= PAGESIZE);
1244 1244 #endif
1245 1245 dftss0 = (tss_t *)BOP_ALLOC(bootops, (caddr_t)DFTSS_VA,
1246 1246 PAGESIZE, PAGESIZE);
1247 1247 bzero(dftss0, PAGESIZE);
1248 1248 #endif
1249 1249
1250 1250 /*
1251 1251 * Setup and install our GDT.
1252 1252 */
1253 1253 gdt = init_gdt();
1254 1254 ASSERT(IS_P2ALIGNED((uintptr_t)gdt, PAGESIZE));
1255 1255 CPU->cpu_gdt = gdt;
1256 1256
1257 1257 /*
1258 1258 * Initialize this CPU's LDT.
1259 1259 */
1260 1260 CPU->cpu_m.mcpu_ldt = BOP_ALLOC(bootops, (caddr_t)LDT_VA,
1261 1261 LDT_CPU_SIZE, PAGESIZE);
1262 1262 bzero(CPU->cpu_m.mcpu_ldt, LDT_CPU_SIZE);
1263 1263 CPU->cpu_m.mcpu_ldt_len = 0;
1264 1264
1265 1265 /*
1266 1266 * Setup and install our IDT.
1267 1267 */
1268 1268 init_idt(idt0);
1269 1269
1270 1270 idtr.dtr_base = (uintptr_t)idt0;
1271 1271 idtr.dtr_limit = (NIDT * sizeof (*idt0)) - 1;
1272 1272 wr_idtr(&idtr);
1273 1273 CPU->cpu_idt = idt0;
1274 1274
1275 1275 #if defined(__i386)
1276 1276 /*
1277 1277 * We maintain a description of idt0 in convenient IDTR format
1278 1278 * for #pf's on some older pentium processors. See pentium_pftrap().
1279 1279 */
1280 1280 idt0_default_r = idtr;
1281 1281 #endif /* __i386 */
1282 1282
↓ open down ↓ |
1282 lines elided |
↑ open up ↑ |
1283 1283 init_tss();
1284 1284 CPU->cpu_tss = ktss0;
1285 1285 init_ldt();
1286 1286
1287 1287 /* Stash this so that the NMI,MCE,#DF and KDI handlers can use it. */
1288 1288 kpti_safe_cr3 = (uint64_t)getcr3();
1289 1289 }
1290 1290
1291 1291 #endif /* __xpv */
1292 1292
1293 +#ifndef __xpv
1293 1294 /*
1295 + * As per Intel Vol 3 27.5.2, the GDTR limit is reset to 64Kb on a VM exit, so
1296 + * we have to manually fix it up ourselves.
1297 + *
1298 + * The caller may still need to make sure that it can't go off-CPU with the
1299 + * incorrect limit, before calling this (such as disabling pre-emption).
1300 + */
1301 +void
1302 +reset_gdtr_limit(void)
1303 +{
1304 + ulong_t flags = intr_clear();
1305 + desctbr_t gdtr;
1306 +
1307 + rd_gdtr(&gdtr);
1308 + gdtr.dtr_limit = (sizeof (user_desc_t) * NGDT) - 1;
1309 + wr_gdtr(&gdtr);
1310 +
1311 + intr_restore(flags);
1312 +}
1313 +#endif /* __xpv */
1314 +
1315 +/*
1294 1316 * In the early kernel, we need to set up a simple GDT to run on.
1295 1317 *
1296 1318 * XXPV Can dboot use this too? See dboot_gdt.s
1297 1319 */
1298 1320 void
1299 1321 init_boot_gdt(user_desc_t *bgdt)
1300 1322 {
1301 1323 #if defined(__amd64)
1302 1324 set_usegd(&bgdt[GDT_B32DATA], SDP_LONG, NULL, -1, SDT_MEMRWA, SEL_KPL,
1303 1325 SDP_PAGES, SDP_OP32);
1304 1326 set_usegd(&bgdt[GDT_B64CODE], SDP_LONG, NULL, -1, SDT_MEMERA, SEL_KPL,
1305 1327 SDP_PAGES, SDP_OP32);
1306 1328 #elif defined(__i386)
1307 1329 set_usegd(&bgdt[GDT_B32DATA], NULL, -1, SDT_MEMRWA, SEL_KPL,
1308 1330 SDP_PAGES, SDP_OP32);
1309 1331 set_usegd(&bgdt[GDT_B32CODE], NULL, -1, SDT_MEMERA, SEL_KPL,
1310 1332 SDP_PAGES, SDP_OP32);
1311 1333 #endif /* __i386 */
1312 1334 }
1313 1335
1314 1336 /*
1315 1337 * Enable interpositioning on the system call path by rewriting the
1316 1338 * sys{call|enter} MSRs and the syscall-related entries in the IDT to use
1317 1339 * the branded entry points.
1318 1340 */
1319 1341 void
1320 1342 brand_interpositioning_enable(void)
1321 1343 {
1322 1344 gate_desc_t *idt = CPU->cpu_idt;
1323 1345 int i;
1324 1346
1325 1347 ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL);
1326 1348
1327 1349 for (i = 0; brand_tbl[i].ih_inum; i++) {
1328 1350 idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_interp_desc;
1329 1351 #if defined(__xpv)
1330 1352 xen_idt_write(&idt[brand_tbl[i].ih_inum],
1331 1353 brand_tbl[i].ih_inum);
1332 1354 #endif
1333 1355 }
1334 1356
1335 1357 #if defined(__amd64)
1336 1358 #if defined(__xpv)
1337 1359
1338 1360 /*
1339 1361 * Currently the hypervisor only supports 64-bit syscalls via
1340 1362 * syscall instruction. The 32-bit syscalls are handled by
1341 1363 * interrupt gate above.
1342 1364 */
1343 1365 xen_set_callback(brand_sys_syscall, CALLBACKTYPE_syscall,
1344 1366 CALLBACKF_mask_events);
1345 1367
1346 1368 #else
1347 1369
1348 1370 if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) {
1349 1371 if (kpti_enable == 1) {
1350 1372 wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_brand_sys_syscall);
1351 1373 wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_brand_sys_syscall32);
1352 1374 } else {
1353 1375 wrmsr(MSR_AMD_LSTAR, (uintptr_t)brand_sys_syscall);
1354 1376 wrmsr(MSR_AMD_CSTAR, (uintptr_t)brand_sys_syscall32);
1355 1377 }
1356 1378 }
1357 1379
1358 1380 #endif
1359 1381 #endif /* __amd64 */
1360 1382
1361 1383 if (is_x86_feature(x86_featureset, X86FSET_SEP)) {
1362 1384 if (kpti_enable == 1) {
1363 1385 wrmsr(MSR_INTC_SEP_EIP,
1364 1386 (uintptr_t)tr_brand_sys_sysenter);
1365 1387 } else {
1366 1388 wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)brand_sys_sysenter);
1367 1389 }
1368 1390 }
1369 1391 }
1370 1392
1371 1393 /*
1372 1394 * Disable interpositioning on the system call path by rewriting the
1373 1395 * sys{call|enter} MSRs and the syscall-related entries in the IDT to use
1374 1396 * the standard entry points, which bypass the interpositioning hooks.
1375 1397 */
1376 1398 void
1377 1399 brand_interpositioning_disable(void)
1378 1400 {
1379 1401 gate_desc_t *idt = CPU->cpu_idt;
1380 1402 int i;
1381 1403
1382 1404 ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL);
1383 1405
1384 1406 for (i = 0; brand_tbl[i].ih_inum; i++) {
1385 1407 idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_default_desc;
1386 1408 #if defined(__xpv)
1387 1409 xen_idt_write(&idt[brand_tbl[i].ih_inum],
1388 1410 brand_tbl[i].ih_inum);
1389 1411 #endif
1390 1412 }
1391 1413
1392 1414 #if defined(__amd64)
1393 1415 #if defined(__xpv)
1394 1416
1395 1417 /*
1396 1418 * See comment above in brand_interpositioning_enable.
1397 1419 */
1398 1420 xen_set_callback(sys_syscall, CALLBACKTYPE_syscall,
1399 1421 CALLBACKF_mask_events);
1400 1422
1401 1423 #else
1402 1424
1403 1425 if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) {
1404 1426 if (kpti_enable == 1) {
1405 1427 wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_sys_syscall);
1406 1428 wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_sys_syscall32);
1407 1429 } else {
1408 1430 wrmsr(MSR_AMD_LSTAR, (uintptr_t)sys_syscall);
1409 1431 wrmsr(MSR_AMD_CSTAR, (uintptr_t)sys_syscall32);
1410 1432 }
1411 1433 }
1412 1434
1413 1435 #endif
1414 1436 #endif /* __amd64 */
1415 1437
1416 1438 if (is_x86_feature(x86_featureset, X86FSET_SEP)) {
1417 1439 if (kpti_enable == 1) {
1418 1440 wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)tr_sys_sysenter);
1419 1441 } else {
1420 1442 wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)sys_sysenter);
1421 1443 }
1422 1444 }
1423 1445 }
↓ open down ↓ |
120 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX