Print this page
8956 Implement KPTI
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/intel/ia32/os/sysi86.c
+++ new/usr/src/uts/intel/ia32/os/sysi86.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
↓ open down ↓ |
12 lines elided |
↑ open up ↑ |
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
23 + * Copyright 2018 Joyent, Inc.
23 24 */
24 25
25 26 /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */
27 28 /* All Rights Reserved */
28 29
29 30 /* Copyright (c) 1987, 1988 Microsoft Corporation */
30 31 /* All Rights Reserved */
31 32
32 33 #include <sys/param.h>
33 34 #include <sys/types.h>
34 35 #include <sys/sysmacros.h>
35 36 #include <sys/systm.h>
36 37 #include <sys/signal.h>
37 38 #include <sys/errno.h>
38 39 #include <sys/fault.h>
39 40 #include <sys/syscall.h>
40 41 #include <sys/cpuvar.h>
41 42 #include <sys/sysi86.h>
42 43 #include <sys/psw.h>
43 44 #include <sys/cred.h>
44 45 #include <sys/policy.h>
45 46 #include <sys/thread.h>
46 47 #include <sys/debug.h>
47 48 #include <sys/ontrap.h>
48 49 #include <sys/privregs.h>
49 50 #include <sys/x86_archext.h>
50 51 #include <sys/vmem.h>
51 52 #include <sys/kmem.h>
52 53 #include <sys/mman.h>
↓ open down ↓ |
20 lines elided |
↑ open up ↑ |
53 54 #include <sys/archsystm.h>
54 55 #include <vm/hat.h>
55 56 #include <vm/as.h>
56 57 #include <vm/seg.h>
57 58 #include <vm/seg_kmem.h>
58 59 #include <vm/faultcode.h>
59 60 #include <sys/fp.h>
60 61 #include <sys/cmn_err.h>
61 62 #include <sys/segments.h>
62 63 #include <sys/clock.h>
64 +#include <vm/hat_i86.h>
63 65 #if defined(__xpv)
64 66 #include <sys/hypervisor.h>
65 67 #include <sys/note.h>
66 68 #endif
67 69
68 70 static void ldt_alloc(proc_t *, uint_t);
69 71 static void ldt_free(proc_t *);
70 72 static void ldt_dup(proc_t *, proc_t *);
71 73 static void ldt_grow(proc_t *, uint_t);
72 74
73 75 /*
74 76 * sysi86 System Call
75 77 */
76 78
77 79 /* ARGSUSED */
78 80 int
79 81 sysi86(short cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3)
80 82 {
81 83 struct ssd ssd;
82 84 int error = 0;
83 85 int c;
84 86 proc_t *pp = curproc;
85 87
86 88 switch (cmd) {
87 89
88 90 /*
89 91 * The SI86V86 subsystem call of the SYSI86 system call
90 92 * supports only one subcode -- V86SC_IOPL.
91 93 */
92 94 case SI86V86:
93 95 if (arg1 == V86SC_IOPL) {
94 96 struct regs *rp = lwptoregs(ttolwp(curthread));
95 97 greg_t oldpl = rp->r_ps & PS_IOPL;
96 98 greg_t newpl = arg2 & PS_IOPL;
97 99
98 100 /*
99 101 * Must be privileged to run this system call
100 102 * if giving more io privilege.
101 103 */
102 104 if (newpl > oldpl && (error =
103 105 secpolicy_sys_config(CRED(), B_FALSE)) != 0)
104 106 return (set_errno(error));
105 107 #if defined(__xpv)
106 108 kpreempt_disable();
107 109 installctx(curthread, NULL, xen_disable_user_iopl,
108 110 xen_enable_user_iopl, NULL, NULL,
109 111 xen_disable_user_iopl, NULL);
110 112 xen_enable_user_iopl();
111 113 kpreempt_enable();
112 114 #else
113 115 rp->r_ps ^= oldpl ^ newpl;
114 116 #endif
115 117 } else
116 118 error = EINVAL;
117 119 break;
118 120
119 121 /*
120 122 * Set a segment descriptor
121 123 */
122 124 case SI86DSCR:
123 125 /*
124 126 * There are considerable problems here manipulating
125 127 * resources shared by many running lwps. Get everyone
126 128 * into a safe state before changing the LDT.
127 129 */
128 130 if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK1)) {
129 131 error = EINTR;
130 132 break;
131 133 }
132 134
133 135 if (get_udatamodel() == DATAMODEL_LP64) {
134 136 error = EINVAL;
135 137 break;
136 138 }
137 139
138 140 if (copyin((caddr_t)arg1, &ssd, sizeof (ssd)) < 0) {
139 141 error = EFAULT;
140 142 break;
141 143 }
142 144
143 145 error = setdscr(&ssd);
144 146
145 147 mutex_enter(&pp->p_lock);
146 148 if (curthread != pp->p_agenttp)
147 149 continuelwps(pp);
148 150 mutex_exit(&pp->p_lock);
149 151 break;
150 152
151 153 case SI86FPHW:
152 154 c = fp_kind & 0xff;
153 155 if (suword32((void *)arg1, c) == -1)
154 156 error = EFAULT;
155 157 break;
156 158
157 159 case SI86FPSTART:
158 160 /*
159 161 * arg1 is the address of _fp_hw
160 162 * arg2 is the desired x87 FCW value
161 163 * arg3 is the desired SSE MXCSR value
162 164 * a return value of one means SSE hardware, else none.
163 165 */
164 166 c = fp_kind & 0xff;
165 167 if (suword32((void *)arg1, c) == -1) {
166 168 error = EFAULT;
167 169 break;
168 170 }
169 171 fpsetcw((uint16_t)arg2, (uint32_t)arg3);
170 172 return ((fp_kind & __FP_SSE) ? 1 : 0);
171 173
172 174 /* real time clock management commands */
173 175
174 176 case WTODC:
175 177 if ((error = secpolicy_settime(CRED())) == 0) {
176 178 timestruc_t ts;
177 179 mutex_enter(&tod_lock);
178 180 gethrestime(&ts);
179 181 tod_set(ts);
180 182 mutex_exit(&tod_lock);
181 183 }
182 184 break;
183 185
184 186 /* Give some timezone playing room */
185 187 #define ONEWEEK (7 * 24 * 60 * 60)
186 188
187 189 case SGMTL:
188 190 /*
189 191 * Called from 32 bit land, negative values
190 192 * are not sign extended, so we do that here
191 193 * by casting it to an int and back. We also
192 194 * clamp the value to within reason and detect
193 195 * when a 64 bit call overflows an int.
194 196 */
195 197 if ((error = secpolicy_settime(CRED())) == 0) {
196 198 int newlag = (int)arg1;
197 199
198 200 #ifdef _SYSCALL32_IMPL
199 201 if (get_udatamodel() == DATAMODEL_NATIVE &&
200 202 (long)newlag != (long)arg1) {
201 203 error = EOVERFLOW;
202 204 } else
203 205 #endif
204 206 if (newlag >= -ONEWEEK && newlag <= ONEWEEK)
205 207 sgmtl(newlag);
206 208 else
207 209 error = EOVERFLOW;
208 210 }
209 211 break;
210 212
211 213 case GGMTL:
212 214 if (get_udatamodel() == DATAMODEL_NATIVE) {
213 215 if (sulword((void *)arg1, ggmtl()) == -1)
214 216 error = EFAULT;
215 217 #ifdef _SYSCALL32_IMPL
216 218 } else {
217 219 time_t gmtl;
218 220
219 221 if ((gmtl = ggmtl()) > INT32_MAX) {
220 222 /*
221 223 * Since gmt_lag can at most be
222 224 * +/- 12 hours, something is
223 225 * *seriously* messed up here.
224 226 */
225 227 error = EOVERFLOW;
226 228 } else if (suword32((void *)arg1, (int32_t)gmtl) == -1)
227 229 error = EFAULT;
228 230 #endif
229 231 }
230 232 break;
231 233
232 234 case RTCSYNC:
233 235 if ((error = secpolicy_settime(CRED())) == 0)
234 236 rtcsync();
235 237 break;
236 238
237 239 /* END OF real time clock management commands */
238 240
239 241 default:
240 242 error = EINVAL;
241 243 break;
242 244 }
243 245 return (error == 0 ? 0 : set_errno(error));
244 246 }
245 247
246 248 void
247 249 usd_to_ssd(user_desc_t *usd, struct ssd *ssd, selector_t sel)
248 250 {
249 251 ssd->bo = USEGD_GETBASE(usd);
250 252 ssd->ls = USEGD_GETLIMIT(usd);
251 253 ssd->sel = sel;
252 254
253 255 /*
254 256 * set type, dpl and present bits.
255 257 */
256 258 ssd->acc1 = usd->usd_type;
257 259 ssd->acc1 |= usd->usd_dpl << 5;
258 260 ssd->acc1 |= usd->usd_p << (5 + 2);
259 261
260 262 /*
261 263 * set avl, DB and granularity bits.
262 264 */
263 265 ssd->acc2 = usd->usd_avl;
264 266
265 267 #if defined(__amd64)
266 268 ssd->acc2 |= usd->usd_long << 1;
267 269 #else
268 270 ssd->acc2 |= usd->usd_reserved << 1;
269 271 #endif
270 272
271 273 ssd->acc2 |= usd->usd_def32 << (1 + 1);
272 274 ssd->acc2 |= usd->usd_gran << (1 + 1 + 1);
273 275 }
274 276
275 277 static void
276 278 ssd_to_usd(struct ssd *ssd, user_desc_t *usd)
277 279 {
278 280
279 281 ASSERT(bcmp(usd, &null_udesc, sizeof (*usd)) == 0);
280 282
281 283 USEGD_SETBASE(usd, ssd->bo);
282 284 USEGD_SETLIMIT(usd, ssd->ls);
283 285
284 286 /*
285 287 * set type, dpl and present bits.
286 288 */
287 289 usd->usd_type = ssd->acc1;
288 290 usd->usd_dpl = ssd->acc1 >> 5;
289 291 usd->usd_p = ssd->acc1 >> (5 + 2);
290 292
291 293 ASSERT(usd->usd_type >= SDT_MEMRO);
292 294 ASSERT(usd->usd_dpl == SEL_UPL);
293 295
294 296 /*
295 297 * 64-bit code selectors are never allowed in the LDT.
296 298 * Reserved bit is always 0 on 32-bit systems.
297 299 */
298 300 #if defined(__amd64)
299 301 usd->usd_long = 0;
300 302 #else
301 303 usd->usd_reserved = 0;
302 304 #endif
303 305
304 306 /*
305 307 * set avl, DB and granularity bits.
306 308 */
307 309 usd->usd_avl = ssd->acc2;
308 310 usd->usd_def32 = ssd->acc2 >> (1 + 1);
309 311 usd->usd_gran = ssd->acc2 >> (1 + 1 + 1);
310 312 }
311 313
312 314
313 315 #if defined(__i386)
314 316
315 317 static void
316 318 ssd_to_sgd(struct ssd *ssd, gate_desc_t *sgd)
317 319 {
318 320
319 321 ASSERT(bcmp(sgd, &null_sdesc, sizeof (*sgd)) == 0);
320 322
321 323 sgd->sgd_looffset = ssd->bo;
322 324 sgd->sgd_hioffset = ssd->bo >> 16;
323 325
324 326 sgd->sgd_selector = ssd->ls;
325 327
326 328 /*
327 329 * set type, dpl and present bits.
328 330 */
329 331 sgd->sgd_type = ssd->acc1;
330 332 sgd->sgd_dpl = ssd->acc1 >> 5;
331 333 sgd->sgd_p = ssd->acc1 >> 7;
332 334 ASSERT(sgd->sgd_type == SDT_SYSCGT);
333 335 ASSERT(sgd->sgd_dpl == SEL_UPL);
334 336 sgd->sgd_stkcpy = 0;
335 337 }
336 338
337 339 #endif /* __i386 */
338 340
↓ open down ↓ |
266 lines elided |
↑ open up ↑ |
339 341 /*
340 342 * Load LDT register with the current process's LDT.
341 343 */
342 344 static void
343 345 ldt_load(void)
344 346 {
345 347 #if defined(__xpv)
346 348 xen_set_ldt(get_ssd_base(&curproc->p_ldt_desc),
347 349 curproc->p_ldtlimit + 1);
348 350 #else
349 - *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = curproc->p_ldt_desc;
351 + size_t len;
352 + system_desc_t desc;
353 +
354 + /*
355 + * Before we can use the LDT on this CPU, we must install the LDT in the
356 + * user mapping table.
357 + */
358 + len = (curproc->p_ldtlimit + 1) * sizeof (user_desc_t);
359 + bcopy(curproc->p_ldt, CPU->cpu_m.mcpu_ldt, len);
360 + CPU->cpu_m.mcpu_ldt_len = len;
361 + set_syssegd(&desc, CPU->cpu_m.mcpu_ldt, len - 1, SDT_SYSLDT, SEL_KPL);
362 + *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = desc;
363 +
350 364 wr_ldtr(ULDT_SEL);
351 365 #endif
352 366 }
353 367
354 368 /*
355 369 * Store a NULL selector in the LDTR. All subsequent illegal references to
356 370 * the LDT will result in a #gp.
357 371 */
358 372 void
359 373 ldt_unload(void)
360 374 {
361 375 #if defined(__xpv)
362 376 xen_set_ldt(NULL, 0);
363 377 #else
364 378 *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = null_sdesc;
365 379 wr_ldtr(0);
380 +
381 + bzero(CPU->cpu_m.mcpu_ldt, CPU->cpu_m.mcpu_ldt_len);
382 + CPU->cpu_m.mcpu_ldt_len = 0;
366 383 #endif
367 384 }
368 385
369 386 /*ARGSUSED*/
370 387 static void
371 388 ldt_savectx(proc_t *p)
372 389 {
373 390 ASSERT(p->p_ldt != NULL);
374 391 ASSERT(p == curproc);
375 392
376 393 #if defined(__amd64)
377 394 /*
378 395 * The 64-bit kernel must be sure to clear any stale ldt
379 396 * selectors when context switching away from a process that
380 397 * has a private ldt. Consider the following example:
381 398 *
382 399 * Wine creats a ldt descriptor and points a segment register
383 400 * to it.
384 401 *
385 402 * We then context switch away from wine lwp to kernel
386 403 * thread and hit breakpoint in kernel with kmdb
387 404 *
388 405 * When we continue and resume from kmdb we will #gp
389 406 * fault since kmdb will have saved the stale ldt selector
390 407 * from wine and will try to restore it but we are no longer in
391 408 * the context of the wine process and do not have our
392 409 * ldtr register pointing to the private ldt.
393 410 */
394 411 reset_sregs();
395 412 #endif
396 413
397 414 ldt_unload();
398 415 cpu_fast_syscall_enable(NULL);
399 416 }
400 417
401 418 static void
402 419 ldt_restorectx(proc_t *p)
403 420 {
404 421 ASSERT(p->p_ldt != NULL);
405 422 ASSERT(p == curproc);
406 423
407 424 ldt_load();
408 425 cpu_fast_syscall_disable(NULL);
409 426 }
410 427
411 428 /*
412 429 * When a process with a private LDT execs, fast syscalls must be enabled for
413 430 * the new process image.
414 431 */
415 432 /* ARGSUSED */
416 433 static void
417 434 ldt_freectx(proc_t *p, int isexec)
418 435 {
419 436 ASSERT(p->p_ldt);
420 437
421 438 if (isexec) {
422 439 kpreempt_disable();
423 440 cpu_fast_syscall_enable(NULL);
424 441 kpreempt_enable();
425 442 }
426 443
427 444 /*
428 445 * ldt_free() will free the memory used by the private LDT, reset the
429 446 * process's descriptor, and re-program the LDTR.
430 447 */
431 448 ldt_free(p);
432 449 }
433 450
434 451 /*
435 452 * Install ctx op that ensures syscall/sysenter are disabled.
436 453 * See comments below.
437 454 *
438 455 * When a thread with a private LDT forks, the new process
439 456 * must have the LDT context ops installed.
440 457 */
441 458 /* ARGSUSED */
442 459 static void
443 460 ldt_installctx(proc_t *p, proc_t *cp)
444 461 {
445 462 proc_t *targ = p;
446 463 kthread_t *t;
447 464
448 465 /*
449 466 * If this is a fork, operate on the child process.
450 467 */
451 468 if (cp != NULL) {
452 469 targ = cp;
453 470 ldt_dup(p, cp);
454 471 }
455 472
456 473 /*
457 474 * The process context ops expect the target process as their argument.
458 475 */
459 476 ASSERT(removepctx(targ, targ, ldt_savectx, ldt_restorectx,
460 477 ldt_installctx, ldt_savectx, ldt_freectx) == 0);
461 478
462 479 installpctx(targ, targ, ldt_savectx, ldt_restorectx,
463 480 ldt_installctx, ldt_savectx, ldt_freectx);
464 481
465 482 /*
466 483 * We've just disabled fast system call and return instructions; take
467 484 * the slow path out to make sure we don't try to use one to return
468 485 * back to user. We must set t_post_sys for every thread in the
469 486 * process to make sure none of them escape out via fast return.
470 487 */
471 488
472 489 mutex_enter(&targ->p_lock);
473 490 t = targ->p_tlist;
474 491 do {
475 492 t->t_post_sys = 1;
476 493 } while ((t = t->t_forw) != targ->p_tlist);
477 494 mutex_exit(&targ->p_lock);
478 495 }
479 496
480 497 int
481 498 setdscr(struct ssd *ssd)
482 499 {
483 500 ushort_t seli; /* selector index */
484 501 user_desc_t *ldp; /* descriptor pointer */
485 502 user_desc_t ndesc; /* new descriptor */
486 503 proc_t *pp = ttoproc(curthread);
487 504 int rc = 0;
488 505
489 506 /*
490 507 * LDT segments: executable and data at DPL 3 only.
491 508 */
492 509 if (!SELISLDT(ssd->sel) || !SELISUPL(ssd->sel))
493 510 return (EINVAL);
494 511
495 512 /*
496 513 * check the selector index.
497 514 */
498 515 seli = SELTOIDX(ssd->sel);
499 516 if (seli >= MAXNLDT || seli < LDT_UDBASE)
500 517 return (EINVAL);
501 518
502 519 ndesc = null_udesc;
503 520 mutex_enter(&pp->p_ldtlock);
504 521
505 522 /*
506 523 * If this is the first time for this process then setup a
507 524 * private LDT for it.
508 525 */
509 526 if (pp->p_ldt == NULL) {
510 527 ldt_alloc(pp, seli);
511 528
512 529 /*
513 530 * Now that this process has a private LDT, the use of
514 531 * the syscall/sysret and sysenter/sysexit instructions
515 532 * is forbidden for this processes because they destroy
516 533 * the contents of %cs and %ss segment registers.
517 534 *
518 535 * Explicity disable them here and add a context handler
519 536 * to the process. Note that disabling
520 537 * them here means we can't use sysret or sysexit on
521 538 * the way out of this system call - so we force this
522 539 * thread to take the slow path (which doesn't make use
523 540 * of sysenter or sysexit) back out.
524 541 */
525 542 kpreempt_disable();
526 543 ldt_installctx(pp, NULL);
527 544 cpu_fast_syscall_disable(NULL);
528 545 ASSERT(curthread->t_post_sys != 0);
529 546 kpreempt_enable();
530 547
531 548 } else if (seli > pp->p_ldtlimit) {
532 549
533 550 /*
534 551 * Increase size of ldt to include seli.
535 552 */
536 553 ldt_grow(pp, seli);
537 554 }
538 555
539 556 ASSERT(seli <= pp->p_ldtlimit);
540 557 ldp = &pp->p_ldt[seli];
541 558
542 559 /*
543 560 * On the 64-bit kernel, this is where things get more subtle.
544 561 * Recall that in the 64-bit kernel, when we enter the kernel we
545 562 * deliberately -don't- reload the segment selectors we came in on
546 563 * for %ds, %es, %fs or %gs. Messing with selectors is expensive,
547 564 * and the underlying descriptors are essentially ignored by the
548 565 * hardware in long mode - except for the base that we override with
549 566 * the gsbase MSRs.
550 567 *
551 568 * However, there's one unfortunate issue with this rosy picture --
552 569 * a descriptor that's not marked as 'present' will still generate
553 570 * an #np when loading a segment register.
554 571 *
555 572 * Consider this case. An lwp creates a harmless LDT entry, points
556 573 * one of it's segment registers at it, then tells the kernel (here)
557 574 * to delete it. In the 32-bit kernel, the #np will happen on the
558 575 * way back to userland where we reload the segment registers, and be
559 576 * handled in kern_gpfault(). In the 64-bit kernel, the same thing
560 577 * will happen in the normal case too. However, if we're trying to
561 578 * use a debugger that wants to save and restore the segment registers,
562 579 * and the debugger things that we have valid segment registers, we
563 580 * have the problem that the debugger will try and restore the
564 581 * segment register that points at the now 'not present' descriptor
565 582 * and will take a #np right there.
566 583 *
567 584 * We should obviously fix the debugger to be paranoid about
568 585 * -not- restoring segment registers that point to bad descriptors;
569 586 * however we can prevent the problem here if we check to see if any
570 587 * of the segment registers are still pointing at the thing we're
571 588 * destroying; if they are, return an error instead. (That also seems
572 589 * a lot better failure mode than SIGKILL and a core file
573 590 * from kern_gpfault() too.)
574 591 */
575 592 if (SI86SSD_PRES(ssd) == 0) {
576 593 kthread_t *t;
577 594 int bad = 0;
578 595
579 596 /*
580 597 * Look carefully at the segment registers of every lwp
581 598 * in the process (they're all stopped by our caller).
582 599 * If we're about to invalidate a descriptor that's still
583 600 * being referenced by *any* of them, return an error,
584 601 * rather than having them #gp on their way out of the kernel.
585 602 */
586 603 ASSERT(pp->p_lwprcnt == 1);
587 604
588 605 mutex_enter(&pp->p_lock);
589 606 t = pp->p_tlist;
590 607 do {
591 608 klwp_t *lwp = ttolwp(t);
592 609 struct regs *rp = lwp->lwp_regs;
593 610 #if defined(__amd64)
594 611 pcb_t *pcb = &lwp->lwp_pcb;
595 612 #endif
596 613
597 614 if (ssd->sel == rp->r_cs || ssd->sel == rp->r_ss) {
598 615 bad = 1;
599 616 break;
600 617 }
601 618
602 619 #if defined(__amd64)
603 620 if (pcb->pcb_rupdate == 1) {
604 621 if (ssd->sel == pcb->pcb_ds ||
605 622 ssd->sel == pcb->pcb_es ||
606 623 ssd->sel == pcb->pcb_fs ||
607 624 ssd->sel == pcb->pcb_gs) {
608 625 bad = 1;
609 626 break;
610 627 }
611 628 } else
612 629 #endif
613 630 {
614 631 if (ssd->sel == rp->r_ds ||
615 632 ssd->sel == rp->r_es ||
616 633 ssd->sel == rp->r_fs ||
617 634 ssd->sel == rp->r_gs) {
618 635 bad = 1;
619 636 break;
620 637 }
621 638 }
622 639
623 640 } while ((t = t->t_forw) != pp->p_tlist);
624 641 mutex_exit(&pp->p_lock);
625 642
626 643 if (bad) {
627 644 mutex_exit(&pp->p_ldtlock);
628 645 return (EBUSY);
629 646 }
630 647 }
631 648
632 649 /*
633 650 * If acc1 is zero, clear the descriptor (including the 'present' bit)
634 651 */
635 652 if (ssd->acc1 == 0) {
636 653 rc = ldt_update_segd(ldp, &null_udesc);
637 654 mutex_exit(&pp->p_ldtlock);
638 655 return (rc);
639 656 }
640 657
641 658 /*
642 659 * Check segment type, allow segment not present and
643 660 * only user DPL (3).
644 661 */
645 662 if (SI86SSD_DPL(ssd) != SEL_UPL) {
646 663 mutex_exit(&pp->p_ldtlock);
647 664 return (EINVAL);
648 665 }
649 666
650 667 #if defined(__amd64)
651 668 /*
652 669 * Do not allow 32-bit applications to create 64-bit mode code
653 670 * segments.
654 671 */
655 672 if (SI86SSD_ISUSEG(ssd) && ((SI86SSD_TYPE(ssd) >> 3) & 1) == 1 &&
656 673 SI86SSD_ISLONG(ssd)) {
657 674 mutex_exit(&pp->p_ldtlock);
658 675 return (EINVAL);
659 676 }
660 677 #endif /* __amd64 */
661 678
662 679 /*
663 680 * Set up a code or data user segment descriptor.
664 681 */
665 682 if (SI86SSD_ISUSEG(ssd)) {
666 683 ssd_to_usd(ssd, &ndesc);
667 684 rc = ldt_update_segd(ldp, &ndesc);
668 685 mutex_exit(&pp->p_ldtlock);
669 686 return (rc);
670 687 }
671 688
672 689 #if defined(__i386)
673 690 /*
674 691 * Allow a call gate only if the destination is in the LDT
675 692 * and the system is running in 32-bit legacy mode.
676 693 *
677 694 * In long mode 32-bit call gates are redefined as 64-bit call
678 695 * gates and the hw enforces that the target code selector
679 696 * of the call gate must be 64-bit selector. A #gp fault is
680 697 * generated if otherwise. Since we do not allow 32-bit processes
681 698 * to switch themselves to 64-bits we never allow call gates
682 699 * on 64-bit system system.
683 700 */
684 701 if (SI86SSD_TYPE(ssd) == SDT_SYSCGT && SELISLDT(ssd->ls)) {
685 702
686 703
687 704 ssd_to_sgd(ssd, (gate_desc_t *)&ndesc);
688 705 rc = ldt_update_segd(ldp, &ndesc);
689 706 mutex_exit(&pp->p_ldtlock);
690 707 return (rc);
691 708 }
692 709 #endif /* __i386 */
693 710
694 711 mutex_exit(&pp->p_ldtlock);
695 712 return (EINVAL);
696 713 }
697 714
698 715 /*
699 716 * Allocate new LDT for process just large enough to contain seli.
700 717 * Note we allocate and grow LDT in PAGESIZE chunks. We do this
701 718 * to simplify the implementation and because on the hypervisor it's
702 719 * required, since the LDT must live on pages that have PROT_WRITE
703 720 * removed and which are given to the hypervisor.
704 721 */
705 722 static void
706 723 ldt_alloc(proc_t *pp, uint_t seli)
↓ open down ↓ |
331 lines elided |
↑ open up ↑ |
707 724 {
708 725 user_desc_t *ldt;
709 726 size_t ldtsz;
710 727 uint_t nsels;
711 728
712 729 ASSERT(MUTEX_HELD(&pp->p_ldtlock));
713 730 ASSERT(pp->p_ldt == NULL);
714 731 ASSERT(pp->p_ldtlimit == 0);
715 732
716 733 /*
717 - * Allocate new LDT just large enough to contain seli.
734 + * Allocate new LDT just large enough to contain seli. The LDT must
735 + * always be allocated in units of pages for KPTI.
718 736 */
719 737 ldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE);
720 738 nsels = ldtsz / sizeof (user_desc_t);
721 739 ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT);
722 740
723 741 ldt = kmem_zalloc(ldtsz, KM_SLEEP);
724 742 ASSERT(IS_P2ALIGNED(ldt, PAGESIZE));
725 743
726 744 #if defined(__xpv)
727 745 if (xen_ldt_setprot(ldt, ldtsz, PROT_READ))
728 746 panic("ldt_alloc:xen_ldt_setprot(PROT_READ) failed");
729 747 #endif
730 748
731 749 pp->p_ldt = ldt;
732 750 pp->p_ldtlimit = nsels - 1;
733 751 set_syssegd(&pp->p_ldt_desc, ldt, ldtsz - 1, SDT_SYSLDT, SEL_KPL);
734 752
735 753 if (pp == curproc) {
736 754 kpreempt_disable();
737 755 ldt_load();
738 756 kpreempt_enable();
739 757 }
740 758 }
741 759
742 760 static void
743 761 ldt_free(proc_t *pp)
744 762 {
745 763 user_desc_t *ldt;
746 764 size_t ldtsz;
747 765
748 766 ASSERT(pp->p_ldt != NULL);
749 767
750 768 mutex_enter(&pp->p_ldtlock);
751 769 ldt = pp->p_ldt;
752 770 ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
753 771
754 772 ASSERT(IS_P2ALIGNED(ldtsz, PAGESIZE));
755 773
756 774 pp->p_ldt = NULL;
757 775 pp->p_ldtlimit = 0;
758 776 pp->p_ldt_desc = null_sdesc;
759 777 mutex_exit(&pp->p_ldtlock);
760 778
761 779 if (pp == curproc) {
762 780 kpreempt_disable();
763 781 ldt_unload();
764 782 kpreempt_enable();
765 783 }
766 784
767 785 #if defined(__xpv)
768 786 /*
769 787 * We are not allowed to make the ldt writable until after
770 788 * we tell the hypervisor to unload it.
771 789 */
772 790 if (xen_ldt_setprot(ldt, ldtsz, PROT_READ | PROT_WRITE))
773 791 panic("ldt_free:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
774 792 #endif
775 793
776 794 kmem_free(ldt, ldtsz);
777 795 }
778 796
779 797 /*
780 798 * On fork copy new ldt for child.
781 799 */
782 800 static void
783 801 ldt_dup(proc_t *pp, proc_t *cp)
784 802 {
785 803 size_t ldtsz;
786 804
787 805 ASSERT(pp->p_ldt != NULL);
788 806 ASSERT(cp != curproc);
789 807
790 808 /*
791 809 * I assume the parent's ldt can't increase since we're in a fork.
792 810 */
793 811 mutex_enter(&pp->p_ldtlock);
794 812 mutex_enter(&cp->p_ldtlock);
795 813
796 814 ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
797 815
798 816 ldt_alloc(cp, pp->p_ldtlimit);
799 817
800 818 #if defined(__xpv)
801 819 /*
802 820 * Make child's ldt writable so it can be copied into from
803 821 * parent's ldt. This works since ldt_alloc above did not load
804 822 * the ldt since its for the child process. If we tried to make
805 823 * an LDT writable that is loaded in hw the setprot operation
806 824 * would fail.
807 825 */
808 826 if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ | PROT_WRITE))
809 827 panic("ldt_dup:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
810 828 #endif
811 829
812 830 bcopy(pp->p_ldt, cp->p_ldt, ldtsz);
813 831
814 832 #if defined(__xpv)
815 833 if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ))
816 834 panic("ldt_dup:xen_ldt_setprot(PROT_READ) failed");
817 835 #endif
818 836 mutex_exit(&cp->p_ldtlock);
819 837 mutex_exit(&pp->p_ldtlock);
820 838
821 839 }
822 840
823 841 static void
824 842 ldt_grow(proc_t *pp, uint_t seli)
↓ open down ↓ |
97 lines elided |
↑ open up ↑ |
825 843 {
826 844 user_desc_t *oldt, *nldt;
827 845 uint_t nsels;
828 846 size_t oldtsz, nldtsz;
829 847
830 848 ASSERT(MUTEX_HELD(&pp->p_ldtlock));
831 849 ASSERT(pp->p_ldt != NULL);
832 850 ASSERT(pp->p_ldtlimit != 0);
833 851
834 852 /*
835 - * Allocate larger LDT just large enough to contain seli.
853 + * Allocate larger LDT just large enough to contain seli. The LDT must
854 + * always be allocated in units of pages for KPTI.
836 855 */
837 856 nldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE);
838 857 nsels = nldtsz / sizeof (user_desc_t);
839 858 ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT);
840 859 ASSERT(nsels > pp->p_ldtlimit);
841 860
842 861 oldt = pp->p_ldt;
843 862 oldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
844 863
845 864 nldt = kmem_zalloc(nldtsz, KM_SLEEP);
846 865 ASSERT(IS_P2ALIGNED(nldt, PAGESIZE));
847 866
848 867 bcopy(oldt, nldt, oldtsz);
849 868
850 869 /*
851 870 * unload old ldt.
852 871 */
853 872 kpreempt_disable();
854 873 ldt_unload();
855 874 kpreempt_enable();
856 875
857 876 #if defined(__xpv)
858 877
859 878 /*
860 879 * Make old ldt writable and new ldt read only.
861 880 */
862 881 if (xen_ldt_setprot(oldt, oldtsz, PROT_READ | PROT_WRITE))
863 882 panic("ldt_grow:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
864 883
865 884 if (xen_ldt_setprot(nldt, nldtsz, PROT_READ))
866 885 panic("ldt_grow:xen_ldt_setprot(PROT_READ) failed");
867 886 #endif
868 887
869 888 pp->p_ldt = nldt;
870 889 pp->p_ldtlimit = nsels - 1;
871 890
872 891 /*
873 892 * write new ldt segment descriptor.
874 893 */
875 894 set_syssegd(&pp->p_ldt_desc, nldt, nldtsz - 1, SDT_SYSLDT, SEL_KPL);
876 895
877 896 /*
878 897 * load the new ldt.
879 898 */
880 899 kpreempt_disable();
881 900 ldt_load();
882 901 kpreempt_enable();
883 902
884 903 kmem_free(oldt, oldtsz);
885 904 }
↓ open down ↓ |
40 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX