1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2018 Joyent, Inc.
24 */
25
26 /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */
28 /* All Rights Reserved */
29
30 /* Copyright (c) 1987, 1988 Microsoft Corporation */
31 /* All Rights Reserved */
32
33 #include <sys/param.h>
34 #include <sys/types.h>
35 #include <sys/sysmacros.h>
36 #include <sys/systm.h>
37 #include <sys/signal.h>
38 #include <sys/errno.h>
39 #include <sys/fault.h>
40 #include <sys/syscall.h>
41 #include <sys/cpuvar.h>
42 #include <sys/sysi86.h>
43 #include <sys/psw.h>
44 #include <sys/cred.h>
45 #include <sys/policy.h>
46 #include <sys/thread.h>
47 #include <sys/debug.h>
48 #include <sys/ontrap.h>
49 #include <sys/privregs.h>
50 #include <sys/x86_archext.h>
51 #include <sys/vmem.h>
52 #include <sys/kmem.h>
53 #include <sys/mman.h>
54 #include <sys/archsystm.h>
55 #include <vm/hat.h>
56 #include <vm/as.h>
57 #include <vm/seg.h>
58 #include <vm/seg_kmem.h>
59 #include <vm/faultcode.h>
60 #include <sys/fp.h>
61 #include <sys/cmn_err.h>
62 #include <sys/segments.h>
63 #include <sys/clock.h>
64 #include <vm/hat_i86.h>
65 #if defined(__xpv)
66 #include <sys/hypervisor.h>
67 #include <sys/note.h>
68 #endif
69
70 static void ldt_alloc(proc_t *, uint_t);
71 static void ldt_free(proc_t *);
72 static void ldt_dup(proc_t *, proc_t *);
73 static void ldt_grow(proc_t *, uint_t);
74
75 /*
76 * sysi86 System Call
77 */
78
79 /* ARGSUSED */
80 int
81 sysi86(short cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3)
82 {
83 struct ssd ssd;
84 int error = 0;
85 int c;
86 proc_t *pp = curproc;
87
88 switch (cmd) {
89
90 /*
91 * The SI86V86 subsystem call of the SYSI86 system call
92 * supports only one subcode -- V86SC_IOPL.
93 */
94 case SI86V86:
95 if (arg1 == V86SC_IOPL) {
96 struct regs *rp = lwptoregs(ttolwp(curthread));
97 greg_t oldpl = rp->r_ps & PS_IOPL;
98 greg_t newpl = arg2 & PS_IOPL;
99
100 /*
101 * Must be privileged to run this system call
102 * if giving more io privilege.
103 */
104 if (newpl > oldpl && (error =
105 secpolicy_sys_config(CRED(), B_FALSE)) != 0)
106 return (set_errno(error));
107 #if defined(__xpv)
108 kpreempt_disable();
109 installctx(curthread, NULL, xen_disable_user_iopl,
110 xen_enable_user_iopl, NULL, NULL,
111 xen_disable_user_iopl, NULL);
112 xen_enable_user_iopl();
113 kpreempt_enable();
114 #else
115 rp->r_ps ^= oldpl ^ newpl;
116 #endif
117 } else
118 error = EINVAL;
119 break;
120
121 /*
122 * Set a segment descriptor
123 */
124 case SI86DSCR:
125 /*
126 * There are considerable problems here manipulating
127 * resources shared by many running lwps. Get everyone
128 * into a safe state before changing the LDT.
129 */
130 if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK1)) {
131 error = EINTR;
132 break;
133 }
134
135 if (get_udatamodel() == DATAMODEL_LP64) {
136 error = EINVAL;
137 break;
138 }
139
140 if (copyin((caddr_t)arg1, &ssd, sizeof (ssd)) < 0) {
141 error = EFAULT;
142 break;
143 }
144
145 error = setdscr(&ssd);
146
147 mutex_enter(&pp->p_lock);
148 if (curthread != pp->p_agenttp)
149 continuelwps(pp);
150 mutex_exit(&pp->p_lock);
151 break;
152
153 case SI86FPHW:
154 c = fp_kind & 0xff;
155 if (suword32((void *)arg1, c) == -1)
156 error = EFAULT;
157 break;
158
159 case SI86FPSTART:
160 /*
161 * arg1 is the address of _fp_hw
162 * arg2 is the desired x87 FCW value
163 * arg3 is the desired SSE MXCSR value
164 * a return value of one means SSE hardware, else none.
165 */
166 c = fp_kind & 0xff;
167 if (suword32((void *)arg1, c) == -1) {
168 error = EFAULT;
169 break;
170 }
171 fpsetcw((uint16_t)arg2, (uint32_t)arg3);
172 return ((fp_kind & __FP_SSE) ? 1 : 0);
173
174 /* real time clock management commands */
175
176 case WTODC:
177 if ((error = secpolicy_settime(CRED())) == 0) {
178 timestruc_t ts;
179 mutex_enter(&tod_lock);
180 gethrestime(&ts);
181 tod_set(ts);
182 mutex_exit(&tod_lock);
183 }
184 break;
185
186 /* Give some timezone playing room */
187 #define ONEWEEK (7 * 24 * 60 * 60)
188
189 case SGMTL:
190 /*
191 * Called from 32 bit land, negative values
192 * are not sign extended, so we do that here
193 * by casting it to an int and back. We also
194 * clamp the value to within reason and detect
195 * when a 64 bit call overflows an int.
196 */
197 if ((error = secpolicy_settime(CRED())) == 0) {
198 int newlag = (int)arg1;
199
200 #ifdef _SYSCALL32_IMPL
201 if (get_udatamodel() == DATAMODEL_NATIVE &&
202 (long)newlag != (long)arg1) {
203 error = EOVERFLOW;
204 } else
205 #endif
206 if (newlag >= -ONEWEEK && newlag <= ONEWEEK)
207 sgmtl(newlag);
208 else
209 error = EOVERFLOW;
210 }
211 break;
212
213 case GGMTL:
214 if (get_udatamodel() == DATAMODEL_NATIVE) {
215 if (sulword((void *)arg1, ggmtl()) == -1)
216 error = EFAULT;
217 #ifdef _SYSCALL32_IMPL
218 } else {
219 time_t gmtl;
220
221 if ((gmtl = ggmtl()) > INT32_MAX) {
222 /*
223 * Since gmt_lag can at most be
224 * +/- 12 hours, something is
225 * *seriously* messed up here.
226 */
227 error = EOVERFLOW;
228 } else if (suword32((void *)arg1, (int32_t)gmtl) == -1)
229 error = EFAULT;
230 #endif
231 }
232 break;
233
234 case RTCSYNC:
235 if ((error = secpolicy_settime(CRED())) == 0)
236 rtcsync();
237 break;
238
239 /* END OF real time clock management commands */
240
241 default:
242 error = EINVAL;
243 break;
244 }
245 return (error == 0 ? 0 : set_errno(error));
246 }
247
248 void
249 usd_to_ssd(user_desc_t *usd, struct ssd *ssd, selector_t sel)
250 {
251 ssd->bo = USEGD_GETBASE(usd);
252 ssd->ls = USEGD_GETLIMIT(usd);
253 ssd->sel = sel;
254
255 /*
256 * set type, dpl and present bits.
257 */
258 ssd->acc1 = usd->usd_type;
259 ssd->acc1 |= usd->usd_dpl << 5;
260 ssd->acc1 |= usd->usd_p << (5 + 2);
261
262 /*
263 * set avl, DB and granularity bits.
264 */
265 ssd->acc2 = usd->usd_avl;
266
267 #if defined(__amd64)
268 ssd->acc2 |= usd->usd_long << 1;
269 #else
270 ssd->acc2 |= usd->usd_reserved << 1;
271 #endif
272
273 ssd->acc2 |= usd->usd_def32 << (1 + 1);
274 ssd->acc2 |= usd->usd_gran << (1 + 1 + 1);
275 }
276
277 static void
278 ssd_to_usd(struct ssd *ssd, user_desc_t *usd)
279 {
280
281 ASSERT(bcmp(usd, &null_udesc, sizeof (*usd)) == 0);
282
283 USEGD_SETBASE(usd, ssd->bo);
284 USEGD_SETLIMIT(usd, ssd->ls);
285
286 /*
287 * set type, dpl and present bits.
288 */
289 usd->usd_type = ssd->acc1;
290 usd->usd_dpl = ssd->acc1 >> 5;
291 usd->usd_p = ssd->acc1 >> (5 + 2);
292
293 ASSERT(usd->usd_type >= SDT_MEMRO);
294 ASSERT(usd->usd_dpl == SEL_UPL);
295
296 /*
297 * 64-bit code selectors are never allowed in the LDT.
298 * Reserved bit is always 0 on 32-bit systems.
299 */
300 #if defined(__amd64)
301 usd->usd_long = 0;
302 #else
303 usd->usd_reserved = 0;
304 #endif
305
306 /*
307 * set avl, DB and granularity bits.
308 */
309 usd->usd_avl = ssd->acc2;
310 usd->usd_def32 = ssd->acc2 >> (1 + 1);
311 usd->usd_gran = ssd->acc2 >> (1 + 1 + 1);
312 }
313
314
315 #if defined(__i386)
316
317 static void
318 ssd_to_sgd(struct ssd *ssd, gate_desc_t *sgd)
319 {
320
321 ASSERT(bcmp(sgd, &null_sdesc, sizeof (*sgd)) == 0);
322
323 sgd->sgd_looffset = ssd->bo;
324 sgd->sgd_hioffset = ssd->bo >> 16;
325
326 sgd->sgd_selector = ssd->ls;
327
328 /*
329 * set type, dpl and present bits.
330 */
331 sgd->sgd_type = ssd->acc1;
332 sgd->sgd_dpl = ssd->acc1 >> 5;
333 sgd->sgd_p = ssd->acc1 >> 7;
334 ASSERT(sgd->sgd_type == SDT_SYSCGT);
335 ASSERT(sgd->sgd_dpl == SEL_UPL);
336 sgd->sgd_stkcpy = 0;
337 }
338
339 #endif /* __i386 */
340
341 /*
342 * Load LDT register with the current process's LDT.
343 */
344 static void
345 ldt_load(void)
346 {
347 #if defined(__xpv)
348 xen_set_ldt(get_ssd_base(&curproc->p_ldt_desc),
349 curproc->p_ldtlimit + 1);
350 #else
351 size_t len;
352 system_desc_t desc;
353
354 /*
355 * Before we can use the LDT on this CPU, we must install the LDT in the
356 * user mapping table.
357 */
358 len = (curproc->p_ldtlimit + 1) * sizeof (user_desc_t);
359 bcopy(curproc->p_ldt, CPU->cpu_m.mcpu_ldt, len);
360 CPU->cpu_m.mcpu_ldt_len = len;
361 set_syssegd(&desc, CPU->cpu_m.mcpu_ldt, len - 1, SDT_SYSLDT, SEL_KPL);
362 *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = desc;
363
364 wr_ldtr(ULDT_SEL);
365 #endif
366 }
367
368 /*
369 * Store a NULL selector in the LDTR. All subsequent illegal references to
370 * the LDT will result in a #gp.
371 */
372 void
373 ldt_unload(void)
374 {
375 #if defined(__xpv)
376 xen_set_ldt(NULL, 0);
377 #else
378 *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = null_sdesc;
379 wr_ldtr(0);
380
381 bzero(CPU->cpu_m.mcpu_ldt, CPU->cpu_m.mcpu_ldt_len);
382 CPU->cpu_m.mcpu_ldt_len = 0;
383 #endif
384 }
385
386 /*ARGSUSED*/
387 static void
388 ldt_savectx(proc_t *p)
389 {
390 ASSERT(p->p_ldt != NULL);
391 ASSERT(p == curproc);
392
393 #if defined(__amd64)
394 /*
395 * The 64-bit kernel must be sure to clear any stale ldt
396 * selectors when context switching away from a process that
397 * has a private ldt. Consider the following example:
398 *
399 * Wine creats a ldt descriptor and points a segment register
400 * to it.
401 *
402 * We then context switch away from wine lwp to kernel
403 * thread and hit breakpoint in kernel with kmdb
404 *
405 * When we continue and resume from kmdb we will #gp
406 * fault since kmdb will have saved the stale ldt selector
407 * from wine and will try to restore it but we are no longer in
408 * the context of the wine process and do not have our
409 * ldtr register pointing to the private ldt.
410 */
411 reset_sregs();
412 #endif
413
414 ldt_unload();
415 cpu_fast_syscall_enable(NULL);
416 }
417
418 static void
419 ldt_restorectx(proc_t *p)
420 {
421 ASSERT(p->p_ldt != NULL);
422 ASSERT(p == curproc);
423
424 ldt_load();
425 cpu_fast_syscall_disable(NULL);
426 }
427
428 /*
429 * When a process with a private LDT execs, fast syscalls must be enabled for
430 * the new process image.
431 */
432 /* ARGSUSED */
433 static void
434 ldt_freectx(proc_t *p, int isexec)
435 {
436 ASSERT(p->p_ldt);
437
438 if (isexec) {
439 kpreempt_disable();
440 cpu_fast_syscall_enable(NULL);
441 kpreempt_enable();
442 }
443
444 /*
445 * ldt_free() will free the memory used by the private LDT, reset the
446 * process's descriptor, and re-program the LDTR.
447 */
448 ldt_free(p);
449 }
450
451 /*
452 * Install ctx op that ensures syscall/sysenter are disabled.
453 * See comments below.
454 *
455 * When a thread with a private LDT forks, the new process
456 * must have the LDT context ops installed.
457 */
458 /* ARGSUSED */
459 static void
460 ldt_installctx(proc_t *p, proc_t *cp)
461 {
462 proc_t *targ = p;
463 kthread_t *t;
464
465 /*
466 * If this is a fork, operate on the child process.
467 */
468 if (cp != NULL) {
469 targ = cp;
470 ldt_dup(p, cp);
471 }
472
473 /*
474 * The process context ops expect the target process as their argument.
475 */
476 ASSERT(removepctx(targ, targ, ldt_savectx, ldt_restorectx,
477 ldt_installctx, ldt_savectx, ldt_freectx) == 0);
478
479 installpctx(targ, targ, ldt_savectx, ldt_restorectx,
480 ldt_installctx, ldt_savectx, ldt_freectx);
481
482 /*
483 * We've just disabled fast system call and return instructions; take
484 * the slow path out to make sure we don't try to use one to return
485 * back to user. We must set t_post_sys for every thread in the
486 * process to make sure none of them escape out via fast return.
487 */
488
489 mutex_enter(&targ->p_lock);
490 t = targ->p_tlist;
491 do {
492 t->t_post_sys = 1;
493 } while ((t = t->t_forw) != targ->p_tlist);
494 mutex_exit(&targ->p_lock);
495 }
496
497 int
498 setdscr(struct ssd *ssd)
499 {
500 ushort_t seli; /* selector index */
501 user_desc_t *ldp; /* descriptor pointer */
502 user_desc_t ndesc; /* new descriptor */
503 proc_t *pp = ttoproc(curthread);
504 int rc = 0;
505
506 /*
507 * LDT segments: executable and data at DPL 3 only.
508 */
509 if (!SELISLDT(ssd->sel) || !SELISUPL(ssd->sel))
510 return (EINVAL);
511
512 /*
513 * check the selector index.
514 */
515 seli = SELTOIDX(ssd->sel);
516 if (seli >= MAXNLDT || seli < LDT_UDBASE)
517 return (EINVAL);
518
519 ndesc = null_udesc;
520 mutex_enter(&pp->p_ldtlock);
521
522 /*
523 * If this is the first time for this process then setup a
524 * private LDT for it.
525 */
526 if (pp->p_ldt == NULL) {
527 ldt_alloc(pp, seli);
528
529 /*
530 * Now that this process has a private LDT, the use of
531 * the syscall/sysret and sysenter/sysexit instructions
532 * is forbidden for this processes because they destroy
533 * the contents of %cs and %ss segment registers.
534 *
535 * Explicity disable them here and add a context handler
536 * to the process. Note that disabling
537 * them here means we can't use sysret or sysexit on
538 * the way out of this system call - so we force this
539 * thread to take the slow path (which doesn't make use
540 * of sysenter or sysexit) back out.
541 */
542 kpreempt_disable();
543 ldt_installctx(pp, NULL);
544 cpu_fast_syscall_disable(NULL);
545 ASSERT(curthread->t_post_sys != 0);
546 kpreempt_enable();
547
548 } else if (seli > pp->p_ldtlimit) {
549
550 /*
551 * Increase size of ldt to include seli.
552 */
553 ldt_grow(pp, seli);
554 }
555
556 ASSERT(seli <= pp->p_ldtlimit);
557 ldp = &pp->p_ldt[seli];
558
559 /*
560 * On the 64-bit kernel, this is where things get more subtle.
561 * Recall that in the 64-bit kernel, when we enter the kernel we
562 * deliberately -don't- reload the segment selectors we came in on
563 * for %ds, %es, %fs or %gs. Messing with selectors is expensive,
564 * and the underlying descriptors are essentially ignored by the
565 * hardware in long mode - except for the base that we override with
566 * the gsbase MSRs.
567 *
568 * However, there's one unfortunate issue with this rosy picture --
569 * a descriptor that's not marked as 'present' will still generate
570 * an #np when loading a segment register.
571 *
572 * Consider this case. An lwp creates a harmless LDT entry, points
573 * one of it's segment registers at it, then tells the kernel (here)
574 * to delete it. In the 32-bit kernel, the #np will happen on the
575 * way back to userland where we reload the segment registers, and be
576 * handled in kern_gpfault(). In the 64-bit kernel, the same thing
577 * will happen in the normal case too. However, if we're trying to
578 * use a debugger that wants to save and restore the segment registers,
579 * and the debugger things that we have valid segment registers, we
580 * have the problem that the debugger will try and restore the
581 * segment register that points at the now 'not present' descriptor
582 * and will take a #np right there.
583 *
584 * We should obviously fix the debugger to be paranoid about
585 * -not- restoring segment registers that point to bad descriptors;
586 * however we can prevent the problem here if we check to see if any
587 * of the segment registers are still pointing at the thing we're
588 * destroying; if they are, return an error instead. (That also seems
589 * a lot better failure mode than SIGKILL and a core file
590 * from kern_gpfault() too.)
591 */
592 if (SI86SSD_PRES(ssd) == 0) {
593 kthread_t *t;
594 int bad = 0;
595
596 /*
597 * Look carefully at the segment registers of every lwp
598 * in the process (they're all stopped by our caller).
599 * If we're about to invalidate a descriptor that's still
600 * being referenced by *any* of them, return an error,
601 * rather than having them #gp on their way out of the kernel.
602 */
603 ASSERT(pp->p_lwprcnt == 1);
604
605 mutex_enter(&pp->p_lock);
606 t = pp->p_tlist;
607 do {
608 klwp_t *lwp = ttolwp(t);
609 struct regs *rp = lwp->lwp_regs;
610 #if defined(__amd64)
611 pcb_t *pcb = &lwp->lwp_pcb;
612 #endif
613
614 if (ssd->sel == rp->r_cs || ssd->sel == rp->r_ss) {
615 bad = 1;
616 break;
617 }
618
619 #if defined(__amd64)
620 if (pcb->pcb_rupdate == 1) {
621 if (ssd->sel == pcb->pcb_ds ||
622 ssd->sel == pcb->pcb_es ||
623 ssd->sel == pcb->pcb_fs ||
624 ssd->sel == pcb->pcb_gs) {
625 bad = 1;
626 break;
627 }
628 } else
629 #endif
630 {
631 if (ssd->sel == rp->r_ds ||
632 ssd->sel == rp->r_es ||
633 ssd->sel == rp->r_fs ||
634 ssd->sel == rp->r_gs) {
635 bad = 1;
636 break;
637 }
638 }
639
640 } while ((t = t->t_forw) != pp->p_tlist);
641 mutex_exit(&pp->p_lock);
642
643 if (bad) {
644 mutex_exit(&pp->p_ldtlock);
645 return (EBUSY);
646 }
647 }
648
649 /*
650 * If acc1 is zero, clear the descriptor (including the 'present' bit)
651 */
652 if (ssd->acc1 == 0) {
653 rc = ldt_update_segd(ldp, &null_udesc);
654 mutex_exit(&pp->p_ldtlock);
655 return (rc);
656 }
657
658 /*
659 * Check segment type, allow segment not present and
660 * only user DPL (3).
661 */
662 if (SI86SSD_DPL(ssd) != SEL_UPL) {
663 mutex_exit(&pp->p_ldtlock);
664 return (EINVAL);
665 }
666
667 #if defined(__amd64)
668 /*
669 * Do not allow 32-bit applications to create 64-bit mode code
670 * segments.
671 */
672 if (SI86SSD_ISUSEG(ssd) && ((SI86SSD_TYPE(ssd) >> 3) & 1) == 1 &&
673 SI86SSD_ISLONG(ssd)) {
674 mutex_exit(&pp->p_ldtlock);
675 return (EINVAL);
676 }
677 #endif /* __amd64 */
678
679 /*
680 * Set up a code or data user segment descriptor.
681 */
682 if (SI86SSD_ISUSEG(ssd)) {
683 ssd_to_usd(ssd, &ndesc);
684 rc = ldt_update_segd(ldp, &ndesc);
685 mutex_exit(&pp->p_ldtlock);
686 return (rc);
687 }
688
689 #if defined(__i386)
690 /*
691 * Allow a call gate only if the destination is in the LDT
692 * and the system is running in 32-bit legacy mode.
693 *
694 * In long mode 32-bit call gates are redefined as 64-bit call
695 * gates and the hw enforces that the target code selector
696 * of the call gate must be 64-bit selector. A #gp fault is
697 * generated if otherwise. Since we do not allow 32-bit processes
698 * to switch themselves to 64-bits we never allow call gates
699 * on 64-bit system system.
700 */
701 if (SI86SSD_TYPE(ssd) == SDT_SYSCGT && SELISLDT(ssd->ls)) {
702
703
704 ssd_to_sgd(ssd, (gate_desc_t *)&ndesc);
705 rc = ldt_update_segd(ldp, &ndesc);
706 mutex_exit(&pp->p_ldtlock);
707 return (rc);
708 }
709 #endif /* __i386 */
710
711 mutex_exit(&pp->p_ldtlock);
712 return (EINVAL);
713 }
714
715 /*
716 * Allocate new LDT for process just large enough to contain seli.
717 * Note we allocate and grow LDT in PAGESIZE chunks. We do this
718 * to simplify the implementation and because on the hypervisor it's
719 * required, since the LDT must live on pages that have PROT_WRITE
720 * removed and which are given to the hypervisor.
721 */
722 static void
723 ldt_alloc(proc_t *pp, uint_t seli)
724 {
725 user_desc_t *ldt;
726 size_t ldtsz;
727 uint_t nsels;
728
729 ASSERT(MUTEX_HELD(&pp->p_ldtlock));
730 ASSERT(pp->p_ldt == NULL);
731 ASSERT(pp->p_ldtlimit == 0);
732
733 /*
734 * Allocate new LDT just large enough to contain seli. The LDT must
735 * always be allocated in units of pages for KPTI.
736 */
737 ldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE);
738 nsels = ldtsz / sizeof (user_desc_t);
739 ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT);
740
741 ldt = kmem_zalloc(ldtsz, KM_SLEEP);
742 ASSERT(IS_P2ALIGNED(ldt, PAGESIZE));
743
744 #if defined(__xpv)
745 if (xen_ldt_setprot(ldt, ldtsz, PROT_READ))
746 panic("ldt_alloc:xen_ldt_setprot(PROT_READ) failed");
747 #endif
748
749 pp->p_ldt = ldt;
750 pp->p_ldtlimit = nsels - 1;
751 set_syssegd(&pp->p_ldt_desc, ldt, ldtsz - 1, SDT_SYSLDT, SEL_KPL);
752
753 if (pp == curproc) {
754 kpreempt_disable();
755 ldt_load();
756 kpreempt_enable();
757 }
758 }
759
760 static void
761 ldt_free(proc_t *pp)
762 {
763 user_desc_t *ldt;
764 size_t ldtsz;
765
766 ASSERT(pp->p_ldt != NULL);
767
768 mutex_enter(&pp->p_ldtlock);
769 ldt = pp->p_ldt;
770 ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
771
772 ASSERT(IS_P2ALIGNED(ldtsz, PAGESIZE));
773
774 pp->p_ldt = NULL;
775 pp->p_ldtlimit = 0;
776 pp->p_ldt_desc = null_sdesc;
777 mutex_exit(&pp->p_ldtlock);
778
779 if (pp == curproc) {
780 kpreempt_disable();
781 ldt_unload();
782 kpreempt_enable();
783 }
784
785 #if defined(__xpv)
786 /*
787 * We are not allowed to make the ldt writable until after
788 * we tell the hypervisor to unload it.
789 */
790 if (xen_ldt_setprot(ldt, ldtsz, PROT_READ | PROT_WRITE))
791 panic("ldt_free:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
792 #endif
793
794 kmem_free(ldt, ldtsz);
795 }
796
797 /*
798 * On fork copy new ldt for child.
799 */
800 static void
801 ldt_dup(proc_t *pp, proc_t *cp)
802 {
803 size_t ldtsz;
804
805 ASSERT(pp->p_ldt != NULL);
806 ASSERT(cp != curproc);
807
808 /*
809 * I assume the parent's ldt can't increase since we're in a fork.
810 */
811 mutex_enter(&pp->p_ldtlock);
812 mutex_enter(&cp->p_ldtlock);
813
814 ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
815
816 ldt_alloc(cp, pp->p_ldtlimit);
817
818 #if defined(__xpv)
819 /*
820 * Make child's ldt writable so it can be copied into from
821 * parent's ldt. This works since ldt_alloc above did not load
822 * the ldt since its for the child process. If we tried to make
823 * an LDT writable that is loaded in hw the setprot operation
824 * would fail.
825 */
826 if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ | PROT_WRITE))
827 panic("ldt_dup:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
828 #endif
829
830 bcopy(pp->p_ldt, cp->p_ldt, ldtsz);
831
832 #if defined(__xpv)
833 if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ))
834 panic("ldt_dup:xen_ldt_setprot(PROT_READ) failed");
835 #endif
836 mutex_exit(&cp->p_ldtlock);
837 mutex_exit(&pp->p_ldtlock);
838
839 }
840
841 static void
842 ldt_grow(proc_t *pp, uint_t seli)
843 {
844 user_desc_t *oldt, *nldt;
845 uint_t nsels;
846 size_t oldtsz, nldtsz;
847
848 ASSERT(MUTEX_HELD(&pp->p_ldtlock));
849 ASSERT(pp->p_ldt != NULL);
850 ASSERT(pp->p_ldtlimit != 0);
851
852 /*
853 * Allocate larger LDT just large enough to contain seli. The LDT must
854 * always be allocated in units of pages for KPTI.
855 */
856 nldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE);
857 nsels = nldtsz / sizeof (user_desc_t);
858 ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT);
859 ASSERT(nsels > pp->p_ldtlimit);
860
861 oldt = pp->p_ldt;
862 oldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
863
864 nldt = kmem_zalloc(nldtsz, KM_SLEEP);
865 ASSERT(IS_P2ALIGNED(nldt, PAGESIZE));
866
867 bcopy(oldt, nldt, oldtsz);
868
869 /*
870 * unload old ldt.
871 */
872 kpreempt_disable();
873 ldt_unload();
874 kpreempt_enable();
875
876 #if defined(__xpv)
877
878 /*
879 * Make old ldt writable and new ldt read only.
880 */
881 if (xen_ldt_setprot(oldt, oldtsz, PROT_READ | PROT_WRITE))
882 panic("ldt_grow:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
883
884 if (xen_ldt_setprot(nldt, nldtsz, PROT_READ))
885 panic("ldt_grow:xen_ldt_setprot(PROT_READ) failed");
886 #endif
887
888 pp->p_ldt = nldt;
889 pp->p_ldtlimit = nsels - 1;
890
891 /*
892 * write new ldt segment descriptor.
893 */
894 set_syssegd(&pp->p_ldt_desc, nldt, nldtsz - 1, SDT_SYSLDT, SEL_KPL);
895
896 /*
897 * load the new ldt.
898 */
899 kpreempt_disable();
900 ldt_load();
901 kpreempt_enable();
902
903 kmem_free(oldt, oldtsz);
904 }