8956 Implement KPTI Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com> Reviewed by: Robert Mustacchi <rm@joyent.com>
1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */ 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */ 27 /* All Rights Reserved */ 28 29 /* Copyright (c) 1987, 1988 Microsoft Corporation */ 30 /* All Rights Reserved */ 31 32 #include <sys/param.h> 33 #include <sys/types.h> 34 #include <sys/sysmacros.h> 35 #include <sys/systm.h> 36 #include <sys/signal.h> 37 #include <sys/errno.h> 38 #include <sys/fault.h> 39 #include <sys/syscall.h> 40 #include <sys/cpuvar.h> 41 #include <sys/sysi86.h> 42 #include <sys/psw.h> 43 #include <sys/cred.h> 44 #include <sys/policy.h> 45 #include <sys/thread.h> 46 #include <sys/debug.h> 47 #include <sys/ontrap.h> 48 #include <sys/privregs.h> 49 #include <sys/x86_archext.h> 50 #include <sys/vmem.h> 51 #include <sys/kmem.h> 52 #include <sys/mman.h> 53 #include <sys/archsystm.h> 54 #include <vm/hat.h> 55 #include <vm/as.h> 56 #include <vm/seg.h> 57 #include <vm/seg_kmem.h> 58 #include <vm/faultcode.h> 59 #include <sys/fp.h> 60 #include <sys/cmn_err.h> 61 #include <sys/segments.h> 62 #include <sys/clock.h> 63 #if defined(__xpv) 64 #include <sys/hypervisor.h> 65 #include <sys/note.h> 66 #endif 67 68 static void ldt_alloc(proc_t *, uint_t); 69 static void ldt_free(proc_t *); 70 static void ldt_dup(proc_t *, proc_t *); 71 static void ldt_grow(proc_t *, uint_t); 72 73 /* 74 * sysi86 System Call 75 */ 76 77 /* ARGSUSED */ 78 int 79 sysi86(short cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3) 80 { 81 struct ssd ssd; 82 int error = 0; 83 int c; 84 proc_t *pp = curproc; 85 86 switch (cmd) { 87 88 /* 89 * The SI86V86 subsystem call of the SYSI86 system call 90 * supports only one subcode -- V86SC_IOPL. 91 */ 92 case SI86V86: 93 if (arg1 == V86SC_IOPL) { 94 struct regs *rp = lwptoregs(ttolwp(curthread)); 95 greg_t oldpl = rp->r_ps & PS_IOPL; 96 greg_t newpl = arg2 & PS_IOPL; 97 98 /* 99 * Must be privileged to run this system call 100 * if giving more io privilege. 101 */ 102 if (newpl > oldpl && (error = 103 secpolicy_sys_config(CRED(), B_FALSE)) != 0) 104 return (set_errno(error)); 105 #if defined(__xpv) 106 kpreempt_disable(); 107 installctx(curthread, NULL, xen_disable_user_iopl, 108 xen_enable_user_iopl, NULL, NULL, 109 xen_disable_user_iopl, NULL); 110 xen_enable_user_iopl(); 111 kpreempt_enable(); 112 #else 113 rp->r_ps ^= oldpl ^ newpl; 114 #endif 115 } else 116 error = EINVAL; 117 break; 118 119 /* 120 * Set a segment descriptor 121 */ 122 case SI86DSCR: 123 /* 124 * There are considerable problems here manipulating 125 * resources shared by many running lwps. Get everyone 126 * into a safe state before changing the LDT. 127 */ 128 if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK1)) { 129 error = EINTR; 130 break; 131 } 132 133 if (get_udatamodel() == DATAMODEL_LP64) { 134 error = EINVAL; 135 break; 136 } 137 138 if (copyin((caddr_t)arg1, &ssd, sizeof (ssd)) < 0) { 139 error = EFAULT; 140 break; 141 } 142 143 error = setdscr(&ssd); 144 145 mutex_enter(&pp->p_lock); 146 if (curthread != pp->p_agenttp) 147 continuelwps(pp); 148 mutex_exit(&pp->p_lock); 149 break; 150 151 case SI86FPHW: 152 c = fp_kind & 0xff; 153 if (suword32((void *)arg1, c) == -1) 154 error = EFAULT; 155 break; 156 157 case SI86FPSTART: 158 /* 159 * arg1 is the address of _fp_hw 160 * arg2 is the desired x87 FCW value 161 * arg3 is the desired SSE MXCSR value 162 * a return value of one means SSE hardware, else none. 163 */ 164 c = fp_kind & 0xff; 165 if (suword32((void *)arg1, c) == -1) { 166 error = EFAULT; 167 break; 168 } 169 fpsetcw((uint16_t)arg2, (uint32_t)arg3); 170 return ((fp_kind & __FP_SSE) ? 1 : 0); 171 172 /* real time clock management commands */ 173 174 case WTODC: 175 if ((error = secpolicy_settime(CRED())) == 0) { 176 timestruc_t ts; 177 mutex_enter(&tod_lock); 178 gethrestime(&ts); 179 tod_set(ts); 180 mutex_exit(&tod_lock); 181 } 182 break; 183 184 /* Give some timezone playing room */ 185 #define ONEWEEK (7 * 24 * 60 * 60) 186 187 case SGMTL: 188 /* 189 * Called from 32 bit land, negative values 190 * are not sign extended, so we do that here 191 * by casting it to an int and back. We also 192 * clamp the value to within reason and detect 193 * when a 64 bit call overflows an int. 194 */ 195 if ((error = secpolicy_settime(CRED())) == 0) { 196 int newlag = (int)arg1; 197 198 #ifdef _SYSCALL32_IMPL 199 if (get_udatamodel() == DATAMODEL_NATIVE && 200 (long)newlag != (long)arg1) { 201 error = EOVERFLOW; 202 } else 203 #endif 204 if (newlag >= -ONEWEEK && newlag <= ONEWEEK) 205 sgmtl(newlag); 206 else 207 error = EOVERFLOW; 208 } 209 break; 210 211 case GGMTL: 212 if (get_udatamodel() == DATAMODEL_NATIVE) { 213 if (sulword((void *)arg1, ggmtl()) == -1) 214 error = EFAULT; 215 #ifdef _SYSCALL32_IMPL 216 } else { 217 time_t gmtl; 218 219 if ((gmtl = ggmtl()) > INT32_MAX) { 220 /* 221 * Since gmt_lag can at most be 222 * +/- 12 hours, something is 223 * *seriously* messed up here. 224 */ 225 error = EOVERFLOW; 226 } else if (suword32((void *)arg1, (int32_t)gmtl) == -1) 227 error = EFAULT; 228 #endif 229 } 230 break; 231 232 case RTCSYNC: 233 if ((error = secpolicy_settime(CRED())) == 0) 234 rtcsync(); 235 break; 236 237 /* END OF real time clock management commands */ 238 239 default: 240 error = EINVAL; 241 break; 242 } 243 return (error == 0 ? 0 : set_errno(error)); 244 } 245 246 void 247 usd_to_ssd(user_desc_t *usd, struct ssd *ssd, selector_t sel) 248 { 249 ssd->bo = USEGD_GETBASE(usd); 250 ssd->ls = USEGD_GETLIMIT(usd); 251 ssd->sel = sel; 252 253 /* 254 * set type, dpl and present bits. 255 */ 256 ssd->acc1 = usd->usd_type; 257 ssd->acc1 |= usd->usd_dpl << 5; 258 ssd->acc1 |= usd->usd_p << (5 + 2); 259 260 /* 261 * set avl, DB and granularity bits. 262 */ 263 ssd->acc2 = usd->usd_avl; 264 265 #if defined(__amd64) 266 ssd->acc2 |= usd->usd_long << 1; 267 #else 268 ssd->acc2 |= usd->usd_reserved << 1; 269 #endif 270 271 ssd->acc2 |= usd->usd_def32 << (1 + 1); 272 ssd->acc2 |= usd->usd_gran << (1 + 1 + 1); 273 } 274 275 static void 276 ssd_to_usd(struct ssd *ssd, user_desc_t *usd) 277 { 278 279 ASSERT(bcmp(usd, &null_udesc, sizeof (*usd)) == 0); 280 281 USEGD_SETBASE(usd, ssd->bo); 282 USEGD_SETLIMIT(usd, ssd->ls); 283 284 /* 285 * set type, dpl and present bits. 286 */ 287 usd->usd_type = ssd->acc1; 288 usd->usd_dpl = ssd->acc1 >> 5; 289 usd->usd_p = ssd->acc1 >> (5 + 2); 290 291 ASSERT(usd->usd_type >= SDT_MEMRO); 292 ASSERT(usd->usd_dpl == SEL_UPL); 293 294 /* 295 * 64-bit code selectors are never allowed in the LDT. 296 * Reserved bit is always 0 on 32-bit systems. 297 */ 298 #if defined(__amd64) 299 usd->usd_long = 0; 300 #else 301 usd->usd_reserved = 0; 302 #endif 303 304 /* 305 * set avl, DB and granularity bits. 306 */ 307 usd->usd_avl = ssd->acc2; 308 usd->usd_def32 = ssd->acc2 >> (1 + 1); 309 usd->usd_gran = ssd->acc2 >> (1 + 1 + 1); 310 } 311 312 313 #if defined(__i386) 314 315 static void 316 ssd_to_sgd(struct ssd *ssd, gate_desc_t *sgd) 317 { 318 319 ASSERT(bcmp(sgd, &null_sdesc, sizeof (*sgd)) == 0); 320 321 sgd->sgd_looffset = ssd->bo; 322 sgd->sgd_hioffset = ssd->bo >> 16; 323 324 sgd->sgd_selector = ssd->ls; 325 326 /* 327 * set type, dpl and present bits. 328 */ 329 sgd->sgd_type = ssd->acc1; 330 sgd->sgd_dpl = ssd->acc1 >> 5; 331 sgd->sgd_p = ssd->acc1 >> 7; 332 ASSERT(sgd->sgd_type == SDT_SYSCGT); 333 ASSERT(sgd->sgd_dpl == SEL_UPL); 334 sgd->sgd_stkcpy = 0; 335 } 336 337 #endif /* __i386 */ 338 339 /* 340 * Load LDT register with the current process's LDT. 341 */ 342 static void 343 ldt_load(void) 344 { 345 #if defined(__xpv) 346 xen_set_ldt(get_ssd_base(&curproc->p_ldt_desc), 347 curproc->p_ldtlimit + 1); 348 #else 349 *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = curproc->p_ldt_desc; 350 wr_ldtr(ULDT_SEL); 351 #endif 352 } 353 354 /* 355 * Store a NULL selector in the LDTR. All subsequent illegal references to 356 * the LDT will result in a #gp. 357 */ 358 void 359 ldt_unload(void) 360 { 361 #if defined(__xpv) 362 xen_set_ldt(NULL, 0); 363 #else 364 *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = null_sdesc; 365 wr_ldtr(0); 366 #endif 367 } 368 369 /*ARGSUSED*/ 370 static void 371 ldt_savectx(proc_t *p) 372 { 373 ASSERT(p->p_ldt != NULL); 374 ASSERT(p == curproc); 375 376 #if defined(__amd64) 377 /* 378 * The 64-bit kernel must be sure to clear any stale ldt 379 * selectors when context switching away from a process that 380 * has a private ldt. Consider the following example: 381 * 382 * Wine creats a ldt descriptor and points a segment register 383 * to it. 384 * 385 * We then context switch away from wine lwp to kernel 386 * thread and hit breakpoint in kernel with kmdb 387 * 388 * When we continue and resume from kmdb we will #gp 389 * fault since kmdb will have saved the stale ldt selector 390 * from wine and will try to restore it but we are no longer in 391 * the context of the wine process and do not have our 392 * ldtr register pointing to the private ldt. 393 */ 394 reset_sregs(); 395 #endif 396 397 ldt_unload(); 398 cpu_fast_syscall_enable(NULL); 399 } 400 401 static void 402 ldt_restorectx(proc_t *p) 403 { 404 ASSERT(p->p_ldt != NULL); 405 ASSERT(p == curproc); 406 407 ldt_load(); 408 cpu_fast_syscall_disable(NULL); 409 } 410 411 /* 412 * When a process with a private LDT execs, fast syscalls must be enabled for 413 * the new process image. 414 */ 415 /* ARGSUSED */ 416 static void 417 ldt_freectx(proc_t *p, int isexec) 418 { 419 ASSERT(p->p_ldt); 420 421 if (isexec) { 422 kpreempt_disable(); 423 cpu_fast_syscall_enable(NULL); 424 kpreempt_enable(); 425 } 426 427 /* 428 * ldt_free() will free the memory used by the private LDT, reset the 429 * process's descriptor, and re-program the LDTR. 430 */ 431 ldt_free(p); 432 } 433 434 /* 435 * Install ctx op that ensures syscall/sysenter are disabled. 436 * See comments below. 437 * 438 * When a thread with a private LDT forks, the new process 439 * must have the LDT context ops installed. 440 */ 441 /* ARGSUSED */ 442 static void 443 ldt_installctx(proc_t *p, proc_t *cp) 444 { 445 proc_t *targ = p; 446 kthread_t *t; 447 448 /* 449 * If this is a fork, operate on the child process. 450 */ 451 if (cp != NULL) { 452 targ = cp; 453 ldt_dup(p, cp); 454 } 455 456 /* 457 * The process context ops expect the target process as their argument. 458 */ 459 ASSERT(removepctx(targ, targ, ldt_savectx, ldt_restorectx, 460 ldt_installctx, ldt_savectx, ldt_freectx) == 0); 461 462 installpctx(targ, targ, ldt_savectx, ldt_restorectx, 463 ldt_installctx, ldt_savectx, ldt_freectx); 464 465 /* 466 * We've just disabled fast system call and return instructions; take 467 * the slow path out to make sure we don't try to use one to return 468 * back to user. We must set t_post_sys for every thread in the 469 * process to make sure none of them escape out via fast return. 470 */ 471 472 mutex_enter(&targ->p_lock); 473 t = targ->p_tlist; 474 do { 475 t->t_post_sys = 1; 476 } while ((t = t->t_forw) != targ->p_tlist); 477 mutex_exit(&targ->p_lock); 478 } 479 480 int 481 setdscr(struct ssd *ssd) 482 { 483 ushort_t seli; /* selector index */ 484 user_desc_t *ldp; /* descriptor pointer */ 485 user_desc_t ndesc; /* new descriptor */ 486 proc_t *pp = ttoproc(curthread); 487 int rc = 0; 488 489 /* 490 * LDT segments: executable and data at DPL 3 only. 491 */ 492 if (!SELISLDT(ssd->sel) || !SELISUPL(ssd->sel)) 493 return (EINVAL); 494 495 /* 496 * check the selector index. 497 */ 498 seli = SELTOIDX(ssd->sel); 499 if (seli >= MAXNLDT || seli < LDT_UDBASE) 500 return (EINVAL); 501 502 ndesc = null_udesc; 503 mutex_enter(&pp->p_ldtlock); 504 505 /* 506 * If this is the first time for this process then setup a 507 * private LDT for it. 508 */ 509 if (pp->p_ldt == NULL) { 510 ldt_alloc(pp, seli); 511 512 /* 513 * Now that this process has a private LDT, the use of 514 * the syscall/sysret and sysenter/sysexit instructions 515 * is forbidden for this processes because they destroy 516 * the contents of %cs and %ss segment registers. 517 * 518 * Explicity disable them here and add a context handler 519 * to the process. Note that disabling 520 * them here means we can't use sysret or sysexit on 521 * the way out of this system call - so we force this 522 * thread to take the slow path (which doesn't make use 523 * of sysenter or sysexit) back out. 524 */ 525 kpreempt_disable(); 526 ldt_installctx(pp, NULL); 527 cpu_fast_syscall_disable(NULL); 528 ASSERT(curthread->t_post_sys != 0); 529 kpreempt_enable(); 530 531 } else if (seli > pp->p_ldtlimit) { 532 533 /* 534 * Increase size of ldt to include seli. 535 */ 536 ldt_grow(pp, seli); 537 } 538 539 ASSERT(seli <= pp->p_ldtlimit); 540 ldp = &pp->p_ldt[seli]; 541 542 /* 543 * On the 64-bit kernel, this is where things get more subtle. 544 * Recall that in the 64-bit kernel, when we enter the kernel we 545 * deliberately -don't- reload the segment selectors we came in on 546 * for %ds, %es, %fs or %gs. Messing with selectors is expensive, 547 * and the underlying descriptors are essentially ignored by the 548 * hardware in long mode - except for the base that we override with 549 * the gsbase MSRs. 550 * 551 * However, there's one unfortunate issue with this rosy picture -- 552 * a descriptor that's not marked as 'present' will still generate 553 * an #np when loading a segment register. 554 * 555 * Consider this case. An lwp creates a harmless LDT entry, points 556 * one of it's segment registers at it, then tells the kernel (here) 557 * to delete it. In the 32-bit kernel, the #np will happen on the 558 * way back to userland where we reload the segment registers, and be 559 * handled in kern_gpfault(). In the 64-bit kernel, the same thing 560 * will happen in the normal case too. However, if we're trying to 561 * use a debugger that wants to save and restore the segment registers, 562 * and the debugger things that we have valid segment registers, we 563 * have the problem that the debugger will try and restore the 564 * segment register that points at the now 'not present' descriptor 565 * and will take a #np right there. 566 * 567 * We should obviously fix the debugger to be paranoid about 568 * -not- restoring segment registers that point to bad descriptors; 569 * however we can prevent the problem here if we check to see if any 570 * of the segment registers are still pointing at the thing we're 571 * destroying; if they are, return an error instead. (That also seems 572 * a lot better failure mode than SIGKILL and a core file 573 * from kern_gpfault() too.) 574 */ 575 if (SI86SSD_PRES(ssd) == 0) { 576 kthread_t *t; 577 int bad = 0; 578 579 /* 580 * Look carefully at the segment registers of every lwp 581 * in the process (they're all stopped by our caller). 582 * If we're about to invalidate a descriptor that's still 583 * being referenced by *any* of them, return an error, 584 * rather than having them #gp on their way out of the kernel. 585 */ 586 ASSERT(pp->p_lwprcnt == 1); 587 588 mutex_enter(&pp->p_lock); 589 t = pp->p_tlist; 590 do { 591 klwp_t *lwp = ttolwp(t); 592 struct regs *rp = lwp->lwp_regs; 593 #if defined(__amd64) 594 pcb_t *pcb = &lwp->lwp_pcb; 595 #endif 596 597 if (ssd->sel == rp->r_cs || ssd->sel == rp->r_ss) { 598 bad = 1; 599 break; 600 } 601 602 #if defined(__amd64) 603 if (pcb->pcb_rupdate == 1) { 604 if (ssd->sel == pcb->pcb_ds || 605 ssd->sel == pcb->pcb_es || 606 ssd->sel == pcb->pcb_fs || 607 ssd->sel == pcb->pcb_gs) { 608 bad = 1; 609 break; 610 } 611 } else 612 #endif 613 { 614 if (ssd->sel == rp->r_ds || 615 ssd->sel == rp->r_es || 616 ssd->sel == rp->r_fs || 617 ssd->sel == rp->r_gs) { 618 bad = 1; 619 break; 620 } 621 } 622 623 } while ((t = t->t_forw) != pp->p_tlist); 624 mutex_exit(&pp->p_lock); 625 626 if (bad) { 627 mutex_exit(&pp->p_ldtlock); 628 return (EBUSY); 629 } 630 } 631 632 /* 633 * If acc1 is zero, clear the descriptor (including the 'present' bit) 634 */ 635 if (ssd->acc1 == 0) { 636 rc = ldt_update_segd(ldp, &null_udesc); 637 mutex_exit(&pp->p_ldtlock); 638 return (rc); 639 } 640 641 /* 642 * Check segment type, allow segment not present and 643 * only user DPL (3). 644 */ 645 if (SI86SSD_DPL(ssd) != SEL_UPL) { 646 mutex_exit(&pp->p_ldtlock); 647 return (EINVAL); 648 } 649 650 #if defined(__amd64) 651 /* 652 * Do not allow 32-bit applications to create 64-bit mode code 653 * segments. 654 */ 655 if (SI86SSD_ISUSEG(ssd) && ((SI86SSD_TYPE(ssd) >> 3) & 1) == 1 && 656 SI86SSD_ISLONG(ssd)) { 657 mutex_exit(&pp->p_ldtlock); 658 return (EINVAL); 659 } 660 #endif /* __amd64 */ 661 662 /* 663 * Set up a code or data user segment descriptor. 664 */ 665 if (SI86SSD_ISUSEG(ssd)) { 666 ssd_to_usd(ssd, &ndesc); 667 rc = ldt_update_segd(ldp, &ndesc); 668 mutex_exit(&pp->p_ldtlock); 669 return (rc); 670 } 671 672 #if defined(__i386) 673 /* 674 * Allow a call gate only if the destination is in the LDT 675 * and the system is running in 32-bit legacy mode. 676 * 677 * In long mode 32-bit call gates are redefined as 64-bit call 678 * gates and the hw enforces that the target code selector 679 * of the call gate must be 64-bit selector. A #gp fault is 680 * generated if otherwise. Since we do not allow 32-bit processes 681 * to switch themselves to 64-bits we never allow call gates 682 * on 64-bit system system. 683 */ 684 if (SI86SSD_TYPE(ssd) == SDT_SYSCGT && SELISLDT(ssd->ls)) { 685 686 687 ssd_to_sgd(ssd, (gate_desc_t *)&ndesc); 688 rc = ldt_update_segd(ldp, &ndesc); 689 mutex_exit(&pp->p_ldtlock); 690 return (rc); 691 } 692 #endif /* __i386 */ 693 694 mutex_exit(&pp->p_ldtlock); 695 return (EINVAL); 696 } 697 698 /* 699 * Allocate new LDT for process just large enough to contain seli. 700 * Note we allocate and grow LDT in PAGESIZE chunks. We do this 701 * to simplify the implementation and because on the hypervisor it's 702 * required, since the LDT must live on pages that have PROT_WRITE 703 * removed and which are given to the hypervisor. 704 */ 705 static void 706 ldt_alloc(proc_t *pp, uint_t seli) 707 { 708 user_desc_t *ldt; 709 size_t ldtsz; 710 uint_t nsels; 711 712 ASSERT(MUTEX_HELD(&pp->p_ldtlock)); 713 ASSERT(pp->p_ldt == NULL); 714 ASSERT(pp->p_ldtlimit == 0); 715 716 /* 717 * Allocate new LDT just large enough to contain seli. 718 */ 719 ldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE); 720 nsels = ldtsz / sizeof (user_desc_t); 721 ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT); 722 723 ldt = kmem_zalloc(ldtsz, KM_SLEEP); 724 ASSERT(IS_P2ALIGNED(ldt, PAGESIZE)); 725 726 #if defined(__xpv) 727 if (xen_ldt_setprot(ldt, ldtsz, PROT_READ)) 728 panic("ldt_alloc:xen_ldt_setprot(PROT_READ) failed"); 729 #endif 730 731 pp->p_ldt = ldt; 732 pp->p_ldtlimit = nsels - 1; 733 set_syssegd(&pp->p_ldt_desc, ldt, ldtsz - 1, SDT_SYSLDT, SEL_KPL); 734 735 if (pp == curproc) { 736 kpreempt_disable(); 737 ldt_load(); 738 kpreempt_enable(); 739 } 740 } 741 742 static void 743 ldt_free(proc_t *pp) 744 { 745 user_desc_t *ldt; 746 size_t ldtsz; 747 748 ASSERT(pp->p_ldt != NULL); 749 750 mutex_enter(&pp->p_ldtlock); 751 ldt = pp->p_ldt; 752 ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t); 753 754 ASSERT(IS_P2ALIGNED(ldtsz, PAGESIZE)); 755 756 pp->p_ldt = NULL; 757 pp->p_ldtlimit = 0; 758 pp->p_ldt_desc = null_sdesc; 759 mutex_exit(&pp->p_ldtlock); 760 761 if (pp == curproc) { 762 kpreempt_disable(); 763 ldt_unload(); 764 kpreempt_enable(); 765 } 766 767 #if defined(__xpv) 768 /* 769 * We are not allowed to make the ldt writable until after 770 * we tell the hypervisor to unload it. 771 */ 772 if (xen_ldt_setprot(ldt, ldtsz, PROT_READ | PROT_WRITE)) 773 panic("ldt_free:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed"); 774 #endif 775 776 kmem_free(ldt, ldtsz); 777 } 778 779 /* 780 * On fork copy new ldt for child. 781 */ 782 static void 783 ldt_dup(proc_t *pp, proc_t *cp) 784 { 785 size_t ldtsz; 786 787 ASSERT(pp->p_ldt != NULL); 788 ASSERT(cp != curproc); 789 790 /* 791 * I assume the parent's ldt can't increase since we're in a fork. 792 */ 793 mutex_enter(&pp->p_ldtlock); 794 mutex_enter(&cp->p_ldtlock); 795 796 ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t); 797 798 ldt_alloc(cp, pp->p_ldtlimit); 799 800 #if defined(__xpv) 801 /* 802 * Make child's ldt writable so it can be copied into from 803 * parent's ldt. This works since ldt_alloc above did not load 804 * the ldt since its for the child process. If we tried to make 805 * an LDT writable that is loaded in hw the setprot operation 806 * would fail. 807 */ 808 if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ | PROT_WRITE)) 809 panic("ldt_dup:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed"); 810 #endif 811 812 bcopy(pp->p_ldt, cp->p_ldt, ldtsz); 813 814 #if defined(__xpv) 815 if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ)) 816 panic("ldt_dup:xen_ldt_setprot(PROT_READ) failed"); 817 #endif 818 mutex_exit(&cp->p_ldtlock); 819 mutex_exit(&pp->p_ldtlock); 820 821 } 822 823 static void 824 ldt_grow(proc_t *pp, uint_t seli) 825 { 826 user_desc_t *oldt, *nldt; 827 uint_t nsels; 828 size_t oldtsz, nldtsz; 829 830 ASSERT(MUTEX_HELD(&pp->p_ldtlock)); 831 ASSERT(pp->p_ldt != NULL); 832 ASSERT(pp->p_ldtlimit != 0); 833 834 /* 835 * Allocate larger LDT just large enough to contain seli. 836 */ 837 nldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE); 838 nsels = nldtsz / sizeof (user_desc_t); 839 ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT); 840 ASSERT(nsels > pp->p_ldtlimit); 841 842 oldt = pp->p_ldt; 843 oldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t); 844 845 nldt = kmem_zalloc(nldtsz, KM_SLEEP); 846 ASSERT(IS_P2ALIGNED(nldt, PAGESIZE)); 847 848 bcopy(oldt, nldt, oldtsz); 849 850 /* 851 * unload old ldt. 852 */ 853 kpreempt_disable(); 854 ldt_unload(); 855 kpreempt_enable(); 856 857 #if defined(__xpv) 858 859 /* 860 * Make old ldt writable and new ldt read only. 861 */ 862 if (xen_ldt_setprot(oldt, oldtsz, PROT_READ | PROT_WRITE)) 863 panic("ldt_grow:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed"); 864 865 if (xen_ldt_setprot(nldt, nldtsz, PROT_READ)) 866 panic("ldt_grow:xen_ldt_setprot(PROT_READ) failed"); 867 #endif 868 869 pp->p_ldt = nldt; 870 pp->p_ldtlimit = nsels - 1; 871 872 /* 873 * write new ldt segment descriptor. 874 */ 875 set_syssegd(&pp->p_ldt_desc, nldt, nldtsz - 1, SDT_SYSLDT, SEL_KPL); 876 877 /* 878 * load the new ldt. 879 */ 880 kpreempt_disable(); 881 ldt_load(); 882 kpreempt_enable(); 883 884 kmem_free(oldt, oldtsz); 885 } --- EOF ---