8956 Implement KPTI
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.

  23  */
  24 
  25 /*      Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
  26 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T       */
  27 /*        All Rights Reserved   */
  28 
  29 /*      Copyright (c) 1987, 1988 Microsoft Corporation  */
  30 /*        All Rights Reserved   */
  31 
  32 #include <sys/param.h>
  33 #include <sys/types.h>
  34 #include <sys/sysmacros.h>
  35 #include <sys/systm.h>
  36 #include <sys/signal.h>
  37 #include <sys/errno.h>
  38 #include <sys/fault.h>
  39 #include <sys/syscall.h>
  40 #include <sys/cpuvar.h>
  41 #include <sys/sysi86.h>
  42 #include <sys/psw.h>
  43 #include <sys/cred.h>
  44 #include <sys/policy.h>
  45 #include <sys/thread.h>
  46 #include <sys/debug.h>
  47 #include <sys/ontrap.h>
  48 #include <sys/privregs.h>
  49 #include <sys/x86_archext.h>
  50 #include <sys/vmem.h>
  51 #include <sys/kmem.h>
  52 #include <sys/mman.h>
  53 #include <sys/archsystm.h>
  54 #include <vm/hat.h>
  55 #include <vm/as.h>
  56 #include <vm/seg.h>
  57 #include <vm/seg_kmem.h>
  58 #include <vm/faultcode.h>
  59 #include <sys/fp.h>
  60 #include <sys/cmn_err.h>
  61 #include <sys/segments.h>
  62 #include <sys/clock.h>

  63 #if defined(__xpv)
  64 #include <sys/hypervisor.h>
  65 #include <sys/note.h>
  66 #endif
  67 
  68 static void ldt_alloc(proc_t *, uint_t);
  69 static void ldt_free(proc_t *);
  70 static void ldt_dup(proc_t *, proc_t *);
  71 static void ldt_grow(proc_t *, uint_t);
  72 
  73 /*
  74  * sysi86 System Call
  75  */
  76 
  77 /* ARGSUSED */
  78 int
  79 sysi86(short cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3)
  80 {
  81         struct ssd ssd;
  82         int error = 0;
  83         int c;
  84         proc_t *pp = curproc;
  85 
  86         switch (cmd) {
  87 
  88         /*
  89          * The SI86V86 subsystem call of the SYSI86 system call
  90          * supports only one subcode -- V86SC_IOPL.
  91          */
  92         case SI86V86:
  93                 if (arg1 == V86SC_IOPL) {
  94                         struct regs *rp = lwptoregs(ttolwp(curthread));
  95                         greg_t oldpl = rp->r_ps & PS_IOPL;
  96                         greg_t newpl = arg2 & PS_IOPL;
  97 
  98                         /*
  99                          * Must be privileged to run this system call
 100                          * if giving more io privilege.
 101                          */
 102                         if (newpl > oldpl && (error =
 103                             secpolicy_sys_config(CRED(), B_FALSE)) != 0)
 104                                 return (set_errno(error));
 105 #if defined(__xpv)
 106                         kpreempt_disable();
 107                         installctx(curthread, NULL, xen_disable_user_iopl,
 108                             xen_enable_user_iopl, NULL, NULL,
 109                             xen_disable_user_iopl, NULL);
 110                         xen_enable_user_iopl();
 111                         kpreempt_enable();
 112 #else
 113                         rp->r_ps ^= oldpl ^ newpl;
 114 #endif
 115                 } else
 116                         error = EINVAL;
 117                 break;
 118 
 119         /*
 120          * Set a segment descriptor
 121          */
 122         case SI86DSCR:
 123                 /*
 124                  * There are considerable problems here manipulating
 125                  * resources shared by many running lwps.  Get everyone
 126                  * into a safe state before changing the LDT.
 127                  */
 128                 if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK1)) {
 129                         error = EINTR;
 130                         break;
 131                 }
 132 
 133                 if (get_udatamodel() == DATAMODEL_LP64) {
 134                         error = EINVAL;
 135                         break;
 136                 }
 137 
 138                 if (copyin((caddr_t)arg1, &ssd, sizeof (ssd)) < 0) {
 139                         error = EFAULT;
 140                         break;
 141                 }
 142 
 143                 error = setdscr(&ssd);
 144 
 145                 mutex_enter(&pp->p_lock);
 146                 if (curthread != pp->p_agenttp)
 147                         continuelwps(pp);
 148                 mutex_exit(&pp->p_lock);
 149                 break;
 150 
 151         case SI86FPHW:
 152                 c = fp_kind & 0xff;
 153                 if (suword32((void *)arg1, c) == -1)
 154                         error = EFAULT;
 155                 break;
 156 
 157         case SI86FPSTART:
 158                 /*
 159                  * arg1 is the address of _fp_hw
 160                  * arg2 is the desired x87 FCW value
 161                  * arg3 is the desired SSE MXCSR value
 162                  * a return value of one means SSE hardware, else none.
 163                  */
 164                 c = fp_kind & 0xff;
 165                 if (suword32((void *)arg1, c) == -1) {
 166                         error = EFAULT;
 167                         break;
 168                 }
 169                 fpsetcw((uint16_t)arg2, (uint32_t)arg3);
 170                 return ((fp_kind & __FP_SSE) ? 1 : 0);
 171 
 172         /* real time clock management commands */
 173 
 174         case WTODC:
 175                 if ((error = secpolicy_settime(CRED())) == 0) {
 176                         timestruc_t ts;
 177                         mutex_enter(&tod_lock);
 178                         gethrestime(&ts);
 179                         tod_set(ts);
 180                         mutex_exit(&tod_lock);
 181                 }
 182                 break;
 183 
 184 /* Give some timezone playing room */
 185 #define ONEWEEK (7 * 24 * 60 * 60)
 186 
 187         case SGMTL:
 188                 /*
 189                  * Called from 32 bit land, negative values
 190                  * are not sign extended, so we do that here
 191                  * by casting it to an int and back.  We also
 192                  * clamp the value to within reason and detect
 193                  * when a 64 bit call overflows an int.
 194                  */
 195                 if ((error = secpolicy_settime(CRED())) == 0) {
 196                         int newlag = (int)arg1;
 197 
 198 #ifdef _SYSCALL32_IMPL
 199                         if (get_udatamodel() == DATAMODEL_NATIVE &&
 200                             (long)newlag != (long)arg1) {
 201                                 error = EOVERFLOW;
 202                         } else
 203 #endif
 204                         if (newlag >= -ONEWEEK && newlag <= ONEWEEK)
 205                                 sgmtl(newlag);
 206                         else
 207                                 error = EOVERFLOW;
 208                 }
 209                 break;
 210 
 211         case GGMTL:
 212                 if (get_udatamodel() == DATAMODEL_NATIVE) {
 213                         if (sulword((void *)arg1, ggmtl()) == -1)
 214                                 error = EFAULT;
 215 #ifdef _SYSCALL32_IMPL
 216                 } else {
 217                         time_t gmtl;
 218 
 219                         if ((gmtl = ggmtl()) > INT32_MAX) {
 220                                 /*
 221                                  * Since gmt_lag can at most be
 222                                  * +/- 12 hours, something is
 223                                  * *seriously* messed up here.
 224                                  */
 225                                 error = EOVERFLOW;
 226                         } else if (suword32((void *)arg1, (int32_t)gmtl) == -1)
 227                                 error = EFAULT;
 228 #endif
 229                 }
 230                 break;
 231 
 232         case RTCSYNC:
 233                 if ((error = secpolicy_settime(CRED())) == 0)
 234                         rtcsync();
 235                 break;
 236 
 237         /* END OF real time clock management commands */
 238 
 239         default:
 240                 error = EINVAL;
 241                 break;
 242         }
 243         return (error == 0 ? 0 : set_errno(error));
 244 }
 245 
 246 void
 247 usd_to_ssd(user_desc_t *usd, struct ssd *ssd, selector_t sel)
 248 {
 249         ssd->bo = USEGD_GETBASE(usd);
 250         ssd->ls = USEGD_GETLIMIT(usd);
 251         ssd->sel = sel;
 252 
 253         /*
 254          * set type, dpl and present bits.
 255          */
 256         ssd->acc1 = usd->usd_type;
 257         ssd->acc1 |= usd->usd_dpl << 5;
 258         ssd->acc1 |= usd->usd_p << (5 + 2);
 259 
 260         /*
 261          * set avl, DB and granularity bits.
 262          */
 263         ssd->acc2 = usd->usd_avl;
 264 
 265 #if defined(__amd64)
 266         ssd->acc2 |= usd->usd_long << 1;
 267 #else
 268         ssd->acc2 |= usd->usd_reserved << 1;
 269 #endif
 270 
 271         ssd->acc2 |= usd->usd_def32 << (1 + 1);
 272         ssd->acc2 |= usd->usd_gran << (1 + 1 + 1);
 273 }
 274 
 275 static void
 276 ssd_to_usd(struct ssd *ssd, user_desc_t *usd)
 277 {
 278 
 279         ASSERT(bcmp(usd, &null_udesc, sizeof (*usd)) == 0);
 280 
 281         USEGD_SETBASE(usd, ssd->bo);
 282         USEGD_SETLIMIT(usd, ssd->ls);
 283 
 284         /*
 285          * set type, dpl and present bits.
 286          */
 287         usd->usd_type = ssd->acc1;
 288         usd->usd_dpl = ssd->acc1 >> 5;
 289         usd->usd_p = ssd->acc1 >> (5 + 2);
 290 
 291         ASSERT(usd->usd_type >= SDT_MEMRO);
 292         ASSERT(usd->usd_dpl == SEL_UPL);
 293 
 294         /*
 295          * 64-bit code selectors are never allowed in the LDT.
 296          * Reserved bit is always 0 on 32-bit systems.
 297          */
 298 #if defined(__amd64)
 299         usd->usd_long = 0;
 300 #else
 301         usd->usd_reserved = 0;
 302 #endif
 303 
 304         /*
 305          * set avl, DB and granularity bits.
 306          */
 307         usd->usd_avl = ssd->acc2;
 308         usd->usd_def32 = ssd->acc2 >> (1 + 1);
 309         usd->usd_gran = ssd->acc2 >> (1 + 1 + 1);
 310 }
 311 
 312 
 313 #if defined(__i386)
 314 
 315 static void
 316 ssd_to_sgd(struct ssd *ssd, gate_desc_t *sgd)
 317 {
 318 
 319         ASSERT(bcmp(sgd, &null_sdesc, sizeof (*sgd)) == 0);
 320 
 321         sgd->sgd_looffset = ssd->bo;
 322         sgd->sgd_hioffset = ssd->bo >> 16;
 323 
 324         sgd->sgd_selector = ssd->ls;
 325 
 326         /*
 327          * set type, dpl and present bits.
 328          */
 329         sgd->sgd_type = ssd->acc1;
 330         sgd->sgd_dpl = ssd->acc1 >> 5;
 331         sgd->sgd_p = ssd->acc1 >> 7;
 332         ASSERT(sgd->sgd_type == SDT_SYSCGT);
 333         ASSERT(sgd->sgd_dpl == SEL_UPL);
 334         sgd->sgd_stkcpy = 0;
 335 }
 336 
 337 #endif  /* __i386 */
 338 
 339 /*
 340  * Load LDT register with the current process's LDT.
 341  */
 342 static void
 343 ldt_load(void)
 344 {
 345 #if defined(__xpv)
 346         xen_set_ldt(get_ssd_base(&curproc->p_ldt_desc),
 347             curproc->p_ldtlimit + 1);
 348 #else
 349         *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = curproc->p_ldt_desc;












 350         wr_ldtr(ULDT_SEL);
 351 #endif
 352 }
 353 
 354 /*
 355  * Store a NULL selector in the LDTR. All subsequent illegal references to
 356  * the LDT will result in a #gp.
 357  */
 358 void
 359 ldt_unload(void)
 360 {
 361 #if defined(__xpv)
 362         xen_set_ldt(NULL, 0);
 363 #else
 364         *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = null_sdesc;
 365         wr_ldtr(0);



 366 #endif
 367 }
 368 
 369 /*ARGSUSED*/
 370 static void
 371 ldt_savectx(proc_t *p)
 372 {
 373         ASSERT(p->p_ldt != NULL);
 374         ASSERT(p == curproc);
 375 
 376 #if defined(__amd64)
 377         /*
 378          * The 64-bit kernel must be sure to clear any stale ldt
 379          * selectors when context switching away from a process that
 380          * has a private ldt. Consider the following example:
 381          *
 382          *      Wine creats a ldt descriptor and points a segment register
 383          *      to it.
 384          *
 385          *      We then context switch away from wine lwp to kernel
 386          *      thread and hit breakpoint in kernel with kmdb
 387          *
 388          *      When we continue and resume from kmdb we will #gp
 389          *      fault since kmdb will have saved the stale ldt selector
 390          *      from wine and will try to restore it but we are no longer in
 391          *      the context of the wine process and do not have our
 392          *      ldtr register pointing to the private ldt.
 393          */
 394         reset_sregs();
 395 #endif
 396 
 397         ldt_unload();
 398         cpu_fast_syscall_enable(NULL);
 399 }
 400 
 401 static void
 402 ldt_restorectx(proc_t *p)
 403 {
 404         ASSERT(p->p_ldt != NULL);
 405         ASSERT(p == curproc);
 406 
 407         ldt_load();
 408         cpu_fast_syscall_disable(NULL);
 409 }
 410 
 411 /*
 412  * When a process with a private LDT execs, fast syscalls must be enabled for
 413  * the new process image.
 414  */
 415 /* ARGSUSED */
 416 static void
 417 ldt_freectx(proc_t *p, int isexec)
 418 {
 419         ASSERT(p->p_ldt);
 420 
 421         if (isexec) {
 422                 kpreempt_disable();
 423                 cpu_fast_syscall_enable(NULL);
 424                 kpreempt_enable();
 425         }
 426 
 427         /*
 428          * ldt_free() will free the memory used by the private LDT, reset the
 429          * process's descriptor, and re-program the LDTR.
 430          */
 431         ldt_free(p);
 432 }
 433 
 434 /*
 435  * Install ctx op that ensures syscall/sysenter are disabled.
 436  * See comments below.
 437  *
 438  * When a thread with a private LDT forks, the new process
 439  * must have the LDT context ops installed.
 440  */
 441 /* ARGSUSED */
 442 static void
 443 ldt_installctx(proc_t *p, proc_t *cp)
 444 {
 445         proc_t          *targ = p;
 446         kthread_t       *t;
 447 
 448         /*
 449          * If this is a fork, operate on the child process.
 450          */
 451         if (cp != NULL) {
 452                 targ = cp;
 453                 ldt_dup(p, cp);
 454         }
 455 
 456         /*
 457          * The process context ops expect the target process as their argument.
 458          */
 459         ASSERT(removepctx(targ, targ, ldt_savectx, ldt_restorectx,
 460             ldt_installctx, ldt_savectx, ldt_freectx) == 0);
 461 
 462         installpctx(targ, targ, ldt_savectx, ldt_restorectx,
 463             ldt_installctx, ldt_savectx, ldt_freectx);
 464 
 465         /*
 466          * We've just disabled fast system call and return instructions; take
 467          * the slow path out to make sure we don't try to use one to return
 468          * back to user. We must set t_post_sys for every thread in the
 469          * process to make sure none of them escape out via fast return.
 470          */
 471 
 472         mutex_enter(&targ->p_lock);
 473         t = targ->p_tlist;
 474         do {
 475                 t->t_post_sys = 1;
 476         } while ((t = t->t_forw) != targ->p_tlist);
 477         mutex_exit(&targ->p_lock);
 478 }
 479 
 480 int
 481 setdscr(struct ssd *ssd)
 482 {
 483         ushort_t seli;          /* selector index */
 484         user_desc_t *ldp;       /* descriptor pointer */
 485         user_desc_t ndesc;      /* new descriptor */
 486         proc_t  *pp = ttoproc(curthread);
 487         int     rc = 0;
 488 
 489         /*
 490          * LDT segments: executable and data at DPL 3 only.
 491          */
 492         if (!SELISLDT(ssd->sel) || !SELISUPL(ssd->sel))
 493                 return (EINVAL);
 494 
 495         /*
 496          * check the selector index.
 497          */
 498         seli = SELTOIDX(ssd->sel);
 499         if (seli >= MAXNLDT || seli < LDT_UDBASE)
 500                 return (EINVAL);
 501 
 502         ndesc = null_udesc;
 503         mutex_enter(&pp->p_ldtlock);
 504 
 505         /*
 506          * If this is the first time for this process then setup a
 507          * private LDT for it.
 508          */
 509         if (pp->p_ldt == NULL) {
 510                 ldt_alloc(pp, seli);
 511 
 512                 /*
 513                  * Now that this process has a private LDT, the use of
 514                  * the syscall/sysret and sysenter/sysexit instructions
 515                  * is forbidden for this processes because they destroy
 516                  * the contents of %cs and %ss segment registers.
 517                  *
 518                  * Explicity disable them here and add a context handler
 519                  * to the process. Note that disabling
 520                  * them here means we can't use sysret or sysexit on
 521                  * the way out of this system call - so we force this
 522                  * thread to take the slow path (which doesn't make use
 523                  * of sysenter or sysexit) back out.
 524                  */
 525                 kpreempt_disable();
 526                 ldt_installctx(pp, NULL);
 527                 cpu_fast_syscall_disable(NULL);
 528                 ASSERT(curthread->t_post_sys != 0);
 529                 kpreempt_enable();
 530 
 531         } else if (seli > pp->p_ldtlimit) {
 532 
 533                 /*
 534                  * Increase size of ldt to include seli.
 535                  */
 536                 ldt_grow(pp, seli);
 537         }
 538 
 539         ASSERT(seli <= pp->p_ldtlimit);
 540         ldp = &pp->p_ldt[seli];
 541 
 542         /*
 543          * On the 64-bit kernel, this is where things get more subtle.
 544          * Recall that in the 64-bit kernel, when we enter the kernel we
 545          * deliberately -don't- reload the segment selectors we came in on
 546          * for %ds, %es, %fs or %gs. Messing with selectors is expensive,
 547          * and the underlying descriptors are essentially ignored by the
 548          * hardware in long mode - except for the base that we override with
 549          * the gsbase MSRs.
 550          *
 551          * However, there's one unfortunate issue with this rosy picture --
 552          * a descriptor that's not marked as 'present' will still generate
 553          * an #np when loading a segment register.
 554          *
 555          * Consider this case.  An lwp creates a harmless LDT entry, points
 556          * one of it's segment registers at it, then tells the kernel (here)
 557          * to delete it.  In the 32-bit kernel, the #np will happen on the
 558          * way back to userland where we reload the segment registers, and be
 559          * handled in kern_gpfault().  In the 64-bit kernel, the same thing
 560          * will happen in the normal case too.  However, if we're trying to
 561          * use a debugger that wants to save and restore the segment registers,
 562          * and the debugger things that we have valid segment registers, we
 563          * have the problem that the debugger will try and restore the
 564          * segment register that points at the now 'not present' descriptor
 565          * and will take a #np right there.
 566          *
 567          * We should obviously fix the debugger to be paranoid about
 568          * -not- restoring segment registers that point to bad descriptors;
 569          * however we can prevent the problem here if we check to see if any
 570          * of the segment registers are still pointing at the thing we're
 571          * destroying; if they are, return an error instead. (That also seems
 572          * a lot better failure mode than SIGKILL and a core file
 573          * from kern_gpfault() too.)
 574          */
 575         if (SI86SSD_PRES(ssd) == 0) {
 576                 kthread_t *t;
 577                 int bad = 0;
 578 
 579                 /*
 580                  * Look carefully at the segment registers of every lwp
 581                  * in the process (they're all stopped by our caller).
 582                  * If we're about to invalidate a descriptor that's still
 583                  * being referenced by *any* of them, return an error,
 584                  * rather than having them #gp on their way out of the kernel.
 585                  */
 586                 ASSERT(pp->p_lwprcnt == 1);
 587 
 588                 mutex_enter(&pp->p_lock);
 589                 t = pp->p_tlist;
 590                 do {
 591                         klwp_t *lwp = ttolwp(t);
 592                         struct regs *rp = lwp->lwp_regs;
 593 #if defined(__amd64)
 594                         pcb_t *pcb = &lwp->lwp_pcb;
 595 #endif
 596 
 597                         if (ssd->sel == rp->r_cs || ssd->sel == rp->r_ss) {
 598                                 bad = 1;
 599                                 break;
 600                         }
 601 
 602 #if defined(__amd64)
 603                         if (pcb->pcb_rupdate == 1) {
 604                                 if (ssd->sel == pcb->pcb_ds ||
 605                                     ssd->sel == pcb->pcb_es ||
 606                                     ssd->sel == pcb->pcb_fs ||
 607                                     ssd->sel == pcb->pcb_gs) {
 608                                         bad = 1;
 609                                         break;
 610                                 }
 611                         } else
 612 #endif
 613                         {
 614                                 if (ssd->sel == rp->r_ds ||
 615                                     ssd->sel == rp->r_es ||
 616                                     ssd->sel == rp->r_fs ||
 617                                     ssd->sel == rp->r_gs) {
 618                                         bad = 1;
 619                                         break;
 620                                 }
 621                         }
 622 
 623                 } while ((t = t->t_forw) != pp->p_tlist);
 624                 mutex_exit(&pp->p_lock);
 625 
 626                 if (bad) {
 627                         mutex_exit(&pp->p_ldtlock);
 628                         return (EBUSY);
 629                 }
 630         }
 631 
 632         /*
 633          * If acc1 is zero, clear the descriptor (including the 'present' bit)
 634          */
 635         if (ssd->acc1 == 0) {
 636                 rc  = ldt_update_segd(ldp, &null_udesc);
 637                 mutex_exit(&pp->p_ldtlock);
 638                 return (rc);
 639         }
 640 
 641         /*
 642          * Check segment type, allow segment not present and
 643          * only user DPL (3).
 644          */
 645         if (SI86SSD_DPL(ssd) != SEL_UPL) {
 646                 mutex_exit(&pp->p_ldtlock);
 647                 return (EINVAL);
 648         }
 649 
 650 #if defined(__amd64)
 651         /*
 652          * Do not allow 32-bit applications to create 64-bit mode code
 653          * segments.
 654          */
 655         if (SI86SSD_ISUSEG(ssd) && ((SI86SSD_TYPE(ssd) >> 3) & 1) == 1 &&
 656             SI86SSD_ISLONG(ssd)) {
 657                 mutex_exit(&pp->p_ldtlock);
 658                 return (EINVAL);
 659         }
 660 #endif /* __amd64 */
 661 
 662         /*
 663          * Set up a code or data user segment descriptor.
 664          */
 665         if (SI86SSD_ISUSEG(ssd)) {
 666                 ssd_to_usd(ssd, &ndesc);
 667                 rc = ldt_update_segd(ldp, &ndesc);
 668                 mutex_exit(&pp->p_ldtlock);
 669                 return (rc);
 670         }
 671 
 672 #if defined(__i386)
 673         /*
 674          * Allow a call gate only if the destination is in the LDT
 675          * and the system is running in 32-bit legacy mode.
 676          *
 677          * In long mode 32-bit call gates are redefined as 64-bit call
 678          * gates and the hw enforces that the target code selector
 679          * of the call gate must be 64-bit selector. A #gp fault is
 680          * generated if otherwise. Since we do not allow 32-bit processes
 681          * to switch themselves to 64-bits we never allow call gates
 682          * on 64-bit system system.
 683          */
 684         if (SI86SSD_TYPE(ssd) == SDT_SYSCGT && SELISLDT(ssd->ls)) {
 685 
 686 
 687                 ssd_to_sgd(ssd, (gate_desc_t *)&ndesc);
 688                 rc = ldt_update_segd(ldp, &ndesc);
 689                 mutex_exit(&pp->p_ldtlock);
 690                 return (rc);
 691         }
 692 #endif  /* __i386 */
 693 
 694         mutex_exit(&pp->p_ldtlock);
 695         return (EINVAL);
 696 }
 697 
 698 /*
 699  * Allocate new LDT for process just large enough to contain seli.
 700  * Note we allocate and grow LDT in PAGESIZE chunks. We do this
 701  * to simplify the implementation and because on the hypervisor it's
 702  * required, since the LDT must live on pages that have PROT_WRITE
 703  * removed and which are given to the hypervisor.
 704  */
 705 static void
 706 ldt_alloc(proc_t *pp, uint_t seli)
 707 {
 708         user_desc_t     *ldt;
 709         size_t          ldtsz;
 710         uint_t          nsels;
 711 
 712         ASSERT(MUTEX_HELD(&pp->p_ldtlock));
 713         ASSERT(pp->p_ldt == NULL);
 714         ASSERT(pp->p_ldtlimit == 0);
 715 
 716         /*
 717          * Allocate new LDT just large enough to contain seli.

 718          */
 719         ldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE);
 720         nsels = ldtsz / sizeof (user_desc_t);
 721         ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT);
 722 
 723         ldt = kmem_zalloc(ldtsz, KM_SLEEP);
 724         ASSERT(IS_P2ALIGNED(ldt, PAGESIZE));
 725 
 726 #if defined(__xpv)
 727         if (xen_ldt_setprot(ldt, ldtsz, PROT_READ))
 728                 panic("ldt_alloc:xen_ldt_setprot(PROT_READ) failed");
 729 #endif
 730 
 731         pp->p_ldt = ldt;
 732         pp->p_ldtlimit = nsels - 1;
 733         set_syssegd(&pp->p_ldt_desc, ldt, ldtsz - 1, SDT_SYSLDT, SEL_KPL);
 734 
 735         if (pp == curproc) {
 736                 kpreempt_disable();
 737                 ldt_load();
 738                 kpreempt_enable();
 739         }
 740 }
 741 
 742 static void
 743 ldt_free(proc_t *pp)
 744 {
 745         user_desc_t     *ldt;
 746         size_t          ldtsz;
 747 
 748         ASSERT(pp->p_ldt != NULL);
 749 
 750         mutex_enter(&pp->p_ldtlock);
 751         ldt = pp->p_ldt;
 752         ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
 753 
 754         ASSERT(IS_P2ALIGNED(ldtsz, PAGESIZE));
 755 
 756         pp->p_ldt = NULL;
 757         pp->p_ldtlimit = 0;
 758         pp->p_ldt_desc = null_sdesc;
 759         mutex_exit(&pp->p_ldtlock);
 760 
 761         if (pp == curproc) {
 762                 kpreempt_disable();
 763                 ldt_unload();
 764                 kpreempt_enable();
 765         }
 766 
 767 #if defined(__xpv)
 768         /*
 769          * We are not allowed to make the ldt writable until after
 770          * we tell the hypervisor to unload it.
 771          */
 772         if (xen_ldt_setprot(ldt, ldtsz, PROT_READ | PROT_WRITE))
 773                 panic("ldt_free:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
 774 #endif
 775 
 776         kmem_free(ldt, ldtsz);
 777 }
 778 
 779 /*
 780  * On fork copy new ldt for child.
 781  */
 782 static void
 783 ldt_dup(proc_t *pp, proc_t *cp)
 784 {
 785         size_t  ldtsz;
 786 
 787         ASSERT(pp->p_ldt != NULL);
 788         ASSERT(cp != curproc);
 789 
 790         /*
 791          * I assume the parent's ldt can't increase since we're in a fork.
 792          */
 793         mutex_enter(&pp->p_ldtlock);
 794         mutex_enter(&cp->p_ldtlock);
 795 
 796         ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
 797 
 798         ldt_alloc(cp, pp->p_ldtlimit);
 799 
 800 #if defined(__xpv)
 801         /*
 802          * Make child's ldt writable so it can be copied into from
 803          * parent's ldt. This works since ldt_alloc above did not load
 804          * the ldt since its for the child process. If we tried to make
 805          * an LDT writable that is loaded in hw the setprot operation
 806          * would fail.
 807          */
 808         if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ | PROT_WRITE))
 809                 panic("ldt_dup:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
 810 #endif
 811 
 812         bcopy(pp->p_ldt, cp->p_ldt, ldtsz);
 813 
 814 #if defined(__xpv)
 815         if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ))
 816                 panic("ldt_dup:xen_ldt_setprot(PROT_READ) failed");
 817 #endif
 818         mutex_exit(&cp->p_ldtlock);
 819         mutex_exit(&pp->p_ldtlock);
 820 
 821 }
 822 
 823 static void
 824 ldt_grow(proc_t *pp, uint_t seli)
 825 {
 826         user_desc_t     *oldt, *nldt;
 827         uint_t          nsels;
 828         size_t          oldtsz, nldtsz;
 829 
 830         ASSERT(MUTEX_HELD(&pp->p_ldtlock));
 831         ASSERT(pp->p_ldt != NULL);
 832         ASSERT(pp->p_ldtlimit != 0);
 833 
 834         /*
 835          * Allocate larger LDT just large enough to contain seli.

 836          */
 837         nldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE);
 838         nsels = nldtsz / sizeof (user_desc_t);
 839         ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT);
 840         ASSERT(nsels > pp->p_ldtlimit);
 841 
 842         oldt = pp->p_ldt;
 843         oldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
 844 
 845         nldt = kmem_zalloc(nldtsz, KM_SLEEP);
 846         ASSERT(IS_P2ALIGNED(nldt, PAGESIZE));
 847 
 848         bcopy(oldt, nldt, oldtsz);
 849 
 850         /*
 851          * unload old ldt.
 852          */
 853         kpreempt_disable();
 854         ldt_unload();
 855         kpreempt_enable();
 856 
 857 #if defined(__xpv)
 858 
 859         /*
 860          * Make old ldt writable and new ldt read only.
 861          */
 862         if (xen_ldt_setprot(oldt, oldtsz, PROT_READ | PROT_WRITE))
 863                 panic("ldt_grow:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
 864 
 865         if (xen_ldt_setprot(nldt, nldtsz, PROT_READ))
 866                 panic("ldt_grow:xen_ldt_setprot(PROT_READ) failed");
 867 #endif
 868 
 869         pp->p_ldt = nldt;
 870         pp->p_ldtlimit = nsels - 1;
 871 
 872         /*
 873          * write new ldt segment descriptor.
 874          */
 875         set_syssegd(&pp->p_ldt_desc, nldt, nldtsz - 1, SDT_SYSLDT, SEL_KPL);
 876 
 877         /*
 878          * load the new ldt.
 879          */
 880         kpreempt_disable();
 881         ldt_load();
 882         kpreempt_enable();
 883 
 884         kmem_free(oldt, oldtsz);
 885 }
--- EOF ---