8956 Implement KPTI
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2018 Joyent, Inc.
  24  */
  25 
  26 /*      Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
  27 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T       */
  28 /*        All Rights Reserved   */
  29 
  30 /*      Copyright (c) 1987, 1988 Microsoft Corporation  */
  31 /*        All Rights Reserved   */
  32 
  33 #include <sys/param.h>
  34 #include <sys/types.h>
  35 #include <sys/sysmacros.h>
  36 #include <sys/systm.h>
  37 #include <sys/signal.h>
  38 #include <sys/errno.h>
  39 #include <sys/fault.h>
  40 #include <sys/syscall.h>
  41 #include <sys/cpuvar.h>
  42 #include <sys/sysi86.h>
  43 #include <sys/psw.h>
  44 #include <sys/cred.h>
  45 #include <sys/policy.h>
  46 #include <sys/thread.h>
  47 #include <sys/debug.h>
  48 #include <sys/ontrap.h>
  49 #include <sys/privregs.h>
  50 #include <sys/x86_archext.h>
  51 #include <sys/vmem.h>
  52 #include <sys/kmem.h>
  53 #include <sys/mman.h>
  54 #include <sys/archsystm.h>
  55 #include <vm/hat.h>
  56 #include <vm/as.h>
  57 #include <vm/seg.h>
  58 #include <vm/seg_kmem.h>
  59 #include <vm/faultcode.h>
  60 #include <sys/fp.h>
  61 #include <sys/cmn_err.h>
  62 #include <sys/segments.h>
  63 #include <sys/clock.h>
  64 #include <vm/hat_i86.h>
  65 #if defined(__xpv)
  66 #include <sys/hypervisor.h>
  67 #include <sys/note.h>
  68 #endif
  69 
  70 static void ldt_alloc(proc_t *, uint_t);
  71 static void ldt_free(proc_t *);
  72 static void ldt_dup(proc_t *, proc_t *);
  73 static void ldt_grow(proc_t *, uint_t);
  74 
  75 /*
  76  * sysi86 System Call
  77  */
  78 
  79 /* ARGSUSED */
  80 int
  81 sysi86(short cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3)
  82 {
  83         struct ssd ssd;
  84         int error = 0;
  85         int c;
  86         proc_t *pp = curproc;
  87 
  88         switch (cmd) {
  89 
  90         /*
  91          * The SI86V86 subsystem call of the SYSI86 system call
  92          * supports only one subcode -- V86SC_IOPL.
  93          */
  94         case SI86V86:
  95                 if (arg1 == V86SC_IOPL) {
  96                         struct regs *rp = lwptoregs(ttolwp(curthread));
  97                         greg_t oldpl = rp->r_ps & PS_IOPL;
  98                         greg_t newpl = arg2 & PS_IOPL;
  99 
 100                         /*
 101                          * Must be privileged to run this system call
 102                          * if giving more io privilege.
 103                          */
 104                         if (newpl > oldpl && (error =
 105                             secpolicy_sys_config(CRED(), B_FALSE)) != 0)
 106                                 return (set_errno(error));
 107 #if defined(__xpv)
 108                         kpreempt_disable();
 109                         installctx(curthread, NULL, xen_disable_user_iopl,
 110                             xen_enable_user_iopl, NULL, NULL,
 111                             xen_disable_user_iopl, NULL);
 112                         xen_enable_user_iopl();
 113                         kpreempt_enable();
 114 #else
 115                         rp->r_ps ^= oldpl ^ newpl;
 116 #endif
 117                 } else
 118                         error = EINVAL;
 119                 break;
 120 
 121         /*
 122          * Set a segment descriptor
 123          */
 124         case SI86DSCR:
 125                 /*
 126                  * There are considerable problems here manipulating
 127                  * resources shared by many running lwps.  Get everyone
 128                  * into a safe state before changing the LDT.
 129                  */
 130                 if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK1)) {
 131                         error = EINTR;
 132                         break;
 133                 }
 134 
 135                 if (get_udatamodel() == DATAMODEL_LP64) {
 136                         error = EINVAL;
 137                         break;
 138                 }
 139 
 140                 if (copyin((caddr_t)arg1, &ssd, sizeof (ssd)) < 0) {
 141                         error = EFAULT;
 142                         break;
 143                 }
 144 
 145                 error = setdscr(&ssd);
 146 
 147                 mutex_enter(&pp->p_lock);
 148                 if (curthread != pp->p_agenttp)
 149                         continuelwps(pp);
 150                 mutex_exit(&pp->p_lock);
 151                 break;
 152 
 153         case SI86FPHW:
 154                 c = fp_kind & 0xff;
 155                 if (suword32((void *)arg1, c) == -1)
 156                         error = EFAULT;
 157                 break;
 158 
 159         case SI86FPSTART:
 160                 /*
 161                  * arg1 is the address of _fp_hw
 162                  * arg2 is the desired x87 FCW value
 163                  * arg3 is the desired SSE MXCSR value
 164                  * a return value of one means SSE hardware, else none.
 165                  */
 166                 c = fp_kind & 0xff;
 167                 if (suword32((void *)arg1, c) == -1) {
 168                         error = EFAULT;
 169                         break;
 170                 }
 171                 fpsetcw((uint16_t)arg2, (uint32_t)arg3);
 172                 return ((fp_kind & __FP_SSE) ? 1 : 0);
 173 
 174         /* real time clock management commands */
 175 
 176         case WTODC:
 177                 if ((error = secpolicy_settime(CRED())) == 0) {
 178                         timestruc_t ts;
 179                         mutex_enter(&tod_lock);
 180                         gethrestime(&ts);
 181                         tod_set(ts);
 182                         mutex_exit(&tod_lock);
 183                 }
 184                 break;
 185 
 186 /* Give some timezone playing room */
 187 #define ONEWEEK (7 * 24 * 60 * 60)
 188 
 189         case SGMTL:
 190                 /*
 191                  * Called from 32 bit land, negative values
 192                  * are not sign extended, so we do that here
 193                  * by casting it to an int and back.  We also
 194                  * clamp the value to within reason and detect
 195                  * when a 64 bit call overflows an int.
 196                  */
 197                 if ((error = secpolicy_settime(CRED())) == 0) {
 198                         int newlag = (int)arg1;
 199 
 200 #ifdef _SYSCALL32_IMPL
 201                         if (get_udatamodel() == DATAMODEL_NATIVE &&
 202                             (long)newlag != (long)arg1) {
 203                                 error = EOVERFLOW;
 204                         } else
 205 #endif
 206                         if (newlag >= -ONEWEEK && newlag <= ONEWEEK)
 207                                 sgmtl(newlag);
 208                         else
 209                                 error = EOVERFLOW;
 210                 }
 211                 break;
 212 
 213         case GGMTL:
 214                 if (get_udatamodel() == DATAMODEL_NATIVE) {
 215                         if (sulword((void *)arg1, ggmtl()) == -1)
 216                                 error = EFAULT;
 217 #ifdef _SYSCALL32_IMPL
 218                 } else {
 219                         time_t gmtl;
 220 
 221                         if ((gmtl = ggmtl()) > INT32_MAX) {
 222                                 /*
 223                                  * Since gmt_lag can at most be
 224                                  * +/- 12 hours, something is
 225                                  * *seriously* messed up here.
 226                                  */
 227                                 error = EOVERFLOW;
 228                         } else if (suword32((void *)arg1, (int32_t)gmtl) == -1)
 229                                 error = EFAULT;
 230 #endif
 231                 }
 232                 break;
 233 
 234         case RTCSYNC:
 235                 if ((error = secpolicy_settime(CRED())) == 0)
 236                         rtcsync();
 237                 break;
 238 
 239         /* END OF real time clock management commands */
 240 
 241         default:
 242                 error = EINVAL;
 243                 break;
 244         }
 245         return (error == 0 ? 0 : set_errno(error));
 246 }
 247 
 248 void
 249 usd_to_ssd(user_desc_t *usd, struct ssd *ssd, selector_t sel)
 250 {
 251         ssd->bo = USEGD_GETBASE(usd);
 252         ssd->ls = USEGD_GETLIMIT(usd);
 253         ssd->sel = sel;
 254 
 255         /*
 256          * set type, dpl and present bits.
 257          */
 258         ssd->acc1 = usd->usd_type;
 259         ssd->acc1 |= usd->usd_dpl << 5;
 260         ssd->acc1 |= usd->usd_p << (5 + 2);
 261 
 262         /*
 263          * set avl, DB and granularity bits.
 264          */
 265         ssd->acc2 = usd->usd_avl;
 266 
 267 #if defined(__amd64)
 268         ssd->acc2 |= usd->usd_long << 1;
 269 #else
 270         ssd->acc2 |= usd->usd_reserved << 1;
 271 #endif
 272 
 273         ssd->acc2 |= usd->usd_def32 << (1 + 1);
 274         ssd->acc2 |= usd->usd_gran << (1 + 1 + 1);
 275 }
 276 
 277 static void
 278 ssd_to_usd(struct ssd *ssd, user_desc_t *usd)
 279 {
 280 
 281         ASSERT(bcmp(usd, &null_udesc, sizeof (*usd)) == 0);
 282 
 283         USEGD_SETBASE(usd, ssd->bo);
 284         USEGD_SETLIMIT(usd, ssd->ls);
 285 
 286         /*
 287          * set type, dpl and present bits.
 288          */
 289         usd->usd_type = ssd->acc1;
 290         usd->usd_dpl = ssd->acc1 >> 5;
 291         usd->usd_p = ssd->acc1 >> (5 + 2);
 292 
 293         ASSERT(usd->usd_type >= SDT_MEMRO);
 294         ASSERT(usd->usd_dpl == SEL_UPL);
 295 
 296         /*
 297          * 64-bit code selectors are never allowed in the LDT.
 298          * Reserved bit is always 0 on 32-bit systems.
 299          */
 300 #if defined(__amd64)
 301         usd->usd_long = 0;
 302 #else
 303         usd->usd_reserved = 0;
 304 #endif
 305 
 306         /*
 307          * set avl, DB and granularity bits.
 308          */
 309         usd->usd_avl = ssd->acc2;
 310         usd->usd_def32 = ssd->acc2 >> (1 + 1);
 311         usd->usd_gran = ssd->acc2 >> (1 + 1 + 1);
 312 }
 313 
 314 
 315 #if defined(__i386)
 316 
 317 static void
 318 ssd_to_sgd(struct ssd *ssd, gate_desc_t *sgd)
 319 {
 320 
 321         ASSERT(bcmp(sgd, &null_sdesc, sizeof (*sgd)) == 0);
 322 
 323         sgd->sgd_looffset = ssd->bo;
 324         sgd->sgd_hioffset = ssd->bo >> 16;
 325 
 326         sgd->sgd_selector = ssd->ls;
 327 
 328         /*
 329          * set type, dpl and present bits.
 330          */
 331         sgd->sgd_type = ssd->acc1;
 332         sgd->sgd_dpl = ssd->acc1 >> 5;
 333         sgd->sgd_p = ssd->acc1 >> 7;
 334         ASSERT(sgd->sgd_type == SDT_SYSCGT);
 335         ASSERT(sgd->sgd_dpl == SEL_UPL);
 336         sgd->sgd_stkcpy = 0;
 337 }
 338 
 339 #endif  /* __i386 */
 340 
 341 /*
 342  * Load LDT register with the current process's LDT.
 343  */
 344 static void
 345 ldt_load(void)
 346 {
 347 #if defined(__xpv)
 348         xen_set_ldt(get_ssd_base(&curproc->p_ldt_desc),
 349             curproc->p_ldtlimit + 1);
 350 #else
 351         size_t len;
 352         system_desc_t desc;
 353 
 354         /*
 355          * Before we can use the LDT on this CPU, we must install the LDT in the
 356          * user mapping table.
 357          */
 358         len = (curproc->p_ldtlimit + 1) * sizeof (user_desc_t);
 359         bcopy(curproc->p_ldt, CPU->cpu_m.mcpu_ldt, len);
 360         CPU->cpu_m.mcpu_ldt_len = len;
 361         set_syssegd(&desc, CPU->cpu_m.mcpu_ldt, len - 1, SDT_SYSLDT, SEL_KPL);
 362         *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = desc;
 363 
 364         wr_ldtr(ULDT_SEL);
 365 #endif
 366 }
 367 
 368 /*
 369  * Store a NULL selector in the LDTR. All subsequent illegal references to
 370  * the LDT will result in a #gp.
 371  */
 372 void
 373 ldt_unload(void)
 374 {
 375 #if defined(__xpv)
 376         xen_set_ldt(NULL, 0);
 377 #else
 378         *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = null_sdesc;
 379         wr_ldtr(0);
 380 
 381         bzero(CPU->cpu_m.mcpu_ldt, CPU->cpu_m.mcpu_ldt_len);
 382         CPU->cpu_m.mcpu_ldt_len = 0;
 383 #endif
 384 }
 385 
 386 /*ARGSUSED*/
 387 static void
 388 ldt_savectx(proc_t *p)
 389 {
 390         ASSERT(p->p_ldt != NULL);
 391         ASSERT(p == curproc);
 392 
 393 #if defined(__amd64)
 394         /*
 395          * The 64-bit kernel must be sure to clear any stale ldt
 396          * selectors when context switching away from a process that
 397          * has a private ldt. Consider the following example:
 398          *
 399          *      Wine creats a ldt descriptor and points a segment register
 400          *      to it.
 401          *
 402          *      We then context switch away from wine lwp to kernel
 403          *      thread and hit breakpoint in kernel with kmdb
 404          *
 405          *      When we continue and resume from kmdb we will #gp
 406          *      fault since kmdb will have saved the stale ldt selector
 407          *      from wine and will try to restore it but we are no longer in
 408          *      the context of the wine process and do not have our
 409          *      ldtr register pointing to the private ldt.
 410          */
 411         reset_sregs();
 412 #endif
 413 
 414         ldt_unload();
 415         cpu_fast_syscall_enable(NULL);
 416 }
 417 
 418 static void
 419 ldt_restorectx(proc_t *p)
 420 {
 421         ASSERT(p->p_ldt != NULL);
 422         ASSERT(p == curproc);
 423 
 424         ldt_load();
 425         cpu_fast_syscall_disable(NULL);
 426 }
 427 
 428 /*
 429  * When a process with a private LDT execs, fast syscalls must be enabled for
 430  * the new process image.
 431  */
 432 /* ARGSUSED */
 433 static void
 434 ldt_freectx(proc_t *p, int isexec)
 435 {
 436         ASSERT(p->p_ldt);
 437 
 438         if (isexec) {
 439                 kpreempt_disable();
 440                 cpu_fast_syscall_enable(NULL);
 441                 kpreempt_enable();
 442         }
 443 
 444         /*
 445          * ldt_free() will free the memory used by the private LDT, reset the
 446          * process's descriptor, and re-program the LDTR.
 447          */
 448         ldt_free(p);
 449 }
 450 
 451 /*
 452  * Install ctx op that ensures syscall/sysenter are disabled.
 453  * See comments below.
 454  *
 455  * When a thread with a private LDT forks, the new process
 456  * must have the LDT context ops installed.
 457  */
 458 /* ARGSUSED */
 459 static void
 460 ldt_installctx(proc_t *p, proc_t *cp)
 461 {
 462         proc_t          *targ = p;
 463         kthread_t       *t;
 464 
 465         /*
 466          * If this is a fork, operate on the child process.
 467          */
 468         if (cp != NULL) {
 469                 targ = cp;
 470                 ldt_dup(p, cp);
 471         }
 472 
 473         /*
 474          * The process context ops expect the target process as their argument.
 475          */
 476         ASSERT(removepctx(targ, targ, ldt_savectx, ldt_restorectx,
 477             ldt_installctx, ldt_savectx, ldt_freectx) == 0);
 478 
 479         installpctx(targ, targ, ldt_savectx, ldt_restorectx,
 480             ldt_installctx, ldt_savectx, ldt_freectx);
 481 
 482         /*
 483          * We've just disabled fast system call and return instructions; take
 484          * the slow path out to make sure we don't try to use one to return
 485          * back to user. We must set t_post_sys for every thread in the
 486          * process to make sure none of them escape out via fast return.
 487          */
 488 
 489         mutex_enter(&targ->p_lock);
 490         t = targ->p_tlist;
 491         do {
 492                 t->t_post_sys = 1;
 493         } while ((t = t->t_forw) != targ->p_tlist);
 494         mutex_exit(&targ->p_lock);
 495 }
 496 
 497 int
 498 setdscr(struct ssd *ssd)
 499 {
 500         ushort_t seli;          /* selector index */
 501         user_desc_t *ldp;       /* descriptor pointer */
 502         user_desc_t ndesc;      /* new descriptor */
 503         proc_t  *pp = ttoproc(curthread);
 504         int     rc = 0;
 505 
 506         /*
 507          * LDT segments: executable and data at DPL 3 only.
 508          */
 509         if (!SELISLDT(ssd->sel) || !SELISUPL(ssd->sel))
 510                 return (EINVAL);
 511 
 512         /*
 513          * check the selector index.
 514          */
 515         seli = SELTOIDX(ssd->sel);
 516         if (seli >= MAXNLDT || seli < LDT_UDBASE)
 517                 return (EINVAL);
 518 
 519         ndesc = null_udesc;
 520         mutex_enter(&pp->p_ldtlock);
 521 
 522         /*
 523          * If this is the first time for this process then setup a
 524          * private LDT for it.
 525          */
 526         if (pp->p_ldt == NULL) {
 527                 ldt_alloc(pp, seli);
 528 
 529                 /*
 530                  * Now that this process has a private LDT, the use of
 531                  * the syscall/sysret and sysenter/sysexit instructions
 532                  * is forbidden for this processes because they destroy
 533                  * the contents of %cs and %ss segment registers.
 534                  *
 535                  * Explicity disable them here and add a context handler
 536                  * to the process. Note that disabling
 537                  * them here means we can't use sysret or sysexit on
 538                  * the way out of this system call - so we force this
 539                  * thread to take the slow path (which doesn't make use
 540                  * of sysenter or sysexit) back out.
 541                  */
 542                 kpreempt_disable();
 543                 ldt_installctx(pp, NULL);
 544                 cpu_fast_syscall_disable(NULL);
 545                 ASSERT(curthread->t_post_sys != 0);
 546                 kpreempt_enable();
 547 
 548         } else if (seli > pp->p_ldtlimit) {
 549 
 550                 /*
 551                  * Increase size of ldt to include seli.
 552                  */
 553                 ldt_grow(pp, seli);
 554         }
 555 
 556         ASSERT(seli <= pp->p_ldtlimit);
 557         ldp = &pp->p_ldt[seli];
 558 
 559         /*
 560          * On the 64-bit kernel, this is where things get more subtle.
 561          * Recall that in the 64-bit kernel, when we enter the kernel we
 562          * deliberately -don't- reload the segment selectors we came in on
 563          * for %ds, %es, %fs or %gs. Messing with selectors is expensive,
 564          * and the underlying descriptors are essentially ignored by the
 565          * hardware in long mode - except for the base that we override with
 566          * the gsbase MSRs.
 567          *
 568          * However, there's one unfortunate issue with this rosy picture --
 569          * a descriptor that's not marked as 'present' will still generate
 570          * an #np when loading a segment register.
 571          *
 572          * Consider this case.  An lwp creates a harmless LDT entry, points
 573          * one of it's segment registers at it, then tells the kernel (here)
 574          * to delete it.  In the 32-bit kernel, the #np will happen on the
 575          * way back to userland where we reload the segment registers, and be
 576          * handled in kern_gpfault().  In the 64-bit kernel, the same thing
 577          * will happen in the normal case too.  However, if we're trying to
 578          * use a debugger that wants to save and restore the segment registers,
 579          * and the debugger things that we have valid segment registers, we
 580          * have the problem that the debugger will try and restore the
 581          * segment register that points at the now 'not present' descriptor
 582          * and will take a #np right there.
 583          *
 584          * We should obviously fix the debugger to be paranoid about
 585          * -not- restoring segment registers that point to bad descriptors;
 586          * however we can prevent the problem here if we check to see if any
 587          * of the segment registers are still pointing at the thing we're
 588          * destroying; if they are, return an error instead. (That also seems
 589          * a lot better failure mode than SIGKILL and a core file
 590          * from kern_gpfault() too.)
 591          */
 592         if (SI86SSD_PRES(ssd) == 0) {
 593                 kthread_t *t;
 594                 int bad = 0;
 595 
 596                 /*
 597                  * Look carefully at the segment registers of every lwp
 598                  * in the process (they're all stopped by our caller).
 599                  * If we're about to invalidate a descriptor that's still
 600                  * being referenced by *any* of them, return an error,
 601                  * rather than having them #gp on their way out of the kernel.
 602                  */
 603                 ASSERT(pp->p_lwprcnt == 1);
 604 
 605                 mutex_enter(&pp->p_lock);
 606                 t = pp->p_tlist;
 607                 do {
 608                         klwp_t *lwp = ttolwp(t);
 609                         struct regs *rp = lwp->lwp_regs;
 610 #if defined(__amd64)
 611                         pcb_t *pcb = &lwp->lwp_pcb;
 612 #endif
 613 
 614                         if (ssd->sel == rp->r_cs || ssd->sel == rp->r_ss) {
 615                                 bad = 1;
 616                                 break;
 617                         }
 618 
 619 #if defined(__amd64)
 620                         if (pcb->pcb_rupdate == 1) {
 621                                 if (ssd->sel == pcb->pcb_ds ||
 622                                     ssd->sel == pcb->pcb_es ||
 623                                     ssd->sel == pcb->pcb_fs ||
 624                                     ssd->sel == pcb->pcb_gs) {
 625                                         bad = 1;
 626                                         break;
 627                                 }
 628                         } else
 629 #endif
 630                         {
 631                                 if (ssd->sel == rp->r_ds ||
 632                                     ssd->sel == rp->r_es ||
 633                                     ssd->sel == rp->r_fs ||
 634                                     ssd->sel == rp->r_gs) {
 635                                         bad = 1;
 636                                         break;
 637                                 }
 638                         }
 639 
 640                 } while ((t = t->t_forw) != pp->p_tlist);
 641                 mutex_exit(&pp->p_lock);
 642 
 643                 if (bad) {
 644                         mutex_exit(&pp->p_ldtlock);
 645                         return (EBUSY);
 646                 }
 647         }
 648 
 649         /*
 650          * If acc1 is zero, clear the descriptor (including the 'present' bit)
 651          */
 652         if (ssd->acc1 == 0) {
 653                 rc  = ldt_update_segd(ldp, &null_udesc);
 654                 mutex_exit(&pp->p_ldtlock);
 655                 return (rc);
 656         }
 657 
 658         /*
 659          * Check segment type, allow segment not present and
 660          * only user DPL (3).
 661          */
 662         if (SI86SSD_DPL(ssd) != SEL_UPL) {
 663                 mutex_exit(&pp->p_ldtlock);
 664                 return (EINVAL);
 665         }
 666 
 667 #if defined(__amd64)
 668         /*
 669          * Do not allow 32-bit applications to create 64-bit mode code
 670          * segments.
 671          */
 672         if (SI86SSD_ISUSEG(ssd) && ((SI86SSD_TYPE(ssd) >> 3) & 1) == 1 &&
 673             SI86SSD_ISLONG(ssd)) {
 674                 mutex_exit(&pp->p_ldtlock);
 675                 return (EINVAL);
 676         }
 677 #endif /* __amd64 */
 678 
 679         /*
 680          * Set up a code or data user segment descriptor.
 681          */
 682         if (SI86SSD_ISUSEG(ssd)) {
 683                 ssd_to_usd(ssd, &ndesc);
 684                 rc = ldt_update_segd(ldp, &ndesc);
 685                 mutex_exit(&pp->p_ldtlock);
 686                 return (rc);
 687         }
 688 
 689 #if defined(__i386)
 690         /*
 691          * Allow a call gate only if the destination is in the LDT
 692          * and the system is running in 32-bit legacy mode.
 693          *
 694          * In long mode 32-bit call gates are redefined as 64-bit call
 695          * gates and the hw enforces that the target code selector
 696          * of the call gate must be 64-bit selector. A #gp fault is
 697          * generated if otherwise. Since we do not allow 32-bit processes
 698          * to switch themselves to 64-bits we never allow call gates
 699          * on 64-bit system system.
 700          */
 701         if (SI86SSD_TYPE(ssd) == SDT_SYSCGT && SELISLDT(ssd->ls)) {
 702 
 703 
 704                 ssd_to_sgd(ssd, (gate_desc_t *)&ndesc);
 705                 rc = ldt_update_segd(ldp, &ndesc);
 706                 mutex_exit(&pp->p_ldtlock);
 707                 return (rc);
 708         }
 709 #endif  /* __i386 */
 710 
 711         mutex_exit(&pp->p_ldtlock);
 712         return (EINVAL);
 713 }
 714 
 715 /*
 716  * Allocate new LDT for process just large enough to contain seli.
 717  * Note we allocate and grow LDT in PAGESIZE chunks. We do this
 718  * to simplify the implementation and because on the hypervisor it's
 719  * required, since the LDT must live on pages that have PROT_WRITE
 720  * removed and which are given to the hypervisor.
 721  */
 722 static void
 723 ldt_alloc(proc_t *pp, uint_t seli)
 724 {
 725         user_desc_t     *ldt;
 726         size_t          ldtsz;
 727         uint_t          nsels;
 728 
 729         ASSERT(MUTEX_HELD(&pp->p_ldtlock));
 730         ASSERT(pp->p_ldt == NULL);
 731         ASSERT(pp->p_ldtlimit == 0);
 732 
 733         /*
 734          * Allocate new LDT just large enough to contain seli. The LDT must
 735          * always be allocated in units of pages for KPTI.
 736          */
 737         ldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE);
 738         nsels = ldtsz / sizeof (user_desc_t);
 739         ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT);
 740 
 741         ldt = kmem_zalloc(ldtsz, KM_SLEEP);
 742         ASSERT(IS_P2ALIGNED(ldt, PAGESIZE));
 743 
 744 #if defined(__xpv)
 745         if (xen_ldt_setprot(ldt, ldtsz, PROT_READ))
 746                 panic("ldt_alloc:xen_ldt_setprot(PROT_READ) failed");
 747 #endif
 748 
 749         pp->p_ldt = ldt;
 750         pp->p_ldtlimit = nsels - 1;
 751         set_syssegd(&pp->p_ldt_desc, ldt, ldtsz - 1, SDT_SYSLDT, SEL_KPL);
 752 
 753         if (pp == curproc) {
 754                 kpreempt_disable();
 755                 ldt_load();
 756                 kpreempt_enable();
 757         }
 758 }
 759 
 760 static void
 761 ldt_free(proc_t *pp)
 762 {
 763         user_desc_t     *ldt;
 764         size_t          ldtsz;
 765 
 766         ASSERT(pp->p_ldt != NULL);
 767 
 768         mutex_enter(&pp->p_ldtlock);
 769         ldt = pp->p_ldt;
 770         ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
 771 
 772         ASSERT(IS_P2ALIGNED(ldtsz, PAGESIZE));
 773 
 774         pp->p_ldt = NULL;
 775         pp->p_ldtlimit = 0;
 776         pp->p_ldt_desc = null_sdesc;
 777         mutex_exit(&pp->p_ldtlock);
 778 
 779         if (pp == curproc) {
 780                 kpreempt_disable();
 781                 ldt_unload();
 782                 kpreempt_enable();
 783         }
 784 
 785 #if defined(__xpv)
 786         /*
 787          * We are not allowed to make the ldt writable until after
 788          * we tell the hypervisor to unload it.
 789          */
 790         if (xen_ldt_setprot(ldt, ldtsz, PROT_READ | PROT_WRITE))
 791                 panic("ldt_free:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
 792 #endif
 793 
 794         kmem_free(ldt, ldtsz);
 795 }
 796 
 797 /*
 798  * On fork copy new ldt for child.
 799  */
 800 static void
 801 ldt_dup(proc_t *pp, proc_t *cp)
 802 {
 803         size_t  ldtsz;
 804 
 805         ASSERT(pp->p_ldt != NULL);
 806         ASSERT(cp != curproc);
 807 
 808         /*
 809          * I assume the parent's ldt can't increase since we're in a fork.
 810          */
 811         mutex_enter(&pp->p_ldtlock);
 812         mutex_enter(&cp->p_ldtlock);
 813 
 814         ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
 815 
 816         ldt_alloc(cp, pp->p_ldtlimit);
 817 
 818 #if defined(__xpv)
 819         /*
 820          * Make child's ldt writable so it can be copied into from
 821          * parent's ldt. This works since ldt_alloc above did not load
 822          * the ldt since its for the child process. If we tried to make
 823          * an LDT writable that is loaded in hw the setprot operation
 824          * would fail.
 825          */
 826         if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ | PROT_WRITE))
 827                 panic("ldt_dup:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
 828 #endif
 829 
 830         bcopy(pp->p_ldt, cp->p_ldt, ldtsz);
 831 
 832 #if defined(__xpv)
 833         if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ))
 834                 panic("ldt_dup:xen_ldt_setprot(PROT_READ) failed");
 835 #endif
 836         mutex_exit(&cp->p_ldtlock);
 837         mutex_exit(&pp->p_ldtlock);
 838 
 839 }
 840 
 841 static void
 842 ldt_grow(proc_t *pp, uint_t seli)
 843 {
 844         user_desc_t     *oldt, *nldt;
 845         uint_t          nsels;
 846         size_t          oldtsz, nldtsz;
 847 
 848         ASSERT(MUTEX_HELD(&pp->p_ldtlock));
 849         ASSERT(pp->p_ldt != NULL);
 850         ASSERT(pp->p_ldtlimit != 0);
 851 
 852         /*
 853          * Allocate larger LDT just large enough to contain seli. The LDT must
 854          * always be allocated in units of pages for KPTI.
 855          */
 856         nldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE);
 857         nsels = nldtsz / sizeof (user_desc_t);
 858         ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT);
 859         ASSERT(nsels > pp->p_ldtlimit);
 860 
 861         oldt = pp->p_ldt;
 862         oldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
 863 
 864         nldt = kmem_zalloc(nldtsz, KM_SLEEP);
 865         ASSERT(IS_P2ALIGNED(nldt, PAGESIZE));
 866 
 867         bcopy(oldt, nldt, oldtsz);
 868 
 869         /*
 870          * unload old ldt.
 871          */
 872         kpreempt_disable();
 873         ldt_unload();
 874         kpreempt_enable();
 875 
 876 #if defined(__xpv)
 877 
 878         /*
 879          * Make old ldt writable and new ldt read only.
 880          */
 881         if (xen_ldt_setprot(oldt, oldtsz, PROT_READ | PROT_WRITE))
 882                 panic("ldt_grow:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
 883 
 884         if (xen_ldt_setprot(nldt, nldtsz, PROT_READ))
 885                 panic("ldt_grow:xen_ldt_setprot(PROT_READ) failed");
 886 #endif
 887 
 888         pp->p_ldt = nldt;
 889         pp->p_ldtlimit = nsels - 1;
 890 
 891         /*
 892          * write new ldt segment descriptor.
 893          */
 894         set_syssegd(&pp->p_ldt_desc, nldt, nldtsz - 1, SDT_SYSLDT, SEL_KPL);
 895 
 896         /*
 897          * load the new ldt.
 898          */
 899         kpreempt_disable();
 900         ldt_load();
 901         kpreempt_enable();
 902 
 903         kmem_free(oldt, oldtsz);
 904 }
--- EOF ---