illumos-gate Wdiff usr/src/uts/i86pc/vm/i86_mmu.c

Print this page

8956 Implement KPTI
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/i86pc/vm/i86_mmu.c
          +++ new/usr/src/uts/i86pc/vm/i86_mmu.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each

↓ open down ↓

13 lines elided

↑ open up ↑

  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
       24 + *
       25 + * Copyright 2018 Joyent, Inc.
  24   26   */
  25   27  
  26   28  #include <sys/t_lock.h>
  27   29  #include <sys/memlist.h>
  28   30  #include <sys/cpuvar.h>
  29   31  #include <sys/vmem.h>
  30   32  #include <sys/mman.h>
  31   33  #include <sys/vm.h>
  32   34  #include <sys/kmem.h>
  33   35  #include <sys/cmn_err.h>

  34   36  #include <sys/debug.h>
  35   37  #include <sys/vm_machparam.h>
  36   38  #include <sys/tss.h>
  37   39  #include <sys/vnode.h>
  38   40  #include <vm/hat.h>
  39   41  #include <vm/anon.h>
  40   42  #include <vm/as.h>
  41   43  #include <vm/page.h>
  42   44  #include <vm/seg.h>
  43   45  #include <vm/seg_kmem.h>
  44   46  #include <vm/seg_map.h>
  45   47  #include <vm/hat_i86.h>
  46   48  #include <sys/promif.h>
  47   49  #include <sys/x86_archext.h>
  48   50  #include <sys/systm.h>
  49   51  #include <sys/archsystm.h>
  50   52  #include <sys/sunddi.h>
  51   53  #include <sys/ddidmareq.h>
  52   54  #include <sys/controlregs.h>
  53   55  #include <sys/reboot.h>

↓ open down ↓

20 lines elided

↑ open up ↑

  54   56  #include <sys/kdi.h>
  55   57  #include <sys/bootconf.h>
  56   58  #include <sys/bootsvcs.h>
  57   59  #include <sys/bootinfo.h>
  58   60  #include <vm/kboot_mmu.h>
  59   61  
  60   62  #ifdef __xpv
  61   63  #include <sys/hypervisor.h>
  62   64  #endif
  63   65  
  64      -caddr_t
  65      -i86devmap(pfn_t pf, pgcnt_t pgcnt, uint_t prot)
  66      -{
  67      -        caddr_t addr;
  68      -        caddr_t addr1;
  69      -        page_t *pp;
       66 +#define ON_USER_HAT(cpu) \
       67 +        ((cpu)->cpu_m.mcpu_current_hat != NULL && \
       68 +        (cpu)->cpu_m.mcpu_current_hat != kas.a_hat)
  70   69  
  71      -        addr1 = addr = vmem_alloc(heap_arena, mmu_ptob(pgcnt), VM_SLEEP);
  72      -
  73      -        for (; pgcnt != 0; addr += MMU_PAGESIZE, ++pf, --pgcnt) {
  74      -                pp = page_numtopp_nolock(pf);
  75      -                if (pp == NULL) {
  76      -                        hat_devload(kas.a_hat, addr, MMU_PAGESIZE, pf,
  77      -                            prot | HAT_NOSYNC, HAT_LOAD_LOCK);
  78      -                } else {
  79      -                        hat_memload(kas.a_hat, addr, pp,
  80      -                            prot | HAT_NOSYNC, HAT_LOAD_LOCK);
  81      -                }
  82      -        }
  83      -
  84      -        return (addr1);
  85      -}
  86      -
  87   70  /*
  88      - * This routine is like page_numtopp, but accepts only free pages, which
  89      - * it allocates (unfrees) and returns with the exclusive lock held.
  90      - * It is used by machdep.c/dma_init() to find contiguous free pages.
  91      - *
  92      - * XXX this and some others should probably be in vm_machdep.c
  93      - */
  94      -page_t *
  95      -page_numtopp_alloc(pfn_t pfnum)
  96      -{
  97      -        page_t *pp;
  98      -
  99      -retry:
 100      -        pp = page_numtopp_nolock(pfnum);
 101      -        if (pp == NULL) {
 102      -                return (NULL);
 103      -        }
 104      -
 105      -        if (!page_trylock(pp, SE_EXCL)) {
 106      -                return (NULL);
 107      -        }
 108      -
 109      -        if (page_pptonum(pp) != pfnum) {
 110      -                page_unlock(pp);
 111      -                goto retry;
 112      -        }
 113      -
 114      -        if (!PP_ISFREE(pp)) {
 115      -                page_unlock(pp);
 116      -                return (NULL);
 117      -        }
 118      -        if (pp->p_szc) {
 119      -                page_demote_free_pages(pp);
 120      -                page_unlock(pp);
 121      -                goto retry;
 122      -        }
 123      -
 124      -        /* If associated with a vnode, destroy mappings */
 125      -
 126      -        if (pp->p_vnode) {
 127      -
 128      -                page_destroy_free(pp);
 129      -
 130      -                if (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_NO_RECLAIM)) {
 131      -                        return (NULL);
 132      -                }
 133      -
 134      -                if (page_pptonum(pp) != pfnum) {
 135      -                        page_unlock(pp);
 136      -                        goto retry;
 137      -                }
 138      -        }
 139      -
 140      -        if (!PP_ISFREE(pp)) {
 141      -                page_unlock(pp);
 142      -                return (NULL);
 143      -        }
 144      -
 145      -        if (!page_reclaim(pp, (kmutex_t *)NULL))
 146      -                return (NULL);
 147      -
 148      -        return (pp);
 149      -}
 150      -
 151      -/*
 152   71   * Flag is not set early in boot. Once it is set we are no longer
 153   72   * using boot's page tables.
 154   73   */
 155   74  uint_t khat_running = 0;
 156   75  
 157   76  /*
 158   77   * This procedure is callable only while the boot loader is in charge of the
 159   78   * MMU. It assumes that PA == VA for page table pointers.  It doesn't live in
 160   79   * kboot_mmu.c since it's used from common code.
 161   80   */

 162   81  pfn_t
 163   82  va_to_pfn(void *vaddr)
 164   83  {
 165   84          uintptr_t       des_va = ALIGN2PAGE(vaddr);
 166   85          uintptr_t       va = des_va;
 167   86          size_t          len;
 168   87          uint_t          prot;
 169   88          pfn_t           pfn;
 170   89  
 171   90          if (khat_running)
 172   91                  panic("va_to_pfn(): called too late\n");
 173   92  
 174   93          if (kbm_probe(&va, &len, &pfn, &prot) == 0)
 175   94                  return (PFN_INVALID);
 176   95          if (va > des_va)
 177   96                  return (PFN_INVALID);
 178   97          if (va < des_va)
 179   98                  pfn += mmu_btop(des_va - va);
 180   99          return (pfn);
 181  100  }
 182  101  
 183  102  /*
 184  103   * Initialize a special area in the kernel that always holds some PTEs for
 185  104   * faster performance. This always holds segmap's PTEs.
 186  105   * In the 32 bit kernel this maps the kernel heap too.
 187  106   */
 188  107  void
 189  108  hat_kmap_init(uintptr_t base, size_t len)
 190  109  {
 191  110          uintptr_t map_addr;     /* base rounded down to large page size */
 192  111          uintptr_t map_eaddr;    /* base + len rounded up */
 193  112          size_t map_len;
 194  113          caddr_t ptes;           /* mapping area in kernel for kmap ptes */
 195  114          size_t window_size;     /* size of mapping area for ptes */
 196  115          ulong_t htable_cnt;     /* # of page tables to cover map_len */
 197  116          ulong_t i;
 198  117          htable_t *ht;
 199  118          uintptr_t va;
 200  119  
 201  120          /*
 202  121           * We have to map in an area that matches an entire page table.
 203  122           * The PTEs are large page aligned to avoid spurious pagefaults
 204  123           * on the hypervisor.
 205  124           */
 206  125          map_addr = base & LEVEL_MASK(1);
 207  126          map_eaddr = (base + len + LEVEL_SIZE(1) - 1) & LEVEL_MASK(1);
 208  127          map_len = map_eaddr - map_addr;
 209  128          window_size = mmu_btop(map_len) * mmu.pte_size;
 210  129          window_size = (window_size + LEVEL_SIZE(1)) & LEVEL_MASK(1);
 211  130          htable_cnt = map_len >> LEVEL_SHIFT(1);
 212  131  
 213  132          /*
 214  133           * allocate vmem for the kmap_ptes
 215  134           */
 216  135          ptes = vmem_xalloc(heap_arena, window_size, LEVEL_SIZE(1), 0,
 217  136              0, NULL, NULL, VM_SLEEP);
 218  137          mmu.kmap_htables =
 219  138              kmem_alloc(htable_cnt * sizeof (htable_t *), KM_SLEEP);
 220  139  
 221  140          /*
 222  141           * Map the page tables that cover kmap into the allocated range.
 223  142           * Note we don't ever htable_release() the kmap page tables - they
 224  143           * can't ever be stolen, freed, etc.
 225  144           */
 226  145          for (va = map_addr, i = 0; i < htable_cnt; va += LEVEL_SIZE(1), ++i) {
 227  146                  ht = htable_create(kas.a_hat, va, 0, NULL);
 228  147                  if (ht == NULL)
 229  148                          panic("hat_kmap_init: ht == NULL");
 230  149                  mmu.kmap_htables[i] = ht;
 231  150  
 232  151                  hat_devload(kas.a_hat, ptes + i * MMU_PAGESIZE,
 233  152                      MMU_PAGESIZE, ht->ht_pfn,
 234  153  #ifdef __xpv
 235  154                      PROT_READ | HAT_NOSYNC | HAT_UNORDERED_OK,
 236  155  #else
 237  156                      PROT_READ | PROT_WRITE | HAT_NOSYNC | HAT_UNORDERED_OK,
 238  157  #endif
 239  158                      HAT_LOAD | HAT_LOAD_NOCONSIST);
 240  159          }
 241  160  
 242  161          /*
 243  162           * set information in mmu to activate handling of kmap
 244  163           */
 245  164          mmu.kmap_addr = map_addr;
 246  165          mmu.kmap_eaddr = map_eaddr;
 247  166          mmu.kmap_ptes = (x86pte_t *)ptes;
 248  167  }
 249  168  
 250  169  extern caddr_t  kpm_vbase;
 251  170  extern size_t   kpm_size;
 252  171  
 253  172  #ifdef __xpv
 254  173  /*
 255  174   * Create the initial segkpm mappings for the hypervisor. To avoid having
 256  175   * to deal with page tables being read only, we make all mappings
 257  176   * read only at first.
 258  177   */
 259  178  static void
 260  179  xen_kpm_create(paddr_t paddr, level_t lvl)
 261  180  {
 262  181          ulong_t pg_off;
 263  182  
 264  183          for (pg_off = 0; pg_off < LEVEL_SIZE(lvl); pg_off += MMU_PAGESIZE) {
 265  184                  kbm_map((uintptr_t)kpm_vbase + paddr, (paddr_t)0, 0, 1);
 266  185                  kbm_read_only((uintptr_t)kpm_vbase + paddr + pg_off,
 267  186                      paddr + pg_off);
 268  187          }
 269  188  }
 270  189  
 271  190  /*
 272  191   * Try to make all kpm mappings writable. Failures are ok, as those
 273  192   * are just pagetable, GDT, etc. pages.
 274  193   */
 275  194  static void
 276  195  xen_kpm_finish_init(void)
 277  196  {
 278  197          pfn_t gdtpfn = mmu_btop(CPU->cpu_m.mcpu_gdtpa);
 279  198          pfn_t pfn;
 280  199          page_t *pp;
 281  200  
 282  201          for (pfn = 0; pfn < mfn_count; ++pfn) {
 283  202                  /*
 284  203                   * skip gdt
 285  204                   */
 286  205                  if (pfn == gdtpfn)
 287  206                          continue;
 288  207  
 289  208                  /*
 290  209                   * p_index is a hint that this is a pagetable
 291  210                   */
 292  211                  pp = page_numtopp_nolock(pfn);
 293  212                  if (pp && pp->p_index) {
 294  213                          pp->p_index = 0;
 295  214                          continue;
 296  215                  }
 297  216                  (void) xen_kpm_page(pfn, PT_VALID | PT_WRITABLE);
 298  217          }
 299  218  }
 300  219  #endif
 301  220  
 302  221  /*
 303  222   * Routine to pre-allocate data structures for hat_kern_setup(). It computes
 304  223   * how many pagetables it needs by walking the boot loader's page tables.
 305  224   */
 306  225  /*ARGSUSED*/
 307  226  void
 308  227  hat_kern_alloc(
 309  228          caddr_t segmap_base,
 310  229          size_t  segmap_size,
 311  230          caddr_t ekernelheap)
 312  231  {
 313  232          uintptr_t       last_va = (uintptr_t)-1;        /* catch 1st time */
 314  233          uintptr_t       va = 0;
 315  234          size_t          size;
 316  235          pfn_t           pfn;
 317  236          uint_t          prot;
 318  237          uint_t          table_cnt = 1;
 319  238          uint_t          mapping_cnt;
 320  239          level_t         start_level;
 321  240          level_t         l;
 322  241          struct memlist  *pmem;
 323  242          level_t         lpagel = mmu.max_page_level;
 324  243          uint64_t        paddr;
 325  244          int64_t         psize;
 326  245          int             nwindows;
 327  246  
 328  247          if (kpm_size > 0) {
 329  248                  /*
 330  249                   * Create the kpm page tables.  When running on the
 331  250                   * hypervisor these are made read/only at first.
 332  251                   * Later we'll add write permission where possible.
 333  252                   */
 334  253                  for (pmem = phys_install; pmem; pmem = pmem->ml_next) {
 335  254                          paddr = pmem->ml_address;
 336  255                          psize = pmem->ml_size;
 337  256                          while (psize >= MMU_PAGESIZE) {
 338  257                                  /* find the largest page size */
 339  258                                  for (l = lpagel; l > 0; l--) {
 340  259                                          if ((paddr & LEVEL_OFFSET(l)) == 0 &&
 341  260                                              psize > LEVEL_SIZE(l))
 342  261                                                  break;
 343  262                                  }
 344  263  
 345  264  #if defined(__xpv)
 346  265                                  /*
 347  266                                   * Create read/only mappings to avoid
 348  267                                   * conflicting with pagetable usage
 349  268                                   */
 350  269                                  xen_kpm_create(paddr, l);
 351  270  #else
 352  271                                  kbm_map((uintptr_t)kpm_vbase + paddr, paddr,
 353  272                                      l, 1);
 354  273  #endif
 355  274                                  paddr += LEVEL_SIZE(l);
 356  275                                  psize -= LEVEL_SIZE(l);
 357  276                          }
 358  277                  }
 359  278          }
 360  279  
 361  280          /*
 362  281           * If this machine doesn't have a kpm segment, we need to allocate
 363  282           * a small number of 'windows' which can be used to map pagetables.
 364  283           */
 365  284          nwindows = (kpm_size == 0) ? 2 * NCPU : 0;
 366  285  
 367  286  #if defined(__xpv)
 368  287          /*
 369  288           * On a hypervisor, these windows are also used by the xpv_panic
 370  289           * code, where we need one window for each level of the pagetable
 371  290           * hierarchy.
 372  291           */
 373  292          nwindows = MAX(nwindows, mmu.max_level);
 374  293  #endif
 375  294  
 376  295          if (nwindows != 0) {
 377  296                  /*
 378  297                   * Create the page windows and 1 page of VA in
 379  298                   * which we map the PTEs of those windows.
 380  299                   */
 381  300                  mmu.pwin_base = vmem_xalloc(heap_arena, nwindows * MMU_PAGESIZE,
 382  301                      LEVEL_SIZE(1), 0, 0, NULL, NULL, VM_SLEEP);
 383  302                  ASSERT(nwindows <= MMU_PAGESIZE / mmu.pte_size);
 384  303                  mmu.pwin_pte_va = vmem_xalloc(heap_arena, MMU_PAGESIZE,
 385  304                      MMU_PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP);
 386  305  
 387  306                  /*
 388  307                   * Find/Create the page table window mappings.
 389  308                   */
 390  309                  paddr = 0;
 391  310                  (void) find_pte((uintptr_t)mmu.pwin_base, &paddr, 0, 0);
 392  311                  ASSERT(paddr != 0);
 393  312                  ASSERT((paddr & MMU_PAGEOFFSET) == 0);
 394  313                  mmu.pwin_pte_pa = paddr;
 395  314  #ifdef __xpv
 396  315                  (void) find_pte((uintptr_t)mmu.pwin_pte_va, NULL, 0, 0);
 397  316                  kbm_read_only((uintptr_t)mmu.pwin_pte_va, mmu.pwin_pte_pa);
 398  317  #else
 399  318                  kbm_map((uintptr_t)mmu.pwin_pte_va, mmu.pwin_pte_pa, 0, 1);
 400  319  #endif
 401  320          }
 402  321  
 403  322          /*
 404  323           * Walk the boot loader's page tables and figure out
 405  324           * how many tables and page mappings there will be.
 406  325           */
 407  326          while (kbm_probe(&va, &size, &pfn, &prot) != 0) {
 408  327                  /*
 409  328                   * At each level, if the last_va falls into a new htable,
 410  329                   * increment table_cnt. We can stop at the 1st level where
 411  330                   * they are in the same htable.
 412  331                   */
 413  332                  start_level = 0;
 414  333                  while (start_level <= mmu.max_page_level) {
 415  334                          if (size == LEVEL_SIZE(start_level))
 416  335                                  break;
 417  336                          start_level++;
 418  337                  }
 419  338  
 420  339                  for (l = start_level; l < mmu.max_level; ++l) {
 421  340                          if (va >> LEVEL_SHIFT(l + 1) ==
 422  341                              last_va >> LEVEL_SHIFT(l + 1))
 423  342                                  break;
 424  343                          ++table_cnt;
 425  344                  }
 426  345                  last_va = va;
 427  346                  l = (start_level == 0) ? 1 : start_level;
 428  347                  va = (va & LEVEL_MASK(l)) + LEVEL_SIZE(l);

↓ open down ↓

267 lines elided

↑ open up ↑

 429  348          }
 430  349  
 431  350          /*
 432  351           * Besides the boot loader mappings, we're going to fill in
 433  352           * the entire top level page table for the kernel. Make sure there's
 434  353           * enough reserve for that too.
 435  354           */
 436  355          table_cnt += mmu.top_level_count - ((kernelbase >>
 437  356              LEVEL_SHIFT(mmu.max_level)) & (mmu.top_level_count - 1));
 438  357  
 439      -#if defined(__i386)
 440  358          /*
 441      -         * The 32 bit PAE hat allocates tables one level below the top when
 442      -         * kernelbase isn't 1 Gig aligned. We'll just be sloppy and allocate
 443      -         * a bunch more to the reserve. Any unused will be returned later.
 444      -         * Note we've already counted these mappings, just not the extra
 445      -         * pagetables.
 446      -         */
 447      -        if (mmu.pae_hat != 0 && (kernelbase & LEVEL_OFFSET(mmu.max_level)) != 0)
 448      -                table_cnt += mmu.ptes_per_table -
 449      -                    ((kernelbase & LEVEL_OFFSET(mmu.max_level)) >>
 450      -                    LEVEL_SHIFT(mmu.max_level - 1));
 451      -#endif
 452      -
 453      -        /*
 454  359           * Add 1/4 more into table_cnt for extra slop.  The unused
 455  360           * slop is freed back when we htable_adjust_reserve() later.
 456  361           */
 457  362          table_cnt += table_cnt >> 2;
 458  363  
 459  364          /*
 460  365           * We only need mapping entries (hments) for shared pages.
 461  366           * This should be far, far fewer than the total possible,
 462  367           * We'll allocate enough for 1/16 of all possible PTEs.
 463  368           */

 464  369          mapping_cnt = (table_cnt * mmu.ptes_per_table) >> 4;
 465  370  
 466  371          /*
 467  372           * Now create the initial htable/hment reserves
 468  373           */
 469  374          htable_initial_reserve(table_cnt);
 470  375          hment_reserve(mapping_cnt);
 471  376          x86pte_cpu_init(CPU);
 472  377  }
 473  378  
 474  379  
 475  380  /*
 476  381   * This routine handles the work of creating the kernel's initial mappings
 477  382   * by deciphering the mappings in the page tables created by the boot program.
 478  383   *
 479  384   * We maintain large page mappings, but only to a level 1 pagesize.
 480  385   * The boot loader can only add new mappings once this function starts.
 481  386   * In particular it can not change the pagesize used for any existing
 482  387   * mappings or this code breaks!
 483  388   */
 484  389  
 485  390  void

↓ open down ↓

22 lines elided

↑ open up ↑

 486  391  hat_kern_setup(void)
 487  392  {
 488  393          /*
 489  394           * Attach htables to the existing pagetables
 490  395           */
 491  396          /* BEGIN CSTYLED */
 492  397          htable_attach(kas.a_hat, 0, mmu.max_level, NULL,
 493  398  #ifdef __xpv
 494  399              mmu_btop(xen_info->pt_base - ONE_GIG));
 495  400  #else
 496      -            mmu_btop(getcr3()));
      401 +            mmu_btop(getcr3_pa()));
 497  402  #endif
 498  403          /* END CSTYLED */
 499  404  
 500      -#if defined(__i386) && !defined(__xpv)
 501      -        CPU->cpu_tss->tss_cr3 = dftss0->tss_cr3 = getcr3();
 502      -#endif /* __i386 */
 503      -
 504      -#if defined(__xpv) && defined(__amd64)
      405 +#if defined(__xpv)
 505  406          /*
 506  407           * Try to make the kpm mappings r/w. Failures here are OK, as
 507  408           * it's probably just a pagetable
 508  409           */
 509  410          xen_kpm_finish_init();
 510  411  #endif
 511  412  
 512  413          /*
 513  414           * The kernel HAT is now officially open for business.
 514  415           */
 515  416          khat_running = 1;
 516  417  
 517  418          CPUSET_ATOMIC_ADD(kas.a_hat->hat_cpus, CPU->cpu_id);
 518  419          CPU->cpu_current_hat = kas.a_hat;
 519  420  }
      421 +
      422 +#ifndef __xpv
      423 +
      424 +/*
      425 + * Note that the INVPCID_ALL* variants can be used even in the !PCIDE case, but
      426 + * INVPCID_ADDR isn't.
      427 + */
      428 +static void
      429 +invpcid(uint64_t type, uint64_t pcid, uintptr_t addr)
      430 +{
      431 +        ulong_t flag;
      432 +        uint64_t cr4;
      433 +
      434 +        if (x86_use_invpcid == 1) {
      435 +                ASSERT(is_x86_feature(x86_featureset, X86FSET_INVPCID));
      436 +                invpcid_insn(type, pcid, addr);
      437 +                return;
      438 +        }
      439 +
      440 +        switch (type) {
      441 +        case INVPCID_ALL_GLOBAL:
      442 +                flag = intr_clear();
      443 +                cr4 = getcr4();
      444 +                setcr4(cr4 & ~(ulong_t)CR4_PGE);
      445 +                setcr4(cr4 | CR4_PGE);
      446 +                intr_restore(flag);
      447 +                break;
      448 +
      449 +        case INVPCID_ALL_NONGLOBAL:
      450 +                if (!(getcr4() & CR4_PCIDE)) {
      451 +                        reload_cr3();
      452 +                } else {
      453 +                        flag = intr_clear();
      454 +                        cr4 = getcr4();
      455 +                        setcr4(cr4 & ~(ulong_t)CR4_PGE);
      456 +                        setcr4(cr4 | CR4_PGE);
      457 +                        intr_restore(flag);
      458 +                }
      459 +                break;
      460 +
      461 +        case INVPCID_ADDR:
      462 +                if (pcid == PCID_USER) {
      463 +                        flag = intr_clear();
      464 +                        ASSERT(addr < kernelbase);
      465 +                        ASSERT(ON_USER_HAT(CPU));
      466 +                        ASSERT(CPU->cpu_m.mcpu_kpti.kf_user_cr3 != 0);
      467 +                        tr_mmu_flush_user_range(addr, MMU_PAGESIZE,
      468 +                            MMU_PAGESIZE, CPU->cpu_m.mcpu_kpti.kf_user_cr3);
      469 +                        intr_restore(flag);
      470 +                } else {
      471 +                        mmu_invlpg((caddr_t)addr);
      472 +                }
      473 +                break;
      474 +
      475 +        default:
      476 +                panic("unsupported invpcid(%lu)", type);
      477 +                break;
      478 +        }
      479 +}
      480 +
      481 +/*
      482 + * Flush one kernel mapping.
      483 + *
      484 + * We want to assert on kernel space here mainly for reasoning about the PCIDE
      485 + * case: namely, this flush should never need to flush a non-current PCID
      486 + * mapping.  This presumes we never have reason to flush the kernel regions
      487 + * available to PCID_USER (the trampolines and so on).  It also relies on
      488 + * PCID_KERNEL == PCID_NONE.
      489 + */
      490 +void
      491 +mmu_flush_tlb_kpage(uintptr_t va)
      492 +{
      493 +        ASSERT(va >= kernelbase);
      494 +        ASSERT(getpcid() == PCID_KERNEL);
      495 +        mmu_invlpg((caddr_t)va);
      496 +}
      497 +
      498 +/*
      499 + * Flush one mapping: local CPU version of hat_tlb_inval().
      500 + *
      501 + * If this is a userspace address in the PCIDE case, we need two invalidations,
      502 + * one for any potentially stale PCID_USER mapping, as well as any established
      503 + * while in the kernel.
      504 + */
      505 +void
      506 +mmu_flush_tlb_page(uintptr_t va)
      507 +{
      508 +        ASSERT(getpcid() == PCID_KERNEL);
      509 +
      510 +        if (va >= kernelbase) {
      511 +                mmu_flush_tlb_kpage(va);
      512 +                return;
      513 +        }
      514 +
      515 +        if (!(getcr4() & CR4_PCIDE)) {
      516 +                mmu_invlpg((caddr_t)va);
      517 +                return;
      518 +        }
      519 +
      520 +        /*
      521 +         * Yes, kas will need to flush below kernelspace, at least during boot.
      522 +         * But there's no PCID_USER context.
      523 +         */
      524 +        if (ON_USER_HAT(CPU))
      525 +                invpcid(INVPCID_ADDR, PCID_USER, va);
      526 +        invpcid(INVPCID_ADDR, PCID_KERNEL, va);
      527 +}
      528 +
      529 +static void
      530 +mmu_flush_tlb_range(uintptr_t addr, size_t len, size_t pgsz)
      531 +{
      532 +        EQUIV(addr < kernelbase, (addr + len - 1) < kernelbase);
      533 +        ASSERT(len > 0);
      534 +        ASSERT(pgsz != 0);
      535 +
      536 +        if (!(getcr4() & CR4_PCIDE) || x86_use_invpcid == 1) {
      537 +                for (uintptr_t va = addr; va < (addr + len); va += pgsz)
      538 +                        mmu_flush_tlb_page(va);
      539 +                return;
      540 +        }
      541 +
      542 +        /*
      543 +         * As an emulated invpcid() in the PCIDE case requires jumping
      544 +         * cr3s, we batch the invalidations.  We should only need to flush the
      545 +         * user range if we're on a user-space HAT.
      546 +         */
      547 +        if (addr < kernelbase && ON_USER_HAT(CPU)) {
      548 +                ulong_t flag = intr_clear();
      549 +                ASSERT(CPU->cpu_m.mcpu_kpti.kf_user_cr3 != 0);
      550 +                tr_mmu_flush_user_range(addr, len, pgsz,
      551 +                    CPU->cpu_m.mcpu_kpti.kf_user_cr3);
      552 +                intr_restore(flag);
      553 +        }
      554 +
      555 +        for (uintptr_t va = addr; va < (addr + len); va += pgsz)
      556 +                mmu_invlpg((caddr_t)va);
      557 +}
      558 +
      559 +/*
      560 + * MMU TLB (and PT cache) flushing on this CPU.
      561 + *
      562 + * FLUSH_TLB_ALL: invalidate everything, all PCIDs, all PT_GLOBAL.
      563 + * FLUSH_TLB_NONGLOBAL: invalidate all PCIDs, excluding PT_GLOBAL
      564 + * FLUSH_TLB_RANGE: invalidate the given range, including PCID_USER
      565 + * mappings as appropriate.  If using invpcid, PT_GLOBAL mappings are not
      566 + * invalidated.
      567 + */
      568 +void
      569 +mmu_flush_tlb(flush_tlb_type_t type, tlb_range_t *range)
      570 +{
      571 +        ASSERT(getpcid() == PCID_KERNEL);
      572 +
      573 +        switch (type) {
      574 +        case FLUSH_TLB_ALL:
      575 +                ASSERT(range == NULL);
      576 +                invpcid(INVPCID_ALL_GLOBAL, 0, 0);
      577 +                break;
      578 +
      579 +        case FLUSH_TLB_NONGLOBAL:
      580 +                ASSERT(range == NULL);
      581 +                invpcid(INVPCID_ALL_NONGLOBAL, 0, 0);
      582 +                break;
      583 +
      584 +        case FLUSH_TLB_RANGE: {
      585 +                mmu_flush_tlb_range(range->tr_va, TLB_RANGE_LEN(range),
      586 +                    LEVEL_SIZE(range->tr_level));
      587 +                break;
      588 +        }
      589 +
      590 +        default:
      591 +                panic("invalid call mmu_flush_tlb(%d)", type);
      592 +                break;
      593 +        }
      594 +}
      595 +
      596 +#endif /* ! __xpv */

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX