illumos-gate Sdiff usr/src/uts/i86pc/vm/hat

Print this page

8956 Implement KPTI
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
9208 hati_demap_func should take pagesize into account
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Tim Kordas <tim.kordas@joyent.com>
Reviewed by: Yuri Pankov <yuripv@yuripv.net>

  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 /*
  25  * Copyright (c) 2010, Intel Corporation.
  26  * All rights reserved.
  27  */
  28 /*
  29  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.

  30  * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
  31  */
  32 
  33 /*
  34  * VM - Hardware Address Translation management for i386 and amd64
  35  *
  36  * Implementation of the interfaces described in <common/vm/hat.h>
  37  *
  38  * Nearly all the details of how the hardware is managed should not be
  39  * visible outside this layer except for misc. machine specific functions
  40  * that work in conjunction with this code.
  41  *
  42  * Routines used only inside of i86pc/vm start with hati_ for HAT Internal.
  43  */
  44 

























































































































































































  45 #include <sys/machparam.h>
  46 #include <sys/machsystm.h>
  47 #include <sys/mman.h>
  48 #include <sys/types.h>
  49 #include <sys/systm.h>
  50 #include <sys/cpuvar.h>
  51 #include <sys/thread.h>
  52 #include <sys/proc.h>
  53 #include <sys/cpu.h>
  54 #include <sys/kmem.h>
  55 #include <sys/disp.h>
  56 #include <sys/shm.h>
  57 #include <sys/sysmacros.h>
  58 #include <sys/machparam.h>
  59 #include <sys/vmem.h>
  60 #include <sys/vmsystm.h>
  61 #include <sys/promif.h>
  62 #include <sys/var.h>
  63 #include <sys/x86_archext.h>
  64 #include <sys/atomic.h>

  78 #include <vm/seg_kpm.h>
  79 #include <vm/vm_dep.h>
  80 #ifdef __xpv
  81 #include <sys/hypervisor.h>
  82 #endif
  83 #include <vm/kboot_mmu.h>
  84 #include <vm/seg_spt.h>
  85 
  86 #include <sys/cmn_err.h>
  87 
  88 /*
  89  * Basic parameters for hat operation.
  90  */
  91 struct hat_mmu_info mmu;
  92 
  93 /*
  94  * The page that is the kernel's top level pagetable.
  95  *
  96  * For 32 bit PAE support on i86pc, the kernel hat will use the 1st 4 entries
  97  * on this 4K page for its top level page table. The remaining groups of
  98  * 4 entries are used for per processor copies of user VLP pagetables for
  99  * running threads.  See hat_switch() and reload_pae32() for details.
 100  *
 101  * vlp_page[0..3] - level==2 PTEs for kernel HAT
 102  * vlp_page[4..7] - level==2 PTEs for user thread on cpu 0
 103  * vlp_page[8..11]  - level==2 PTE for user thread on cpu 1
 104  * etc...



 105  */
 106 static x86pte_t *vlp_page;
 107 
 108 /*
 109  * forward declaration of internal utility routines
 110  */
 111 static x86pte_t hati_update_pte(htable_t *ht, uint_t entry, x86pte_t expected,
 112         x86pte_t new);
 113 
 114 /*
 115  * The kernel address space exists in all HATs. To implement this the
 116  * kernel reserves a fixed number of entries in the topmost level(s) of page
 117  * tables. The values are setup during startup and then copied to every user
 118  * hat created by hat_alloc(). This means that kernelbase must be:
 119  *
 120  *        4Meg aligned for 32 bit kernels
 121  *      512Gig aligned for x86_64 64 bit kernel
 122  *
 123  * The hat_kernel_range_ts describe what needs to be copied from kernel hat
 124  * to each user hat.
 125  */
 126 typedef struct hat_kernel_range {
 127         level_t         hkr_level;
 128         uintptr_t       hkr_start_va;
 129         uintptr_t       hkr_end_va;     /* zero means to end of memory */
 130 } hat_kernel_range_t;
 131 #define NUM_KERNEL_RANGE 2
 132 static hat_kernel_range_t kernel_ranges[NUM_KERNEL_RANGE];
 133 static int num_kernel_ranges;
 134 
 135 uint_t use_boot_reserve = 1;    /* cleared after early boot process */
 136 uint_t can_steal_post_boot = 0; /* set late in boot to enable stealing */
 137 
 138 /*

 153 
 154 
 155 #ifdef DEBUG
 156 uint_t  map1gcnt;
 157 #endif
 158 
 159 
 160 /*
 161  * A cpuset for all cpus. This is used for kernel address cross calls, since
 162  * the kernel addresses apply to all cpus.
 163  */
 164 cpuset_t khat_cpuset;
 165 
 166 /*
 167  * management stuff for hat structures
 168  */
 169 kmutex_t        hat_list_lock;
 170 kcondvar_t      hat_list_cv;
 171 kmem_cache_t    *hat_cache;
 172 kmem_cache_t    *hat_hash_cache;
 173 kmem_cache_t    *vlp_hash_cache;
 174 
 175 /*
 176  * Simple statistics
 177  */
 178 struct hatstats hatstat;
 179 
 180 /*
 181  * Some earlier hypervisor versions do not emulate cmpxchg of PTEs
 182  * correctly.  For such hypervisors we must set PT_USER for kernel
 183  * entries ourselves (normally the emulation would set PT_USER for
 184  * kernel entries and PT_USER|PT_GLOBAL for user entries).  pt_kern is
 185  * thus set appropriately.  Note that dboot/kbm is OK, as only the full
 186  * HAT uses cmpxchg() and the other paths (hypercall etc.) were never
 187  * incorrect.
 188  */
 189 int pt_kern;
 190 
 191 /*
 192  * useful stuff for atomic access/clearing/setting REF/MOD/RO bits in page_t's.
 193  */
 194 extern void atomic_orb(uchar_t *addr, uchar_t val);
 195 extern void atomic_andb(uchar_t *addr, uchar_t val);
 196 
 197 #ifndef __xpv
 198 extern pfn_t memseg_get_start(struct memseg *);
 199 #endif
 200 
 201 #define PP_GETRM(pp, rmmask)    (pp->p_nrm & rmmask)
 202 #define PP_ISMOD(pp)            PP_GETRM(pp, P_MOD)
 203 #define PP_ISREF(pp)            PP_GETRM(pp, P_REF)
 204 #define PP_ISRO(pp)             PP_GETRM(pp, P_RO)
 205 
 206 #define PP_SETRM(pp, rm)        atomic_orb(&(pp->p_nrm), rm)
 207 #define PP_SETMOD(pp)           PP_SETRM(pp, P_MOD)
 208 #define PP_SETREF(pp)           PP_SETRM(pp, P_REF)
 209 #define PP_SETRO(pp)            PP_SETRM(pp, P_RO)
 210 
 211 #define PP_CLRRM(pp, rm)        atomic_andb(&(pp->p_nrm), ~(rm))
 212 #define PP_CLRMOD(pp)           PP_CLRRM(pp, P_MOD)
 213 #define PP_CLRREF(pp)           PP_CLRRM(pp, P_REF)
 214 #define PP_CLRRO(pp)            PP_CLRRM(pp, P_RO)
 215 #define PP_CLRALL(pp)           PP_CLRRM(pp, P_MOD | P_REF | P_RO)
 216

 219  */
 220 /*ARGSUSED*/
 221 static int
 222 hati_constructor(void *buf, void *handle, int kmflags)
 223 {
 224         hat_t   *hat = buf;
 225 
 226         mutex_init(&hat->hat_mutex, NULL, MUTEX_DEFAULT, NULL);
 227         bzero(hat->hat_pages_mapped,
 228             sizeof (pgcnt_t) * (mmu.max_page_level + 1));
 229         hat->hat_ism_pgcnt = 0;
 230         hat->hat_stats = 0;
 231         hat->hat_flags = 0;
 232         CPUSET_ZERO(hat->hat_cpus);
 233         hat->hat_htable = NULL;
 234         hat->hat_ht_hash = NULL;
 235         return (0);
 236 }
 237 
 238 /*


























 239  * Allocate a hat structure for as. We also create the top level
 240  * htable and initialize it to contain the kernel hat entries.
 241  */
 242 hat_t *
 243 hat_alloc(struct as *as)
 244 {
 245         hat_t                   *hat;
 246         htable_t                *ht;    /* top level htable */
 247         uint_t                  use_vlp;
 248         uint_t                  r;
 249         hat_kernel_range_t      *rp;
 250         uintptr_t               va;
 251         uintptr_t               eva;
 252         uint_t                  start;
 253         uint_t                  cnt;
 254         htable_t                *src;

 255 
 256         /*
 257          * Once we start creating user process HATs we can enable
 258          * the htable_steal() code.
 259          */
 260         if (can_steal_post_boot == 0)
 261                 can_steal_post_boot = 1;
 262 
 263         ASSERT(AS_WRITE_HELD(as));
 264         hat = kmem_cache_alloc(hat_cache, KM_SLEEP);
 265         hat->hat_as = as;
 266         mutex_init(&hat->hat_mutex, NULL, MUTEX_DEFAULT, NULL);
 267         ASSERT(hat->hat_flags == 0);
 268 
 269 #if defined(__xpv)
 270         /*
 271          * No VLP stuff on the hypervisor due to the 64-bit split top level
 272          * page tables.  On 32-bit it's not needed as the hypervisor takes
 273          * care of copying the top level PTEs to a below 4Gig page.
 274          */
 275         use_vlp = 0;




 276 #else   /* __xpv */
 277         /* 32 bit processes uses a VLP style hat when running with PAE */
 278 #if defined(__amd64)
 279         use_vlp = (ttoproc(curthread)->p_model == DATAMODEL_ILP32);
 280 #elif defined(__i386)
 281         use_vlp = mmu.pae_hat;
 282 #endif




















 283 #endif  /* __xpv */
 284         if (use_vlp) {
 285                 hat->hat_flags = HAT_VLP;
 286                 bzero(hat->hat_vlp_ptes, VLP_SIZE);
 287         }
 288 
 289         /*
 290          * Allocate the htable hash





 291          */
 292         if ((hat->hat_flags & HAT_VLP)) {
 293                 hat->hat_num_hash = mmu.vlp_hash_cnt;
 294                 hat->hat_ht_hash = kmem_cache_alloc(vlp_hash_cache, KM_SLEEP);
 295         } else {
 296                 hat->hat_num_hash = mmu.hash_cnt;
 297                 hat->hat_ht_hash = kmem_cache_alloc(hat_hash_cache, KM_SLEEP);
 298         }
 299         bzero(hat->hat_ht_hash, hat->hat_num_hash * sizeof (htable_t *));
 300 
 301         /*
 302          * Initialize Kernel HAT entries at the top of the top level page
 303          * tables for the new hat.
 304          */
 305         hat->hat_htable = NULL;
 306         hat->hat_ht_cached = NULL;
 307         XPV_DISALLOW_MIGRATE();
 308         ht = htable_create(hat, (uintptr_t)0, TOP_LEVEL(hat), NULL);
 309         hat->hat_htable = ht;
 310 
 311 #if defined(__amd64)
 312         if (hat->hat_flags & HAT_VLP)
 313                 goto init_done;
 314 #endif
 315 
 316         for (r = 0; r < num_kernel_ranges; ++r) {
 317                 rp = &kernel_ranges[r];
 318                 for (va = rp->hkr_start_va; va != rp->hkr_end_va;
 319                     va += cnt * LEVEL_SIZE(rp->hkr_level)) {
 320 
 321                         if (rp->hkr_level == TOP_LEVEL(hat))
 322                                 ht = hat->hat_htable;
 323                         else
 324                                 ht = htable_create(hat, va, rp->hkr_level,
 325                                     NULL);
 326 
 327                         start = htable_va2entry(va, ht);
 328                         cnt = HTABLE_NUM_PTES(ht) - start;
 329                         eva = va +
 330                             ((uintptr_t)cnt << LEVEL_SHIFT(rp->hkr_level));
 331                         if (rp->hkr_end_va != 0 &&
 332                             (eva > rp->hkr_end_va || eva == 0))
 333                                 cnt = htable_va2entry(rp->hkr_end_va, ht) -
 334                                     start;
 335 
 336 #if defined(__i386) && !defined(__xpv)
 337                         if (ht->ht_flags & HTABLE_VLP) {
 338                                 bcopy(&vlp_page[start],
 339                                     &hat->hat_vlp_ptes[start],
 340                                     cnt * sizeof (x86pte_t));
 341                                 continue;
 342                         }
 343 #endif
 344                         src = htable_lookup(kas.a_hat, va, rp->hkr_level);
 345                         ASSERT(src != NULL);
 346                         x86pte_copy(src, ht, start, cnt);
 347                         htable_release(src);
 348                 }
 349         }
 350 
 351 init_done:
 352 
 353 #if defined(__xpv)
 354         /*
 355          * Pin top level page tables after initializing them
 356          */
 357         xen_pin(hat->hat_htable->ht_pfn, mmu.max_level);
 358 #if defined(__amd64)
 359         xen_pin(hat->hat_user_ptable, mmu.max_level);
 360 #endif
 361 #endif
 362         XPV_ALLOW_MIGRATE();
 363 
































 364         /*
 365          * Put it at the start of the global list of all hats (used by stealing)
 366          *
 367          * kas.a_hat is not in the list but is instead used to find the
 368          * first and last items in the list.
 369          *
 370          * - kas.a_hat->hat_next points to the start of the user hats.
 371          *   The list ends where hat->hat_next == NULL
 372          *
 373          * - kas.a_hat->hat_prev points to the last of the user hats.
 374          *   The list begins where hat->hat_prev == NULL
 375          */
 376         mutex_enter(&hat_list_lock);
 377         hat->hat_prev = NULL;
 378         hat->hat_next = kas.a_hat->hat_next;
 379         if (hat->hat_next)
 380                 hat->hat_next->hat_prev = hat;
 381         else
 382                 kas.a_hat->hat_prev = hat;
 383         kas.a_hat->hat_next = hat;
 384         mutex_exit(&hat_list_lock);
 385 







 386         return (hat);
 387 }

 388 
 389 /*
 390  * process has finished executing but as has not been cleaned up yet.
 391  */
 392 /*ARGSUSED*/
 393 void
 394 hat_free_start(hat_t *hat)
 395 {
 396         ASSERT(AS_WRITE_HELD(hat->hat_as));
 397 
 398         /*
 399          * If the hat is currently a stealing victim, wait for the stealing
 400          * to finish.  Once we mark it as HAT_FREEING, htable_steal()
 401          * won't look at its pagetables anymore.
 402          */
 403         mutex_enter(&hat_list_lock);
 404         while (hat->hat_flags & HAT_VICTIM)
 405                 cv_wait(&hat_list_cv, &hat_list_lock);
 406         hat->hat_flags |= HAT_FREEING;
 407         mutex_exit(&hat_list_lock);

 424 
 425         /*
 426          * Remove it from the list of HATs
 427          */
 428         mutex_enter(&hat_list_lock);
 429         if (hat->hat_prev)
 430                 hat->hat_prev->hat_next = hat->hat_next;
 431         else
 432                 kas.a_hat->hat_next = hat->hat_next;
 433         if (hat->hat_next)
 434                 hat->hat_next->hat_prev = hat->hat_prev;
 435         else
 436                 kas.a_hat->hat_prev = hat->hat_prev;
 437         mutex_exit(&hat_list_lock);
 438         hat->hat_next = hat->hat_prev = NULL;
 439 
 440 #if defined(__xpv)
 441         /*
 442          * On the hypervisor, unpin top level page table(s)
 443          */

 444         xen_unpin(hat->hat_htable->ht_pfn);
 445 #if defined(__amd64)
 446         xen_unpin(hat->hat_user_ptable);
 447 #endif
 448 #endif
 449 
 450         /*
 451          * Make a pass through the htables freeing them all up.
 452          */
 453         htable_purge_hat(hat);
 454 
 455         /*
 456          * Decide which kmem cache the hash table came from, then free it.
 457          */
 458         if (hat->hat_flags & HAT_VLP)
 459                 cache = vlp_hash_cache;
 460         else


 461                 cache = hat_hash_cache;







 462         kmem_cache_free(cache, hat->hat_ht_hash);
 463         hat->hat_ht_hash = NULL;
 464 
 465         hat->hat_flags = 0;


 466         kmem_cache_free(hat_cache, hat);
 467 }
 468 
 469 /*
 470  * round kernelbase down to a supported value to use for _userlimit
 471  *
 472  * userlimit must be aligned down to an entry in the top level htable.
 473  * The one exception is for 32 bit HAT's running PAE.
 474  */
 475 uintptr_t
 476 hat_kernelbase(uintptr_t va)
 477 {
 478 #if defined(__i386)
 479         va &= LEVEL_MASK(1);
 480 #endif
 481         if (IN_VA_HOLE(va))
 482                 panic("_userlimit %p will fall in VA hole\n", (void *)va);
 483         return (va);
 484 }
 485

 500                             cpuid_opteron_erratum(CPU, 6671130)) {
 501                                 lvl = 1;
 502                         }
 503                         if (plat_mnode_xcheck(LEVEL_SIZE(2) >>
 504                             LEVEL_SHIFT(0))) {
 505                                 lvl = 1;
 506                         }
 507                 } else {
 508                         lvl = 1;
 509                 }
 510         }
 511         mmu.max_page_level = lvl;
 512 
 513         if ((lvl == 2) && (enable_1gpg == 0))
 514                 mmu.umax_page_level = 1;
 515         else
 516                 mmu.umax_page_level = lvl;
 517 }
 518 
 519 /*




































 520  * Initialize hat data structures based on processor MMU information.
 521  */
 522 void
 523 mmu_init(void)
 524 {
 525         uint_t max_htables;
 526         uint_t pa_bits;
 527         uint_t va_bits;
 528         int i;
 529 
 530         /*
 531          * If CPU enabled the page table global bit, use it for the kernel
 532          * This is bit 7 in CR4 (PGE - Page Global Enable).
 533          */
 534         if (is_x86_feature(x86_featureset, X86FSET_PGE) &&
 535             (getcr4() & CR4_PGE) != 0)
 536                 mmu.pt_global = PT_GLOBAL;
 537 

 538         /*










 539          * Detect NX and PAE usage.
 540          */
 541         mmu.pae_hat = kbm_pae_support;
 542         if (kbm_nx_support)
 543                 mmu.pt_nx = PT_NX;
 544         else
 545                 mmu.pt_nx = 0;
 546 
 547         /*
 548          * Use CPU info to set various MMU parameters
 549          */
 550         cpuid_get_addrsize(CPU, &pa_bits, &va_bits);
 551 
 552         if (va_bits < sizeof (void *) * NBBY) {
 553                 mmu.hole_start = (1ul << (va_bits - 1));
 554                 mmu.hole_end = 0ul - mmu.hole_start - 1;
 555         } else {
 556                 mmu.hole_end = 0;
 557                 mmu.hole_start = mmu.hole_end - 1;
 558         }

 576                 mmu.pte_size = 8;       /* 8 byte PTEs */
 577                 mmu.pte_size_shift = 3;
 578         } else {
 579                 mmu.pte_size = 4;       /* 4 byte PTEs */
 580                 mmu.pte_size_shift = 2;
 581         }
 582 
 583         if (mmu.pae_hat && !is_x86_feature(x86_featureset, X86FSET_PAE))
 584                 panic("Processor does not support PAE");
 585 
 586         if (!is_x86_feature(x86_featureset, X86FSET_CX8))
 587                 panic("Processor does not support cmpxchg8b instruction");
 588 
 589 #if defined(__amd64)
 590 
 591         mmu.num_level = 4;
 592         mmu.max_level = 3;
 593         mmu.ptes_per_table = 512;
 594         mmu.top_level_count = 512;
 595 





 596         mmu.level_shift[0] = 12;
 597         mmu.level_shift[1] = 21;
 598         mmu.level_shift[2] = 30;
 599         mmu.level_shift[3] = 39;
 600 
 601 #elif defined(__i386)
 602 
 603         if (mmu.pae_hat) {
 604                 mmu.num_level = 3;
 605                 mmu.max_level = 2;
 606                 mmu.ptes_per_table = 512;
 607                 mmu.top_level_count = 4;
 608 
 609                 mmu.level_shift[0] = 12;
 610                 mmu.level_shift[1] = 21;
 611                 mmu.level_shift[2] = 30;
 612 
 613         } else {
 614                 mmu.num_level = 2;
 615                 mmu.max_level = 1;
 616                 mmu.ptes_per_table = 1024;
 617                 mmu.top_level_count = 1024;
 618 
 619                 mmu.level_shift[0] = 12;
 620                 mmu.level_shift[1] = 22;
 621         }
 622 
 623 #endif  /* __i386 */
 624 
 625         for (i = 0; i < mmu.num_level; ++i) {
 626                 mmu.level_size[i] = 1UL << mmu.level_shift[i];
 627                 mmu.level_offset[i] = mmu.level_size[i] - 1;
 628                 mmu.level_mask[i] = ~mmu.level_offset[i];
 629         }
 630 
 631         set_max_page_level();

 632 
 633         mmu_page_sizes = mmu.max_page_level + 1;
 634         mmu_exported_page_sizes = mmu.umax_page_level + 1;
 635 
 636         /* restrict legacy applications from using pagesizes 1g and above */
 637         mmu_legacy_page_sizes =
 638             (mmu_exported_page_sizes > 2) ? 2 : mmu_exported_page_sizes;
 639 
 640 
 641         for (i = 0; i <= mmu.max_page_level; ++i) {
 642                 mmu.pte_bits[i] = PT_VALID | pt_kern;
 643                 if (i > 0)
 644                         mmu.pte_bits[i] |= PT_PAGESIZE;
 645         }
 646 
 647         /*
 648          * NOTE Legacy 32 bit PAE mode only has the P_VALID bit at top level.
 649          */
 650         for (i = 1; i < mmu.num_level; ++i)
 651                 mmu.ptp_bits[i] = PT_PTPBITS;
 652 
 653 #if defined(__i386)
 654         mmu.ptp_bits[2] = PT_VALID;
 655 #endif
 656 
 657         /*
 658          * Compute how many hash table entries to have per process for htables.
 659          * We start with 1 page's worth of entries.
 660          *
 661          * If physical memory is small, reduce the amount need to cover it.
 662          */
 663         max_htables = physmax / mmu.ptes_per_table;
 664         mmu.hash_cnt = MMU_PAGESIZE / sizeof (htable_t *);
 665         while (mmu.hash_cnt > 16 && mmu.hash_cnt >= max_htables)
 666                 mmu.hash_cnt >>= 1;
 667         mmu.vlp_hash_cnt = mmu.hash_cnt;
 668 
 669 #if defined(__amd64)
 670         /*
 671          * If running in 64 bits and physical memory is large,
 672          * increase the size of the cache to cover all of memory for
 673          * a 64 bit process.
 674          */
 675 #define HASH_MAX_LENGTH 4
 676         while (mmu.hash_cnt * HASH_MAX_LENGTH < max_htables)
 677                 mmu.hash_cnt <<= 1;
 678 #endif
 679 }
 680 
 681 
 682 /*
 683  * initialize hat data structures
 684  */
 685 void
 686 hat_init()
 687 {

 696         }
 697 #endif
 698 
 699         cv_init(&hat_list_cv, NULL, CV_DEFAULT, NULL);
 700 
 701         /*
 702          * initialize kmem caches
 703          */
 704         htable_init();
 705         hment_init();
 706 
 707         hat_cache = kmem_cache_create("hat_t",
 708             sizeof (hat_t), 0, hati_constructor, NULL, NULL,
 709             NULL, 0, 0);
 710 
 711         hat_hash_cache = kmem_cache_create("HatHash",
 712             mmu.hash_cnt * sizeof (htable_t *), 0, NULL, NULL, NULL,
 713             NULL, 0, 0);
 714 
 715         /*
 716          * VLP hats can use a smaller hash table size on large memroy machines

 717          */
 718         if (mmu.hash_cnt == mmu.vlp_hash_cnt) {
 719                 vlp_hash_cache = hat_hash_cache;
 720         } else {
 721                 vlp_hash_cache = kmem_cache_create("HatVlpHash",
 722                     mmu.vlp_hash_cnt * sizeof (htable_t *), 0, NULL, NULL, NULL,
 723                     NULL, 0, 0);
 724         }
 725 
 726         /*
 727          * Set up the kernel's hat
 728          */
 729         AS_LOCK_ENTER(&kas, RW_WRITER);
 730         kas.a_hat = kmem_cache_alloc(hat_cache, KM_NOSLEEP);
 731         mutex_init(&kas.a_hat->hat_mutex, NULL, MUTEX_DEFAULT, NULL);
 732         kas.a_hat->hat_as = &kas;
 733         kas.a_hat->hat_flags = 0;
 734         AS_LOCK_EXIT(&kas);
 735 
 736         CPUSET_ZERO(khat_cpuset);
 737         CPUSET_ADD(khat_cpuset, CPU->cpu_id);
 738 
 739         /*







 740          * The kernel hat's next pointer serves as the head of the hat list .
 741          * The kernel hat's prev pointer tracks the last hat on the list for
 742          * htable_steal() to use.
 743          */
 744         kas.a_hat->hat_next = NULL;
 745         kas.a_hat->hat_prev = NULL;
 746 
 747         /*
 748          * Allocate an htable hash bucket for the kernel
 749          * XX64 - tune for 64 bit procs
 750          */
 751         kas.a_hat->hat_num_hash = mmu.hash_cnt;
 752         kas.a_hat->hat_ht_hash = kmem_cache_alloc(hat_hash_cache, KM_NOSLEEP);
 753         bzero(kas.a_hat->hat_ht_hash, mmu.hash_cnt * sizeof (htable_t *));
 754 
 755         /*
 756          * zero out the top level and cached htable pointers
 757          */
 758         kas.a_hat->hat_ht_cached = NULL;
 759         kas.a_hat->hat_htable = NULL;
 760 
 761         /*
 762          * Pre-allocate hrm_hashtab before enabling the collection of
 763          * refmod statistics.  Allocating on the fly would mean us
 764          * running the risk of suffering recursive mutex enters or
 765          * deadlocks.
 766          */
 767         hrm_hashtab = kmem_zalloc(HRM_HASHSIZE * sizeof (struct hrmstat *),
 768             KM_SLEEP);
 769 }
 770 









 771 /*
 772  * Prepare CPU specific pagetables for VLP processes on 64 bit kernels.
 773  *
 774  * Each CPU has a set of 2 pagetables that are reused for any 32 bit
 775  * process it runs. They are the top level pagetable, hci_vlp_l3ptes, and
 776  * the next to top level table for the bottom 512 Gig, hci_vlp_l2ptes.
 777  */
 778 /*ARGSUSED*/
 779 static void
 780 hat_vlp_setup(struct cpu *cpu)
 781 {
 782 #if defined(__amd64) && !defined(__xpv)
 783         struct hat_cpu_info *hci = cpu->cpu_hat_info;
 784         pfn_t pfn;

 785 
 786         /*
 787          * allocate the level==2 page table for the bottom most
 788          * 512Gig of address space (this is where 32 bit apps live)
 789          */
 790         ASSERT(hci != NULL);
 791         hci->hci_vlp_l2ptes = kmem_zalloc(MMU_PAGESIZE, KM_SLEEP);
 792 
 793         /*
 794          * Allocate a top level pagetable and copy the kernel's
 795          * entries into it. Then link in hci_vlp_l2ptes in the 1st entry.
 796          */
 797         hci->hci_vlp_l3ptes = kmem_zalloc(MMU_PAGESIZE, KM_SLEEP);
 798         hci->hci_vlp_pfn =
 799             hat_getpfnum(kas.a_hat, (caddr_t)hci->hci_vlp_l3ptes);
 800         ASSERT(hci->hci_vlp_pfn != PFN_INVALID);
 801         bcopy(vlp_page, hci->hci_vlp_l3ptes, MMU_PAGESIZE);
 802 
 803         pfn = hat_getpfnum(kas.a_hat, (caddr_t)hci->hci_vlp_l2ptes);
 804         ASSERT(pfn != PFN_INVALID);
 805         hci->hci_vlp_l3ptes[0] = MAKEPTP(pfn, 2);
 806 #endif /* __amd64 && !__xpv */






























































































 807 }
 808 
 809 /*ARGSUSED*/
 810 static void
 811 hat_vlp_teardown(cpu_t *cpu)
 812 {
 813 #if defined(__amd64) && !defined(__xpv)
 814         struct hat_cpu_info *hci;
 815 
 816         if ((hci = cpu->cpu_hat_info) == NULL)
 817                 return;
 818         if (hci->hci_vlp_l2ptes)
 819                 kmem_free(hci->hci_vlp_l2ptes, MMU_PAGESIZE);
 820         if (hci->hci_vlp_l3ptes)
 821                 kmem_free(hci->hci_vlp_l3ptes, MMU_PAGESIZE);




 822 #endif
 823 }
 824 
 825 #define NEXT_HKR(r, l, s, e) {                  \
 826         kernel_ranges[r].hkr_level = l;         \
 827         kernel_ranges[r].hkr_start_va = s;      \
 828         kernel_ranges[r].hkr_end_va = e;        \
 829         ++r;                                    \
 830 }
 831 
 832 /*
 833  * Finish filling in the kernel hat.
 834  * Pre fill in all top level kernel page table entries for the kernel's
 835  * part of the address range.  From this point on we can't use any new
 836  * kernel large pages if they need PTE's at max_level
 837  *
 838  * create the kmap mappings.
 839  */
 840 void
 841 hat_init_finish(void)

 897 
 898                         if (IN_HYPERVISOR_VA(va))
 899                                 continue;
 900 
 901                         /* can/must skip if a page mapping already exists */
 902                         if (rp->hkr_level <= mmu.max_page_level &&
 903                             (ht = htable_getpage(kas.a_hat, va, NULL)) !=
 904                             NULL) {
 905                                 htable_release(ht);
 906                                 continue;
 907                         }
 908 
 909                         (void) htable_create(kas.a_hat, va, rp->hkr_level - 1,
 910                             NULL);
 911                 }
 912         }
 913 
 914         /*
 915          * 32 bit PAE metal kernels use only 4 of the 512 entries in the
 916          * page holding the top level pagetable. We use the remainder for
 917          * the "per CPU" page tables for VLP processes.
 918          * Map the top level kernel pagetable into the kernel to make
 919          * it easy to use bcopy access these tables.



 920          */
 921         if (mmu.pae_hat) {
 922                 vlp_page = vmem_alloc(heap_arena, MMU_PAGESIZE, VM_SLEEP);
 923                 hat_devload(kas.a_hat, (caddr_t)vlp_page, MMU_PAGESIZE,
 924                     kas.a_hat->hat_htable->ht_pfn,
 925 #if !defined(__xpv)
 926                     PROT_WRITE |
 927 #endif
 928                     PROT_READ | HAT_NOSYNC | HAT_UNORDERED_OK,
 929                     HAT_LOAD | HAT_LOAD_NOCONSIST);
 930         }
 931         hat_vlp_setup(CPU);
 932 
 933         /*
 934          * Create kmap (cached mappings of kernel PTEs)
 935          * for 32 bit we map from segmap_start .. ekernelheap
 936          * for 64 bit we map from segmap_start .. segmap_start + segmapsize;
 937          */
 938 #if defined(__i386)
 939         size = (uintptr_t)ekernelheap - segmap_start;
 940 #elif defined(__amd64)
 941         size = segmapsize;
 942 #endif
 943         hat_kmap_init((uintptr_t)segmap_start, size);






 944 }
 945 
 946 /*
 947  * On 32 bit PAE mode, PTE's are 64 bits, but ordinary atomic memory references
 948  * are 32 bit, so for safety we must use atomic_cas_64() to install these.
 949  */
 950 #ifdef __i386
 951 static void
 952 reload_pae32(hat_t *hat, cpu_t *cpu)
 953 {
 954         x86pte_t *src;
 955         x86pte_t *dest;
 956         x86pte_t pte;
 957         int i;
 958 
 959         /*
 960          * Load the 4 entries of the level 2 page table into this
 961          * cpu's range of the vlp_page and point cr3 at them.
 962          */
 963         ASSERT(mmu.pae_hat);
 964         src = hat->hat_vlp_ptes;
 965         dest = vlp_page + (cpu->cpu_id + 1) * VLP_NUM_PTES;
 966         for (i = 0; i < VLP_NUM_PTES; ++i) {
 967                 for (;;) {
 968                         pte = dest[i];
 969                         if (pte == src[i])
 970                                 break;
 971                         if (atomic_cas_64(dest + i, pte, src[i]) != src[i])
 972                                 break;
 973                 }
 974         }
 975 }
 976 #endif
 977 
 978 /*






















































































































 979  * Switch to a new active hat, maintaining bit masks to track active CPUs.
 980  *
 981  * On the 32-bit PAE hypervisor, %cr3 is a 64-bit value, on metal it
 982  * remains a 32-bit value.































 983  */
 984 void
 985 hat_switch(hat_t *hat)
 986 {
 987         uint64_t        newcr3;
 988         cpu_t           *cpu = CPU;
 989         hat_t           *old = cpu->cpu_current_hat;
 990 
 991         /*
 992          * set up this information first, so we don't miss any cross calls
 993          */
 994         if (old != NULL) {
 995                 if (old == hat)
 996                         return;
 997                 if (old != kas.a_hat)
 998                         CPUSET_ATOMIC_DEL(old->hat_cpus, cpu->cpu_id);
 999         }
1000 
1001         /*
1002          * Add this CPU to the active set for this HAT.
1003          */
1004         if (hat != kas.a_hat) {
1005                 CPUSET_ATOMIC_ADD(hat->hat_cpus, cpu->cpu_id);
1006         }
1007         cpu->cpu_current_hat = hat;
1008 
1009         /*
1010          * now go ahead and load cr3
1011          */
1012         if (hat->hat_flags & HAT_VLP) {
1013 #if defined(__amd64)
1014                 x86pte_t *vlpptep = cpu->cpu_hat_info->hci_vlp_l2ptes;


1015 
1016                 VLP_COPY(hat->hat_vlp_ptes, vlpptep);
1017                 newcr3 = MAKECR3(cpu->cpu_hat_info->hci_vlp_pfn);
1018 #elif defined(__i386)
1019                 reload_pae32(hat, cpu);
1020                 newcr3 = MAKECR3(kas.a_hat->hat_htable->ht_pfn) +
1021                     (cpu->cpu_id + 1) * VLP_SIZE;
1022 #endif
1023         } else {
1024                 newcr3 = MAKECR3((uint64_t)hat->hat_htable->ht_pfn);

1025         }
1026 #ifdef __xpv
1027         {
1028                 struct mmuext_op t[2];
1029                 uint_t retcnt;
1030                 uint_t opcnt = 1;
1031 
1032                 t[0].cmd = MMUEXT_NEW_BASEPTR;
1033                 t[0].arg1.mfn = mmu_btop(pa_to_ma(newcr3));
1034 #if defined(__amd64)
















1035                 /*
1036                  * There's an interesting problem here, as to what to
1037                  * actually specify when switching to the kernel hat.
1038                  * For now we'll reuse the kernel hat again.


1039                  */
1040                 t[1].cmd = MMUEXT_NEW_USER_BASEPTR;
1041                 if (hat == kas.a_hat)
1042                         t[1].arg1.mfn = mmu_btop(pa_to_ma(newcr3));
1043                 else
1044                         t[1].arg1.mfn = pfn_to_mfn(hat->hat_user_ptable);
1045                 ++opcnt;
1046 #endif  /* __amd64 */
1047                 if (HYPERVISOR_mmuext_op(t, opcnt, &retcnt, DOMID_SELF) < 0)
1048                         panic("HYPERVISOR_mmu_update() failed");
1049                 ASSERT(retcnt == opcnt);
1050 
1051         }
1052 #else
1053         setcr3(newcr3);
1054 #endif





1055         ASSERT(cpu == CPU);
1056 }
1057 
1058 /*
1059  * Utility to return a valid x86pte_t from protections, pfn, and level number
1060  */
1061 static x86pte_t
1062 hati_mkpte(pfn_t pfn, uint_t attr, level_t level, uint_t flags)
1063 {
1064         x86pte_t        pte;
1065         uint_t          cache_attr = attr & HAT_ORDER_MASK;
1066 
1067         pte = MAKEPTE(pfn, level);
1068 
1069         if (attr & PROT_WRITE)
1070                 PTE_SET(pte, PT_WRITABLE);
1071 
1072         if (attr & PROT_USER)
1073                 PTE_SET(pte, PT_USER);
1074

1346                 goto done;
1347         }
1348 
1349         /*
1350          * If the mapping didn't change there is nothing more to do.
1351          */
1352         if (PTE_EQUIV(pte, old_pte))
1353                 goto done;
1354 
1355         /*
1356          * Install a new mapping in the page's mapping list
1357          */
1358         if (!PTE_ISVALID(old_pte)) {
1359                 if (is_consist) {
1360                         hment_assign(ht, entry, pp, hm);
1361                         x86_hm_exit(pp);
1362                 } else {
1363                         ASSERT(flags & HAT_LOAD_NOCONSIST);
1364                 }
1365 #if defined(__amd64)
1366                 if (ht->ht_flags & HTABLE_VLP) {
1367                         cpu_t *cpu = CPU;
1368                         x86pte_t *vlpptep = cpu->cpu_hat_info->hci_vlp_l2ptes;
1369                         VLP_COPY(hat->hat_vlp_ptes, vlpptep);
1370                 }
1371 #endif
1372                 HTABLE_INC(ht->ht_valid_cnt);
1373                 PGCNT_INC(hat, l);
1374                 return (rv);
1375         }
1376 
1377         /*
1378          * Remap's are more complicated:
1379          *  - HAT_LOAD_REMAP must be specified if changing the pfn.
1380          *    We also require that NOCONSIST be specified.
1381          *  - Otherwise only permission or caching bits may change.
1382          */
1383         if (!PTE_ISPAGE(old_pte, l))
1384                 panic("non-null/page mapping pte=" FMT_PTE, old_pte);
1385 
1386         if (PTE2PFN(old_pte, l) != PTE2PFN(pte, l)) {
1387                 REMAPASSERT(flags & HAT_LOAD_REMAP);
1388                 REMAPASSERT(flags & HAT_LOAD_NOCONSIST);
1389                 REMAPASSERT(PTE_GET(old_pte, PT_SOFTWARE) >= PT_NOCONSIST);

1421         hat_t           *hat,
1422         uintptr_t       va,
1423         page_t          *pp,
1424         uint_t          attr,
1425         uint_t          flags,
1426         level_t         level,
1427         pfn_t           pfn)
1428 {
1429         htable_t        *ht;
1430         uint_t          entry;
1431         x86pte_t        pte;
1432         int             rv = 0;
1433 
1434         /*
1435          * The number 16 is arbitrary and here to catch a recursion problem
1436          * early before we blow out the kernel stack.
1437          */
1438         ++curthread->t_hatdepth;
1439         ASSERT(curthread->t_hatdepth < 16);
1440 
1441         ASSERT(hat == kas.a_hat || AS_LOCK_HELD(hat->hat_as));

1442 
1443         if (flags & HAT_LOAD_SHARE)
1444                 hat->hat_flags |= HAT_SHARED;
1445 
1446         /*
1447          * Find the page table that maps this page if it already exists.
1448          */
1449         ht = htable_lookup(hat, va, level);
1450 
1451         /*
1452          * We must have HAT_LOAD_NOCONSIST if page_t is NULL.
1453          */
1454         if (pp == NULL)
1455                 flags |= HAT_LOAD_NOCONSIST;
1456 
1457         if (ht == NULL) {
1458                 ht = htable_create(hat, va, level, NULL);
1459                 ASSERT(ht != NULL);
1460         }











1461         entry = htable_va2entry(va, ht);
1462 
1463         /*
1464          * a bunch of paranoid error checking
1465          */
1466         ASSERT(ht->ht_busy > 0);
1467         if (ht->ht_vaddr > va || va > HTABLE_LAST_PAGE(ht))
1468                 panic("hati_load_common: bad htable %p, va %p",
1469                     (void *)ht, (void *)va);
1470         ASSERT(ht->ht_level == level);
1471 
1472         /*
1473          * construct the new PTE
1474          */
1475         if (hat == kas.a_hat)
1476                 attr &= ~PROT_USER;
1477         pte = hati_mkpte(pfn, attr, level, flags);
1478         if (hat == kas.a_hat && va >= kernelbase)
1479                 PTE_SET(pte, mmu.pt_global);
1480 
1481         /*
1482          * establish the mapping
1483          */
1484         rv = hati_pte_map(ht, entry, pp, pte, flags, NULL);
1485 
1486         /*
1487          * release the htable and any reserves
1488          */
1489         htable_release(ht);

1899                             "htable=%p, vaddr=%p\n", (void *)ht, (void *)vaddr);
1900                 HTABLE_LOCK_DEC(ht);
1901 
1902                 vaddr += LEVEL_SIZE(ht->ht_level);
1903         }
1904         if (ht)
1905                 htable_release(ht);
1906         XPV_ALLOW_MIGRATE();
1907 }
1908 
1909 /* ARGSUSED */
1910 void
1911 hat_unlock_region(struct hat *hat, caddr_t addr, size_t len,
1912     hat_region_cookie_t rcookie)
1913 {
1914         panic("No shared region support on x86");
1915 }
1916 
1917 #if !defined(__xpv)
1918 /*
1919  * Cross call service routine to demap a virtual page on
1920  * the current CPU or flush all mappings in TLB.
1921  */
1922 /*ARGSUSED*/
1923 static int
1924 hati_demap_func(xc_arg_t a1, xc_arg_t a2, xc_arg_t a3)
1925 {

1926         hat_t   *hat = (hat_t *)a1;
1927         caddr_t addr = (caddr_t)a2;
1928         size_t len = (size_t)a3;
1929 
1930         /*
1931          * If the target hat isn't the kernel and this CPU isn't operating
1932          * in the target hat, we can ignore the cross call.
1933          */
1934         if (hat != kas.a_hat && hat != CPU->cpu_current_hat)
1935                 return (0);
1936 
1937         /*
1938          * For a normal address, we flush a range of contiguous mappings
1939          */
1940         if ((uintptr_t)addr != DEMAP_ALL_ADDR) {
1941                 for (size_t i = 0; i < len; i += MMU_PAGESIZE)
1942                         mmu_tlbflush_entry(addr + i);
1943                 return (0);
1944         }
1945 
1946         /*
1947          * Otherwise we reload cr3 to effect a complete TLB flush.
1948          *
1949          * A reload of cr3 on a VLP process also means we must also recopy in
1950          * the pte values from the struct hat
1951          */
1952         if (hat->hat_flags & HAT_VLP) {
1953 #if defined(__amd64)
1954                 x86pte_t *vlpptep = CPU->cpu_hat_info->hci_vlp_l2ptes;
1955 
1956                 VLP_COPY(hat->hat_vlp_ptes, vlpptep);
1957 #elif defined(__i386)
1958                 reload_pae32(hat, CPU);
1959 #endif
1960         }
1961         reload_cr3();

1962         return (0);
1963 }
1964 
1965 /*
1966  * Flush all TLB entries, including global (ie. kernel) ones.
1967  */
1968 static void
1969 flush_all_tlb_entries(void)
1970 {
1971         ulong_t cr4 = getcr4();
1972 
1973         if (cr4 & CR4_PGE) {
1974                 setcr4(cr4 & ~(ulong_t)CR4_PGE);
1975                 setcr4(cr4);
1976 
1977                 /*
1978                  * 32 bit PAE also needs to always reload_cr3()
1979                  */
1980                 if (mmu.max_level == 2)
1981                         reload_cr3();
1982         } else {
1983                 reload_cr3();
1984         }
1985 }
1986 
1987 #define TLB_CPU_HALTED  (01ul)
1988 #define TLB_INVAL_ALL   (02ul)
1989 #define CAS_TLB_INFO(cpu, old, new)     \
1990         atomic_cas_ulong((ulong_t *)&(cpu)->cpu_m.mcpu_tlb_info, (old), (new))
1991 
1992 /*
1993  * Record that a CPU is going idle
1994  */
1995 void
1996 tlb_going_idle(void)
1997 {
1998         atomic_or_ulong((ulong_t *)&CPU->cpu_m.mcpu_tlb_info, TLB_CPU_HALTED);

1999 }
2000 
2001 /*
2002  * Service a delayed TLB flush if coming out of being idle.
2003  * It will be called from cpu idle notification with interrupt disabled.
2004  */
2005 void
2006 tlb_service(void)
2007 {
2008         ulong_t tlb_info;
2009         ulong_t found;
2010 
2011         /*
2012          * We only have to do something if coming out of being idle.
2013          */
2014         tlb_info = CPU->cpu_m.mcpu_tlb_info;
2015         if (tlb_info & TLB_CPU_HALTED) {
2016                 ASSERT(CPU->cpu_current_hat == kas.a_hat);
2017 
2018                 /*
2019                  * Atomic clear and fetch of old state.
2020                  */
2021                 while ((found = CAS_TLB_INFO(CPU, tlb_info, 0)) != tlb_info) {
2022                         ASSERT(found & TLB_CPU_HALTED);
2023                         tlb_info = found;
2024                         SMT_PAUSE();
2025                 }
2026                 if (tlb_info & TLB_INVAL_ALL)
2027                         flush_all_tlb_entries();
2028         }
2029 }
2030 #endif /* !__xpv */
2031 
2032 /*
2033  * Internal routine to do cross calls to invalidate a range of pages on
2034  * all CPUs using a given hat.
2035  */
2036 void
2037 hat_tlb_inval_range(hat_t *hat, uintptr_t va, size_t len)
2038 {
2039         extern int      flushes_require_xcalls; /* from mp_startup.c */
2040         cpuset_t        justme;
2041         cpuset_t        cpus_to_shootdown;

2042 #ifndef __xpv
2043         cpuset_t        check_cpus;
2044         cpu_t           *cpup;
2045         int             c;
2046 #endif
2047 
2048         /*
2049          * If the hat is being destroyed, there are no more users, so
2050          * demap need not do anything.
2051          */
2052         if (hat->hat_flags & HAT_FREEING)
2053                 return;
2054 
2055         /*
2056          * If demapping from a shared pagetable, we best demap the
2057          * entire set of user TLBs, since we don't know what addresses
2058          * these were shared at.
2059          */
2060         if (hat->hat_flags & HAT_SHARED) {
2061                 hat = kas.a_hat;
2062                 va = DEMAP_ALL_ADDR;
2063         }
2064 
2065         /*
2066          * if not running with multiple CPUs, don't use cross calls
2067          */
2068         if (panicstr || !flushes_require_xcalls) {
2069 #ifdef __xpv
2070                 if (va == DEMAP_ALL_ADDR) {
2071                         xen_flush_tlb();
2072                 } else {
2073                         for (size_t i = 0; i < len; i += MMU_PAGESIZE)
2074                                 xen_flush_va((caddr_t)(va + i));

2075                 }

2076 #else
2077                 (void) hati_demap_func((xc_arg_t)hat,
2078                     (xc_arg_t)va, (xc_arg_t)len);
2079 #endif
2080                 return;
2081         }
2082 
2083 
2084         /*
2085          * Determine CPUs to shootdown. Kernel changes always do all CPUs.
2086          * Otherwise it's just CPUs currently executing in this hat.
2087          */
2088         kpreempt_disable();
2089         CPUSET_ONLY(justme, CPU->cpu_id);
2090         if (hat == kas.a_hat)
2091                 cpus_to_shootdown = khat_cpuset;
2092         else
2093                 cpus_to_shootdown = hat->hat_cpus;
2094 
2095 #ifndef __xpv
2096         /*
2097          * If any CPUs in the set are idle, just request a delayed flush
2098          * and avoid waking them up.
2099          */
2100         check_cpus = cpus_to_shootdown;
2101         for (c = 0; c < NCPU && !CPUSET_ISNULL(check_cpus); ++c) {
2102                 ulong_t tlb_info;
2103 
2104                 if (!CPU_IN_SET(check_cpus, c))
2105                         continue;
2106                 CPUSET_DEL(check_cpus, c);
2107                 cpup = cpu[c];
2108                 if (cpup == NULL)
2109                         continue;
2110 
2111                 tlb_info = cpup->cpu_m.mcpu_tlb_info;
2112                 while (tlb_info == TLB_CPU_HALTED) {
2113                         (void) CAS_TLB_INFO(cpup, TLB_CPU_HALTED,
2114                             TLB_CPU_HALTED | TLB_INVAL_ALL);
2115                         SMT_PAUSE();
2116                         tlb_info = cpup->cpu_m.mcpu_tlb_info;
2117                 }
2118                 if (tlb_info == (TLB_CPU_HALTED | TLB_INVAL_ALL)) {
2119                         HATSTAT_INC(hs_tlb_inval_delayed);
2120                         CPUSET_DEL(cpus_to_shootdown, c);
2121                 }
2122         }
2123 #endif
2124 
2125         if (CPUSET_ISNULL(cpus_to_shootdown) ||
2126             CPUSET_ISEQUAL(cpus_to_shootdown, justme)) {
2127 
2128 #ifdef __xpv
2129                 if (va == DEMAP_ALL_ADDR) {
2130                         xen_flush_tlb();
2131                 } else {
2132                         for (size_t i = 0; i < len; i += MMU_PAGESIZE)
2133                                 xen_flush_va((caddr_t)(va + i));

2134                 }

2135 #else
2136                 (void) hati_demap_func((xc_arg_t)hat,
2137                     (xc_arg_t)va, (xc_arg_t)len);
2138 #endif
2139 
2140         } else {
2141 
2142                 CPUSET_ADD(cpus_to_shootdown, CPU->cpu_id);
2143 #ifdef __xpv
2144                 if (va == DEMAP_ALL_ADDR) {
2145                         xen_gflush_tlb(cpus_to_shootdown);
2146                 } else {
2147                         for (size_t i = 0; i < len; i += MMU_PAGESIZE) {
2148                                 xen_gflush_va((caddr_t)(va + i),

2149                                     cpus_to_shootdown);
2150                         }
2151                 }
2152 #else
2153                 xc_call((xc_arg_t)hat, (xc_arg_t)va, (xc_arg_t)len,
2154                     CPUSET2BV(cpus_to_shootdown), hati_demap_func);
2155 #endif
2156 
2157         }
2158         kpreempt_enable();
2159 }
2160 
2161 void
2162 hat_tlb_inval(hat_t *hat, uintptr_t va)
2163 {
2164         hat_tlb_inval_range(hat, va, MMU_PAGESIZE);








2165 }
2166 
2167 /*
2168  * Interior routine for HAT_UNLOADs from hat_unload_callback(),
2169  * hat_kmap_unload() OR from hat_steal() code.  This routine doesn't
2170  * handle releasing of the htables.
2171  */
2172 void
2173 hat_pte_unmap(
2174         htable_t        *ht,
2175         uint_t          entry,
2176         uint_t          flags,
2177         x86pte_t        old_pte,
2178         void            *pte_ptr,
2179         boolean_t       tlb)
2180 {
2181         hat_t           *hat = ht->ht_hat;
2182         hment_t         *hm = NULL;
2183         page_t          *pp = NULL;
2184         level_t         l = ht->ht_level;

2311 hat_unload(hat_t *hat, caddr_t addr, size_t len, uint_t flags)
2312 {
2313         uintptr_t va = (uintptr_t)addr;
2314 
2315         XPV_DISALLOW_MIGRATE();
2316         ASSERT(hat == kas.a_hat || va + len <= _userlimit);
2317 
2318         /*
2319          * special case for performance.
2320          */
2321         if (mmu.kmap_addr <= va && va < mmu.kmap_eaddr) {
2322                 ASSERT(hat == kas.a_hat);
2323                 hat_kmap_unload(addr, len, flags);
2324         } else {
2325                 hat_unload_callback(hat, addr, len, flags, NULL);
2326         }
2327         XPV_ALLOW_MIGRATE();
2328 }
2329 
2330 /*
2331  * Do the callbacks for ranges being unloaded.
2332  */
2333 typedef struct range_info {
2334         uintptr_t       rng_va;
2335         ulong_t         rng_cnt;
2336         level_t         rng_level;
2337 } range_info_t;
2338 
2339 /*
2340  * Invalidate the TLB, and perform the callback to the upper level VM system,
2341  * for the specified ranges of contiguous pages.
2342  */
2343 static void
2344 handle_ranges(hat_t *hat, hat_callback_t *cb, uint_t cnt, range_info_t *range)
2345 {
2346         while (cnt > 0) {
2347                 size_t len;
2348 
2349                 --cnt;
2350                 len = range[cnt].rng_cnt << LEVEL_SHIFT(range[cnt].rng_level);
2351                 hat_tlb_inval_range(hat, (uintptr_t)range[cnt].rng_va, len);
2352 
2353                 if (cb != NULL) {
2354                         cb->hcb_start_addr = (caddr_t)range[cnt].rng_va;
2355                         cb->hcb_end_addr = cb->hcb_start_addr;
2356                         cb->hcb_end_addr += len;

2357                         cb->hcb_function(cb);
2358                 }
2359         }
2360 }
2361 
2362 /*
2363  * Unload a given range of addresses (has optional callback)
2364  *
2365  * Flags:
2366  * define       HAT_UNLOAD              0x00
2367  * define       HAT_UNLOAD_NOSYNC       0x02
2368  * define       HAT_UNLOAD_UNLOCK       0x04
2369  * define       HAT_UNLOAD_OTHER        0x08 - not used
2370  * define       HAT_UNLOAD_UNMAP        0x10 - same as HAT_UNLOAD
2371  */
2372 #define MAX_UNLOAD_CNT (8)
2373 void
2374 hat_unload_callback(
2375         hat_t           *hat,
2376         caddr_t         addr,
2377         size_t          len,
2378         uint_t          flags,
2379         hat_callback_t  *cb)
2380 {
2381         uintptr_t       vaddr = (uintptr_t)addr;
2382         uintptr_t       eaddr = vaddr + len;
2383         htable_t        *ht = NULL;
2384         uint_t          entry;
2385         uintptr_t       contig_va = (uintptr_t)-1L;
2386         range_info_t    r[MAX_UNLOAD_CNT];
2387         uint_t          r_cnt = 0;
2388         x86pte_t        old_pte;
2389 
2390         XPV_DISALLOW_MIGRATE();
2391         ASSERT(hat == kas.a_hat || eaddr <= _userlimit);
2392         ASSERT(IS_PAGEALIGNED(vaddr));
2393         ASSERT(IS_PAGEALIGNED(eaddr));
2394 
2395         /*
2396          * Special case a single page being unloaded for speed. This happens
2397          * quite frequently, COW faults after a fork() for example.
2398          */
2399         if (cb == NULL && len == MMU_PAGESIZE) {
2400                 ht = htable_getpte(hat, vaddr, &entry, &old_pte, 0);
2401                 if (ht != NULL) {
2402                         if (PTE_ISVALID(old_pte)) {
2403                                 hat_pte_unmap(ht, entry, flags, old_pte,
2404                                     NULL, B_TRUE);
2405                         }
2406                         htable_release(ht);
2407                 }
2408                 XPV_ALLOW_MIGRATE();
2409                 return;
2410         }
2411 
2412         while (vaddr < eaddr) {
2413                 old_pte = htable_walk(hat, &ht, &vaddr, eaddr);
2414                 if (ht == NULL)
2415                         break;
2416 
2417                 ASSERT(!IN_VA_HOLE(vaddr));
2418 
2419                 if (vaddr < (uintptr_t)addr)
2420                         panic("hat_unload_callback(): unmap inside large page");
2421 
2422                 /*
2423                  * We'll do the call backs for contiguous ranges
2424                  */
2425                 if (vaddr != contig_va ||
2426                     (r_cnt > 0 && r[r_cnt - 1].rng_level != ht->ht_level)) {
2427                         if (r_cnt == MAX_UNLOAD_CNT) {
2428                                 handle_ranges(hat, cb, r_cnt, r);
2429                                 r_cnt = 0;
2430                         }
2431                         r[r_cnt].rng_va = vaddr;
2432                         r[r_cnt].rng_cnt = 0;
2433                         r[r_cnt].rng_level = ht->ht_level;
2434                         ++r_cnt;
2435                 }
2436 
2437                 /*
2438                  * Unload one mapping (for a single page) from the page tables.
2439                  * Note that we do not remove the mapping from the TLB yet,
2440                  * as indicated by the tlb=FALSE argument to hat_pte_unmap().
2441                  * handle_ranges() will clear the TLB entries with one call to
2442                  * hat_tlb_inval_range() per contiguous range.  This is
2443                  * safe because the page can not be reused until the
2444                  * callback is made (or we return).
2445                  */
2446                 entry = htable_va2entry(vaddr, ht);
2447                 hat_pte_unmap(ht, entry, flags, old_pte, NULL, B_FALSE);
2448                 ASSERT(ht->ht_level <= mmu.max_page_level);
2449                 vaddr += LEVEL_SIZE(ht->ht_level);
2450                 contig_va = vaddr;
2451                 ++r[r_cnt - 1].rng_cnt;
2452         }
2453         if (ht)
2454                 htable_release(ht);
2455 
2456         /*
2457          * handle last range for callbacks
2458          */
2459         if (r_cnt > 0)
2460                 handle_ranges(hat, cb, r_cnt, r);
2461         XPV_ALLOW_MIGRATE();
2462 }
2463 
2464 /*
2465  * Invalidate a virtual address translation on a slave CPU during
2466  * panic() dumps.
2467  */
2468 void
2469 hat_flush_range(hat_t *hat, caddr_t va, size_t size)
2470 {
2471         ssize_t sz;
2472         caddr_t endva = va + size;
2473 
2474         while (va < endva) {
2475                 sz = hat_getpagesize(hat, va);
2476                 if (sz < 0) {
2477 #ifdef __xpv
2478                         xen_flush_tlb();
2479 #else
2480                         flush_all_tlb_entries();
2481 #endif
2482                         break;
2483                 }
2484 #ifdef __xpv
2485                 xen_flush_va(va);
2486 #else
2487                 mmu_tlbflush_entry(va);
2488 #endif
2489                 va += sz;
2490         }
2491 }
2492 
2493 /*
2494  * synchronize mapping with software data structures
2495  *
2496  * This interface is currently only used by the working set monitor
2497  * driver.
2498  */
2499 /*ARGSUSED*/
2500 void
2501 hat_sync(hat_t *hat, caddr_t addr, size_t len, uint_t flags)
2502 {
2503         uintptr_t       vaddr = (uintptr_t)addr;
2504         uintptr_t       eaddr = vaddr + len;
2505         htable_t        *ht = NULL;
2506         uint_t          entry;
2507         x86pte_t        pte;

3133                          */
3134                         ht = htable_lookup(hat, vaddr, l);
3135                         if (ht == NULL)
3136                                 continue;
3137                         if (ht->ht_flags & HTABLE_SHARED_PFN) {
3138                                 /*
3139                                  * clear page count, set valid_cnt to 0,
3140                                  * let htable_release() finish the job
3141                                  */
3142                                 hat->hat_ism_pgcnt -= ht->ht_valid_cnt <<
3143                                     (LEVEL_SHIFT(ht->ht_level) - MMU_PAGESHIFT);
3144                                 ht->ht_valid_cnt = 0;
3145                                 need_demaps = 1;
3146                         }
3147                         htable_release(ht);
3148                 }
3149         }
3150 
3151         /*
3152          * flush the TLBs - since we're probably dealing with MANY mappings
3153          * we do just one CR3 reload.
3154          */
3155         if (!(hat->hat_flags & HAT_FREEING) && need_demaps)
3156                 hat_tlb_inval(hat, DEMAP_ALL_ADDR);
3157 
3158         /*
3159          * Now go back and clean up any unaligned mappings that
3160          * couldn't share pagetables.
3161          */
3162         if (!is_it_dism(hat, addr))
3163                 flags |= HAT_UNLOAD_UNLOCK;
3164         hat_unload(hat, addr, len, flags);
3165         XPV_ALLOW_MIGRATE();
3166 }
3167 
3168 
3169 /*
3170  * hat_reserve() does nothing
3171  */
3172 /*ARGSUSED*/
3173 void

3916         htable_t        *ht;
3917 
3918         XPV_DISALLOW_MIGRATE();
3919         /*
3920          * invalidate any left over mapping and decrement the htable valid count
3921          */
3922 #ifdef __xpv
3923         if (HYPERVISOR_update_va_mapping((uintptr_t)addr, 0,
3924             UVMF_INVLPG | UVMF_LOCAL))
3925                 panic("HYPERVISOR_update_va_mapping() failed");
3926 #else
3927         {
3928                 x86pte_t *pteptr;
3929 
3930                 pteptr = x86pte_mapin(mmu_btop(pte_pa),
3931                     (pte_pa & MMU_PAGEOFFSET) >> mmu.pte_size_shift, NULL);
3932                 if (mmu.pae_hat)
3933                         *pteptr = 0;
3934                 else
3935                         *(x86pte32_t *)pteptr = 0;
3936                 mmu_tlbflush_entry(addr);
3937                 x86pte_mapout();
3938         }
3939 #endif
3940 
3941         ht = htable_getpte(kas.a_hat, ALIGN2PAGE(addr), NULL, NULL, 0);
3942         if (ht == NULL)
3943                 panic("hat_mempte_release(): invalid address");
3944         ASSERT(ht->ht_level == 0);
3945         HTABLE_DEC(ht->ht_valid_cnt);
3946         htable_release(ht);
3947         XPV_ALLOW_MIGRATE();
3948 }
3949 
3950 /*
3951  * Apply a temporary CPU private mapping to a page. We flush the TLB only
3952  * on this CPU, so this ought to have been called with preemption disabled.
3953  */
3954 void
3955 hat_mempte_remap(
3956         pfn_t           pfn,

3977         ASSERT(ht->ht_level == 0);
3978         ASSERT(ht->ht_valid_cnt > 0);
3979         ASSERT(ht->ht_pfn == mmu_btop(pte_pa));
3980         htable_release(ht);
3981 #endif
3982         XPV_DISALLOW_MIGRATE();
3983         pte = hati_mkpte(pfn, attr, 0, flags);
3984 #ifdef __xpv
3985         if (HYPERVISOR_update_va_mapping(va, pte, UVMF_INVLPG | UVMF_LOCAL))
3986                 panic("HYPERVISOR_update_va_mapping() failed");
3987 #else
3988         {
3989                 x86pte_t *pteptr;
3990 
3991                 pteptr = x86pte_mapin(mmu_btop(pte_pa),
3992                     (pte_pa & MMU_PAGEOFFSET) >> mmu.pte_size_shift, NULL);
3993                 if (mmu.pae_hat)
3994                         *(x86pte_t *)pteptr = pte;
3995                 else
3996                         *(x86pte32_t *)pteptr = (x86pte32_t)pte;
3997                 mmu_tlbflush_entry(addr);
3998                 x86pte_mapout();
3999         }
4000 #endif
4001         XPV_ALLOW_MIGRATE();
4002 }
4003 
4004 
4005 
4006 /*
4007  * Hat locking functions
4008  * XXX - these two functions are currently being used by hatstats
4009  *      they can be removed by using a per-as mutex for hatstats.
4010  */
4011 void
4012 hat_enter(hat_t *hat)
4013 {
4014         mutex_enter(&hat->hat_mutex);
4015 }
4016 
4017 void
4018 hat_exit(hat_t *hat)
4019 {
4020         mutex_exit(&hat->hat_mutex);
4021 }
4022 
4023 /*
4024  * HAT part of cpu initialization.
4025  */
4026 void
4027 hat_cpu_online(struct cpu *cpup)
4028 {
4029         if (cpup != CPU) {
4030                 x86pte_cpu_init(cpup);
4031                 hat_vlp_setup(cpup);
4032         }
4033         CPUSET_ATOMIC_ADD(khat_cpuset, cpup->cpu_id);
4034 }
4035 
4036 /*
4037  * HAT part of cpu deletion.
4038  * (currently, we only call this after the cpu is safely passivated.)
4039  */
4040 void
4041 hat_cpu_offline(struct cpu *cpup)
4042 {
4043         ASSERT(cpup != CPU);
4044 
4045         CPUSET_ATOMIC_DEL(khat_cpuset, cpup->cpu_id);
4046         hat_vlp_teardown(cpup);
4047         x86pte_cpu_fini(cpup);
4048 }
4049 
4050 /*
4051  * Function called after all CPUs are brought online.
4052  * Used to remove low address boot mappings.
4053  */
4054 void
4055 clear_boot_mappings(uintptr_t low, uintptr_t high)
4056 {
4057         uintptr_t vaddr = low;
4058         htable_t *ht = NULL;
4059         level_t level;
4060         uint_t entry;
4061         x86pte_t pte;
4062 
4063         /*
4064          * On 1st CPU we can unload the prom mappings, basically we blow away
4065          * all virtual mappings under _userlimit.
4066          */

4473                 *pte_ma = base_ma + (entry << mmu.pte_size_shift);
4474         }
4475         XPV_ALLOW_MIGRATE();
4476 }
4477 
4478 void
4479 hat_release_mapping(hat_t *hat, caddr_t addr)
4480 {
4481         htable_t *ht;
4482 
4483         ASSERT(IS_P2ALIGNED((uintptr_t)addr, MMU_PAGESIZE));
4484         XPV_DISALLOW_MIGRATE();
4485         ht = htable_lookup(hat, (uintptr_t)addr, 0);
4486         ASSERT(ht != NULL);
4487         ASSERT(ht->ht_busy >= 2);
4488         htable_release(ht);
4489         htable_release(ht);
4490         XPV_ALLOW_MIGRATE();
4491 }
4492 #endif  /* __xpv */

  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 /*
  25  * Copyright (c) 2010, Intel Corporation.
  26  * All rights reserved.
  27  */
  28 /*
  29  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  30  * Copyright 2018 Joyent, Inc.  All rights reserved.
  31  * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
  32  */
  33 
  34 /*
  35  * VM - Hardware Address Translation management for i386 and amd64
  36  *
  37  * Implementation of the interfaces described in <common/vm/hat.h>
  38  *
  39  * Nearly all the details of how the hardware is managed should not be
  40  * visible outside this layer except for misc. machine specific functions
  41  * that work in conjunction with this code.
  42  *
  43  * Routines used only inside of i86pc/vm start with hati_ for HAT Internal.
  44  */
  45 
  46 /*
  47  * amd64 HAT Design
  48  *
  49  * ----------
  50  * Background
  51  * ----------
  52  *
  53  * On x86, the address space is shared between a user process and the kernel.
  54  * This is different from SPARC. Conventionally, the kernel lives at the top of
  55  * the address space and the user process gets to enjoy the rest of it. If you
  56  * look at the image of the address map in uts/i86pc/os/startup.c, you'll get a
  57  * rough sense of how the address space is laid out and used.
  58  *
  59  * Every unique address space is represented by an instance of a HAT structure
  60  * called a 'hat_t'. In addition to a hat_t structure for each process, there is
  61  * also one that is used for the kernel (kas.a_hat), and each CPU ultimately
  62  * also has a HAT.
  63  *
  64  * Each HAT contains a pointer to its root page table. This root page table is
  65  * what we call an L3 page table in illumos and Intel calls the PML4. It is the
  66  * physical address of the L3 table that we place in the %cr3 register which the
  67  * processor uses.
  68  *
  69  * Each of the many layers of the page table is represented by a structure
  70  * called an htable_t. The htable_t manages a set of 512 8-byte entries. The
  71  * number of entries in a given page table is constant across all different
  72  * level page tables. Note, this is only true on amd64. This has not always been
  73  * the case on x86.
  74  *
  75  * Each entry in a page table, generally referred to as a PTE, may refer to
  76  * another page table or a memory location, depending on the level of the page
  77  * table and the use of large pages. Importantly, the top-level L3 page table
  78  * (PML4) only supports linking to further page tables. This is also true on
  79  * systems which support a 5th level page table (which we do not currently
  80  * support).
  81  *
  82  * Historically, on x86, when a process was running on CPU, the root of the page
  83  * table was inserted into %cr3 on each CPU on which it was currently running.
  84  * When processes would switch (by calling hat_switch()), then the value in %cr3
  85  * on that CPU would change to that of the new HAT. While this behavior is still
  86  * maintained in the xpv kernel, this is not what is done today.
  87  *
  88  * -------------------
  89  * Per-CPU Page Tables
  90  * -------------------
  91  *
  92  * Throughout the system the 64-bit kernel has a notion of what it calls a
  93  * per-CPU page table or PCP. The notion of a per-CPU page table was originally
  94  * introduced as part of the original work to support x86 PAE. On the 64-bit
  95  * kernel, it was originally used for 32-bit processes running on the 64-bit
  96  * kernel. The rationale behind this was that each 32-bit process could have all
  97  * of its memory represented in a single L2 page table as each L2 page table
  98  * entry represents 1 GbE of memory.
  99  *
 100  * Following on from this, the idea was that given that all of the L3 page table
 101  * entries for 32-bit processes are basically going to be identical with the
 102  * exception of the first entry in the page table, why not share those page
 103  * table entries. This gave rise to the idea of a per-CPU page table.
 104  *
 105  * The way this works is that we have a member in the machcpu_t called the
 106  * mcpu_hat_info. That structure contains two different 4k pages: one that
 107  * represents the L3 page table and one that represents an L2 page table. When
 108  * the CPU starts up, the L3 page table entries are copied in from the kernel's
 109  * page table. The L3 kernel entries do not change throughout the lifetime of
 110  * the kernel. The kernel portion of these L3 pages for each CPU have the same
 111  * records, meaning that they point to the same L2 page tables and thus see a
 112  * consistent view of the world.
 113  *
 114  * When a 32-bit process is loaded into this world, we copy the 32-bit process's
 115  * four top-level page table entries into the CPU's L2 page table and then set
 116  * the CPU's first L3 page table entry to point to the CPU's L2 page.
 117  * Specifically, in hat_pcp_update(), we're copying from the process's
 118  * HAT_COPIED_32 HAT into the page tables specific to this CPU.
 119  *
 120  * As part of the implementation of kernel page table isolation, this was also
 121  * extended to 64-bit processes. When a 64-bit process runs, we'll copy their L3
 122  * PTEs across into the current CPU's L3 page table. (As we can't do the
 123  * first-L3-entry trick for 64-bit processes, ->hci_pcp_l2ptes is unused in this
 124  * case.)
 125  *
 126  * The use of per-CPU page tables has a lot of implementation ramifications. A
 127  * HAT that runs a user process will be flagged with the HAT_COPIED flag to
 128  * indicate that it is using the per-CPU page table functionality. In tandem
 129  * with the HAT, the top-level htable_t will be flagged with the HTABLE_COPIED
 130  * flag. If the HAT represents a 32-bit process, then we will also set the
 131  * HAT_COPIED_32 flag on that hat_t.
 132  *
 133  * These two flags work together. The top-level htable_t when using per-CPU page
 134  * tables is 'virtual'. We never allocate a ptable for this htable_t (i.e.
 135  * ht->ht_pfn is PFN_INVALID).  Instead, when we need to modify a PTE in an
 136  * HTABLE_COPIED ptable, x86pte_access_pagetable() will redirect any accesses to
 137  * ht_hat->hat_copied_ptes.
 138  *
 139  * Of course, such a modification won't actually modify the HAT_PCP page tables
 140  * that were copied from the HAT_COPIED htable. When we change the top level
 141  * page table entries (L2 PTEs for a 32-bit process and L3 PTEs for a 64-bit
 142  * process), we need to make sure to trigger hat_pcp_update() on all CPUs that
 143  * are currently tied to this HAT (including the current CPU).
 144  *
 145  * To do this, PCP piggy-backs on TLB invalidation, specifically via the
 146  * hat_tlb_inval() path from link_ptp() and unlink_ptp().
 147  *
 148  * (Importantly, in all such cases, when this is in operation, the top-level
 149  * entry should not be able to refer to an actual page table entry that can be
 150  * changed and consolidated into a large page. If large page consolidation is
 151  * required here, then there will be much that needs to be reconsidered.)
 152  *
 153  * -----------------------------------------------
 154  * Kernel Page Table Isolation and the Per-CPU HAT
 155  * -----------------------------------------------
 156  *
 157  * All Intel CPUs that support speculative execution and paging are subject to a
 158  * series of bugs that have been termed 'Meltdown'. These exploits allow a user
 159  * process to read kernel memory through cache side channels and speculative
 160  * execution. To mitigate this on vulnerable CPUs, we need to use a technique
 161  * called kernel page table isolation. What this requires is that we have two
 162  * different page table roots. When executing in kernel mode, we will use a %cr3
 163  * value that has both the user and kernel pages. However when executing in user
 164  * mode, we will need to have a %cr3 that has all of the user pages; however,
 165  * only a subset of the kernel pages required to operate.
 166  *
 167  * These kernel pages that we need mapped are:
 168  *
 169  *   o Kernel Text that allows us to switch between the cr3 values.
 170  *   o The current global descriptor table (GDT)
 171  *   o The current interrupt descriptor table (IDT)
 172  *   o The current task switching state (TSS)
 173  *   o The current local descriptor table (LDT)
 174  *   o Stacks and scratch space used by the interrupt handlers
 175  *
 176  * For more information on the stack switching techniques, construction of the
 177  * trampolines, and more, please see i86pc/ml/kpti_trampolines.s. The most
 178  * important part of these mappings are the following two constraints:
 179  *
 180  *   o The mappings are all per-CPU (except for read-only text)
 181  *   o The mappings are static. They are all established before the CPU is
 182  *     started (with the exception of the boot CPU).
 183  *
 184  * To facilitate the kernel page table isolation we employ our per-CPU
 185  * page tables discussed in the previous section and add the notion of a per-CPU
 186  * HAT. Fundamentally we have a second page table root. There is both a kernel
 187  * page table (hci_pcp_l3ptes), and a user L3 page table (hci_user_l3ptes).
 188  * Both will have the user page table entries copied into them, the same way
 189  * that we discussed in the section 'Per-CPU Page Tables'.
 190  *
 191  * The complex part of this is how do we construct the set of kernel mappings
 192  * that should be present when running with the user page table. To answer that,
 193  * we add the notion of a per-CPU HAT. This HAT functions like a normal HAT,
 194  * except that it's not really associated with an address space the same way
 195  * that other HATs are.
 196  *
 197  * This HAT lives off of the 'struct hat_cpu_info' which is a member of the
 198  * machcpu in the member hci_user_hat. We use this per-CPU HAT to create the set
 199  * of kernel mappings that should be present on this CPU. The kernel mappings
 200  * are added to the per-CPU HAT through the function hati_cpu_punchin(). Once a
 201  * mapping has been punched in, it may not be punched out. The reason that we
 202  * opt to leverage a HAT structure is that it knows how to allocate and manage
 203  * all of the lower level page tables as required.
 204  *
 205  * Because all of the mappings are present at the beginning of time for this CPU
 206  * and none of the mappings are in the kernel pageable segment, we don't have to
 207  * worry about faulting on these HAT structures and thus the notion of the
 208  * current HAT that we're using is always the appropriate HAT for the process
 209  * (usually a user HAT or the kernel's HAT).
 210  *
 211  * A further constraint we place on the system with these per-CPU HATs is that
 212  * they are not subject to htable_steal(). Because each CPU will have a rather
 213  * fixed number of page tables, the same way that we don't steal from the
 214  * kernel's HAT, it was determined that we should not steal from this HAT due to
 215  * the complications involved and somewhat criminal nature of htable_steal().
 216  *
 217  * The per-CPU HAT is initialized in hat_pcp_setup() which is called as part of
 218  * onlining the CPU, but before the CPU is actually started. The per-CPU HAT is
 219  * removed in hat_pcp_teardown() which is called when a CPU is being offlined to
 220  * be removed from the system (which is different from what psradm usually
 221  * does).
 222  *
 223  * Finally, once the CPU has been onlined, the set of mappings in the per-CPU
 224  * HAT must not change. The HAT related functions that we call are not meant to
 225  * be called when we're switching between processes. For example, it is quite
 226  * possible that if they were, they would try to grab an htable mutex which
 227  * another thread might have. One needs to treat hat_switch() as though they
 228  * were above LOCK_LEVEL and therefore _must not_ block under any circumstance.
 229  */
 230 
 231 #include <sys/machparam.h>
 232 #include <sys/machsystm.h>
 233 #include <sys/mman.h>
 234 #include <sys/types.h>
 235 #include <sys/systm.h>
 236 #include <sys/cpuvar.h>
 237 #include <sys/thread.h>
 238 #include <sys/proc.h>
 239 #include <sys/cpu.h>
 240 #include <sys/kmem.h>
 241 #include <sys/disp.h>
 242 #include <sys/shm.h>
 243 #include <sys/sysmacros.h>
 244 #include <sys/machparam.h>
 245 #include <sys/vmem.h>
 246 #include <sys/vmsystm.h>
 247 #include <sys/promif.h>
 248 #include <sys/var.h>
 249 #include <sys/x86_archext.h>
 250 #include <sys/atomic.h>

 264 #include <vm/seg_kpm.h>
 265 #include <vm/vm_dep.h>
 266 #ifdef __xpv
 267 #include <sys/hypervisor.h>
 268 #endif
 269 #include <vm/kboot_mmu.h>
 270 #include <vm/seg_spt.h>
 271 
 272 #include <sys/cmn_err.h>
 273 
 274 /*
 275  * Basic parameters for hat operation.
 276  */
 277 struct hat_mmu_info mmu;
 278 
 279 /*
 280  * The page that is the kernel's top level pagetable.
 281  *
 282  * For 32 bit PAE support on i86pc, the kernel hat will use the 1st 4 entries
 283  * on this 4K page for its top level page table. The remaining groups of
 284  * 4 entries are used for per processor copies of user PCP pagetables for
 285  * running threads.  See hat_switch() and reload_pae32() for details.
 286  *
 287  * pcp_page[0..3] - level==2 PTEs for kernel HAT
 288  * pcp_page[4..7] - level==2 PTEs for user thread on cpu 0
 289  * pcp_page[8..11]  - level==2 PTE for user thread on cpu 1
 290  * etc...
 291  *
 292  * On the 64-bit kernel, this is the normal root of the page table and there is
 293  * nothing special about it when used for other CPUs.
 294  */
 295 static x86pte_t *pcp_page;
 296 
 297 /*
 298  * forward declaration of internal utility routines
 299  */
 300 static x86pte_t hati_update_pte(htable_t *ht, uint_t entry, x86pte_t expected,
 301         x86pte_t new);
 302 
 303 /*
 304  * The kernel address space exists in all non-HAT_COPIED HATs. To implement this
 305  * the kernel reserves a fixed number of entries in the topmost level(s) of page
 306  * tables. The values are setup during startup and then copied to every user hat
 307  * created by hat_alloc(). This means that kernelbase must be:
 308  *
 309  *        4Meg aligned for 32 bit kernels
 310  *      512Gig aligned for x86_64 64 bit kernel
 311  *
 312  * The hat_kernel_range_ts describe what needs to be copied from kernel hat
 313  * to each user hat.
 314  */
 315 typedef struct hat_kernel_range {
 316         level_t         hkr_level;
 317         uintptr_t       hkr_start_va;
 318         uintptr_t       hkr_end_va;     /* zero means to end of memory */
 319 } hat_kernel_range_t;
 320 #define NUM_KERNEL_RANGE 2
 321 static hat_kernel_range_t kernel_ranges[NUM_KERNEL_RANGE];
 322 static int num_kernel_ranges;
 323 
 324 uint_t use_boot_reserve = 1;    /* cleared after early boot process */
 325 uint_t can_steal_post_boot = 0; /* set late in boot to enable stealing */
 326 
 327 /*

 342 
 343 
 344 #ifdef DEBUG
 345 uint_t  map1gcnt;
 346 #endif
 347 
 348 
 349 /*
 350  * A cpuset for all cpus. This is used for kernel address cross calls, since
 351  * the kernel addresses apply to all cpus.
 352  */
 353 cpuset_t khat_cpuset;
 354 
 355 /*
 356  * management stuff for hat structures
 357  */
 358 kmutex_t        hat_list_lock;
 359 kcondvar_t      hat_list_cv;
 360 kmem_cache_t    *hat_cache;
 361 kmem_cache_t    *hat_hash_cache;
 362 kmem_cache_t    *hat32_hash_cache;
 363 
 364 /*
 365  * Simple statistics
 366  */
 367 struct hatstats hatstat;
 368 
 369 /*
 370  * Some earlier hypervisor versions do not emulate cmpxchg of PTEs
 371  * correctly.  For such hypervisors we must set PT_USER for kernel
 372  * entries ourselves (normally the emulation would set PT_USER for
 373  * kernel entries and PT_USER|PT_GLOBAL for user entries).  pt_kern is
 374  * thus set appropriately.  Note that dboot/kbm is OK, as only the full
 375  * HAT uses cmpxchg() and the other paths (hypercall etc.) were never
 376  * incorrect.
 377  */
 378 int pt_kern;
 379 






 380 #ifndef __xpv
 381 extern pfn_t memseg_get_start(struct memseg *);
 382 #endif
 383 
 384 #define PP_GETRM(pp, rmmask)    (pp->p_nrm & rmmask)
 385 #define PP_ISMOD(pp)            PP_GETRM(pp, P_MOD)
 386 #define PP_ISREF(pp)            PP_GETRM(pp, P_REF)
 387 #define PP_ISRO(pp)             PP_GETRM(pp, P_RO)
 388 
 389 #define PP_SETRM(pp, rm)        atomic_orb(&(pp->p_nrm), rm)
 390 #define PP_SETMOD(pp)           PP_SETRM(pp, P_MOD)
 391 #define PP_SETREF(pp)           PP_SETRM(pp, P_REF)
 392 #define PP_SETRO(pp)            PP_SETRM(pp, P_RO)
 393 
 394 #define PP_CLRRM(pp, rm)        atomic_andb(&(pp->p_nrm), ~(rm))
 395 #define PP_CLRMOD(pp)           PP_CLRRM(pp, P_MOD)
 396 #define PP_CLRREF(pp)           PP_CLRRM(pp, P_REF)
 397 #define PP_CLRRO(pp)            PP_CLRRM(pp, P_RO)
 398 #define PP_CLRALL(pp)           PP_CLRRM(pp, P_MOD | P_REF | P_RO)
 399

 402  */
 403 /*ARGSUSED*/
 404 static int
 405 hati_constructor(void *buf, void *handle, int kmflags)
 406 {
 407         hat_t   *hat = buf;
 408 
 409         mutex_init(&hat->hat_mutex, NULL, MUTEX_DEFAULT, NULL);
 410         bzero(hat->hat_pages_mapped,
 411             sizeof (pgcnt_t) * (mmu.max_page_level + 1));
 412         hat->hat_ism_pgcnt = 0;
 413         hat->hat_stats = 0;
 414         hat->hat_flags = 0;
 415         CPUSET_ZERO(hat->hat_cpus);
 416         hat->hat_htable = NULL;
 417         hat->hat_ht_hash = NULL;
 418         return (0);
 419 }
 420 
 421 /*
 422  * Put it at the start of the global list of all hats (used by stealing)
 423  *
 424  * kas.a_hat is not in the list but is instead used to find the
 425  * first and last items in the list.
 426  *
 427  * - kas.a_hat->hat_next points to the start of the user hats.
 428  *   The list ends where hat->hat_next == NULL
 429  *
 430  * - kas.a_hat->hat_prev points to the last of the user hats.
 431  *   The list begins where hat->hat_prev == NULL
 432  */
 433 static void
 434 hat_list_append(hat_t *hat)
 435 {
 436         mutex_enter(&hat_list_lock);
 437         hat->hat_prev = NULL;
 438         hat->hat_next = kas.a_hat->hat_next;
 439         if (hat->hat_next)
 440                 hat->hat_next->hat_prev = hat;
 441         else
 442                 kas.a_hat->hat_prev = hat;
 443         kas.a_hat->hat_next = hat;
 444         mutex_exit(&hat_list_lock);
 445 }
 446 
 447 /*
 448  * Allocate a hat structure for as. We also create the top level
 449  * htable and initialize it to contain the kernel hat entries.
 450  */
 451 hat_t *
 452 hat_alloc(struct as *as)
 453 {
 454         hat_t                   *hat;
 455         htable_t                *ht;    /* top level htable */
 456         uint_t                  use_copied;
 457         uint_t                  r;
 458         hat_kernel_range_t      *rp;
 459         uintptr_t               va;
 460         uintptr_t               eva;
 461         uint_t                  start;
 462         uint_t                  cnt;
 463         htable_t                *src;
 464         boolean_t               use_hat32_cache;
 465 
 466         /*
 467          * Once we start creating user process HATs we can enable
 468          * the htable_steal() code.
 469          */
 470         if (can_steal_post_boot == 0)
 471                 can_steal_post_boot = 1;
 472 
 473         ASSERT(AS_WRITE_HELD(as));
 474         hat = kmem_cache_alloc(hat_cache, KM_SLEEP);
 475         hat->hat_as = as;
 476         mutex_init(&hat->hat_mutex, NULL, MUTEX_DEFAULT, NULL);
 477         ASSERT(hat->hat_flags == 0);
 478 
 479 #if defined(__xpv)
 480         /*
 481          * No PCP stuff on the hypervisor due to the 64-bit split top level
 482          * page tables.  On 32-bit it's not needed as the hypervisor takes
 483          * care of copying the top level PTEs to a below 4Gig page.
 484          */
 485         use_copied = 0;
 486         use_hat32_cache = B_FALSE;
 487         hat->hat_max_level = mmu.max_level;
 488         hat->hat_num_copied = 0;
 489         hat->hat_flags = 0;
 490 #else   /* __xpv */
 491 
 492         /*
 493          * All processes use HAT_COPIED on the 64-bit kernel if KPTI is
 494          * turned on.
 495          */
 496         if (ttoproc(curthread)->p_model == DATAMODEL_ILP32) {
 497                 use_copied = 1;
 498                 hat->hat_max_level = mmu.max_level32;
 499                 hat->hat_num_copied = mmu.num_copied_ents32;
 500                 use_hat32_cache = B_TRUE;
 501                 hat->hat_flags |= HAT_COPIED_32;
 502                 HATSTAT_INC(hs_hat_copied32);
 503         } else if (kpti_enable == 1) {
 504                 use_copied = 1;
 505                 hat->hat_max_level = mmu.max_level;
 506                 hat->hat_num_copied = mmu.num_copied_ents;
 507                 use_hat32_cache = B_FALSE;
 508                 HATSTAT_INC(hs_hat_copied64);
 509         } else {
 510                 use_copied = 0;
 511                 use_hat32_cache = B_FALSE;
 512                 hat->hat_max_level = mmu.max_level;
 513                 hat->hat_num_copied = 0;
 514                 hat->hat_flags = 0;
 515                 HATSTAT_INC(hs_hat_normal64);
 516         }
 517 #endif  /* __xpv */
 518         if (use_copied) {
 519                 hat->hat_flags |= HAT_COPIED;
 520                 bzero(hat->hat_copied_ptes, sizeof (hat->hat_copied_ptes));
 521         }
 522 
 523         /*
 524          * Allocate the htable hash. For 32-bit PCP processes we use the
 525          * hat32_hash_cache. However, for 64-bit PCP processes we do not as the
 526          * number of entries that they have to handle is closer to
 527          * hat_hash_cache in count (though there will be more wastage when we
 528          * have more DRAM in the system and thus push down the user address
 529          * range).
 530          */
 531         if (use_hat32_cache) {
 532                 hat->hat_num_hash = mmu.hat32_hash_cnt;
 533                 hat->hat_ht_hash = kmem_cache_alloc(hat32_hash_cache, KM_SLEEP);
 534         } else {
 535                 hat->hat_num_hash = mmu.hash_cnt;
 536                 hat->hat_ht_hash = kmem_cache_alloc(hat_hash_cache, KM_SLEEP);
 537         }
 538         bzero(hat->hat_ht_hash, hat->hat_num_hash * sizeof (htable_t *));
 539 
 540         /*
 541          * Initialize Kernel HAT entries at the top of the top level page
 542          * tables for the new hat.
 543          */
 544         hat->hat_htable = NULL;
 545         hat->hat_ht_cached = NULL;
 546         XPV_DISALLOW_MIGRATE();
 547         ht = htable_create(hat, (uintptr_t)0, TOP_LEVEL(hat), NULL);
 548         hat->hat_htable = ht;
 549 
 550 #if defined(__amd64)
 551         if (hat->hat_flags & HAT_COPIED)
 552                 goto init_done;
 553 #endif
 554 
 555         for (r = 0; r < num_kernel_ranges; ++r) {
 556                 rp = &kernel_ranges[r];
 557                 for (va = rp->hkr_start_va; va != rp->hkr_end_va;
 558                     va += cnt * LEVEL_SIZE(rp->hkr_level)) {
 559 
 560                         if (rp->hkr_level == TOP_LEVEL(hat))
 561                                 ht = hat->hat_htable;
 562                         else
 563                                 ht = htable_create(hat, va, rp->hkr_level,
 564                                     NULL);
 565 
 566                         start = htable_va2entry(va, ht);
 567                         cnt = HTABLE_NUM_PTES(ht) - start;
 568                         eva = va +
 569                             ((uintptr_t)cnt << LEVEL_SHIFT(rp->hkr_level));
 570                         if (rp->hkr_end_va != 0 &&
 571                             (eva > rp->hkr_end_va || eva == 0))
 572                                 cnt = htable_va2entry(rp->hkr_end_va, ht) -
 573                                     start;
 574 
 575 #if defined(__i386) && !defined(__xpv)
 576                         if (ht->ht_flags & HTABLE_COPIED) {
 577                                 bcopy(&pcp_page[start],
 578                                     &hat->hat_copied_ptes[start],
 579                                     cnt * sizeof (x86pte_t));
 580                                 continue;
 581                         }
 582 #endif
 583                         src = htable_lookup(kas.a_hat, va, rp->hkr_level);
 584                         ASSERT(src != NULL);
 585                         x86pte_copy(src, ht, start, cnt);
 586                         htable_release(src);
 587                 }
 588         }
 589 
 590 init_done:
 591 
 592 #if defined(__xpv)
 593         /*
 594          * Pin top level page tables after initializing them
 595          */
 596         xen_pin(hat->hat_htable->ht_pfn, mmu.max_level);
 597 #if defined(__amd64)
 598         xen_pin(hat->hat_user_ptable, mmu.max_level);
 599 #endif
 600 #endif
 601         XPV_ALLOW_MIGRATE();
 602 
 603         hat_list_append(hat);
 604 
 605         return (hat);
 606 }
 607 
 608 #if !defined(__xpv)
 609 /*
 610  * Cons up a HAT for a CPU. This represents the user mappings. This will have
 611  * various kernel pages punched into it manually. Importantly, this hat is
 612  * ineligible for stealing. We really don't want to deal with this ever
 613  * faulting and figuring out that this is happening, much like we don't with
 614  * kas.
 615  */
 616 static hat_t *
 617 hat_cpu_alloc(cpu_t *cpu)
 618 {
 619         hat_t *hat;
 620         htable_t *ht;
 621 
 622         hat = kmem_cache_alloc(hat_cache, KM_SLEEP);
 623         hat->hat_as = NULL;
 624         mutex_init(&hat->hat_mutex, NULL, MUTEX_DEFAULT, NULL);
 625         hat->hat_max_level = mmu.max_level;
 626         hat->hat_num_copied = 0;
 627         hat->hat_flags = HAT_PCP;
 628 
 629         hat->hat_num_hash = mmu.hash_cnt;
 630         hat->hat_ht_hash = kmem_cache_alloc(hat_hash_cache, KM_SLEEP);
 631         bzero(hat->hat_ht_hash, hat->hat_num_hash * sizeof (htable_t *));
 632 
 633         hat->hat_next = hat->hat_prev = NULL;
 634 
 635         /*
 636          * Because this HAT will only ever be used by the current CPU, we'll go
 637          * ahead and set the CPUSET up to only point to the CPU in question.








 638          */
 639         CPUSET_ADD(hat->hat_cpus, cpu->cpu_id);








 640 
 641         hat->hat_htable = NULL;
 642         hat->hat_ht_cached = NULL;
 643         ht = htable_create(hat, (uintptr_t)0, TOP_LEVEL(hat), NULL);
 644         hat->hat_htable = ht;
 645 
 646         hat_list_append(hat);
 647 
 648         return (hat);
 649 }
 650 #endif /* !__xpv */
 651 
 652 /*
 653  * process has finished executing but as has not been cleaned up yet.
 654  */
 655 /*ARGSUSED*/
 656 void
 657 hat_free_start(hat_t *hat)
 658 {
 659         ASSERT(AS_WRITE_HELD(hat->hat_as));
 660 
 661         /*
 662          * If the hat is currently a stealing victim, wait for the stealing
 663          * to finish.  Once we mark it as HAT_FREEING, htable_steal()
 664          * won't look at its pagetables anymore.
 665          */
 666         mutex_enter(&hat_list_lock);
 667         while (hat->hat_flags & HAT_VICTIM)
 668                 cv_wait(&hat_list_cv, &hat_list_lock);
 669         hat->hat_flags |= HAT_FREEING;
 670         mutex_exit(&hat_list_lock);

 687 
 688         /*
 689          * Remove it from the list of HATs
 690          */
 691         mutex_enter(&hat_list_lock);
 692         if (hat->hat_prev)
 693                 hat->hat_prev->hat_next = hat->hat_next;
 694         else
 695                 kas.a_hat->hat_next = hat->hat_next;
 696         if (hat->hat_next)
 697                 hat->hat_next->hat_prev = hat->hat_prev;
 698         else
 699                 kas.a_hat->hat_prev = hat->hat_prev;
 700         mutex_exit(&hat_list_lock);
 701         hat->hat_next = hat->hat_prev = NULL;
 702 
 703 #if defined(__xpv)
 704         /*
 705          * On the hypervisor, unpin top level page table(s)
 706          */
 707         VERIFY3U(hat->hat_flags & HAT_PCP, ==, 0);
 708         xen_unpin(hat->hat_htable->ht_pfn);
 709 #if defined(__amd64)
 710         xen_unpin(hat->hat_user_ptable);
 711 #endif
 712 #endif
 713 
 714         /*
 715          * Make a pass through the htables freeing them all up.
 716          */
 717         htable_purge_hat(hat);
 718 
 719         /*
 720          * Decide which kmem cache the hash table came from, then free it.
 721          */
 722         if (hat->hat_flags & HAT_COPIED) {
 723 #if defined(__amd64)
 724                 if (hat->hat_flags & HAT_COPIED_32) {
 725                         cache = hat32_hash_cache;
 726                 } else {
 727                         cache = hat_hash_cache;
 728                 }
 729 #else
 730                 cache = hat32_hash_cache;
 731 #endif
 732         } else {
 733                 cache = hat_hash_cache;
 734         }
 735         kmem_cache_free(cache, hat->hat_ht_hash);
 736         hat->hat_ht_hash = NULL;
 737 
 738         hat->hat_flags = 0;
 739         hat->hat_max_level = 0;
 740         hat->hat_num_copied = 0;
 741         kmem_cache_free(hat_cache, hat);
 742 }
 743 
 744 /*
 745  * round kernelbase down to a supported value to use for _userlimit
 746  *
 747  * userlimit must be aligned down to an entry in the top level htable.
 748  * The one exception is for 32 bit HAT's running PAE.
 749  */
 750 uintptr_t
 751 hat_kernelbase(uintptr_t va)
 752 {
 753 #if defined(__i386)
 754         va &= LEVEL_MASK(1);
 755 #endif
 756         if (IN_VA_HOLE(va))
 757                 panic("_userlimit %p will fall in VA hole\n", (void *)va);
 758         return (va);
 759 }
 760

 775                             cpuid_opteron_erratum(CPU, 6671130)) {
 776                                 lvl = 1;
 777                         }
 778                         if (plat_mnode_xcheck(LEVEL_SIZE(2) >>
 779                             LEVEL_SHIFT(0))) {
 780                                 lvl = 1;
 781                         }
 782                 } else {
 783                         lvl = 1;
 784                 }
 785         }
 786         mmu.max_page_level = lvl;
 787 
 788         if ((lvl == 2) && (enable_1gpg == 0))
 789                 mmu.umax_page_level = 1;
 790         else
 791                 mmu.umax_page_level = lvl;
 792 }
 793 
 794 /*
 795  * Determine the number of slots that are in used in the top-most level page
 796  * table for user memory. This is based on _userlimit. In effect this is similar
 797  * to htable_va2entry, but without the convenience of having an htable.
 798  */
 799 void
 800 mmu_calc_user_slots(void)
 801 {
 802         uint_t ent, nptes;
 803         uintptr_t shift;
 804 
 805         nptes = mmu.top_level_count;
 806         shift = _userlimit >> mmu.level_shift[mmu.max_level];
 807         ent = shift & (nptes - 1);
 808 
 809         /*
 810          * Ent tells us the slot that the page for _userlimit would fit in. We
 811          * need to add one to this to cover the total number of entries.
 812          */
 813         mmu.top_level_uslots = ent + 1;
 814 
 815         /*
 816          * When running 32-bit compatability processes on a 64-bit kernel, we
 817          * will only need to use one slot.
 818          */
 819         mmu.top_level_uslots32 = 1;
 820 
 821         /*
 822          * Record the number of PCP page table entries that we'll need to copy
 823          * around. For 64-bit processes this is the number of user slots. For
 824          * 32-bit proceses, this is 4 1 GiB pages.
 825          */
 826         mmu.num_copied_ents = mmu.top_level_uslots;
 827         mmu.num_copied_ents32 = 4;
 828 }
 829 
 830 /*
 831  * Initialize hat data structures based on processor MMU information.
 832  */
 833 void
 834 mmu_init(void)
 835 {
 836         uint_t max_htables;
 837         uint_t pa_bits;
 838         uint_t va_bits;
 839         int i;
 840 
 841         /*
 842          * If CPU enabled the page table global bit, use it for the kernel
 843          * This is bit 7 in CR4 (PGE - Page Global Enable).
 844          */
 845         if (is_x86_feature(x86_featureset, X86FSET_PGE) &&
 846             (getcr4() & CR4_PGE) != 0)
 847                 mmu.pt_global = PT_GLOBAL;
 848 
 849 #if !defined(__xpv)
 850         /*
 851          * The 64-bit x86 kernel has split user/kernel page tables. As such we
 852          * cannot have the global bit set. The simplest way for us to deal with
 853          * this is to just say that pt_global is zero, so the global bit isn't
 854          * present.
 855          */
 856         if (kpti_enable == 1)
 857                 mmu.pt_global = 0;
 858 #endif
 859 
 860         /*
 861          * Detect NX and PAE usage.
 862          */
 863         mmu.pae_hat = kbm_pae_support;
 864         if (kbm_nx_support)
 865                 mmu.pt_nx = PT_NX;
 866         else
 867                 mmu.pt_nx = 0;
 868 
 869         /*
 870          * Use CPU info to set various MMU parameters
 871          */
 872         cpuid_get_addrsize(CPU, &pa_bits, &va_bits);
 873 
 874         if (va_bits < sizeof (void *) * NBBY) {
 875                 mmu.hole_start = (1ul << (va_bits - 1));
 876                 mmu.hole_end = 0ul - mmu.hole_start - 1;
 877         } else {
 878                 mmu.hole_end = 0;
 879                 mmu.hole_start = mmu.hole_end - 1;
 880         }

 898                 mmu.pte_size = 8;       /* 8 byte PTEs */
 899                 mmu.pte_size_shift = 3;
 900         } else {
 901                 mmu.pte_size = 4;       /* 4 byte PTEs */
 902                 mmu.pte_size_shift = 2;
 903         }
 904 
 905         if (mmu.pae_hat && !is_x86_feature(x86_featureset, X86FSET_PAE))
 906                 panic("Processor does not support PAE");
 907 
 908         if (!is_x86_feature(x86_featureset, X86FSET_CX8))
 909                 panic("Processor does not support cmpxchg8b instruction");
 910 
 911 #if defined(__amd64)
 912 
 913         mmu.num_level = 4;
 914         mmu.max_level = 3;
 915         mmu.ptes_per_table = 512;
 916         mmu.top_level_count = 512;
 917 
 918         /*
 919          * 32-bit processes only use 1 GB ptes.
 920          */
 921         mmu.max_level32 = 2;
 922 
 923         mmu.level_shift[0] = 12;
 924         mmu.level_shift[1] = 21;
 925         mmu.level_shift[2] = 30;
 926         mmu.level_shift[3] = 39;
 927 
 928 #elif defined(__i386)
 929 
 930         if (mmu.pae_hat) {
 931                 mmu.num_level = 3;
 932                 mmu.max_level = 2;
 933                 mmu.ptes_per_table = 512;
 934                 mmu.top_level_count = 4;
 935 
 936                 mmu.level_shift[0] = 12;
 937                 mmu.level_shift[1] = 21;
 938                 mmu.level_shift[2] = 30;
 939 
 940         } else {
 941                 mmu.num_level = 2;
 942                 mmu.max_level = 1;
 943                 mmu.ptes_per_table = 1024;
 944                 mmu.top_level_count = 1024;
 945 
 946                 mmu.level_shift[0] = 12;
 947                 mmu.level_shift[1] = 22;
 948         }
 949 
 950 #endif  /* __i386 */
 951 
 952         for (i = 0; i < mmu.num_level; ++i) {
 953                 mmu.level_size[i] = 1UL << mmu.level_shift[i];
 954                 mmu.level_offset[i] = mmu.level_size[i] - 1;
 955                 mmu.level_mask[i] = ~mmu.level_offset[i];
 956         }
 957 
 958         set_max_page_level();
 959         mmu_calc_user_slots();
 960 
 961         mmu_page_sizes = mmu.max_page_level + 1;
 962         mmu_exported_page_sizes = mmu.umax_page_level + 1;
 963 
 964         /* restrict legacy applications from using pagesizes 1g and above */
 965         mmu_legacy_page_sizes =
 966             (mmu_exported_page_sizes > 2) ? 2 : mmu_exported_page_sizes;
 967 
 968 
 969         for (i = 0; i <= mmu.max_page_level; ++i) {
 970                 mmu.pte_bits[i] = PT_VALID | pt_kern;
 971                 if (i > 0)
 972                         mmu.pte_bits[i] |= PT_PAGESIZE;
 973         }
 974 
 975         /*
 976          * NOTE Legacy 32 bit PAE mode only has the P_VALID bit at top level.
 977          */
 978         for (i = 1; i < mmu.num_level; ++i)
 979                 mmu.ptp_bits[i] = PT_PTPBITS;
 980 
 981 #if defined(__i386)
 982         mmu.ptp_bits[2] = PT_VALID;
 983 #endif
 984 
 985         /*
 986          * Compute how many hash table entries to have per process for htables.
 987          * We start with 1 page's worth of entries.
 988          *
 989          * If physical memory is small, reduce the amount need to cover it.
 990          */
 991         max_htables = physmax / mmu.ptes_per_table;
 992         mmu.hash_cnt = MMU_PAGESIZE / sizeof (htable_t *);
 993         while (mmu.hash_cnt > 16 && mmu.hash_cnt >= max_htables)
 994                 mmu.hash_cnt >>= 1;
 995         mmu.hat32_hash_cnt = mmu.hash_cnt;
 996 
 997 #if defined(__amd64)
 998         /*
 999          * If running in 64 bits and physical memory is large,
1000          * increase the size of the cache to cover all of memory for
1001          * a 64 bit process.
1002          */
1003 #define HASH_MAX_LENGTH 4
1004         while (mmu.hash_cnt * HASH_MAX_LENGTH < max_htables)
1005                 mmu.hash_cnt <<= 1;
1006 #endif
1007 }
1008 
1009 
1010 /*
1011  * initialize hat data structures
1012  */
1013 void
1014 hat_init()
1015 {

1024         }
1025 #endif
1026 
1027         cv_init(&hat_list_cv, NULL, CV_DEFAULT, NULL);
1028 
1029         /*
1030          * initialize kmem caches
1031          */
1032         htable_init();
1033         hment_init();
1034 
1035         hat_cache = kmem_cache_create("hat_t",
1036             sizeof (hat_t), 0, hati_constructor, NULL, NULL,
1037             NULL, 0, 0);
1038 
1039         hat_hash_cache = kmem_cache_create("HatHash",
1040             mmu.hash_cnt * sizeof (htable_t *), 0, NULL, NULL, NULL,
1041             NULL, 0, 0);
1042 
1043         /*
1044          * 32-bit PCP hats can use a smaller hash table size on large memory
1045          * machines
1046          */
1047         if (mmu.hash_cnt == mmu.hat32_hash_cnt) {
1048                 hat32_hash_cache = hat_hash_cache;
1049         } else {
1050                 hat32_hash_cache = kmem_cache_create("Hat32Hash",
1051                     mmu.hat32_hash_cnt * sizeof (htable_t *), 0, NULL, NULL,
1052                     NULL, NULL, 0, 0);
1053         }
1054 
1055         /*
1056          * Set up the kernel's hat
1057          */
1058         AS_LOCK_ENTER(&kas, RW_WRITER);
1059         kas.a_hat = kmem_cache_alloc(hat_cache, KM_NOSLEEP);
1060         mutex_init(&kas.a_hat->hat_mutex, NULL, MUTEX_DEFAULT, NULL);
1061         kas.a_hat->hat_as = &kas;
1062         kas.a_hat->hat_flags = 0;
1063         AS_LOCK_EXIT(&kas);
1064 
1065         CPUSET_ZERO(khat_cpuset);
1066         CPUSET_ADD(khat_cpuset, CPU->cpu_id);
1067 
1068         /*
1069          * The kernel HAT doesn't use PCP regardless of architectures.
1070          */
1071         ASSERT3U(mmu.max_level, >, 0);
1072         kas.a_hat->hat_max_level = mmu.max_level;
1073         kas.a_hat->hat_num_copied = 0;
1074 
1075         /*
1076          * The kernel hat's next pointer serves as the head of the hat list .
1077          * The kernel hat's prev pointer tracks the last hat on the list for
1078          * htable_steal() to use.
1079          */
1080         kas.a_hat->hat_next = NULL;
1081         kas.a_hat->hat_prev = NULL;
1082 
1083         /*
1084          * Allocate an htable hash bucket for the kernel
1085          * XX64 - tune for 64 bit procs
1086          */
1087         kas.a_hat->hat_num_hash = mmu.hash_cnt;
1088         kas.a_hat->hat_ht_hash = kmem_cache_alloc(hat_hash_cache, KM_NOSLEEP);
1089         bzero(kas.a_hat->hat_ht_hash, mmu.hash_cnt * sizeof (htable_t *));
1090 
1091         /*
1092          * zero out the top level and cached htable pointers
1093          */
1094         kas.a_hat->hat_ht_cached = NULL;
1095         kas.a_hat->hat_htable = NULL;
1096 
1097         /*
1098          * Pre-allocate hrm_hashtab before enabling the collection of
1099          * refmod statistics.  Allocating on the fly would mean us
1100          * running the risk of suffering recursive mutex enters or
1101          * deadlocks.
1102          */
1103         hrm_hashtab = kmem_zalloc(HRM_HASHSIZE * sizeof (struct hrmstat *),
1104             KM_SLEEP);
1105 }
1106 
1107 
1108 extern void kpti_tramp_start();
1109 extern void kpti_tramp_end();
1110 
1111 extern void kdi_isr_start();
1112 extern void kdi_isr_end();
1113 
1114 extern gate_desc_t kdi_idt[NIDT];
1115 
1116 /*
1117  * Prepare per-CPU pagetables for all processes on the 64 bit kernel.
1118  *
1119  * Each CPU has a set of 2 pagetables that are reused for any 32 bit
1120  * process it runs. They are the top level pagetable, hci_pcp_l3ptes, and
1121  * the next to top level table for the bottom 512 Gig, hci_pcp_l2ptes.
1122  */
1123 /*ARGSUSED*/
1124 static void
1125 hat_pcp_setup(struct cpu *cpu)
1126 {
1127 #if !defined(__xpv)
1128         struct hat_cpu_info *hci = cpu->cpu_hat_info;
1129         uintptr_t va;
1130         size_t len;
1131 
1132         /*
1133          * allocate the level==2 page table for the bottom most
1134          * 512Gig of address space (this is where 32 bit apps live)
1135          */
1136         ASSERT(hci != NULL);
1137         hci->hci_pcp_l2ptes = kmem_zalloc(MMU_PAGESIZE, KM_SLEEP);
1138 
1139         /*
1140          * Allocate a top level pagetable and copy the kernel's
1141          * entries into it. Then link in hci_pcp_l2ptes in the 1st entry.
1142          */
1143         hci->hci_pcp_l3ptes = kmem_zalloc(MMU_PAGESIZE, KM_SLEEP);
1144         hci->hci_pcp_l3pfn =
1145             hat_getpfnum(kas.a_hat, (caddr_t)hci->hci_pcp_l3ptes);
1146         ASSERT3U(hci->hci_pcp_l3pfn, !=, PFN_INVALID);
1147         bcopy(pcp_page, hci->hci_pcp_l3ptes, MMU_PAGESIZE);
1148 
1149         hci->hci_pcp_l2pfn =
1150             hat_getpfnum(kas.a_hat, (caddr_t)hci->hci_pcp_l2ptes);
1151         ASSERT3U(hci->hci_pcp_l2pfn, !=, PFN_INVALID);
1152 
1153         /*
1154          * Now go through and allocate the user version of these structures.
1155          * Unlike with the kernel version, we allocate a hat to represent the
1156          * top-level page table as that will make it much simpler when we need
1157          * to patch through user entries.
1158          */
1159         hci->hci_user_hat = hat_cpu_alloc(cpu);
1160         hci->hci_user_l3pfn = hci->hci_user_hat->hat_htable->ht_pfn;
1161         ASSERT3U(hci->hci_user_l3pfn, !=, PFN_INVALID);
1162         hci->hci_user_l3ptes =
1163             (x86pte_t *)hat_kpm_mapin_pfn(hci->hci_user_l3pfn);
1164 
1165         /* Skip the rest of this if KPTI is switched off at boot. */
1166         if (kpti_enable != 1)
1167                 return;
1168 
1169         /*
1170          * OK, now that we have this we need to go through and punch the normal
1171          * holes in the CPU's hat for this. At this point we'll punch in the
1172          * following:
1173          *
1174          *   o GDT
1175          *   o IDT
1176          *   o LDT
1177          *   o Trampoline Code
1178          *   o machcpu KPTI page
1179          *   o kmdb ISR code page (just trampolines)
1180          *
1181          * If this is cpu0, then we also can initialize the following because
1182          * they'll have already been allocated.
1183          *
1184          *   o TSS for CPU 0
1185          *   o Double Fault for CPU 0
1186          *
1187          * The following items have yet to be allocated and have not been
1188          * punched in yet. They will be punched in later:
1189          *
1190          *   o TSS (mach_cpucontext_alloc_tables())
1191          *   o Double Fault Stack (mach_cpucontext_alloc_tables())
1192          */
1193         hati_cpu_punchin(cpu, (uintptr_t)cpu->cpu_gdt, PROT_READ);
1194         hati_cpu_punchin(cpu, (uintptr_t)cpu->cpu_idt, PROT_READ);
1195 
1196         /*
1197          * As the KDI IDT is only active during kmdb sessions (including single
1198          * stepping), typically we don't actually need this punched in (we
1199          * consider the routines that switch to the user cr3 to be toxic).  But
1200          * if we ever accidentally end up on the user cr3 while on this IDT,
1201          * we'd prefer not to triple fault.
1202          */
1203         hati_cpu_punchin(cpu, (uintptr_t)&kdi_idt, PROT_READ);
1204 
1205         CTASSERT(((uintptr_t)&kpti_tramp_start % MMU_PAGESIZE) == 0);
1206         CTASSERT(((uintptr_t)&kpti_tramp_end % MMU_PAGESIZE) == 0);
1207         for (va = (uintptr_t)&kpti_tramp_start;
1208             va < (uintptr_t)&kpti_tramp_end; va += MMU_PAGESIZE) {
1209                 hati_cpu_punchin(cpu, va, PROT_READ | PROT_EXEC);
1210         }
1211 
1212         VERIFY3U(((uintptr_t)cpu->cpu_m.mcpu_ldt) % MMU_PAGESIZE, ==, 0);
1213         for (va = (uintptr_t)cpu->cpu_m.mcpu_ldt, len = LDT_CPU_SIZE;
1214             len >= MMU_PAGESIZE; va += MMU_PAGESIZE, len -= MMU_PAGESIZE) {
1215                 hati_cpu_punchin(cpu, va, PROT_READ);
1216         }
1217 
1218         /* mcpu_pad2 is the start of the page containing the kpti_frames. */
1219         hati_cpu_punchin(cpu, (uintptr_t)&cpu->cpu_m.mcpu_pad2[0],
1220             PROT_READ | PROT_WRITE);
1221 
1222         if (cpu == &cpus[0]) {
1223                 /*
1224                  * CPU0 uses a global for its double fault stack to deal with
1225                  * the chicken and egg problem. We need to punch it into its
1226                  * user HAT.
1227                  */
1228                 extern char dblfault_stack0[];
1229 
1230                 hati_cpu_punchin(cpu, (uintptr_t)cpu->cpu_m.mcpu_tss,
1231                     PROT_READ);
1232 
1233                 for (va = (uintptr_t)dblfault_stack0,
1234                     len = DEFAULTSTKSZ; len >= MMU_PAGESIZE;
1235                     va += MMU_PAGESIZE, len -= MMU_PAGESIZE) {
1236                         hati_cpu_punchin(cpu, va, PROT_READ | PROT_WRITE);
1237                 }
1238         }
1239 
1240         CTASSERT(((uintptr_t)&kdi_isr_start % MMU_PAGESIZE) == 0);
1241         CTASSERT(((uintptr_t)&kdi_isr_end % MMU_PAGESIZE) == 0);
1242         for (va = (uintptr_t)&kdi_isr_start;
1243             va < (uintptr_t)&kdi_isr_end; va += MMU_PAGESIZE) {
1244                 hati_cpu_punchin(cpu, va, PROT_READ | PROT_EXEC);
1245         }
1246 #endif /* !__xpv */
1247 }
1248 
1249 /*ARGSUSED*/
1250 static void
1251 hat_pcp_teardown(cpu_t *cpu)
1252 {
1253 #if !defined(__xpv)
1254         struct hat_cpu_info *hci;
1255 
1256         if ((hci = cpu->cpu_hat_info) == NULL)
1257                 return;
1258         if (hci->hci_pcp_l2ptes != NULL)
1259                 kmem_free(hci->hci_pcp_l2ptes, MMU_PAGESIZE);
1260         if (hci->hci_pcp_l3ptes != NULL)
1261                 kmem_free(hci->hci_pcp_l3ptes, MMU_PAGESIZE);
1262         if (hci->hci_user_hat != NULL) {
1263                 hat_free_start(hci->hci_user_hat);
1264                 hat_free_end(hci->hci_user_hat);
1265         }
1266 #endif
1267 }
1268 
1269 #define NEXT_HKR(r, l, s, e) {                  \
1270         kernel_ranges[r].hkr_level = l;         \
1271         kernel_ranges[r].hkr_start_va = s;      \
1272         kernel_ranges[r].hkr_end_va = e;        \
1273         ++r;                                    \
1274 }
1275 
1276 /*
1277  * Finish filling in the kernel hat.
1278  * Pre fill in all top level kernel page table entries for the kernel's
1279  * part of the address range.  From this point on we can't use any new
1280  * kernel large pages if they need PTE's at max_level
1281  *
1282  * create the kmap mappings.
1283  */
1284 void
1285 hat_init_finish(void)

1341 
1342                         if (IN_HYPERVISOR_VA(va))
1343                                 continue;
1344 
1345                         /* can/must skip if a page mapping already exists */
1346                         if (rp->hkr_level <= mmu.max_page_level &&
1347                             (ht = htable_getpage(kas.a_hat, va, NULL)) !=
1348                             NULL) {
1349                                 htable_release(ht);
1350                                 continue;
1351                         }
1352 
1353                         (void) htable_create(kas.a_hat, va, rp->hkr_level - 1,
1354                             NULL);
1355                 }
1356         }
1357 
1358         /*
1359          * 32 bit PAE metal kernels use only 4 of the 512 entries in the
1360          * page holding the top level pagetable. We use the remainder for
1361          * the "per CPU" page tables for PCP processes.
1362          * Map the top level kernel pagetable into the kernel to make
1363          * it easy to use bcopy access these tables.
1364          *
1365          * PAE is required for the 64-bit kernel which uses this as well to
1366          * perform the per-CPU pagetables. See the big theory statement.
1367          */
1368         if (mmu.pae_hat) {
1369                 pcp_page = vmem_alloc(heap_arena, MMU_PAGESIZE, VM_SLEEP);
1370                 hat_devload(kas.a_hat, (caddr_t)pcp_page, MMU_PAGESIZE,
1371                     kas.a_hat->hat_htable->ht_pfn,
1372 #if !defined(__xpv)
1373                     PROT_WRITE |
1374 #endif
1375                     PROT_READ | HAT_NOSYNC | HAT_UNORDERED_OK,
1376                     HAT_LOAD | HAT_LOAD_NOCONSIST);
1377         }
1378         hat_pcp_setup(CPU);
1379 
1380         /*
1381          * Create kmap (cached mappings of kernel PTEs)
1382          * for 32 bit we map from segmap_start .. ekernelheap
1383          * for 64 bit we map from segmap_start .. segmap_start + segmapsize;
1384          */
1385 #if defined(__i386)
1386         size = (uintptr_t)ekernelheap - segmap_start;
1387 #elif defined(__amd64)
1388         size = segmapsize;
1389 #endif
1390         hat_kmap_init((uintptr_t)segmap_start, size);
1391 
1392 #if !defined(__xpv)
1393         ASSERT3U(kas.a_hat->hat_htable->ht_pfn, !=, PFN_INVALID);
1394         ASSERT3U(kpti_safe_cr3, ==,
1395             MAKECR3(kas.a_hat->hat_htable->ht_pfn, PCID_KERNEL));
1396 #endif
1397 }
1398 
1399 /*
1400  * On 32 bit PAE mode, PTE's are 64 bits, but ordinary atomic memory references
1401  * are 32 bit, so for safety we must use atomic_cas_64() to install these.
1402  */
1403 #ifdef __i386
1404 static void
1405 reload_pae32(hat_t *hat, cpu_t *cpu)
1406 {
1407         x86pte_t *src;
1408         x86pte_t *dest;
1409         x86pte_t pte;
1410         int i;
1411 
1412         /*
1413          * Load the 4 entries of the level 2 page table into this
1414          * cpu's range of the pcp_page and point cr3 at them.
1415          */
1416         ASSERT(mmu.pae_hat);
1417         src = hat->hat_copied_ptes;
1418         dest = pcp_page + (cpu->cpu_id + 1) * MAX_COPIED_PTES;
1419         for (i = 0; i < MAX_COPIED_PTES; ++i) {
1420                 for (;;) {
1421                         pte = dest[i];
1422                         if (pte == src[i])
1423                                 break;
1424                         if (atomic_cas_64(dest + i, pte, src[i]) != src[i])
1425                                 break;
1426                 }
1427         }
1428 }
1429 #endif
1430 
1431 /*
1432  * Update the PCP data on the CPU cpu to the one on the hat. If this is a 32-bit
1433  * process, then we must update the L2 pages and then the L3. If this is a
1434  * 64-bit process then we must update the L3 entries.
1435  */
1436 static void
1437 hat_pcp_update(cpu_t *cpu, const hat_t *hat)
1438 {
1439         ASSERT3U(hat->hat_flags & HAT_COPIED, !=, 0);
1440 
1441         if ((hat->hat_flags & HAT_COPIED_32) != 0) {
1442                 const x86pte_t *l2src;
1443                 x86pte_t *l2dst, *l3ptes, *l3uptes;
1444                 /*
1445                  * This is a 32-bit process. To set this up, we need to do the
1446                  * following:
1447                  *
1448                  *  - Copy the 4 L2 PTEs into the dedicated L2 table
1449                  *  - Zero the user L3 PTEs in the user and kernel page table
1450                  *  - Set the first L3 PTE to point to the CPU L2 table
1451                  */
1452                 l2src = hat->hat_copied_ptes;
1453                 l2dst = cpu->cpu_hat_info->hci_pcp_l2ptes;
1454                 l3ptes = cpu->cpu_hat_info->hci_pcp_l3ptes;
1455                 l3uptes = cpu->cpu_hat_info->hci_user_l3ptes;
1456 
1457                 l2dst[0] = l2src[0];
1458                 l2dst[1] = l2src[1];
1459                 l2dst[2] = l2src[2];
1460                 l2dst[3] = l2src[3];
1461 
1462                 /*
1463                  * Make sure to use the mmu to get the number of slots. The
1464                  * number of PLP entries that this has will always be less as
1465                  * it's a 32-bit process.
1466                  */
1467                 bzero(l3ptes, sizeof (x86pte_t) * mmu.top_level_uslots);
1468                 l3ptes[0] = MAKEPTP(cpu->cpu_hat_info->hci_pcp_l2pfn, 2);
1469                 bzero(l3uptes, sizeof (x86pte_t) * mmu.top_level_uslots);
1470                 l3uptes[0] = MAKEPTP(cpu->cpu_hat_info->hci_pcp_l2pfn, 2);
1471         } else {
1472                 /*
1473                  * This is a 64-bit process. To set this up, we need to do the
1474                  * following:
1475                  *
1476                  *  - Zero the 4 L2 PTEs in the CPU structure for safety
1477                  *  - Copy over the new user L3 PTEs into the kernel page table
1478                  *  - Copy over the new user L3 PTEs into the user page table
1479                  */
1480                 ASSERT3S(kpti_enable, ==, 1);
1481                 bzero(cpu->cpu_hat_info->hci_pcp_l2ptes, sizeof (x86pte_t) * 4);
1482                 bcopy(hat->hat_copied_ptes, cpu->cpu_hat_info->hci_pcp_l3ptes,
1483                     sizeof (x86pte_t) * mmu.top_level_uslots);
1484                 bcopy(hat->hat_copied_ptes, cpu->cpu_hat_info->hci_user_l3ptes,
1485                     sizeof (x86pte_t) * mmu.top_level_uslots);
1486         }
1487 }
1488 
1489 static void
1490 reset_kpti(struct kpti_frame *fr, uint64_t kcr3, uint64_t ucr3)
1491 {
1492         ASSERT3U(fr->kf_tr_flag, ==, 0);
1493 #if DEBUG
1494         if (fr->kf_kernel_cr3 != 0) {
1495                 ASSERT3U(fr->kf_lower_redzone, ==, 0xdeadbeefdeadbeef);
1496                 ASSERT3U(fr->kf_middle_redzone, ==, 0xdeadbeefdeadbeef);
1497                 ASSERT3U(fr->kf_upper_redzone, ==, 0xdeadbeefdeadbeef);
1498         }
1499 #endif
1500 
1501         bzero(fr, offsetof(struct kpti_frame, kf_kernel_cr3));
1502         bzero(&fr->kf_unused, sizeof (struct kpti_frame) -
1503             offsetof(struct kpti_frame, kf_unused));
1504 
1505         fr->kf_kernel_cr3 = kcr3;
1506         fr->kf_user_cr3 = ucr3;
1507         fr->kf_tr_ret_rsp = (uintptr_t)&fr->kf_tr_rsp;
1508 
1509         fr->kf_lower_redzone = 0xdeadbeefdeadbeef;
1510         fr->kf_middle_redzone = 0xdeadbeefdeadbeef;
1511         fr->kf_upper_redzone = 0xdeadbeefdeadbeef;
1512 }
1513 
1514 #ifdef __xpv
1515 static void
1516 hat_switch_xen(hat_t *hat)
1517 {
1518         struct mmuext_op t[2];
1519         uint_t retcnt;
1520         uint_t opcnt = 1;
1521         uint64_t newcr3;
1522 
1523         ASSERT(!(hat->hat_flags & HAT_COPIED));
1524         ASSERT(!(getcr4() & CR4_PCIDE));
1525 
1526         newcr3 = MAKECR3((uint64_t)hat->hat_htable->ht_pfn, PCID_NONE);
1527 
1528         t[0].cmd = MMUEXT_NEW_BASEPTR;
1529         t[0].arg1.mfn = mmu_btop(pa_to_ma(newcr3));
1530 
1531         /*
1532          * There's an interesting problem here, as to what to actually specify
1533          * when switching to the kernel hat.  For now we'll reuse the kernel hat
1534          * again.
1535          */
1536         t[1].cmd = MMUEXT_NEW_USER_BASEPTR;
1537         if (hat == kas.a_hat)
1538                 t[1].arg1.mfn = mmu_btop(pa_to_ma(newcr3));
1539         else
1540                 t[1].arg1.mfn = pfn_to_mfn(hat->hat_user_ptable);
1541         ++opcnt;
1542 
1543         if (HYPERVISOR_mmuext_op(t, opcnt, &retcnt, DOMID_SELF) < 0)
1544                 panic("HYPERVISOR_mmu_update() failed");
1545         ASSERT(retcnt == opcnt);
1546 }
1547 #endif /* __xpv */
1548 
1549 /*
1550  * Switch to a new active hat, maintaining bit masks to track active CPUs.
1551  *
1552  * With KPTI, all our HATs except kas should be using PCP.  Thus, to switch
1553  * HATs, we need to copy over the new user PTEs, then set our trampoline context
1554  * as appropriate.
1555  *
1556  * If lacking PCID, we then load our new cr3, which will flush the TLB: we may
1557  * have established userspace TLB entries via kernel accesses, and these are no
1558  * longer valid.  We have to do this eagerly, as we just deleted this CPU from
1559  * ->hat_cpus, so would no longer see any TLB shootdowns.
1560  *
1561  * With PCID enabled, things get a little more complicated.  We would like to
1562  * keep TLB context around when entering and exiting the kernel, and to do this,
1563  * we partition the TLB into two different spaces:
1564  *
1565  * PCID_KERNEL is defined as zero, and used both by kas and all other address
1566  * spaces while in the kernel (post-trampoline).
1567  *
1568  * PCID_USER is used while in userspace.  Therefore, userspace cannot use any
1569  * lingering PCID_KERNEL entries to kernel addresses it should not be able to
1570  * read.
1571  *
1572  * The trampoline cr3s are set not to invalidate on a mov to %cr3. This means if
1573  * we take a journey through the kernel without switching HATs, we have some
1574  * hope of keeping our TLB state around.
1575  *
1576  * On a hat switch, rather than deal with any necessary flushes on the way out
1577  * of the trampolines, we do them upfront here. If we're switching from kas, we
1578  * shouldn't need any invalidation.
1579  *
1580  * Otherwise, we can have stale userspace entries for both PCID_USER (what
1581  * happened before we move onto the kcr3) and PCID_KERNEL (any subsequent
1582  * userspace accesses such as ddi_copyin()).  Since setcr3() won't do these
1583  * flushes on its own in PCIDE, we'll do a non-flushing load and then
1584  * invalidate everything.
1585  */
1586 void
1587 hat_switch(hat_t *hat)
1588 {

1589         cpu_t *cpu = CPU;
1590         hat_t *old = cpu->cpu_current_hat;
1591 
1592         /*
1593          * set up this information first, so we don't miss any cross calls
1594          */
1595         if (old != NULL) {
1596                 if (old == hat)
1597                         return;
1598                 if (old != kas.a_hat)
1599                         CPUSET_ATOMIC_DEL(old->hat_cpus, cpu->cpu_id);
1600         }
1601 
1602         /*
1603          * Add this CPU to the active set for this HAT.
1604          */
1605         if (hat != kas.a_hat) {
1606                 CPUSET_ATOMIC_ADD(hat->hat_cpus, cpu->cpu_id);
1607         }
1608         cpu->cpu_current_hat = hat;
1609 
1610 #if defined(__xpv)
1611         hat_switch_xen(hat);
1612 #else
1613         struct hat_cpu_info *info = cpu->cpu_m.mcpu_hat_info;
1614         uint64_t pcide = getcr4() & CR4_PCIDE;
1615         uint64_t kcr3, ucr3;
1616         pfn_t tl_kpfn;
1617         ulong_t flag;
1618 
1619         EQUIV(kpti_enable, !mmu.pt_global);
1620 
1621         if (hat->hat_flags & HAT_COPIED) {
1622                 hat_pcp_update(cpu, hat);
1623                 tl_kpfn = info->hci_pcp_l3pfn;


1624         } else {
1625                 IMPLY(kpti_enable, hat == kas.a_hat);
1626                 tl_kpfn = hat->hat_htable->ht_pfn;
1627         }





1628 
1629         if (pcide) {
1630                 ASSERT(kpti_enable);
1631 
1632                 kcr3 = MAKECR3(tl_kpfn, PCID_KERNEL) | CR3_NOINVL_BIT;
1633                 ucr3 = MAKECR3(info->hci_user_l3pfn, PCID_USER) |
1634                     CR3_NOINVL_BIT;
1635 
1636                 setcr3(kcr3);
1637                 if (old != kas.a_hat)
1638                         mmu_flush_tlb(FLUSH_TLB_ALL, NULL);
1639         } else {
1640                 kcr3 = MAKECR3(tl_kpfn, PCID_NONE);
1641                 ucr3 = kpti_enable ?
1642                     MAKECR3(info->hci_user_l3pfn, PCID_NONE) :
1643                     0;
1644 
1645                 setcr3(kcr3);
1646         }
1647 
1648         /*
1649          * We will already be taking shootdowns for our new HAT, and as KPTI
1650          * invpcid emulation needs to use kf_user_cr3, make sure we don't get
1651          * any cross calls while we're inconsistent.  Note that it's harmless to
1652          * have a *stale* kf_user_cr3 (we just did a FLUSH_TLB_ALL), but a
1653          * *zero* kf_user_cr3 is not going to go very well.
1654          */
1655         if (pcide)
1656                 flag = intr_clear();








1657 
1658         reset_kpti(&cpu->cpu_m.mcpu_kpti, kcr3, ucr3);
1659         reset_kpti(&cpu->cpu_m.mcpu_kpti_flt, kcr3, ucr3);
1660         reset_kpti(&cpu->cpu_m.mcpu_kpti_dbg, kcr3, ucr3);
1661 
1662         if (pcide)
1663                 intr_restore(flag);
1664 
1665 #endif /* !__xpv */
1666 
1667         ASSERT(cpu == CPU);
1668 }
1669 
1670 /*
1671  * Utility to return a valid x86pte_t from protections, pfn, and level number
1672  */
1673 static x86pte_t
1674 hati_mkpte(pfn_t pfn, uint_t attr, level_t level, uint_t flags)
1675 {
1676         x86pte_t        pte;
1677         uint_t          cache_attr = attr & HAT_ORDER_MASK;
1678 
1679         pte = MAKEPTE(pfn, level);
1680 
1681         if (attr & PROT_WRITE)
1682                 PTE_SET(pte, PT_WRITABLE);
1683 
1684         if (attr & PROT_USER)
1685                 PTE_SET(pte, PT_USER);
1686

1958                 goto done;
1959         }
1960 
1961         /*
1962          * If the mapping didn't change there is nothing more to do.
1963          */
1964         if (PTE_EQUIV(pte, old_pte))
1965                 goto done;
1966 
1967         /*
1968          * Install a new mapping in the page's mapping list
1969          */
1970         if (!PTE_ISVALID(old_pte)) {
1971                 if (is_consist) {
1972                         hment_assign(ht, entry, pp, hm);
1973                         x86_hm_exit(pp);
1974                 } else {
1975                         ASSERT(flags & HAT_LOAD_NOCONSIST);
1976                 }
1977 #if defined(__amd64)
1978                 if (ht->ht_flags & HTABLE_COPIED) {
1979                         cpu_t *cpu = CPU;
1980                         hat_pcp_update(cpu, hat);

1981                 }
1982 #endif
1983                 HTABLE_INC(ht->ht_valid_cnt);
1984                 PGCNT_INC(hat, l);
1985                 return (rv);
1986         }
1987 
1988         /*
1989          * Remap's are more complicated:
1990          *  - HAT_LOAD_REMAP must be specified if changing the pfn.
1991          *    We also require that NOCONSIST be specified.
1992          *  - Otherwise only permission or caching bits may change.
1993          */
1994         if (!PTE_ISPAGE(old_pte, l))
1995                 panic("non-null/page mapping pte=" FMT_PTE, old_pte);
1996 
1997         if (PTE2PFN(old_pte, l) != PTE2PFN(pte, l)) {
1998                 REMAPASSERT(flags & HAT_LOAD_REMAP);
1999                 REMAPASSERT(flags & HAT_LOAD_NOCONSIST);
2000                 REMAPASSERT(PTE_GET(old_pte, PT_SOFTWARE) >= PT_NOCONSIST);

2032         hat_t           *hat,
2033         uintptr_t       va,
2034         page_t          *pp,
2035         uint_t          attr,
2036         uint_t          flags,
2037         level_t         level,
2038         pfn_t           pfn)
2039 {
2040         htable_t        *ht;
2041         uint_t          entry;
2042         x86pte_t        pte;
2043         int             rv = 0;
2044 
2045         /*
2046          * The number 16 is arbitrary and here to catch a recursion problem
2047          * early before we blow out the kernel stack.
2048          */
2049         ++curthread->t_hatdepth;
2050         ASSERT(curthread->t_hatdepth < 16);
2051 
2052         ASSERT(hat == kas.a_hat || (hat->hat_flags & HAT_PCP) != 0 ||
2053             AS_LOCK_HELD(hat->hat_as));
2054 
2055         if (flags & HAT_LOAD_SHARE)
2056                 hat->hat_flags |= HAT_SHARED;
2057 
2058         /*
2059          * Find the page table that maps this page if it already exists.
2060          */
2061         ht = htable_lookup(hat, va, level);
2062 
2063         /*
2064          * We must have HAT_LOAD_NOCONSIST if page_t is NULL.
2065          */
2066         if (pp == NULL)
2067                 flags |= HAT_LOAD_NOCONSIST;
2068 
2069         if (ht == NULL) {
2070                 ht = htable_create(hat, va, level, NULL);
2071                 ASSERT(ht != NULL);
2072         }
2073         /*
2074          * htable_va2entry checks this condition as well, but it won't include
2075          * much useful info in the panic. So we do it in advance here to include
2076          * all the context.
2077          */
2078         if (ht->ht_vaddr > va || va > HTABLE_LAST_PAGE(ht)) {
2079                 panic("hati_load_common: bad htable: va=%p, last page=%p, "
2080                     "ht->ht_vaddr=%p, ht->ht_level=%d", (void *)va,
2081                     (void *)HTABLE_LAST_PAGE(ht), (void *)ht->ht_vaddr,
2082                     (int)ht->ht_level);
2083         }
2084         entry = htable_va2entry(va, ht);
2085 
2086         /*
2087          * a bunch of paranoid error checking
2088          */
2089         ASSERT(ht->ht_busy > 0);



2090         ASSERT(ht->ht_level == level);
2091 
2092         /*
2093          * construct the new PTE
2094          */
2095         if (hat == kas.a_hat)
2096                 attr &= ~PROT_USER;
2097         pte = hati_mkpte(pfn, attr, level, flags);
2098         if (hat == kas.a_hat && va >= kernelbase)
2099                 PTE_SET(pte, mmu.pt_global);
2100 
2101         /*
2102          * establish the mapping
2103          */
2104         rv = hati_pte_map(ht, entry, pp, pte, flags, NULL);
2105 
2106         /*
2107          * release the htable and any reserves
2108          */
2109         htable_release(ht);

2519                             "htable=%p, vaddr=%p\n", (void *)ht, (void *)vaddr);
2520                 HTABLE_LOCK_DEC(ht);
2521 
2522                 vaddr += LEVEL_SIZE(ht->ht_level);
2523         }
2524         if (ht)
2525                 htable_release(ht);
2526         XPV_ALLOW_MIGRATE();
2527 }
2528 
2529 /* ARGSUSED */
2530 void
2531 hat_unlock_region(struct hat *hat, caddr_t addr, size_t len,
2532     hat_region_cookie_t rcookie)
2533 {
2534         panic("No shared region support on x86");
2535 }
2536 
2537 #if !defined(__xpv)
2538 /*
2539  * Cross call service routine to demap a range of virtual
2540  * pages on the current CPU or flush all mappings in TLB.
2541  */

2542 static int
2543 hati_demap_func(xc_arg_t a1, xc_arg_t a2, xc_arg_t a3)
2544 {
2545         _NOTE(ARGUNUSED(a3));
2546         hat_t           *hat = (hat_t *)a1;
2547         tlb_range_t     *range = (tlb_range_t *)a2;

2548 
2549         /*
2550          * If the target hat isn't the kernel and this CPU isn't operating
2551          * in the target hat, we can ignore the cross call.
2552          */
2553         if (hat != kas.a_hat && hat != CPU->cpu_current_hat)
2554                 return (0);
2555 
2556         if (range->tr_va != DEMAP_ALL_ADDR) {
2557                 mmu_flush_tlb(FLUSH_TLB_RANGE, range);




2558                 return (0);
2559         }
2560 
2561         /*
2562          * We are flushing all of userspace.
2563          *
2564          * When using PCP, we first need to update this CPU's idea of the PCP
2565          * PTEs.
2566          */
2567         if (hat->hat_flags & HAT_COPIED) {
2568 #if defined(__amd64)
2569                 hat_pcp_update(CPU, hat);


2570 #elif defined(__i386)
2571                 reload_pae32(hat, CPU);
2572 #endif
2573         }
2574 
2575         mmu_flush_tlb(FLUSH_TLB_NONGLOBAL, NULL);
2576         return (0);
2577 }
2578 
2579 #define TLBIDLE_CPU_HALTED      (0x1UL)
2580 #define TLBIDLE_INVAL_ALL       (0x2UL)






















2581 #define CAS_TLB_INFO(cpu, old, new)     \
2582         atomic_cas_ulong((ulong_t *)&(cpu)->cpu_m.mcpu_tlb_info, (old), (new))
2583 
2584 /*
2585  * Record that a CPU is going idle
2586  */
2587 void
2588 tlb_going_idle(void)
2589 {
2590         atomic_or_ulong((ulong_t *)&CPU->cpu_m.mcpu_tlb_info,
2591             TLBIDLE_CPU_HALTED);
2592 }
2593 
2594 /*
2595  * Service a delayed TLB flush if coming out of being idle.
2596  * It will be called from cpu idle notification with interrupt disabled.
2597  */
2598 void
2599 tlb_service(void)
2600 {
2601         ulong_t tlb_info;
2602         ulong_t found;
2603 
2604         /*
2605          * We only have to do something if coming out of being idle.
2606          */
2607         tlb_info = CPU->cpu_m.mcpu_tlb_info;
2608         if (tlb_info & TLBIDLE_CPU_HALTED) {
2609                 ASSERT(CPU->cpu_current_hat == kas.a_hat);
2610 
2611                 /*
2612                  * Atomic clear and fetch of old state.
2613                  */
2614                 while ((found = CAS_TLB_INFO(CPU, tlb_info, 0)) != tlb_info) {
2615                         ASSERT(found & TLBIDLE_CPU_HALTED);
2616                         tlb_info = found;
2617                         SMT_PAUSE();
2618                 }
2619                 if (tlb_info & TLBIDLE_INVAL_ALL)
2620                         mmu_flush_tlb(FLUSH_TLB_ALL, NULL);
2621         }
2622 }
2623 #endif /* !__xpv */
2624 
2625 /*
2626  * Internal routine to do cross calls to invalidate a range of pages on
2627  * all CPUs using a given hat.
2628  */
2629 void
2630 hat_tlb_inval_range(hat_t *hat, tlb_range_t *in_range)
2631 {
2632         extern int      flushes_require_xcalls; /* from mp_startup.c */
2633         cpuset_t        justme;
2634         cpuset_t        cpus_to_shootdown;
2635         tlb_range_t     range = *in_range;
2636 #ifndef __xpv
2637         cpuset_t        check_cpus;
2638         cpu_t           *cpup;
2639         int             c;
2640 #endif
2641 
2642         /*
2643          * If the hat is being destroyed, there are no more users, so
2644          * demap need not do anything.
2645          */
2646         if (hat->hat_flags & HAT_FREEING)
2647                 return;
2648 
2649         /*
2650          * If demapping from a shared pagetable, we best demap the
2651          * entire set of user TLBs, since we don't know what addresses
2652          * these were shared at.
2653          */
2654         if (hat->hat_flags & HAT_SHARED) {
2655                 hat = kas.a_hat;
2656                 range.tr_va = DEMAP_ALL_ADDR;
2657         }
2658 
2659         /*
2660          * if not running with multiple CPUs, don't use cross calls
2661          */
2662         if (panicstr || !flushes_require_xcalls) {
2663 #ifdef __xpv
2664                 if (range.tr_va == DEMAP_ALL_ADDR) {
2665                         xen_flush_tlb();
2666                 } else {
2667                         for (size_t i = 0; i < TLB_RANGE_LEN(&range);
2668                             i += MMU_PAGESIZE) {
2669                                 xen_flush_va((caddr_t)(range.tr_va + i));
2670                         }
2671                 }
2672 #else
2673                 (void) hati_demap_func((xc_arg_t)hat, (xc_arg_t)&range, 0);

2674 #endif
2675                 return;
2676         }
2677 
2678 
2679         /*
2680          * Determine CPUs to shootdown. Kernel changes always do all CPUs.
2681          * Otherwise it's just CPUs currently executing in this hat.
2682          */
2683         kpreempt_disable();
2684         CPUSET_ONLY(justme, CPU->cpu_id);
2685         if (hat == kas.a_hat)
2686                 cpus_to_shootdown = khat_cpuset;
2687         else
2688                 cpus_to_shootdown = hat->hat_cpus;
2689 
2690 #ifndef __xpv
2691         /*
2692          * If any CPUs in the set are idle, just request a delayed flush
2693          * and avoid waking them up.
2694          */
2695         check_cpus = cpus_to_shootdown;
2696         for (c = 0; c < NCPU && !CPUSET_ISNULL(check_cpus); ++c) {
2697                 ulong_t tlb_info;
2698 
2699                 if (!CPU_IN_SET(check_cpus, c))
2700                         continue;
2701                 CPUSET_DEL(check_cpus, c);
2702                 cpup = cpu[c];
2703                 if (cpup == NULL)
2704                         continue;
2705 
2706                 tlb_info = cpup->cpu_m.mcpu_tlb_info;
2707                 while (tlb_info == TLBIDLE_CPU_HALTED) {
2708                         (void) CAS_TLB_INFO(cpup, TLBIDLE_CPU_HALTED,
2709                             TLBIDLE_CPU_HALTED | TLBIDLE_INVAL_ALL);
2710                         SMT_PAUSE();
2711                         tlb_info = cpup->cpu_m.mcpu_tlb_info;
2712                 }
2713                 if (tlb_info == (TLBIDLE_CPU_HALTED | TLBIDLE_INVAL_ALL)) {
2714                         HATSTAT_INC(hs_tlb_inval_delayed);
2715                         CPUSET_DEL(cpus_to_shootdown, c);
2716                 }
2717         }
2718 #endif
2719 
2720         if (CPUSET_ISNULL(cpus_to_shootdown) ||
2721             CPUSET_ISEQUAL(cpus_to_shootdown, justme)) {
2722 
2723 #ifdef __xpv
2724                 if (range.tr_va == DEMAP_ALL_ADDR) {
2725                         xen_flush_tlb();
2726                 } else {
2727                         for (size_t i = 0; i < TLB_RANGE_LEN(&range);
2728                             i += MMU_PAGESIZE) {
2729                                 xen_flush_va((caddr_t)(range.tr_va + i));
2730                         }
2731                 }
2732 #else
2733                 (void) hati_demap_func((xc_arg_t)hat, (xc_arg_t)&range, 0);

2734 #endif
2735 
2736         } else {
2737 
2738                 CPUSET_ADD(cpus_to_shootdown, CPU->cpu_id);
2739 #ifdef __xpv
2740                 if (range.tr_va == DEMAP_ALL_ADDR) {
2741                         xen_gflush_tlb(cpus_to_shootdown);
2742                 } else {
2743                         for (size_t i = 0; i < TLB_RANGE_LEN(&range);
2744                             i += MMU_PAGESIZE) {
2745                                 xen_gflush_va((caddr_t)(range.tr_va + i),
2746                                     cpus_to_shootdown);
2747                         }
2748                 }
2749 #else
2750                 xc_call((xc_arg_t)hat, (xc_arg_t)&range, 0,
2751                     CPUSET2BV(cpus_to_shootdown), hati_demap_func);
2752 #endif
2753 
2754         }
2755         kpreempt_enable();
2756 }
2757 
2758 void
2759 hat_tlb_inval(hat_t *hat, uintptr_t va)
2760 {
2761         /*
2762          * Create range for a single page.
2763          */
2764         tlb_range_t range;
2765         range.tr_va = va;
2766         range.tr_cnt = 1; /* one page */
2767         range.tr_level = MIN_PAGE_LEVEL; /* pages are MMU_PAGESIZE */
2768 
2769         hat_tlb_inval_range(hat, &range);
2770 }
2771 
2772 /*
2773  * Interior routine for HAT_UNLOADs from hat_unload_callback(),
2774  * hat_kmap_unload() OR from hat_steal() code.  This routine doesn't
2775  * handle releasing of the htables.
2776  */
2777 void
2778 hat_pte_unmap(
2779         htable_t        *ht,
2780         uint_t          entry,
2781         uint_t          flags,
2782         x86pte_t        old_pte,
2783         void            *pte_ptr,
2784         boolean_t       tlb)
2785 {
2786         hat_t           *hat = ht->ht_hat;
2787         hment_t         *hm = NULL;
2788         page_t          *pp = NULL;
2789         level_t         l = ht->ht_level;

2916 hat_unload(hat_t *hat, caddr_t addr, size_t len, uint_t flags)
2917 {
2918         uintptr_t va = (uintptr_t)addr;
2919 
2920         XPV_DISALLOW_MIGRATE();
2921         ASSERT(hat == kas.a_hat || va + len <= _userlimit);
2922 
2923         /*
2924          * special case for performance.
2925          */
2926         if (mmu.kmap_addr <= va && va < mmu.kmap_eaddr) {
2927                 ASSERT(hat == kas.a_hat);
2928                 hat_kmap_unload(addr, len, flags);
2929         } else {
2930                 hat_unload_callback(hat, addr, len, flags, NULL);
2931         }
2932         XPV_ALLOW_MIGRATE();
2933 }
2934 
2935 /*









2936  * Invalidate the TLB, and perform the callback to the upper level VM system,
2937  * for the specified ranges of contiguous pages.
2938  */
2939 static void
2940 handle_ranges(hat_t *hat, hat_callback_t *cb, uint_t cnt, tlb_range_t *range)
2941 {
2942         while (cnt > 0) {


2943                 --cnt;
2944                 hat_tlb_inval_range(hat, &range[cnt]);

2945 
2946                 if (cb != NULL) {
2947                         cb->hcb_start_addr = (caddr_t)range[cnt].tr_va;
2948                         cb->hcb_end_addr = cb->hcb_start_addr;
2949                         cb->hcb_end_addr += range[cnt].tr_cnt <<
2950                             LEVEL_SHIFT(range[cnt].tr_level);
2951                         cb->hcb_function(cb);
2952                 }
2953         }
2954 }
2955 
2956 /*
2957  * Unload a given range of addresses (has optional callback)
2958  *
2959  * Flags:
2960  * define       HAT_UNLOAD              0x00
2961  * define       HAT_UNLOAD_NOSYNC       0x02
2962  * define       HAT_UNLOAD_UNLOCK       0x04
2963  * define       HAT_UNLOAD_OTHER        0x08 - not used
2964  * define       HAT_UNLOAD_UNMAP        0x10 - same as HAT_UNLOAD
2965  */
2966 #define MAX_UNLOAD_CNT (8)
2967 void
2968 hat_unload_callback(
2969         hat_t           *hat,
2970         caddr_t         addr,
2971         size_t          len,
2972         uint_t          flags,
2973         hat_callback_t  *cb)
2974 {
2975         uintptr_t       vaddr = (uintptr_t)addr;
2976         uintptr_t       eaddr = vaddr + len;
2977         htable_t        *ht = NULL;
2978         uint_t          entry;
2979         uintptr_t       contig_va = (uintptr_t)-1L;
2980         tlb_range_t     r[MAX_UNLOAD_CNT];
2981         uint_t          r_cnt = 0;
2982         x86pte_t        old_pte;
2983 
2984         XPV_DISALLOW_MIGRATE();
2985         ASSERT(hat == kas.a_hat || eaddr <= _userlimit);
2986         ASSERT(IS_PAGEALIGNED(vaddr));
2987         ASSERT(IS_PAGEALIGNED(eaddr));
2988 
2989         /*
2990          * Special case a single page being unloaded for speed. This happens
2991          * quite frequently, COW faults after a fork() for example.
2992          */
2993         if (cb == NULL && len == MMU_PAGESIZE) {
2994                 ht = htable_getpte(hat, vaddr, &entry, &old_pte, 0);
2995                 if (ht != NULL) {
2996                         if (PTE_ISVALID(old_pte)) {
2997                                 hat_pte_unmap(ht, entry, flags, old_pte,
2998                                     NULL, B_TRUE);
2999                         }
3000                         htable_release(ht);
3001                 }
3002                 XPV_ALLOW_MIGRATE();
3003                 return;
3004         }
3005 
3006         while (vaddr < eaddr) {
3007                 old_pte = htable_walk(hat, &ht, &vaddr, eaddr);
3008                 if (ht == NULL)
3009                         break;
3010 
3011                 ASSERT(!IN_VA_HOLE(vaddr));
3012 
3013                 if (vaddr < (uintptr_t)addr)
3014                         panic("hat_unload_callback(): unmap inside large page");
3015 
3016                 /*
3017                  * We'll do the call backs for contiguous ranges
3018                  */
3019                 if (vaddr != contig_va ||
3020                     (r_cnt > 0 && r[r_cnt - 1].tr_level != ht->ht_level)) {
3021                         if (r_cnt == MAX_UNLOAD_CNT) {
3022                                 handle_ranges(hat, cb, r_cnt, r);
3023                                 r_cnt = 0;
3024                         }
3025                         r[r_cnt].tr_va = vaddr;
3026                         r[r_cnt].tr_cnt = 0;
3027                         r[r_cnt].tr_level = ht->ht_level;
3028                         ++r_cnt;
3029                 }
3030 
3031                 /*
3032                  * Unload one mapping (for a single page) from the page tables.
3033                  * Note that we do not remove the mapping from the TLB yet,
3034                  * as indicated by the tlb=FALSE argument to hat_pte_unmap().
3035                  * handle_ranges() will clear the TLB entries with one call to
3036                  * hat_tlb_inval_range() per contiguous range.  This is
3037                  * safe because the page can not be reused until the
3038                  * callback is made (or we return).
3039                  */
3040                 entry = htable_va2entry(vaddr, ht);
3041                 hat_pte_unmap(ht, entry, flags, old_pte, NULL, B_FALSE);
3042                 ASSERT(ht->ht_level <= mmu.max_page_level);
3043                 vaddr += LEVEL_SIZE(ht->ht_level);
3044                 contig_va = vaddr;
3045                 ++r[r_cnt - 1].tr_cnt;
3046         }
3047         if (ht)
3048                 htable_release(ht);
3049 
3050         /*
3051          * handle last range for callbacks
3052          */
3053         if (r_cnt > 0)
3054                 handle_ranges(hat, cb, r_cnt, r);
3055         XPV_ALLOW_MIGRATE();
3056 }
3057 
3058 /*
3059  * Invalidate a virtual address translation on a slave CPU during
3060  * panic() dumps.
3061  */
3062 void
3063 hat_flush_range(hat_t *hat, caddr_t va, size_t size)
3064 {
3065         ssize_t sz;
3066         caddr_t endva = va + size;
3067 
3068         while (va < endva) {
3069                 sz = hat_getpagesize(hat, va);
3070                 if (sz < 0) {
3071 #ifdef __xpv
3072                         xen_flush_tlb();
3073 #else
3074                         mmu_flush_tlb(FLUSH_TLB_ALL, NULL);
3075 #endif
3076                         break;
3077                 }
3078 #ifdef __xpv
3079                 xen_flush_va(va);
3080 #else
3081                 mmu_flush_tlb_kpage((uintptr_t)va);
3082 #endif
3083                 va += sz;
3084         }
3085 }
3086 
3087 /*
3088  * synchronize mapping with software data structures
3089  *
3090  * This interface is currently only used by the working set monitor
3091  * driver.
3092  */
3093 /*ARGSUSED*/
3094 void
3095 hat_sync(hat_t *hat, caddr_t addr, size_t len, uint_t flags)
3096 {
3097         uintptr_t       vaddr = (uintptr_t)addr;
3098         uintptr_t       eaddr = vaddr + len;
3099         htable_t        *ht = NULL;
3100         uint_t          entry;
3101         x86pte_t        pte;

3727                          */
3728                         ht = htable_lookup(hat, vaddr, l);
3729                         if (ht == NULL)
3730                                 continue;
3731                         if (ht->ht_flags & HTABLE_SHARED_PFN) {
3732                                 /*
3733                                  * clear page count, set valid_cnt to 0,
3734                                  * let htable_release() finish the job
3735                                  */
3736                                 hat->hat_ism_pgcnt -= ht->ht_valid_cnt <<
3737                                     (LEVEL_SHIFT(ht->ht_level) - MMU_PAGESHIFT);
3738                                 ht->ht_valid_cnt = 0;
3739                                 need_demaps = 1;
3740                         }
3741                         htable_release(ht);
3742                 }
3743         }
3744 
3745         /*
3746          * flush the TLBs - since we're probably dealing with MANY mappings
3747          * we just do a full invalidation.
3748          */
3749         if (!(hat->hat_flags & HAT_FREEING) && need_demaps)
3750                 hat_tlb_inval(hat, DEMAP_ALL_ADDR);
3751 
3752         /*
3753          * Now go back and clean up any unaligned mappings that
3754          * couldn't share pagetables.
3755          */
3756         if (!is_it_dism(hat, addr))
3757                 flags |= HAT_UNLOAD_UNLOCK;
3758         hat_unload(hat, addr, len, flags);
3759         XPV_ALLOW_MIGRATE();
3760 }
3761 
3762 
3763 /*
3764  * hat_reserve() does nothing
3765  */
3766 /*ARGSUSED*/
3767 void

4510         htable_t        *ht;
4511 
4512         XPV_DISALLOW_MIGRATE();
4513         /*
4514          * invalidate any left over mapping and decrement the htable valid count
4515          */
4516 #ifdef __xpv
4517         if (HYPERVISOR_update_va_mapping((uintptr_t)addr, 0,
4518             UVMF_INVLPG | UVMF_LOCAL))
4519                 panic("HYPERVISOR_update_va_mapping() failed");
4520 #else
4521         {
4522                 x86pte_t *pteptr;
4523 
4524                 pteptr = x86pte_mapin(mmu_btop(pte_pa),
4525                     (pte_pa & MMU_PAGEOFFSET) >> mmu.pte_size_shift, NULL);
4526                 if (mmu.pae_hat)
4527                         *pteptr = 0;
4528                 else
4529                         *(x86pte32_t *)pteptr = 0;
4530                 mmu_flush_tlb_kpage((uintptr_t)addr);
4531                 x86pte_mapout();
4532         }
4533 #endif
4534 
4535         ht = htable_getpte(kas.a_hat, ALIGN2PAGE(addr), NULL, NULL, 0);
4536         if (ht == NULL)
4537                 panic("hat_mempte_release(): invalid address");
4538         ASSERT(ht->ht_level == 0);
4539         HTABLE_DEC(ht->ht_valid_cnt);
4540         htable_release(ht);
4541         XPV_ALLOW_MIGRATE();
4542 }
4543 
4544 /*
4545  * Apply a temporary CPU private mapping to a page. We flush the TLB only
4546  * on this CPU, so this ought to have been called with preemption disabled.
4547  */
4548 void
4549 hat_mempte_remap(
4550         pfn_t           pfn,

4571         ASSERT(ht->ht_level == 0);
4572         ASSERT(ht->ht_valid_cnt > 0);
4573         ASSERT(ht->ht_pfn == mmu_btop(pte_pa));
4574         htable_release(ht);
4575 #endif
4576         XPV_DISALLOW_MIGRATE();
4577         pte = hati_mkpte(pfn, attr, 0, flags);
4578 #ifdef __xpv
4579         if (HYPERVISOR_update_va_mapping(va, pte, UVMF_INVLPG | UVMF_LOCAL))
4580                 panic("HYPERVISOR_update_va_mapping() failed");
4581 #else
4582         {
4583                 x86pte_t *pteptr;
4584 
4585                 pteptr = x86pte_mapin(mmu_btop(pte_pa),
4586                     (pte_pa & MMU_PAGEOFFSET) >> mmu.pte_size_shift, NULL);
4587                 if (mmu.pae_hat)
4588                         *(x86pte_t *)pteptr = pte;
4589                 else
4590                         *(x86pte32_t *)pteptr = (x86pte32_t)pte;
4591                 mmu_flush_tlb_kpage((uintptr_t)addr);
4592                 x86pte_mapout();
4593         }
4594 #endif
4595         XPV_ALLOW_MIGRATE();
4596 }
4597 
4598 
4599 
4600 /*
4601  * Hat locking functions
4602  * XXX - these two functions are currently being used by hatstats
4603  *      they can be removed by using a per-as mutex for hatstats.
4604  */
4605 void
4606 hat_enter(hat_t *hat)
4607 {
4608         mutex_enter(&hat->hat_mutex);
4609 }
4610 
4611 void
4612 hat_exit(hat_t *hat)
4613 {
4614         mutex_exit(&hat->hat_mutex);
4615 }
4616 
4617 /*
4618  * HAT part of cpu initialization.
4619  */
4620 void
4621 hat_cpu_online(struct cpu *cpup)
4622 {
4623         if (cpup != CPU) {
4624                 x86pte_cpu_init(cpup);
4625                 hat_pcp_setup(cpup);
4626         }
4627         CPUSET_ATOMIC_ADD(khat_cpuset, cpup->cpu_id);
4628 }
4629 
4630 /*
4631  * HAT part of cpu deletion.
4632  * (currently, we only call this after the cpu is safely passivated.)
4633  */
4634 void
4635 hat_cpu_offline(struct cpu *cpup)
4636 {
4637         ASSERT(cpup != CPU);
4638 
4639         CPUSET_ATOMIC_DEL(khat_cpuset, cpup->cpu_id);
4640         hat_pcp_teardown(cpup);
4641         x86pte_cpu_fini(cpup);
4642 }
4643 
4644 /*
4645  * Function called after all CPUs are brought online.
4646  * Used to remove low address boot mappings.
4647  */
4648 void
4649 clear_boot_mappings(uintptr_t low, uintptr_t high)
4650 {
4651         uintptr_t vaddr = low;
4652         htable_t *ht = NULL;
4653         level_t level;
4654         uint_t entry;
4655         x86pte_t pte;
4656 
4657         /*
4658          * On 1st CPU we can unload the prom mappings, basically we blow away
4659          * all virtual mappings under _userlimit.
4660          */

5067                 *pte_ma = base_ma + (entry << mmu.pte_size_shift);
5068         }
5069         XPV_ALLOW_MIGRATE();
5070 }
5071 
5072 void
5073 hat_release_mapping(hat_t *hat, caddr_t addr)
5074 {
5075         htable_t *ht;
5076 
5077         ASSERT(IS_P2ALIGNED((uintptr_t)addr, MMU_PAGESIZE));
5078         XPV_DISALLOW_MIGRATE();
5079         ht = htable_lookup(hat, (uintptr_t)addr, 0);
5080         ASSERT(ht != NULL);
5081         ASSERT(ht->ht_busy >= 2);
5082         htable_release(ht);
5083         htable_release(ht);
5084         XPV_ALLOW_MIGRATE();
5085 }
5086 #endif  /* __xpv */
5087 
5088 /*
5089  * Helper function to punch in a mapping that we need with the specified
5090  * attributes.
5091  */
5092 void
5093 hati_cpu_punchin(cpu_t *cpu, uintptr_t va, uint_t attrs)
5094 {
5095         int ret;
5096         pfn_t pfn;
5097         hat_t *cpu_hat = cpu->cpu_hat_info->hci_user_hat;
5098 
5099         ASSERT3S(kpti_enable, ==, 1);
5100         ASSERT3P(cpu_hat, !=, NULL);
5101         ASSERT3U(cpu_hat->hat_flags & HAT_PCP, ==, HAT_PCP);
5102         ASSERT3U(va & MMU_PAGEOFFSET, ==, 0);
5103 
5104         pfn = hat_getpfnum(kas.a_hat, (caddr_t)va);
5105         VERIFY3U(pfn, !=, PFN_INVALID);
5106 
5107         /*
5108          * We purposefully don't try to find the page_t. This means that this
5109          * will be marked PT_NOCONSIST; however, given that this is pretty much
5110          * a static mapping that we're using we should be relatively OK.
5111          */
5112         attrs |= HAT_STORECACHING_OK;
5113         ret = hati_load_common(cpu_hat, va, NULL, attrs, 0, 0, pfn);
5114         VERIFY3S(ret, ==, 0);
5115 }