Print this page
8956 Implement KPTI
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>


   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2014 by Delphix. All rights reserved.
  25  * Copyright 2015 Joyent, Inc.
  26  */
  27 
  28 #include <sys/types.h>
  29 #include <sys/sysmacros.h>
  30 #include <sys/kmem.h>
  31 #include <sys/atomic.h>
  32 #include <sys/bitmap.h>
  33 #include <sys/machparam.h>
  34 #include <sys/machsystm.h>
  35 #include <sys/mman.h>
  36 #include <sys/systm.h>
  37 #include <sys/cpuvar.h>
  38 #include <sys/thread.h>
  39 #include <sys/proc.h>
  40 #include <sys/cpu.h>
  41 #include <sys/kmem.h>
  42 #include <sys/disp.h>
  43 #include <sys/vmem.h>
  44 #include <sys/vmsystm.h>
  45 #include <sys/promif.h>


 120  * instead of putting them in a hat's htable cache.
 121  */
 122 uint32_t htable_dont_cache = 0;
 123 
 124 /*
 125  * Track the number of active pagetables, so we can know how many to reap
 126  */
 127 static uint32_t active_ptables = 0;
 128 
 129 #ifdef __xpv
 130 /*
 131  * Deal with hypervisor complications.
 132  */
 133 void
 134 xen_flush_va(caddr_t va)
 135 {
 136         struct mmuext_op t;
 137         uint_t count;
 138 
 139         if (IN_XPV_PANIC()) {
 140                 mmu_tlbflush_entry((caddr_t)va);
 141         } else {
 142                 t.cmd = MMUEXT_INVLPG_LOCAL;
 143                 t.arg1.linear_addr = (uintptr_t)va;
 144                 if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
 145                         panic("HYPERVISOR_mmuext_op() failed");
 146                 ASSERT(count == 1);
 147         }
 148 }
 149 
 150 void
 151 xen_gflush_va(caddr_t va, cpuset_t cpus)
 152 {
 153         struct mmuext_op t;
 154         uint_t count;
 155 
 156         if (IN_XPV_PANIC()) {
 157                 mmu_tlbflush_entry((caddr_t)va);
 158                 return;
 159         }
 160 
 161         t.cmd = MMUEXT_INVLPG_MULTI;
 162         t.arg1.linear_addr = (uintptr_t)va;
 163         /*LINTED: constant in conditional context*/
 164         set_xen_guest_handle(t.arg2.vcpumask, &cpus);
 165         if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
 166                 panic("HYPERVISOR_mmuext_op() failed");
 167         ASSERT(count == 1);
 168 }
 169 
 170 void
 171 xen_flush_tlb()
 172 {
 173         struct mmuext_op t;
 174         uint_t count;
 175 
 176         if (IN_XPV_PANIC()) {
 177                 xpv_panic_reload_cr3();


 604         atomic_inc_32(&htable_dont_cache);
 605         for (pass = 0; pass <= passes && stolen < cnt; ++pass) {
 606                 threshold = pass * mmu.ptes_per_table / htable_steal_passes;
 607 
 608                 mutex_enter(&hat_list_lock);
 609 
 610                 /* skip the first hat (kernel) */
 611                 hat = kas.a_hat->hat_next;
 612                 for (;;) {
 613                         /*
 614                          * Skip any hat that is already being stolen from.
 615                          *
 616                          * We skip SHARED hats, as these are dummy
 617                          * hats that host ISM shared page tables.
 618                          *
 619                          * We also skip if HAT_FREEING because hat_pte_unmap()
 620                          * won't zero out the PTE's. That would lead to hitting
 621                          * stale PTEs either here or under hat_unload() when we
 622                          * steal and unload the same page table in competing
 623                          * threads.



 624                          */
 625                         while (hat != NULL &&
 626                             (hat->hat_flags &
 627                             (HAT_VICTIM | HAT_SHARED | HAT_FREEING)) != 0)
 628                                 hat = hat->hat_next;

 629 
 630                         if (hat == NULL)
 631                                 break;
 632 
 633                         /*
 634                          * Mark the HAT as a stealing victim so that it is
 635                          * not freed from under us, e.g. in as_free()
 636                          */
 637                         hat->hat_flags |= HAT_VICTIM;
 638                         mutex_exit(&hat_list_lock);
 639 
 640                         /*
 641                          * Take any htables from the hat's cached "free" list.
 642                          */
 643                         hat_enter(hat);
 644                         while ((ht = hat->hat_ht_cached) != NULL &&
 645                             stolen < cnt) {
 646                                 hat->hat_ht_cached = ht->ht_next;
 647                                 ht->ht_next = list;
 648                                 list = ht;


 651                         hat_exit(hat);
 652 
 653                         /*
 654                          * Don't steal active htables on first pass.
 655                          */
 656                         if (pass != 0 && (stolen < cnt))
 657                                 htable_steal_active(hat, cnt, threshold,
 658                                     &stolen, &list);
 659 
 660                         /*
 661                          * do synchronous teardown for the reap case so that
 662                          * we can forget hat; at this time, hat is
 663                          * guaranteed to be around because HAT_VICTIM is set
 664                          * (see htable_free() for similar code)
 665                          */
 666                         for (ht = list; (ht) && (reap); ht = ht->ht_next) {
 667                                 if (ht->ht_hat == NULL)
 668                                         continue;
 669                                 ASSERT(ht->ht_hat == hat);
 670 #if defined(__xpv) && defined(__amd64)
 671                                 if (!(ht->ht_flags & HTABLE_VLP) &&
 672                                     ht->ht_level == mmu.max_level) {
 673                                         ptable_free(hat->hat_user_ptable);
 674                                         hat->hat_user_ptable = PFN_INVALID;
 675                                 }
 676 #endif
 677                                 /*
 678                                  * forget the hat
 679                                  */
 680                                 ht->ht_hat = NULL;
 681                         }
 682 
 683                         mutex_enter(&hat_list_lock);
 684 
 685                         /*
 686                          * Are we finished?
 687                          */
 688                         if (stolen == cnt) {
 689                                 /*
 690                                  * Try to spread the pain of stealing,
 691                                  * move victim HAT to the end of the HAT list.
 692                                  */


 762         atomic_dec_32(&htable_dont_cache);
 763 
 764         /*
 765          * Free up excess reserves
 766          */
 767         htable_adjust_reserve();
 768         hment_adjust_reserve();
 769 }
 770 
 771 /*
 772  * Allocate an htable, stealing one or using the reserve if necessary
 773  */
 774 static htable_t *
 775 htable_alloc(
 776         hat_t           *hat,
 777         uintptr_t       vaddr,
 778         level_t         level,
 779         htable_t        *shared)
 780 {
 781         htable_t        *ht = NULL;
 782         uint_t          is_vlp;
 783         uint_t          is_bare = 0;
 784         uint_t          need_to_zero = 1;
 785         int             kmflags = (can_steal_post_boot ? KM_NOSLEEP : KM_SLEEP);
 786 
 787         if (level < 0 || level > TOP_LEVEL(hat))
 788                 panic("htable_alloc(): level %d out of range\n", level);
 789 
 790         is_vlp = (hat->hat_flags & HAT_VLP) && level == VLP_LEVEL;
 791         if (is_vlp || shared != NULL)

 792                 is_bare = 1;
 793 
 794         /*
 795          * First reuse a cached htable from the hat_ht_cached field, this
 796          * avoids unnecessary trips through kmem/page allocators.
 797          */
 798         if (hat->hat_ht_cached != NULL && !is_bare) {
 799                 hat_enter(hat);
 800                 ht = hat->hat_ht_cached;
 801                 if (ht != NULL) {
 802                         hat->hat_ht_cached = ht->ht_next;
 803                         need_to_zero = 0;
 804                         /* XX64 ASSERT() they're all zero somehow */
 805                         ASSERT(ht->ht_pfn != PFN_INVALID);
 806                 }
 807                 hat_exit(hat);
 808         }
 809 
 810         if (ht == NULL) {
 811                 /*


 913         /*
 914          * Shared page tables have all entries locked and entries may not
 915          * be added or deleted.
 916          */
 917         ht->ht_flags = 0;
 918         if (shared != NULL) {
 919                 ASSERT(shared->ht_valid_cnt > 0);
 920                 ht->ht_flags |= HTABLE_SHARED_PFN;
 921                 ht->ht_pfn = shared->ht_pfn;
 922                 ht->ht_lock_cnt = 0;
 923                 ht->ht_valid_cnt = 0;                /* updated in hat_share() */
 924                 ht->ht_shares = shared;
 925                 need_to_zero = 0;
 926         } else {
 927                 ht->ht_shares = NULL;
 928                 ht->ht_lock_cnt = 0;
 929                 ht->ht_valid_cnt = 0;
 930         }
 931 
 932         /*
 933          * setup flags, etc. for VLP htables
 934          */
 935         if (is_vlp) {
 936                 ht->ht_flags |= HTABLE_VLP;
 937                 ASSERT(ht->ht_pfn == PFN_INVALID);
 938                 need_to_zero = 0;
 939         }
 940 
 941         /*
 942          * fill in the htable
 943          */
 944         ht->ht_hat = hat;
 945         ht->ht_parent = NULL;
 946         ht->ht_vaddr = vaddr;
 947         ht->ht_level = level;
 948         ht->ht_busy = 1;
 949         ht->ht_next = NULL;
 950         ht->ht_prev = NULL;
 951 
 952         /*
 953          * Zero out any freshly allocated page table
 954          */
 955         if (need_to_zero)
 956                 x86pte_zero(ht, 0, mmu.ptes_per_table);


 967 }
 968 
 969 /*
 970  * Free up an htable, either to a hat's cached list, the reserves or
 971  * back to kmem.
 972  */
 973 static void
 974 htable_free(htable_t *ht)
 975 {
 976         hat_t *hat = ht->ht_hat;
 977 
 978         /*
 979          * If the process isn't exiting, cache the free htable in the hat
 980          * structure. We always do this for the boot time reserve. We don't
 981          * do this if the hat is exiting or we are stealing/reaping htables.
 982          */
 983         if (hat != NULL &&
 984             !(ht->ht_flags & HTABLE_SHARED_PFN) &&
 985             (use_boot_reserve ||
 986             (!(hat->hat_flags & HAT_FREEING) && !htable_dont_cache))) {
 987                 ASSERT((ht->ht_flags & HTABLE_VLP) == 0);
 988                 ASSERT(ht->ht_pfn != PFN_INVALID);
 989                 hat_enter(hat);
 990                 ht->ht_next = hat->hat_ht_cached;
 991                 hat->hat_ht_cached = ht;
 992                 hat_exit(hat);
 993                 return;
 994         }
 995 
 996         /*
 997          * If we have a hardware page table, free it.
 998          * We don't free page tables that are accessed by sharing.
 999          */
1000         if (ht->ht_flags & HTABLE_SHARED_PFN) {
1001                 ASSERT(ht->ht_pfn != PFN_INVALID);
1002         } else if (!(ht->ht_flags & HTABLE_VLP)) {
1003                 ptable_free(ht->ht_pfn);
1004 #if defined(__amd64) && defined(__xpv)
1005                 if (ht->ht_level == mmu.max_level && hat != NULL) {
1006                         ptable_free(hat->hat_user_ptable);
1007                         hat->hat_user_ptable = PFN_INVALID;
1008                 }
1009 #endif
1010         }
1011         ht->ht_pfn = PFN_INVALID;
1012 
1013         /*
1014          * Free it or put into reserves.
1015          */
1016         if (USE_HAT_RESERVES() || htable_reserve_cnt < htable_reserve_amount) {
1017                 htable_put_reserve(ht);
1018         } else {
1019                 kmem_cache_free(htable_cache, ht);
1020                 htable_adjust_reserve();
1021         }
1022 }


1094         x86pte_t        found;
1095         hat_t           *hat = old->ht_hat;
1096 
1097         ASSERT(higher->ht_busy > 0);
1098         ASSERT(higher->ht_valid_cnt > 0);
1099         ASSERT(old->ht_valid_cnt == 0);
1100         found = x86pte_cas(higher, entry, expect, 0);
1101 #ifdef __xpv
1102         /*
1103          * This is weird, but Xen apparently automatically unlinks empty
1104          * pagetables from the upper page table. So allow PTP to be 0 already.
1105          */
1106         if (found != expect && found != 0)
1107 #else
1108         if (found != expect)
1109 #endif
1110                 panic("Bad PTP found=" FMT_PTE ", expected=" FMT_PTE,
1111                     found, expect);
1112 
1113         /*
1114          * When a top level VLP page table entry changes, we must issue
1115          * a reload of cr3 on all processors.
1116          *
1117          * If we don't need do do that, then we still have to INVLPG against
1118          * an address covered by the inner page table, as the latest processors
1119          * have TLB-like caches for non-leaf page table entries.
1120          */
1121         if (!(hat->hat_flags & HAT_FREEING)) {
1122                 hat_tlb_inval(hat, (higher->ht_flags & HTABLE_VLP) ?
1123                     DEMAP_ALL_ADDR : old->ht_vaddr);
1124         }
1125 
1126         HTABLE_DEC(higher->ht_valid_cnt);
1127 }
1128 
1129 /*
1130  * Link an entry for a new table at vaddr and level into the existing table
1131  * one level higher. We are always holding the HASH_ENTER() when doing this.
1132  */
1133 static void
1134 link_ptp(htable_t *higher, htable_t *new, uintptr_t vaddr)
1135 {
1136         uint_t          entry = htable_va2entry(vaddr, higher);
1137         x86pte_t        newptp = MAKEPTP(new->ht_pfn, new->ht_level);
1138         x86pte_t        found;
1139 
1140         ASSERT(higher->ht_busy > 0);
1141 
1142         ASSERT(new->ht_level != mmu.max_level);
1143 
1144         HTABLE_INC(higher->ht_valid_cnt);
1145 
1146         found = x86pte_cas(higher, entry, 0, newptp);
1147         if ((found & ~PT_REF) != 0)
1148                 panic("HAT: ptp not 0, found=" FMT_PTE, found);
1149 
1150         /*
1151          * When any top level VLP page table entry changes, we must issue
1152          * a reload of cr3 on all processors using it.

1153          * We also need to do this for the kernel hat on PAE 32 bit kernel.
1154          */
1155         if (
1156 #ifdef __i386
1157             (higher->ht_hat == kas.a_hat && higher->ht_level == VLP_LEVEL) ||

1158 #endif
1159             (higher->ht_flags & HTABLE_VLP))
1160                 hat_tlb_inval(higher->ht_hat, DEMAP_ALL_ADDR);
1161 }
1162 
1163 /*
1164  * Release of hold on an htable. If this is the last use and the pagetable
1165  * is empty we may want to free it, then recursively look at the pagetable
1166  * above it. The recursion is handled by the outer while() loop.
1167  *
1168  * On the metal, during process exit, we don't bother unlinking the tables from
1169  * upper level pagetables. They are instead handled in bulk by hat_free_end().
1170  * We can't do this on the hypervisor as we need the page table to be
1171  * implicitly unpinnned before it goes to the free page lists. This can't
1172  * happen unless we fully unlink it from the page table hierarchy.
1173  */
1174 void
1175 htable_release(htable_t *ht)
1176 {
1177         uint_t          hashval;
1178         htable_t        *shared;
1179         htable_t        *higher;


1278 /*
1279  * Find the htable for the pagetable at the given level for the given address.
1280  * If found acquires a hold that eventually needs to be htable_release()d
1281  */
1282 htable_t *
1283 htable_lookup(hat_t *hat, uintptr_t vaddr, level_t level)
1284 {
1285         uintptr_t       base;
1286         uint_t          hashval;
1287         htable_t        *ht = NULL;
1288 
1289         ASSERT(level >= 0);
1290         ASSERT(level <= TOP_LEVEL(hat));
1291 
1292         if (level == TOP_LEVEL(hat)) {
1293 #if defined(__amd64)
1294                 /*
1295                  * 32 bit address spaces on 64 bit kernels need to check
1296                  * for overflow of the 32 bit address space
1297                  */
1298                 if ((hat->hat_flags & HAT_VLP) && vaddr >= ((uint64_t)1 << 32))

1299                         return (NULL);
1300 #endif
1301                 base = 0;
1302         } else {
1303                 base = vaddr & LEVEL_MASK(level + 1);
1304         }
1305 
1306         hashval = HTABLE_HASH(hat, base, level);
1307         HTABLE_ENTER(hashval);
1308         for (ht = hat->hat_ht_hash[hashval]; ht; ht = ht->ht_next) {
1309                 if (ht->ht_hat == hat &&
1310                     ht->ht_vaddr == base &&
1311                     ht->ht_level == level)
1312                         break;
1313         }
1314         if (ht)
1315                 ++ht->ht_busy;
1316 
1317         HTABLE_EXIT(hashval);
1318         return (ht);


1926 
1927         ASSERT(mmu.pae_hat != 0);
1928         for (;;) {
1929                 t = p[0];
1930                 t |= (uint64_t)p[1] << 32;
1931                 if ((t & 0xffffffff) == p[0])
1932                         return (t);
1933         }
1934 }
1935 #endif /* __i386 */
1936 
1937 /*
1938  * Disable preemption and establish a mapping to the pagetable with the
1939  * given pfn. This is optimized for there case where it's the same
1940  * pfn as we last used referenced from this CPU.
1941  */
1942 static x86pte_t *
1943 x86pte_access_pagetable(htable_t *ht, uint_t index)
1944 {
1945         /*
1946          * VLP pagetables are contained in the hat_t
1947          */
1948         if (ht->ht_flags & HTABLE_VLP)
1949                 return (PT_INDEX_PTR(ht->ht_hat->hat_vlp_ptes, index));


1950         return (x86pte_mapin(ht->ht_pfn, index, ht));
1951 }
1952 
1953 /*
1954  * map the given pfn into the page table window.
1955  */
1956 /*ARGSUSED*/
1957 x86pte_t *
1958 x86pte_mapin(pfn_t pfn, uint_t index, htable_t *ht)
1959 {
1960         x86pte_t *pteptr;
1961         x86pte_t pte = 0;
1962         x86pte_t newpte;
1963         int x;
1964 
1965         ASSERT(pfn != PFN_INVALID);
1966 
1967         if (!khat_running) {
1968                 caddr_t va = kbm_remap_window(pfn_to_pa(pfn), 1);
1969                 return (PT_INDEX_PTR(va, index));
1970         }
1971 
1972         /*
1973          * If kpm is available, use it.
1974          */
1975         if (kpm_vbase)
1976                 return (PT_INDEX_PTR(hat_kpm_pfn2va(pfn), index));
1977 
1978         /*
1979          * Disable preemption and grab the CPU's hci_mutex
1980          */
1981         kpreempt_disable();

1982         ASSERT(CPU->cpu_hat_info != NULL);


1983         mutex_enter(&CPU->cpu_hat_info->hci_mutex);
1984         x = PWIN_TABLE(CPU->cpu_id);
1985         pteptr = (x86pte_t *)PWIN_PTE_VA(x);
1986 #ifndef __xpv
1987         if (mmu.pae_hat)
1988                 pte = *pteptr;
1989         else
1990                 pte = *(x86pte32_t *)pteptr;
1991 #endif
1992 
1993         newpte = MAKEPTE(pfn, 0) | mmu.pt_global | mmu.pt_nx;
1994 
1995         /*
1996          * For hardware we can use a writable mapping.
1997          */
1998 #ifdef __xpv
1999         if (IN_XPV_PANIC())
2000 #endif
2001                 newpte |= PT_WRITABLE;
2002 
2003         if (!PTE_EQUIV(newpte, pte)) {
2004 
2005 #ifdef __xpv
2006                 if (!IN_XPV_PANIC()) {
2007                         xen_map(newpte, PWIN_VA(x));
2008                 } else
2009 #endif
2010                 {
2011                         XPV_ALLOW_PAGETABLE_UPDATES();
2012                         if (mmu.pae_hat)
2013                                 *pteptr = newpte;
2014                         else
2015                                 *(x86pte32_t *)pteptr = newpte;
2016                         XPV_DISALLOW_PAGETABLE_UPDATES();
2017                         mmu_tlbflush_entry((caddr_t)(PWIN_VA(x)));
2018                 }
2019         }
2020         return (PT_INDEX_PTR(PWIN_VA(x), index));
2021 }
2022 
2023 /*
2024  * Release access to a page table.
2025  */
2026 static void
2027 x86pte_release_pagetable(htable_t *ht)
2028 {
2029         /*
2030          * nothing to do for VLP htables
2031          */
2032         if (ht->ht_flags & HTABLE_VLP)
2033                 return;
2034 
2035         x86pte_mapout();
2036 }
2037 
2038 void
2039 x86pte_mapout(void)
2040 {
2041         if (kpm_vbase != NULL || !khat_running)
2042                 return;
2043 
2044         /*
2045          * Drop the CPU's hci_mutex and restore preemption.
2046          */
2047 #ifdef __xpv
2048         if (!IN_XPV_PANIC()) {
2049                 uintptr_t va;
2050 
2051                 /*
2052                  * We need to always clear the mapping in case a page


2113          * Install the new PTE. If remapping the same PFN, then
2114          * copy existing REF/MOD bits to new mapping.
2115          */
2116         do {
2117                 prev = GET_PTE(ptep);
2118                 n = new;
2119                 if (PTE_ISVALID(n) && (prev & pfn_mask) == (new & pfn_mask))
2120                         n |= prev & (PT_REF | PT_MOD);
2121 
2122                 /*
2123                  * Another thread may have installed this mapping already,
2124                  * flush the local TLB and be done.
2125                  */
2126                 if (prev == n) {
2127                         old = new;
2128 #ifdef __xpv
2129                         if (!IN_XPV_PANIC())
2130                                 xen_flush_va((caddr_t)addr);
2131                         else
2132 #endif
2133                                 mmu_tlbflush_entry((caddr_t)addr);
2134                         goto done;
2135                 }
2136 
2137                 /*
2138                  * Detect if we have a collision of installing a large
2139                  * page mapping where there already is a lower page table.
2140                  */
2141                 if (l > 0 && (prev & PT_VALID) && !(prev & PT_PAGESIZE)) {
2142                         old = LPAGE_ERROR;
2143                         goto done;
2144                 }
2145 
2146                 XPV_ALLOW_PAGETABLE_UPDATES();
2147                 old = CAS_PTE(ptep, prev, n);
2148                 XPV_DISALLOW_PAGETABLE_UPDATES();
2149         } while (old != prev);
2150 
2151         /*
2152          * Do a TLB demap if needed, ie. the old pte was valid.
2153          *


2172  * Atomic compare and swap of a page table entry. No TLB invalidates are done.
2173  * This is used for links between pagetables of different levels.
2174  * Note we always create these links with dirty/access set, so they should
2175  * never change.
2176  */
2177 x86pte_t
2178 x86pte_cas(htable_t *ht, uint_t entry, x86pte_t old, x86pte_t new)
2179 {
2180         x86pte_t        pte;
2181         x86pte_t        *ptep;
2182 #ifdef __xpv
2183         /*
2184          * We can't use writable pagetables for upper level tables, so fake it.
2185          */
2186         mmu_update_t t[2];
2187         int cnt = 1;
2188         int count;
2189         maddr_t ma;
2190 
2191         if (!IN_XPV_PANIC()) {
2192                 ASSERT(!(ht->ht_flags & HTABLE_VLP));    /* no VLP yet */
2193                 ma = pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(ht->ht_pfn), entry));
2194                 t[0].ptr = ma | MMU_NORMAL_PT_UPDATE;
2195                 t[0].val = new;
2196 
2197 #if defined(__amd64)
2198                 /*
2199                  * On the 64-bit hypervisor we need to maintain the user mode
2200                  * top page table too.
2201                  */
2202                 if (ht->ht_level == mmu.max_level && ht->ht_hat != kas.a_hat) {
2203                         ma = pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(
2204                             ht->ht_hat->hat_user_ptable), entry));
2205                         t[1].ptr = ma | MMU_NORMAL_PT_UPDATE;
2206                         t[1].val = new;
2207                         ++cnt;
2208                 }
2209 #endif  /* __amd64 */
2210 
2211                 if (HYPERVISOR_mmu_update(t, cnt, &count, DOMID_SELF))
2212                         panic("HYPERVISOR_mmu_update() failed");


2329                 if ((expect & (PT_WRITABLE | PT_MOD)) == PT_WRITABLE &&
2330                     (new & (PT_WRITABLE | PT_MOD)) == 0 &&
2331                     (GET_PTE(ptep) & PT_MOD) != 0) {
2332                         do {
2333                                 found = GET_PTE(ptep);
2334                                 XPV_ALLOW_PAGETABLE_UPDATES();
2335                                 found =
2336                                     CAS_PTE(ptep, found, found | PT_WRITABLE);
2337                                 XPV_DISALLOW_PAGETABLE_UPDATES();
2338                         } while ((found & PT_WRITABLE) == 0);
2339                 }
2340         }
2341         x86pte_release_pagetable(ht);
2342         return (found);
2343 }
2344 
2345 #ifndef __xpv
2346 /*
2347  * Copy page tables - this is just a little more complicated than the
2348  * previous routines. Note that it's also not atomic! It also is never
2349  * used for VLP pagetables.
2350  */
2351 void
2352 x86pte_copy(htable_t *src, htable_t *dest, uint_t entry, uint_t count)
2353 {
2354         caddr_t src_va;
2355         caddr_t dst_va;
2356         size_t size;
2357         x86pte_t *pteptr;
2358         x86pte_t pte;
2359 
2360         ASSERT(khat_running);
2361         ASSERT(!(dest->ht_flags & HTABLE_VLP));
2362         ASSERT(!(src->ht_flags & HTABLE_VLP));
2363         ASSERT(!(src->ht_flags & HTABLE_SHARED_PFN));
2364         ASSERT(!(dest->ht_flags & HTABLE_SHARED_PFN));
2365 
2366         /*
2367          * Acquire access to the CPU pagetable windows for the dest and source.
2368          */
2369         dst_va = (caddr_t)x86pte_access_pagetable(dest, entry);
2370         if (kpm_vbase) {
2371                 src_va = (caddr_t)
2372                     PT_INDEX_PTR(hat_kpm_pfn2va(src->ht_pfn), entry);
2373         } else {
2374                 uint_t x = PWIN_SRC(CPU->cpu_id);
2375 


2376                 /*
2377                  * Finish defining the src pagetable mapping
2378                  */
2379                 src_va = (caddr_t)PT_INDEX_PTR(PWIN_VA(x), entry);
2380                 pte = MAKEPTE(src->ht_pfn, 0) | mmu.pt_global | mmu.pt_nx;
2381                 pteptr = (x86pte_t *)PWIN_PTE_VA(x);
2382                 if (mmu.pae_hat)
2383                         *pteptr = pte;
2384                 else
2385                         *(x86pte32_t *)pteptr = pte;
2386                 mmu_tlbflush_entry((caddr_t)(PWIN_VA(x)));
2387         }
2388 
2389         /*
2390          * now do the copy
2391          */
2392         size = count << mmu.pte_size_shift;
2393         bcopy(src_va, dst_va, size);
2394 
2395         x86pte_release_pagetable(dest);
2396 }
2397 
2398 #else /* __xpv */
2399 
2400 /*
2401  * The hypervisor only supports writable pagetables at level 0, so we have
2402  * to install these 1 by 1 the slow way.
2403  */
2404 void
2405 x86pte_copy(htable_t *src, htable_t *dest, uint_t entry, uint_t count)
2406 {


2433 }
2434 #endif /* __xpv */
2435 
2436 /*
2437  * Zero page table entries - Note this doesn't use atomic stores!
2438  */
2439 static void
2440 x86pte_zero(htable_t *dest, uint_t entry, uint_t count)
2441 {
2442         caddr_t dst_va;
2443         size_t size;
2444 #ifdef __xpv
2445         int x;
2446         x86pte_t newpte;
2447 #endif
2448 
2449         /*
2450          * Map in the page table to be zeroed.
2451          */
2452         ASSERT(!(dest->ht_flags & HTABLE_SHARED_PFN));
2453         ASSERT(!(dest->ht_flags & HTABLE_VLP));
2454 
2455         /*
2456          * On the hypervisor we don't use x86pte_access_pagetable() since
2457          * in this case the page is not pinned yet.
2458          */
2459 #ifdef __xpv
2460         if (kpm_vbase == NULL) {
2461                 kpreempt_disable();
2462                 ASSERT(CPU->cpu_hat_info != NULL);
2463                 mutex_enter(&CPU->cpu_hat_info->hci_mutex);
2464                 x = PWIN_TABLE(CPU->cpu_id);
2465                 newpte = MAKEPTE(dest->ht_pfn, 0) | PT_WRITABLE;
2466                 xen_map(newpte, PWIN_VA(x));
2467                 dst_va = (caddr_t)PT_INDEX_PTR(PWIN_VA(x), entry);
2468         } else
2469 #endif
2470                 dst_va = (caddr_t)x86pte_access_pagetable(dest, entry);
2471 
2472         size = count << mmu.pte_size_shift;
2473         ASSERT(size > BLOCKZEROALIGN);


2487 #endif
2488                 x86pte_release_pagetable(dest);
2489 }
2490 
2491 /*
2492  * Called to ensure that all pagetables are in the system dump
2493  */
2494 void
2495 hat_dump(void)
2496 {
2497         hat_t *hat;
2498         uint_t h;
2499         htable_t *ht;
2500 
2501         /*
2502          * Dump all page tables
2503          */
2504         for (hat = kas.a_hat; hat != NULL; hat = hat->hat_next) {
2505                 for (h = 0; h < hat->hat_num_hash; ++h) {
2506                         for (ht = hat->hat_ht_hash[h]; ht; ht = ht->ht_next) {
2507                                 if ((ht->ht_flags & HTABLE_VLP) == 0)
2508                                         dump_page(ht->ht_pfn);
2509                         }
2510                 }
2511         }
2512 }


   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2014 by Delphix. All rights reserved.
  25  * Copyright 2018 Joyent, Inc.
  26  */
  27 
  28 #include <sys/types.h>
  29 #include <sys/sysmacros.h>
  30 #include <sys/kmem.h>
  31 #include <sys/atomic.h>
  32 #include <sys/bitmap.h>
  33 #include <sys/machparam.h>
  34 #include <sys/machsystm.h>
  35 #include <sys/mman.h>
  36 #include <sys/systm.h>
  37 #include <sys/cpuvar.h>
  38 #include <sys/thread.h>
  39 #include <sys/proc.h>
  40 #include <sys/cpu.h>
  41 #include <sys/kmem.h>
  42 #include <sys/disp.h>
  43 #include <sys/vmem.h>
  44 #include <sys/vmsystm.h>
  45 #include <sys/promif.h>


 120  * instead of putting them in a hat's htable cache.
 121  */
 122 uint32_t htable_dont_cache = 0;
 123 
 124 /*
 125  * Track the number of active pagetables, so we can know how many to reap
 126  */
 127 static uint32_t active_ptables = 0;
 128 
 129 #ifdef __xpv
 130 /*
 131  * Deal with hypervisor complications.
 132  */
 133 void
 134 xen_flush_va(caddr_t va)
 135 {
 136         struct mmuext_op t;
 137         uint_t count;
 138 
 139         if (IN_XPV_PANIC()) {
 140                 mmu_flush_tlb_page((uintptr_t)va);
 141         } else {
 142                 t.cmd = MMUEXT_INVLPG_LOCAL;
 143                 t.arg1.linear_addr = (uintptr_t)va;
 144                 if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
 145                         panic("HYPERVISOR_mmuext_op() failed");
 146                 ASSERT(count == 1);
 147         }
 148 }
 149 
 150 void
 151 xen_gflush_va(caddr_t va, cpuset_t cpus)
 152 {
 153         struct mmuext_op t;
 154         uint_t count;
 155 
 156         if (IN_XPV_PANIC()) {
 157                 mmu_flush_tlb_page((uintptr_t)va);
 158                 return;
 159         }
 160 
 161         t.cmd = MMUEXT_INVLPG_MULTI;
 162         t.arg1.linear_addr = (uintptr_t)va;
 163         /*LINTED: constant in conditional context*/
 164         set_xen_guest_handle(t.arg2.vcpumask, &cpus);
 165         if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
 166                 panic("HYPERVISOR_mmuext_op() failed");
 167         ASSERT(count == 1);
 168 }
 169 
 170 void
 171 xen_flush_tlb()
 172 {
 173         struct mmuext_op t;
 174         uint_t count;
 175 
 176         if (IN_XPV_PANIC()) {
 177                 xpv_panic_reload_cr3();


 604         atomic_inc_32(&htable_dont_cache);
 605         for (pass = 0; pass <= passes && stolen < cnt; ++pass) {
 606                 threshold = pass * mmu.ptes_per_table / htable_steal_passes;
 607 
 608                 mutex_enter(&hat_list_lock);
 609 
 610                 /* skip the first hat (kernel) */
 611                 hat = kas.a_hat->hat_next;
 612                 for (;;) {
 613                         /*
 614                          * Skip any hat that is already being stolen from.
 615                          *
 616                          * We skip SHARED hats, as these are dummy
 617                          * hats that host ISM shared page tables.
 618                          *
 619                          * We also skip if HAT_FREEING because hat_pte_unmap()
 620                          * won't zero out the PTE's. That would lead to hitting
 621                          * stale PTEs either here or under hat_unload() when we
 622                          * steal and unload the same page table in competing
 623                          * threads.
 624                          *
 625                          * We skip HATs that belong to CPUs, to make our lives
 626                          * simpler.
 627                          */
 628                         while (hat != NULL && (hat->hat_flags &
 629                             (HAT_VICTIM | HAT_SHARED | HAT_FREEING |
 630                             HAT_PCP)) != 0) {
 631                                 hat = hat->hat_next;
 632                         }
 633 
 634                         if (hat == NULL)
 635                                 break;
 636 
 637                         /*
 638                          * Mark the HAT as a stealing victim so that it is
 639                          * not freed from under us, e.g. in as_free()
 640                          */
 641                         hat->hat_flags |= HAT_VICTIM;
 642                         mutex_exit(&hat_list_lock);
 643 
 644                         /*
 645                          * Take any htables from the hat's cached "free" list.
 646                          */
 647                         hat_enter(hat);
 648                         while ((ht = hat->hat_ht_cached) != NULL &&
 649                             stolen < cnt) {
 650                                 hat->hat_ht_cached = ht->ht_next;
 651                                 ht->ht_next = list;
 652                                 list = ht;


 655                         hat_exit(hat);
 656 
 657                         /*
 658                          * Don't steal active htables on first pass.
 659                          */
 660                         if (pass != 0 && (stolen < cnt))
 661                                 htable_steal_active(hat, cnt, threshold,
 662                                     &stolen, &list);
 663 
 664                         /*
 665                          * do synchronous teardown for the reap case so that
 666                          * we can forget hat; at this time, hat is
 667                          * guaranteed to be around because HAT_VICTIM is set
 668                          * (see htable_free() for similar code)
 669                          */
 670                         for (ht = list; (ht) && (reap); ht = ht->ht_next) {
 671                                 if (ht->ht_hat == NULL)
 672                                         continue;
 673                                 ASSERT(ht->ht_hat == hat);
 674 #if defined(__xpv) && defined(__amd64)
 675                                 ASSERT(!(ht->ht_flags & HTABLE_COPIED));
 676                                 if (ht->ht_level == mmu.max_level) {
 677                                         ptable_free(hat->hat_user_ptable);
 678                                         hat->hat_user_ptable = PFN_INVALID;
 679                                 }
 680 #endif
 681                                 /*
 682                                  * forget the hat
 683                                  */
 684                                 ht->ht_hat = NULL;
 685                         }
 686 
 687                         mutex_enter(&hat_list_lock);
 688 
 689                         /*
 690                          * Are we finished?
 691                          */
 692                         if (stolen == cnt) {
 693                                 /*
 694                                  * Try to spread the pain of stealing,
 695                                  * move victim HAT to the end of the HAT list.
 696                                  */


 766         atomic_dec_32(&htable_dont_cache);
 767 
 768         /*
 769          * Free up excess reserves
 770          */
 771         htable_adjust_reserve();
 772         hment_adjust_reserve();
 773 }
 774 
 775 /*
 776  * Allocate an htable, stealing one or using the reserve if necessary
 777  */
 778 static htable_t *
 779 htable_alloc(
 780         hat_t           *hat,
 781         uintptr_t       vaddr,
 782         level_t         level,
 783         htable_t        *shared)
 784 {
 785         htable_t        *ht = NULL;
 786         uint_t          is_copied;
 787         uint_t          is_bare = 0;
 788         uint_t          need_to_zero = 1;
 789         int             kmflags = (can_steal_post_boot ? KM_NOSLEEP : KM_SLEEP);
 790 
 791         if (level < 0 || level > TOP_LEVEL(hat))
 792                 panic("htable_alloc(): level %d out of range\n", level);
 793 
 794         is_copied = (hat->hat_flags & HAT_COPIED) &&
 795             level == hat->hat_max_level;
 796         if (is_copied || shared != NULL)
 797                 is_bare = 1;
 798 
 799         /*
 800          * First reuse a cached htable from the hat_ht_cached field, this
 801          * avoids unnecessary trips through kmem/page allocators.
 802          */
 803         if (hat->hat_ht_cached != NULL && !is_bare) {
 804                 hat_enter(hat);
 805                 ht = hat->hat_ht_cached;
 806                 if (ht != NULL) {
 807                         hat->hat_ht_cached = ht->ht_next;
 808                         need_to_zero = 0;
 809                         /* XX64 ASSERT() they're all zero somehow */
 810                         ASSERT(ht->ht_pfn != PFN_INVALID);
 811                 }
 812                 hat_exit(hat);
 813         }
 814 
 815         if (ht == NULL) {
 816                 /*


 918         /*
 919          * Shared page tables have all entries locked and entries may not
 920          * be added or deleted.
 921          */
 922         ht->ht_flags = 0;
 923         if (shared != NULL) {
 924                 ASSERT(shared->ht_valid_cnt > 0);
 925                 ht->ht_flags |= HTABLE_SHARED_PFN;
 926                 ht->ht_pfn = shared->ht_pfn;
 927                 ht->ht_lock_cnt = 0;
 928                 ht->ht_valid_cnt = 0;                /* updated in hat_share() */
 929                 ht->ht_shares = shared;
 930                 need_to_zero = 0;
 931         } else {
 932                 ht->ht_shares = NULL;
 933                 ht->ht_lock_cnt = 0;
 934                 ht->ht_valid_cnt = 0;
 935         }
 936 
 937         /*
 938          * setup flags, etc. for copied page tables.
 939          */
 940         if (is_copied) {
 941                 ht->ht_flags |= HTABLE_COPIED;
 942                 ASSERT(ht->ht_pfn == PFN_INVALID);
 943                 need_to_zero = 0;
 944         }
 945 
 946         /*
 947          * fill in the htable
 948          */
 949         ht->ht_hat = hat;
 950         ht->ht_parent = NULL;
 951         ht->ht_vaddr = vaddr;
 952         ht->ht_level = level;
 953         ht->ht_busy = 1;
 954         ht->ht_next = NULL;
 955         ht->ht_prev = NULL;
 956 
 957         /*
 958          * Zero out any freshly allocated page table
 959          */
 960         if (need_to_zero)
 961                 x86pte_zero(ht, 0, mmu.ptes_per_table);


 972 }
 973 
 974 /*
 975  * Free up an htable, either to a hat's cached list, the reserves or
 976  * back to kmem.
 977  */
 978 static void
 979 htable_free(htable_t *ht)
 980 {
 981         hat_t *hat = ht->ht_hat;
 982 
 983         /*
 984          * If the process isn't exiting, cache the free htable in the hat
 985          * structure. We always do this for the boot time reserve. We don't
 986          * do this if the hat is exiting or we are stealing/reaping htables.
 987          */
 988         if (hat != NULL &&
 989             !(ht->ht_flags & HTABLE_SHARED_PFN) &&
 990             (use_boot_reserve ||
 991             (!(hat->hat_flags & HAT_FREEING) && !htable_dont_cache))) {
 992                 ASSERT((ht->ht_flags & HTABLE_COPIED) == 0);
 993                 ASSERT(ht->ht_pfn != PFN_INVALID);
 994                 hat_enter(hat);
 995                 ht->ht_next = hat->hat_ht_cached;
 996                 hat->hat_ht_cached = ht;
 997                 hat_exit(hat);
 998                 return;
 999         }
1000 
1001         /*
1002          * If we have a hardware page table, free it.
1003          * We don't free page tables that are accessed by sharing.
1004          */
1005         if (ht->ht_flags & HTABLE_SHARED_PFN) {
1006                 ASSERT(ht->ht_pfn != PFN_INVALID);
1007         } else if (!(ht->ht_flags & HTABLE_COPIED)) {
1008                 ptable_free(ht->ht_pfn);
1009 #if defined(__amd64) && defined(__xpv)
1010                 if (ht->ht_level == mmu.max_level && hat != NULL) {
1011                         ptable_free(hat->hat_user_ptable);
1012                         hat->hat_user_ptable = PFN_INVALID;
1013                 }
1014 #endif
1015         }
1016         ht->ht_pfn = PFN_INVALID;
1017 
1018         /*
1019          * Free it or put into reserves.
1020          */
1021         if (USE_HAT_RESERVES() || htable_reserve_cnt < htable_reserve_amount) {
1022                 htable_put_reserve(ht);
1023         } else {
1024                 kmem_cache_free(htable_cache, ht);
1025                 htable_adjust_reserve();
1026         }
1027 }


1099         x86pte_t        found;
1100         hat_t           *hat = old->ht_hat;
1101 
1102         ASSERT(higher->ht_busy > 0);
1103         ASSERT(higher->ht_valid_cnt > 0);
1104         ASSERT(old->ht_valid_cnt == 0);
1105         found = x86pte_cas(higher, entry, expect, 0);
1106 #ifdef __xpv
1107         /*
1108          * This is weird, but Xen apparently automatically unlinks empty
1109          * pagetables from the upper page table. So allow PTP to be 0 already.
1110          */
1111         if (found != expect && found != 0)
1112 #else
1113         if (found != expect)
1114 #endif
1115                 panic("Bad PTP found=" FMT_PTE ", expected=" FMT_PTE,
1116                     found, expect);
1117 
1118         /*
1119          * When a top level PTE changes for a copied htable, we must trigger a
1120          * hat_pcp_update() on all HAT CPUs.
1121          *
1122          * If we don't need do do that, then we still have to INVLPG against an
1123          * address covered by the inner page table, as the latest processors
1124          * have TLB-like caches for non-leaf page table entries.
1125          */
1126         if (!(hat->hat_flags & HAT_FREEING)) {
1127                 hat_tlb_inval(hat, (higher->ht_flags & HTABLE_COPIED) ?
1128                     DEMAP_ALL_ADDR : old->ht_vaddr);
1129         }
1130 
1131         HTABLE_DEC(higher->ht_valid_cnt);
1132 }
1133 
1134 /*
1135  * Link an entry for a new table at vaddr and level into the existing table
1136  * one level higher. We are always holding the HASH_ENTER() when doing this.
1137  */
1138 static void
1139 link_ptp(htable_t *higher, htable_t *new, uintptr_t vaddr)
1140 {
1141         uint_t          entry = htable_va2entry(vaddr, higher);
1142         x86pte_t        newptp = MAKEPTP(new->ht_pfn, new->ht_level);
1143         x86pte_t        found;
1144 
1145         ASSERT(higher->ht_busy > 0);
1146 
1147         ASSERT(new->ht_level != mmu.max_level);
1148 
1149         HTABLE_INC(higher->ht_valid_cnt);
1150 
1151         found = x86pte_cas(higher, entry, 0, newptp);
1152         if ((found & ~PT_REF) != 0)
1153                 panic("HAT: ptp not 0, found=" FMT_PTE, found);
1154 
1155         /*
1156          * When a top level PTE changes for a copied htable, we must trigger a
1157          * hat_pcp_update() on all HAT CPUs.
1158          *
1159          * We also need to do this for the kernel hat on PAE 32 bit kernel.
1160          */
1161         if (
1162 #ifdef __i386
1163             (higher->ht_hat == kas.a_hat &&
1164             higher->ht_level == higher->ht_hat->hat_max_level) ||
1165 #endif
1166             (higher->ht_flags & HTABLE_COPIED))
1167                 hat_tlb_inval(higher->ht_hat, DEMAP_ALL_ADDR);
1168 }
1169 
1170 /*
1171  * Release of hold on an htable. If this is the last use and the pagetable
1172  * is empty we may want to free it, then recursively look at the pagetable
1173  * above it. The recursion is handled by the outer while() loop.
1174  *
1175  * On the metal, during process exit, we don't bother unlinking the tables from
1176  * upper level pagetables. They are instead handled in bulk by hat_free_end().
1177  * We can't do this on the hypervisor as we need the page table to be
1178  * implicitly unpinnned before it goes to the free page lists. This can't
1179  * happen unless we fully unlink it from the page table hierarchy.
1180  */
1181 void
1182 htable_release(htable_t *ht)
1183 {
1184         uint_t          hashval;
1185         htable_t        *shared;
1186         htable_t        *higher;


1285 /*
1286  * Find the htable for the pagetable at the given level for the given address.
1287  * If found acquires a hold that eventually needs to be htable_release()d
1288  */
1289 htable_t *
1290 htable_lookup(hat_t *hat, uintptr_t vaddr, level_t level)
1291 {
1292         uintptr_t       base;
1293         uint_t          hashval;
1294         htable_t        *ht = NULL;
1295 
1296         ASSERT(level >= 0);
1297         ASSERT(level <= TOP_LEVEL(hat));
1298 
1299         if (level == TOP_LEVEL(hat)) {
1300 #if defined(__amd64)
1301                 /*
1302                  * 32 bit address spaces on 64 bit kernels need to check
1303                  * for overflow of the 32 bit address space
1304                  */
1305                 if ((hat->hat_flags & HAT_COPIED_32) &&
1306                     vaddr >= ((uint64_t)1 << 32))
1307                         return (NULL);
1308 #endif
1309                 base = 0;
1310         } else {
1311                 base = vaddr & LEVEL_MASK(level + 1);
1312         }
1313 
1314         hashval = HTABLE_HASH(hat, base, level);
1315         HTABLE_ENTER(hashval);
1316         for (ht = hat->hat_ht_hash[hashval]; ht; ht = ht->ht_next) {
1317                 if (ht->ht_hat == hat &&
1318                     ht->ht_vaddr == base &&
1319                     ht->ht_level == level)
1320                         break;
1321         }
1322         if (ht)
1323                 ++ht->ht_busy;
1324 
1325         HTABLE_EXIT(hashval);
1326         return (ht);


1934 
1935         ASSERT(mmu.pae_hat != 0);
1936         for (;;) {
1937                 t = p[0];
1938                 t |= (uint64_t)p[1] << 32;
1939                 if ((t & 0xffffffff) == p[0])
1940                         return (t);
1941         }
1942 }
1943 #endif /* __i386 */
1944 
1945 /*
1946  * Disable preemption and establish a mapping to the pagetable with the
1947  * given pfn. This is optimized for there case where it's the same
1948  * pfn as we last used referenced from this CPU.
1949  */
1950 static x86pte_t *
1951 x86pte_access_pagetable(htable_t *ht, uint_t index)
1952 {
1953         /*
1954          * HTABLE_COPIED pagetables are contained in the hat_t
1955          */
1956         if (ht->ht_flags & HTABLE_COPIED) {
1957                 ASSERT3U(index, <, ht->ht_hat->hat_num_copied);
1958                 return (PT_INDEX_PTR(ht->ht_hat->hat_copied_ptes, index));
1959         }
1960         return (x86pte_mapin(ht->ht_pfn, index, ht));
1961 }
1962 
1963 /*
1964  * map the given pfn into the page table window.
1965  */
1966 /*ARGSUSED*/
1967 x86pte_t *
1968 x86pte_mapin(pfn_t pfn, uint_t index, htable_t *ht)
1969 {
1970         x86pte_t *pteptr;
1971         x86pte_t pte = 0;
1972         x86pte_t newpte;
1973         int x;
1974 
1975         ASSERT(pfn != PFN_INVALID);
1976 
1977         if (!khat_running) {
1978                 caddr_t va = kbm_remap_window(pfn_to_pa(pfn), 1);
1979                 return (PT_INDEX_PTR(va, index));
1980         }
1981 
1982         /*
1983          * If kpm is available, use it.
1984          */
1985         if (kpm_vbase)
1986                 return (PT_INDEX_PTR(hat_kpm_pfn2va(pfn), index));
1987 
1988         /*
1989          * Disable preemption and grab the CPU's hci_mutex
1990          */
1991         kpreempt_disable();
1992 
1993         ASSERT(CPU->cpu_hat_info != NULL);
1994         ASSERT(!(getcr4() & CR4_PCIDE));
1995 
1996         mutex_enter(&CPU->cpu_hat_info->hci_mutex);
1997         x = PWIN_TABLE(CPU->cpu_id);
1998         pteptr = (x86pte_t *)PWIN_PTE_VA(x);
1999 #ifndef __xpv
2000         if (mmu.pae_hat)
2001                 pte = *pteptr;
2002         else
2003                 pte = *(x86pte32_t *)pteptr;
2004 #endif
2005 
2006         newpte = MAKEPTE(pfn, 0) | mmu.pt_global | mmu.pt_nx;
2007 
2008         /*
2009          * For hardware we can use a writable mapping.
2010          */
2011 #ifdef __xpv
2012         if (IN_XPV_PANIC())
2013 #endif
2014                 newpte |= PT_WRITABLE;
2015 
2016         if (!PTE_EQUIV(newpte, pte)) {
2017 
2018 #ifdef __xpv
2019                 if (!IN_XPV_PANIC()) {
2020                         xen_map(newpte, PWIN_VA(x));
2021                 } else
2022 #endif
2023                 {
2024                         XPV_ALLOW_PAGETABLE_UPDATES();
2025                         if (mmu.pae_hat)
2026                                 *pteptr = newpte;
2027                         else
2028                                 *(x86pte32_t *)pteptr = newpte;
2029                         XPV_DISALLOW_PAGETABLE_UPDATES();
2030                         mmu_flush_tlb_kpage((uintptr_t)PWIN_VA(x));
2031                 }
2032         }
2033         return (PT_INDEX_PTR(PWIN_VA(x), index));
2034 }
2035 
2036 /*
2037  * Release access to a page table.
2038  */
2039 static void
2040 x86pte_release_pagetable(htable_t *ht)
2041 {
2042         if (ht->ht_flags & HTABLE_COPIED)



2043                 return;
2044 
2045         x86pte_mapout();
2046 }
2047 
2048 void
2049 x86pte_mapout(void)
2050 {
2051         if (kpm_vbase != NULL || !khat_running)
2052                 return;
2053 
2054         /*
2055          * Drop the CPU's hci_mutex and restore preemption.
2056          */
2057 #ifdef __xpv
2058         if (!IN_XPV_PANIC()) {
2059                 uintptr_t va;
2060 
2061                 /*
2062                  * We need to always clear the mapping in case a page


2123          * Install the new PTE. If remapping the same PFN, then
2124          * copy existing REF/MOD bits to new mapping.
2125          */
2126         do {
2127                 prev = GET_PTE(ptep);
2128                 n = new;
2129                 if (PTE_ISVALID(n) && (prev & pfn_mask) == (new & pfn_mask))
2130                         n |= prev & (PT_REF | PT_MOD);
2131 
2132                 /*
2133                  * Another thread may have installed this mapping already,
2134                  * flush the local TLB and be done.
2135                  */
2136                 if (prev == n) {
2137                         old = new;
2138 #ifdef __xpv
2139                         if (!IN_XPV_PANIC())
2140                                 xen_flush_va((caddr_t)addr);
2141                         else
2142 #endif
2143                                 mmu_flush_tlb_page(addr);
2144                         goto done;
2145                 }
2146 
2147                 /*
2148                  * Detect if we have a collision of installing a large
2149                  * page mapping where there already is a lower page table.
2150                  */
2151                 if (l > 0 && (prev & PT_VALID) && !(prev & PT_PAGESIZE)) {
2152                         old = LPAGE_ERROR;
2153                         goto done;
2154                 }
2155 
2156                 XPV_ALLOW_PAGETABLE_UPDATES();
2157                 old = CAS_PTE(ptep, prev, n);
2158                 XPV_DISALLOW_PAGETABLE_UPDATES();
2159         } while (old != prev);
2160 
2161         /*
2162          * Do a TLB demap if needed, ie. the old pte was valid.
2163          *


2182  * Atomic compare and swap of a page table entry. No TLB invalidates are done.
2183  * This is used for links between pagetables of different levels.
2184  * Note we always create these links with dirty/access set, so they should
2185  * never change.
2186  */
2187 x86pte_t
2188 x86pte_cas(htable_t *ht, uint_t entry, x86pte_t old, x86pte_t new)
2189 {
2190         x86pte_t        pte;
2191         x86pte_t        *ptep;
2192 #ifdef __xpv
2193         /*
2194          * We can't use writable pagetables for upper level tables, so fake it.
2195          */
2196         mmu_update_t t[2];
2197         int cnt = 1;
2198         int count;
2199         maddr_t ma;
2200 
2201         if (!IN_XPV_PANIC()) {
2202                 ASSERT(!(ht->ht_flags & HTABLE_COPIED));
2203                 ma = pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(ht->ht_pfn), entry));
2204                 t[0].ptr = ma | MMU_NORMAL_PT_UPDATE;
2205                 t[0].val = new;
2206 
2207 #if defined(__amd64)
2208                 /*
2209                  * On the 64-bit hypervisor we need to maintain the user mode
2210                  * top page table too.
2211                  */
2212                 if (ht->ht_level == mmu.max_level && ht->ht_hat != kas.a_hat) {
2213                         ma = pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(
2214                             ht->ht_hat->hat_user_ptable), entry));
2215                         t[1].ptr = ma | MMU_NORMAL_PT_UPDATE;
2216                         t[1].val = new;
2217                         ++cnt;
2218                 }
2219 #endif  /* __amd64 */
2220 
2221                 if (HYPERVISOR_mmu_update(t, cnt, &count, DOMID_SELF))
2222                         panic("HYPERVISOR_mmu_update() failed");


2339                 if ((expect & (PT_WRITABLE | PT_MOD)) == PT_WRITABLE &&
2340                     (new & (PT_WRITABLE | PT_MOD)) == 0 &&
2341                     (GET_PTE(ptep) & PT_MOD) != 0) {
2342                         do {
2343                                 found = GET_PTE(ptep);
2344                                 XPV_ALLOW_PAGETABLE_UPDATES();
2345                                 found =
2346                                     CAS_PTE(ptep, found, found | PT_WRITABLE);
2347                                 XPV_DISALLOW_PAGETABLE_UPDATES();
2348                         } while ((found & PT_WRITABLE) == 0);
2349                 }
2350         }
2351         x86pte_release_pagetable(ht);
2352         return (found);
2353 }
2354 
2355 #ifndef __xpv
2356 /*
2357  * Copy page tables - this is just a little more complicated than the
2358  * previous routines. Note that it's also not atomic! It also is never
2359  * used for HTABLE_COPIED pagetables.
2360  */
2361 void
2362 x86pte_copy(htable_t *src, htable_t *dest, uint_t entry, uint_t count)
2363 {
2364         caddr_t src_va;
2365         caddr_t dst_va;
2366         size_t size;
2367         x86pte_t *pteptr;
2368         x86pte_t pte;
2369 
2370         ASSERT(khat_running);
2371         ASSERT(!(dest->ht_flags & HTABLE_COPIED));
2372         ASSERT(!(src->ht_flags & HTABLE_COPIED));
2373         ASSERT(!(src->ht_flags & HTABLE_SHARED_PFN));
2374         ASSERT(!(dest->ht_flags & HTABLE_SHARED_PFN));
2375 
2376         /*
2377          * Acquire access to the CPU pagetable windows for the dest and source.
2378          */
2379         dst_va = (caddr_t)x86pte_access_pagetable(dest, entry);
2380         if (kpm_vbase) {
2381                 src_va = (caddr_t)
2382                     PT_INDEX_PTR(hat_kpm_pfn2va(src->ht_pfn), entry);
2383         } else {
2384                 uint_t x = PWIN_SRC(CPU->cpu_id);
2385 
2386                 ASSERT(!(getcr4() & CR4_PCIDE));
2387 
2388                 /*
2389                  * Finish defining the src pagetable mapping
2390                  */
2391                 src_va = (caddr_t)PT_INDEX_PTR(PWIN_VA(x), entry);
2392                 pte = MAKEPTE(src->ht_pfn, 0) | mmu.pt_global | mmu.pt_nx;
2393                 pteptr = (x86pte_t *)PWIN_PTE_VA(x);
2394                 if (mmu.pae_hat)
2395                         *pteptr = pte;
2396                 else
2397                         *(x86pte32_t *)pteptr = pte;
2398                 mmu_flush_tlb_kpage((uintptr_t)PWIN_VA(x));
2399         }
2400 
2401         /*
2402          * now do the copy
2403          */
2404         size = count << mmu.pte_size_shift;
2405         bcopy(src_va, dst_va, size);
2406 
2407         x86pte_release_pagetable(dest);
2408 }
2409 
2410 #else /* __xpv */
2411 
2412 /*
2413  * The hypervisor only supports writable pagetables at level 0, so we have
2414  * to install these 1 by 1 the slow way.
2415  */
2416 void
2417 x86pte_copy(htable_t *src, htable_t *dest, uint_t entry, uint_t count)
2418 {


2445 }
2446 #endif /* __xpv */
2447 
2448 /*
2449  * Zero page table entries - Note this doesn't use atomic stores!
2450  */
2451 static void
2452 x86pte_zero(htable_t *dest, uint_t entry, uint_t count)
2453 {
2454         caddr_t dst_va;
2455         size_t size;
2456 #ifdef __xpv
2457         int x;
2458         x86pte_t newpte;
2459 #endif
2460 
2461         /*
2462          * Map in the page table to be zeroed.
2463          */
2464         ASSERT(!(dest->ht_flags & HTABLE_SHARED_PFN));
2465         ASSERT(!(dest->ht_flags & HTABLE_COPIED));
2466 
2467         /*
2468          * On the hypervisor we don't use x86pte_access_pagetable() since
2469          * in this case the page is not pinned yet.
2470          */
2471 #ifdef __xpv
2472         if (kpm_vbase == NULL) {
2473                 kpreempt_disable();
2474                 ASSERT(CPU->cpu_hat_info != NULL);
2475                 mutex_enter(&CPU->cpu_hat_info->hci_mutex);
2476                 x = PWIN_TABLE(CPU->cpu_id);
2477                 newpte = MAKEPTE(dest->ht_pfn, 0) | PT_WRITABLE;
2478                 xen_map(newpte, PWIN_VA(x));
2479                 dst_va = (caddr_t)PT_INDEX_PTR(PWIN_VA(x), entry);
2480         } else
2481 #endif
2482                 dst_va = (caddr_t)x86pte_access_pagetable(dest, entry);
2483 
2484         size = count << mmu.pte_size_shift;
2485         ASSERT(size > BLOCKZEROALIGN);


2499 #endif
2500                 x86pte_release_pagetable(dest);
2501 }
2502 
2503 /*
2504  * Called to ensure that all pagetables are in the system dump
2505  */
2506 void
2507 hat_dump(void)
2508 {
2509         hat_t *hat;
2510         uint_t h;
2511         htable_t *ht;
2512 
2513         /*
2514          * Dump all page tables
2515          */
2516         for (hat = kas.a_hat; hat != NULL; hat = hat->hat_next) {
2517                 for (h = 0; h < hat->hat_num_hash; ++h) {
2518                         for (ht = hat->hat_ht_hash[h]; ht; ht = ht->ht_next) {
2519                                 if ((ht->ht_flags & HTABLE_COPIED) == 0)
2520                                         dump_page(ht->ht_pfn);
2521                         }
2522                 }
2523         }
2524 }