5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2014 by Delphix. All rights reserved.
25 * Copyright 2015 Joyent, Inc.
26 */
27
28 #include <sys/types.h>
29 #include <sys/sysmacros.h>
30 #include <sys/kmem.h>
31 #include <sys/atomic.h>
32 #include <sys/bitmap.h>
33 #include <sys/machparam.h>
34 #include <sys/machsystm.h>
35 #include <sys/mman.h>
36 #include <sys/systm.h>
37 #include <sys/cpuvar.h>
38 #include <sys/thread.h>
39 #include <sys/proc.h>
40 #include <sys/cpu.h>
41 #include <sys/kmem.h>
42 #include <sys/disp.h>
43 #include <sys/vmem.h>
44 #include <sys/vmsystm.h>
45 #include <sys/promif.h>
120 * instead of putting them in a hat's htable cache.
121 */
122 uint32_t htable_dont_cache = 0;
123
124 /*
125 * Track the number of active pagetables, so we can know how many to reap
126 */
127 static uint32_t active_ptables = 0;
128
129 #ifdef __xpv
130 /*
131 * Deal with hypervisor complications.
132 */
133 void
134 xen_flush_va(caddr_t va)
135 {
136 struct mmuext_op t;
137 uint_t count;
138
139 if (IN_XPV_PANIC()) {
140 mmu_tlbflush_entry((caddr_t)va);
141 } else {
142 t.cmd = MMUEXT_INVLPG_LOCAL;
143 t.arg1.linear_addr = (uintptr_t)va;
144 if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
145 panic("HYPERVISOR_mmuext_op() failed");
146 ASSERT(count == 1);
147 }
148 }
149
150 void
151 xen_gflush_va(caddr_t va, cpuset_t cpus)
152 {
153 struct mmuext_op t;
154 uint_t count;
155
156 if (IN_XPV_PANIC()) {
157 mmu_tlbflush_entry((caddr_t)va);
158 return;
159 }
160
161 t.cmd = MMUEXT_INVLPG_MULTI;
162 t.arg1.linear_addr = (uintptr_t)va;
163 /*LINTED: constant in conditional context*/
164 set_xen_guest_handle(t.arg2.vcpumask, &cpus);
165 if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
166 panic("HYPERVISOR_mmuext_op() failed");
167 ASSERT(count == 1);
168 }
169
170 void
171 xen_flush_tlb()
172 {
173 struct mmuext_op t;
174 uint_t count;
175
176 if (IN_XPV_PANIC()) {
177 xpv_panic_reload_cr3();
604 atomic_inc_32(&htable_dont_cache);
605 for (pass = 0; pass <= passes && stolen < cnt; ++pass) {
606 threshold = pass * mmu.ptes_per_table / htable_steal_passes;
607
608 mutex_enter(&hat_list_lock);
609
610 /* skip the first hat (kernel) */
611 hat = kas.a_hat->hat_next;
612 for (;;) {
613 /*
614 * Skip any hat that is already being stolen from.
615 *
616 * We skip SHARED hats, as these are dummy
617 * hats that host ISM shared page tables.
618 *
619 * We also skip if HAT_FREEING because hat_pte_unmap()
620 * won't zero out the PTE's. That would lead to hitting
621 * stale PTEs either here or under hat_unload() when we
622 * steal and unload the same page table in competing
623 * threads.
624 */
625 while (hat != NULL &&
626 (hat->hat_flags &
627 (HAT_VICTIM | HAT_SHARED | HAT_FREEING)) != 0)
628 hat = hat->hat_next;
629
630 if (hat == NULL)
631 break;
632
633 /*
634 * Mark the HAT as a stealing victim so that it is
635 * not freed from under us, e.g. in as_free()
636 */
637 hat->hat_flags |= HAT_VICTIM;
638 mutex_exit(&hat_list_lock);
639
640 /*
641 * Take any htables from the hat's cached "free" list.
642 */
643 hat_enter(hat);
644 while ((ht = hat->hat_ht_cached) != NULL &&
645 stolen < cnt) {
646 hat->hat_ht_cached = ht->ht_next;
647 ht->ht_next = list;
648 list = ht;
651 hat_exit(hat);
652
653 /*
654 * Don't steal active htables on first pass.
655 */
656 if (pass != 0 && (stolen < cnt))
657 htable_steal_active(hat, cnt, threshold,
658 &stolen, &list);
659
660 /*
661 * do synchronous teardown for the reap case so that
662 * we can forget hat; at this time, hat is
663 * guaranteed to be around because HAT_VICTIM is set
664 * (see htable_free() for similar code)
665 */
666 for (ht = list; (ht) && (reap); ht = ht->ht_next) {
667 if (ht->ht_hat == NULL)
668 continue;
669 ASSERT(ht->ht_hat == hat);
670 #if defined(__xpv) && defined(__amd64)
671 if (!(ht->ht_flags & HTABLE_VLP) &&
672 ht->ht_level == mmu.max_level) {
673 ptable_free(hat->hat_user_ptable);
674 hat->hat_user_ptable = PFN_INVALID;
675 }
676 #endif
677 /*
678 * forget the hat
679 */
680 ht->ht_hat = NULL;
681 }
682
683 mutex_enter(&hat_list_lock);
684
685 /*
686 * Are we finished?
687 */
688 if (stolen == cnt) {
689 /*
690 * Try to spread the pain of stealing,
691 * move victim HAT to the end of the HAT list.
692 */
762 atomic_dec_32(&htable_dont_cache);
763
764 /*
765 * Free up excess reserves
766 */
767 htable_adjust_reserve();
768 hment_adjust_reserve();
769 }
770
771 /*
772 * Allocate an htable, stealing one or using the reserve if necessary
773 */
774 static htable_t *
775 htable_alloc(
776 hat_t *hat,
777 uintptr_t vaddr,
778 level_t level,
779 htable_t *shared)
780 {
781 htable_t *ht = NULL;
782 uint_t is_vlp;
783 uint_t is_bare = 0;
784 uint_t need_to_zero = 1;
785 int kmflags = (can_steal_post_boot ? KM_NOSLEEP : KM_SLEEP);
786
787 if (level < 0 || level > TOP_LEVEL(hat))
788 panic("htable_alloc(): level %d out of range\n", level);
789
790 is_vlp = (hat->hat_flags & HAT_VLP) && level == VLP_LEVEL;
791 if (is_vlp || shared != NULL)
792 is_bare = 1;
793
794 /*
795 * First reuse a cached htable from the hat_ht_cached field, this
796 * avoids unnecessary trips through kmem/page allocators.
797 */
798 if (hat->hat_ht_cached != NULL && !is_bare) {
799 hat_enter(hat);
800 ht = hat->hat_ht_cached;
801 if (ht != NULL) {
802 hat->hat_ht_cached = ht->ht_next;
803 need_to_zero = 0;
804 /* XX64 ASSERT() they're all zero somehow */
805 ASSERT(ht->ht_pfn != PFN_INVALID);
806 }
807 hat_exit(hat);
808 }
809
810 if (ht == NULL) {
811 /*
913 /*
914 * Shared page tables have all entries locked and entries may not
915 * be added or deleted.
916 */
917 ht->ht_flags = 0;
918 if (shared != NULL) {
919 ASSERT(shared->ht_valid_cnt > 0);
920 ht->ht_flags |= HTABLE_SHARED_PFN;
921 ht->ht_pfn = shared->ht_pfn;
922 ht->ht_lock_cnt = 0;
923 ht->ht_valid_cnt = 0; /* updated in hat_share() */
924 ht->ht_shares = shared;
925 need_to_zero = 0;
926 } else {
927 ht->ht_shares = NULL;
928 ht->ht_lock_cnt = 0;
929 ht->ht_valid_cnt = 0;
930 }
931
932 /*
933 * setup flags, etc. for VLP htables
934 */
935 if (is_vlp) {
936 ht->ht_flags |= HTABLE_VLP;
937 ASSERT(ht->ht_pfn == PFN_INVALID);
938 need_to_zero = 0;
939 }
940
941 /*
942 * fill in the htable
943 */
944 ht->ht_hat = hat;
945 ht->ht_parent = NULL;
946 ht->ht_vaddr = vaddr;
947 ht->ht_level = level;
948 ht->ht_busy = 1;
949 ht->ht_next = NULL;
950 ht->ht_prev = NULL;
951
952 /*
953 * Zero out any freshly allocated page table
954 */
955 if (need_to_zero)
956 x86pte_zero(ht, 0, mmu.ptes_per_table);
967 }
968
969 /*
970 * Free up an htable, either to a hat's cached list, the reserves or
971 * back to kmem.
972 */
973 static void
974 htable_free(htable_t *ht)
975 {
976 hat_t *hat = ht->ht_hat;
977
978 /*
979 * If the process isn't exiting, cache the free htable in the hat
980 * structure. We always do this for the boot time reserve. We don't
981 * do this if the hat is exiting or we are stealing/reaping htables.
982 */
983 if (hat != NULL &&
984 !(ht->ht_flags & HTABLE_SHARED_PFN) &&
985 (use_boot_reserve ||
986 (!(hat->hat_flags & HAT_FREEING) && !htable_dont_cache))) {
987 ASSERT((ht->ht_flags & HTABLE_VLP) == 0);
988 ASSERT(ht->ht_pfn != PFN_INVALID);
989 hat_enter(hat);
990 ht->ht_next = hat->hat_ht_cached;
991 hat->hat_ht_cached = ht;
992 hat_exit(hat);
993 return;
994 }
995
996 /*
997 * If we have a hardware page table, free it.
998 * We don't free page tables that are accessed by sharing.
999 */
1000 if (ht->ht_flags & HTABLE_SHARED_PFN) {
1001 ASSERT(ht->ht_pfn != PFN_INVALID);
1002 } else if (!(ht->ht_flags & HTABLE_VLP)) {
1003 ptable_free(ht->ht_pfn);
1004 #if defined(__amd64) && defined(__xpv)
1005 if (ht->ht_level == mmu.max_level && hat != NULL) {
1006 ptable_free(hat->hat_user_ptable);
1007 hat->hat_user_ptable = PFN_INVALID;
1008 }
1009 #endif
1010 }
1011 ht->ht_pfn = PFN_INVALID;
1012
1013 /*
1014 * Free it or put into reserves.
1015 */
1016 if (USE_HAT_RESERVES() || htable_reserve_cnt < htable_reserve_amount) {
1017 htable_put_reserve(ht);
1018 } else {
1019 kmem_cache_free(htable_cache, ht);
1020 htable_adjust_reserve();
1021 }
1022 }
1094 x86pte_t found;
1095 hat_t *hat = old->ht_hat;
1096
1097 ASSERT(higher->ht_busy > 0);
1098 ASSERT(higher->ht_valid_cnt > 0);
1099 ASSERT(old->ht_valid_cnt == 0);
1100 found = x86pte_cas(higher, entry, expect, 0);
1101 #ifdef __xpv
1102 /*
1103 * This is weird, but Xen apparently automatically unlinks empty
1104 * pagetables from the upper page table. So allow PTP to be 0 already.
1105 */
1106 if (found != expect && found != 0)
1107 #else
1108 if (found != expect)
1109 #endif
1110 panic("Bad PTP found=" FMT_PTE ", expected=" FMT_PTE,
1111 found, expect);
1112
1113 /*
1114 * When a top level VLP page table entry changes, we must issue
1115 * a reload of cr3 on all processors.
1116 *
1117 * If we don't need do do that, then we still have to INVLPG against
1118 * an address covered by the inner page table, as the latest processors
1119 * have TLB-like caches for non-leaf page table entries.
1120 */
1121 if (!(hat->hat_flags & HAT_FREEING)) {
1122 hat_tlb_inval(hat, (higher->ht_flags & HTABLE_VLP) ?
1123 DEMAP_ALL_ADDR : old->ht_vaddr);
1124 }
1125
1126 HTABLE_DEC(higher->ht_valid_cnt);
1127 }
1128
1129 /*
1130 * Link an entry for a new table at vaddr and level into the existing table
1131 * one level higher. We are always holding the HASH_ENTER() when doing this.
1132 */
1133 static void
1134 link_ptp(htable_t *higher, htable_t *new, uintptr_t vaddr)
1135 {
1136 uint_t entry = htable_va2entry(vaddr, higher);
1137 x86pte_t newptp = MAKEPTP(new->ht_pfn, new->ht_level);
1138 x86pte_t found;
1139
1140 ASSERT(higher->ht_busy > 0);
1141
1142 ASSERT(new->ht_level != mmu.max_level);
1143
1144 HTABLE_INC(higher->ht_valid_cnt);
1145
1146 found = x86pte_cas(higher, entry, 0, newptp);
1147 if ((found & ~PT_REF) != 0)
1148 panic("HAT: ptp not 0, found=" FMT_PTE, found);
1149
1150 /*
1151 * When any top level VLP page table entry changes, we must issue
1152 * a reload of cr3 on all processors using it.
1153 * We also need to do this for the kernel hat on PAE 32 bit kernel.
1154 */
1155 if (
1156 #ifdef __i386
1157 (higher->ht_hat == kas.a_hat && higher->ht_level == VLP_LEVEL) ||
1158 #endif
1159 (higher->ht_flags & HTABLE_VLP))
1160 hat_tlb_inval(higher->ht_hat, DEMAP_ALL_ADDR);
1161 }
1162
1163 /*
1164 * Release of hold on an htable. If this is the last use and the pagetable
1165 * is empty we may want to free it, then recursively look at the pagetable
1166 * above it. The recursion is handled by the outer while() loop.
1167 *
1168 * On the metal, during process exit, we don't bother unlinking the tables from
1169 * upper level pagetables. They are instead handled in bulk by hat_free_end().
1170 * We can't do this on the hypervisor as we need the page table to be
1171 * implicitly unpinnned before it goes to the free page lists. This can't
1172 * happen unless we fully unlink it from the page table hierarchy.
1173 */
1174 void
1175 htable_release(htable_t *ht)
1176 {
1177 uint_t hashval;
1178 htable_t *shared;
1179 htable_t *higher;
1278 /*
1279 * Find the htable for the pagetable at the given level for the given address.
1280 * If found acquires a hold that eventually needs to be htable_release()d
1281 */
1282 htable_t *
1283 htable_lookup(hat_t *hat, uintptr_t vaddr, level_t level)
1284 {
1285 uintptr_t base;
1286 uint_t hashval;
1287 htable_t *ht = NULL;
1288
1289 ASSERT(level >= 0);
1290 ASSERT(level <= TOP_LEVEL(hat));
1291
1292 if (level == TOP_LEVEL(hat)) {
1293 #if defined(__amd64)
1294 /*
1295 * 32 bit address spaces on 64 bit kernels need to check
1296 * for overflow of the 32 bit address space
1297 */
1298 if ((hat->hat_flags & HAT_VLP) && vaddr >= ((uint64_t)1 << 32))
1299 return (NULL);
1300 #endif
1301 base = 0;
1302 } else {
1303 base = vaddr & LEVEL_MASK(level + 1);
1304 }
1305
1306 hashval = HTABLE_HASH(hat, base, level);
1307 HTABLE_ENTER(hashval);
1308 for (ht = hat->hat_ht_hash[hashval]; ht; ht = ht->ht_next) {
1309 if (ht->ht_hat == hat &&
1310 ht->ht_vaddr == base &&
1311 ht->ht_level == level)
1312 break;
1313 }
1314 if (ht)
1315 ++ht->ht_busy;
1316
1317 HTABLE_EXIT(hashval);
1318 return (ht);
1926
1927 ASSERT(mmu.pae_hat != 0);
1928 for (;;) {
1929 t = p[0];
1930 t |= (uint64_t)p[1] << 32;
1931 if ((t & 0xffffffff) == p[0])
1932 return (t);
1933 }
1934 }
1935 #endif /* __i386 */
1936
1937 /*
1938 * Disable preemption and establish a mapping to the pagetable with the
1939 * given pfn. This is optimized for there case where it's the same
1940 * pfn as we last used referenced from this CPU.
1941 */
1942 static x86pte_t *
1943 x86pte_access_pagetable(htable_t *ht, uint_t index)
1944 {
1945 /*
1946 * VLP pagetables are contained in the hat_t
1947 */
1948 if (ht->ht_flags & HTABLE_VLP)
1949 return (PT_INDEX_PTR(ht->ht_hat->hat_vlp_ptes, index));
1950 return (x86pte_mapin(ht->ht_pfn, index, ht));
1951 }
1952
1953 /*
1954 * map the given pfn into the page table window.
1955 */
1956 /*ARGSUSED*/
1957 x86pte_t *
1958 x86pte_mapin(pfn_t pfn, uint_t index, htable_t *ht)
1959 {
1960 x86pte_t *pteptr;
1961 x86pte_t pte = 0;
1962 x86pte_t newpte;
1963 int x;
1964
1965 ASSERT(pfn != PFN_INVALID);
1966
1967 if (!khat_running) {
1968 caddr_t va = kbm_remap_window(pfn_to_pa(pfn), 1);
1969 return (PT_INDEX_PTR(va, index));
1970 }
1971
1972 /*
1973 * If kpm is available, use it.
1974 */
1975 if (kpm_vbase)
1976 return (PT_INDEX_PTR(hat_kpm_pfn2va(pfn), index));
1977
1978 /*
1979 * Disable preemption and grab the CPU's hci_mutex
1980 */
1981 kpreempt_disable();
1982 ASSERT(CPU->cpu_hat_info != NULL);
1983 mutex_enter(&CPU->cpu_hat_info->hci_mutex);
1984 x = PWIN_TABLE(CPU->cpu_id);
1985 pteptr = (x86pte_t *)PWIN_PTE_VA(x);
1986 #ifndef __xpv
1987 if (mmu.pae_hat)
1988 pte = *pteptr;
1989 else
1990 pte = *(x86pte32_t *)pteptr;
1991 #endif
1992
1993 newpte = MAKEPTE(pfn, 0) | mmu.pt_global | mmu.pt_nx;
1994
1995 /*
1996 * For hardware we can use a writable mapping.
1997 */
1998 #ifdef __xpv
1999 if (IN_XPV_PANIC())
2000 #endif
2001 newpte |= PT_WRITABLE;
2002
2003 if (!PTE_EQUIV(newpte, pte)) {
2004
2005 #ifdef __xpv
2006 if (!IN_XPV_PANIC()) {
2007 xen_map(newpte, PWIN_VA(x));
2008 } else
2009 #endif
2010 {
2011 XPV_ALLOW_PAGETABLE_UPDATES();
2012 if (mmu.pae_hat)
2013 *pteptr = newpte;
2014 else
2015 *(x86pte32_t *)pteptr = newpte;
2016 XPV_DISALLOW_PAGETABLE_UPDATES();
2017 mmu_tlbflush_entry((caddr_t)(PWIN_VA(x)));
2018 }
2019 }
2020 return (PT_INDEX_PTR(PWIN_VA(x), index));
2021 }
2022
2023 /*
2024 * Release access to a page table.
2025 */
2026 static void
2027 x86pte_release_pagetable(htable_t *ht)
2028 {
2029 /*
2030 * nothing to do for VLP htables
2031 */
2032 if (ht->ht_flags & HTABLE_VLP)
2033 return;
2034
2035 x86pte_mapout();
2036 }
2037
2038 void
2039 x86pte_mapout(void)
2040 {
2041 if (kpm_vbase != NULL || !khat_running)
2042 return;
2043
2044 /*
2045 * Drop the CPU's hci_mutex and restore preemption.
2046 */
2047 #ifdef __xpv
2048 if (!IN_XPV_PANIC()) {
2049 uintptr_t va;
2050
2051 /*
2052 * We need to always clear the mapping in case a page
2113 * Install the new PTE. If remapping the same PFN, then
2114 * copy existing REF/MOD bits to new mapping.
2115 */
2116 do {
2117 prev = GET_PTE(ptep);
2118 n = new;
2119 if (PTE_ISVALID(n) && (prev & pfn_mask) == (new & pfn_mask))
2120 n |= prev & (PT_REF | PT_MOD);
2121
2122 /*
2123 * Another thread may have installed this mapping already,
2124 * flush the local TLB and be done.
2125 */
2126 if (prev == n) {
2127 old = new;
2128 #ifdef __xpv
2129 if (!IN_XPV_PANIC())
2130 xen_flush_va((caddr_t)addr);
2131 else
2132 #endif
2133 mmu_tlbflush_entry((caddr_t)addr);
2134 goto done;
2135 }
2136
2137 /*
2138 * Detect if we have a collision of installing a large
2139 * page mapping where there already is a lower page table.
2140 */
2141 if (l > 0 && (prev & PT_VALID) && !(prev & PT_PAGESIZE)) {
2142 old = LPAGE_ERROR;
2143 goto done;
2144 }
2145
2146 XPV_ALLOW_PAGETABLE_UPDATES();
2147 old = CAS_PTE(ptep, prev, n);
2148 XPV_DISALLOW_PAGETABLE_UPDATES();
2149 } while (old != prev);
2150
2151 /*
2152 * Do a TLB demap if needed, ie. the old pte was valid.
2153 *
2172 * Atomic compare and swap of a page table entry. No TLB invalidates are done.
2173 * This is used for links between pagetables of different levels.
2174 * Note we always create these links with dirty/access set, so they should
2175 * never change.
2176 */
2177 x86pte_t
2178 x86pte_cas(htable_t *ht, uint_t entry, x86pte_t old, x86pte_t new)
2179 {
2180 x86pte_t pte;
2181 x86pte_t *ptep;
2182 #ifdef __xpv
2183 /*
2184 * We can't use writable pagetables for upper level tables, so fake it.
2185 */
2186 mmu_update_t t[2];
2187 int cnt = 1;
2188 int count;
2189 maddr_t ma;
2190
2191 if (!IN_XPV_PANIC()) {
2192 ASSERT(!(ht->ht_flags & HTABLE_VLP)); /* no VLP yet */
2193 ma = pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(ht->ht_pfn), entry));
2194 t[0].ptr = ma | MMU_NORMAL_PT_UPDATE;
2195 t[0].val = new;
2196
2197 #if defined(__amd64)
2198 /*
2199 * On the 64-bit hypervisor we need to maintain the user mode
2200 * top page table too.
2201 */
2202 if (ht->ht_level == mmu.max_level && ht->ht_hat != kas.a_hat) {
2203 ma = pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(
2204 ht->ht_hat->hat_user_ptable), entry));
2205 t[1].ptr = ma | MMU_NORMAL_PT_UPDATE;
2206 t[1].val = new;
2207 ++cnt;
2208 }
2209 #endif /* __amd64 */
2210
2211 if (HYPERVISOR_mmu_update(t, cnt, &count, DOMID_SELF))
2212 panic("HYPERVISOR_mmu_update() failed");
2329 if ((expect & (PT_WRITABLE | PT_MOD)) == PT_WRITABLE &&
2330 (new & (PT_WRITABLE | PT_MOD)) == 0 &&
2331 (GET_PTE(ptep) & PT_MOD) != 0) {
2332 do {
2333 found = GET_PTE(ptep);
2334 XPV_ALLOW_PAGETABLE_UPDATES();
2335 found =
2336 CAS_PTE(ptep, found, found | PT_WRITABLE);
2337 XPV_DISALLOW_PAGETABLE_UPDATES();
2338 } while ((found & PT_WRITABLE) == 0);
2339 }
2340 }
2341 x86pte_release_pagetable(ht);
2342 return (found);
2343 }
2344
2345 #ifndef __xpv
2346 /*
2347 * Copy page tables - this is just a little more complicated than the
2348 * previous routines. Note that it's also not atomic! It also is never
2349 * used for VLP pagetables.
2350 */
2351 void
2352 x86pte_copy(htable_t *src, htable_t *dest, uint_t entry, uint_t count)
2353 {
2354 caddr_t src_va;
2355 caddr_t dst_va;
2356 size_t size;
2357 x86pte_t *pteptr;
2358 x86pte_t pte;
2359
2360 ASSERT(khat_running);
2361 ASSERT(!(dest->ht_flags & HTABLE_VLP));
2362 ASSERT(!(src->ht_flags & HTABLE_VLP));
2363 ASSERT(!(src->ht_flags & HTABLE_SHARED_PFN));
2364 ASSERT(!(dest->ht_flags & HTABLE_SHARED_PFN));
2365
2366 /*
2367 * Acquire access to the CPU pagetable windows for the dest and source.
2368 */
2369 dst_va = (caddr_t)x86pte_access_pagetable(dest, entry);
2370 if (kpm_vbase) {
2371 src_va = (caddr_t)
2372 PT_INDEX_PTR(hat_kpm_pfn2va(src->ht_pfn), entry);
2373 } else {
2374 uint_t x = PWIN_SRC(CPU->cpu_id);
2375
2376 /*
2377 * Finish defining the src pagetable mapping
2378 */
2379 src_va = (caddr_t)PT_INDEX_PTR(PWIN_VA(x), entry);
2380 pte = MAKEPTE(src->ht_pfn, 0) | mmu.pt_global | mmu.pt_nx;
2381 pteptr = (x86pte_t *)PWIN_PTE_VA(x);
2382 if (mmu.pae_hat)
2383 *pteptr = pte;
2384 else
2385 *(x86pte32_t *)pteptr = pte;
2386 mmu_tlbflush_entry((caddr_t)(PWIN_VA(x)));
2387 }
2388
2389 /*
2390 * now do the copy
2391 */
2392 size = count << mmu.pte_size_shift;
2393 bcopy(src_va, dst_va, size);
2394
2395 x86pte_release_pagetable(dest);
2396 }
2397
2398 #else /* __xpv */
2399
2400 /*
2401 * The hypervisor only supports writable pagetables at level 0, so we have
2402 * to install these 1 by 1 the slow way.
2403 */
2404 void
2405 x86pte_copy(htable_t *src, htable_t *dest, uint_t entry, uint_t count)
2406 {
2433 }
2434 #endif /* __xpv */
2435
2436 /*
2437 * Zero page table entries - Note this doesn't use atomic stores!
2438 */
2439 static void
2440 x86pte_zero(htable_t *dest, uint_t entry, uint_t count)
2441 {
2442 caddr_t dst_va;
2443 size_t size;
2444 #ifdef __xpv
2445 int x;
2446 x86pte_t newpte;
2447 #endif
2448
2449 /*
2450 * Map in the page table to be zeroed.
2451 */
2452 ASSERT(!(dest->ht_flags & HTABLE_SHARED_PFN));
2453 ASSERT(!(dest->ht_flags & HTABLE_VLP));
2454
2455 /*
2456 * On the hypervisor we don't use x86pte_access_pagetable() since
2457 * in this case the page is not pinned yet.
2458 */
2459 #ifdef __xpv
2460 if (kpm_vbase == NULL) {
2461 kpreempt_disable();
2462 ASSERT(CPU->cpu_hat_info != NULL);
2463 mutex_enter(&CPU->cpu_hat_info->hci_mutex);
2464 x = PWIN_TABLE(CPU->cpu_id);
2465 newpte = MAKEPTE(dest->ht_pfn, 0) | PT_WRITABLE;
2466 xen_map(newpte, PWIN_VA(x));
2467 dst_va = (caddr_t)PT_INDEX_PTR(PWIN_VA(x), entry);
2468 } else
2469 #endif
2470 dst_va = (caddr_t)x86pte_access_pagetable(dest, entry);
2471
2472 size = count << mmu.pte_size_shift;
2473 ASSERT(size > BLOCKZEROALIGN);
2487 #endif
2488 x86pte_release_pagetable(dest);
2489 }
2490
2491 /*
2492 * Called to ensure that all pagetables are in the system dump
2493 */
2494 void
2495 hat_dump(void)
2496 {
2497 hat_t *hat;
2498 uint_t h;
2499 htable_t *ht;
2500
2501 /*
2502 * Dump all page tables
2503 */
2504 for (hat = kas.a_hat; hat != NULL; hat = hat->hat_next) {
2505 for (h = 0; h < hat->hat_num_hash; ++h) {
2506 for (ht = hat->hat_ht_hash[h]; ht; ht = ht->ht_next) {
2507 if ((ht->ht_flags & HTABLE_VLP) == 0)
2508 dump_page(ht->ht_pfn);
2509 }
2510 }
2511 }
2512 }
|
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2014 by Delphix. All rights reserved.
25 * Copyright 2018 Joyent, Inc.
26 */
27
28 #include <sys/types.h>
29 #include <sys/sysmacros.h>
30 #include <sys/kmem.h>
31 #include <sys/atomic.h>
32 #include <sys/bitmap.h>
33 #include <sys/machparam.h>
34 #include <sys/machsystm.h>
35 #include <sys/mman.h>
36 #include <sys/systm.h>
37 #include <sys/cpuvar.h>
38 #include <sys/thread.h>
39 #include <sys/proc.h>
40 #include <sys/cpu.h>
41 #include <sys/kmem.h>
42 #include <sys/disp.h>
43 #include <sys/vmem.h>
44 #include <sys/vmsystm.h>
45 #include <sys/promif.h>
120 * instead of putting them in a hat's htable cache.
121 */
122 uint32_t htable_dont_cache = 0;
123
124 /*
125 * Track the number of active pagetables, so we can know how many to reap
126 */
127 static uint32_t active_ptables = 0;
128
129 #ifdef __xpv
130 /*
131 * Deal with hypervisor complications.
132 */
133 void
134 xen_flush_va(caddr_t va)
135 {
136 struct mmuext_op t;
137 uint_t count;
138
139 if (IN_XPV_PANIC()) {
140 mmu_flush_tlb_page((uintptr_t)va);
141 } else {
142 t.cmd = MMUEXT_INVLPG_LOCAL;
143 t.arg1.linear_addr = (uintptr_t)va;
144 if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
145 panic("HYPERVISOR_mmuext_op() failed");
146 ASSERT(count == 1);
147 }
148 }
149
150 void
151 xen_gflush_va(caddr_t va, cpuset_t cpus)
152 {
153 struct mmuext_op t;
154 uint_t count;
155
156 if (IN_XPV_PANIC()) {
157 mmu_flush_tlb_page((uintptr_t)va);
158 return;
159 }
160
161 t.cmd = MMUEXT_INVLPG_MULTI;
162 t.arg1.linear_addr = (uintptr_t)va;
163 /*LINTED: constant in conditional context*/
164 set_xen_guest_handle(t.arg2.vcpumask, &cpus);
165 if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
166 panic("HYPERVISOR_mmuext_op() failed");
167 ASSERT(count == 1);
168 }
169
170 void
171 xen_flush_tlb()
172 {
173 struct mmuext_op t;
174 uint_t count;
175
176 if (IN_XPV_PANIC()) {
177 xpv_panic_reload_cr3();
604 atomic_inc_32(&htable_dont_cache);
605 for (pass = 0; pass <= passes && stolen < cnt; ++pass) {
606 threshold = pass * mmu.ptes_per_table / htable_steal_passes;
607
608 mutex_enter(&hat_list_lock);
609
610 /* skip the first hat (kernel) */
611 hat = kas.a_hat->hat_next;
612 for (;;) {
613 /*
614 * Skip any hat that is already being stolen from.
615 *
616 * We skip SHARED hats, as these are dummy
617 * hats that host ISM shared page tables.
618 *
619 * We also skip if HAT_FREEING because hat_pte_unmap()
620 * won't zero out the PTE's. That would lead to hitting
621 * stale PTEs either here or under hat_unload() when we
622 * steal and unload the same page table in competing
623 * threads.
624 *
625 * We skip HATs that belong to CPUs, to make our lives
626 * simpler.
627 */
628 while (hat != NULL && (hat->hat_flags &
629 (HAT_VICTIM | HAT_SHARED | HAT_FREEING |
630 HAT_PCP)) != 0) {
631 hat = hat->hat_next;
632 }
633
634 if (hat == NULL)
635 break;
636
637 /*
638 * Mark the HAT as a stealing victim so that it is
639 * not freed from under us, e.g. in as_free()
640 */
641 hat->hat_flags |= HAT_VICTIM;
642 mutex_exit(&hat_list_lock);
643
644 /*
645 * Take any htables from the hat's cached "free" list.
646 */
647 hat_enter(hat);
648 while ((ht = hat->hat_ht_cached) != NULL &&
649 stolen < cnt) {
650 hat->hat_ht_cached = ht->ht_next;
651 ht->ht_next = list;
652 list = ht;
655 hat_exit(hat);
656
657 /*
658 * Don't steal active htables on first pass.
659 */
660 if (pass != 0 && (stolen < cnt))
661 htable_steal_active(hat, cnt, threshold,
662 &stolen, &list);
663
664 /*
665 * do synchronous teardown for the reap case so that
666 * we can forget hat; at this time, hat is
667 * guaranteed to be around because HAT_VICTIM is set
668 * (see htable_free() for similar code)
669 */
670 for (ht = list; (ht) && (reap); ht = ht->ht_next) {
671 if (ht->ht_hat == NULL)
672 continue;
673 ASSERT(ht->ht_hat == hat);
674 #if defined(__xpv) && defined(__amd64)
675 ASSERT(!(ht->ht_flags & HTABLE_COPIED));
676 if (ht->ht_level == mmu.max_level) {
677 ptable_free(hat->hat_user_ptable);
678 hat->hat_user_ptable = PFN_INVALID;
679 }
680 #endif
681 /*
682 * forget the hat
683 */
684 ht->ht_hat = NULL;
685 }
686
687 mutex_enter(&hat_list_lock);
688
689 /*
690 * Are we finished?
691 */
692 if (stolen == cnt) {
693 /*
694 * Try to spread the pain of stealing,
695 * move victim HAT to the end of the HAT list.
696 */
766 atomic_dec_32(&htable_dont_cache);
767
768 /*
769 * Free up excess reserves
770 */
771 htable_adjust_reserve();
772 hment_adjust_reserve();
773 }
774
775 /*
776 * Allocate an htable, stealing one or using the reserve if necessary
777 */
778 static htable_t *
779 htable_alloc(
780 hat_t *hat,
781 uintptr_t vaddr,
782 level_t level,
783 htable_t *shared)
784 {
785 htable_t *ht = NULL;
786 uint_t is_copied;
787 uint_t is_bare = 0;
788 uint_t need_to_zero = 1;
789 int kmflags = (can_steal_post_boot ? KM_NOSLEEP : KM_SLEEP);
790
791 if (level < 0 || level > TOP_LEVEL(hat))
792 panic("htable_alloc(): level %d out of range\n", level);
793
794 is_copied = (hat->hat_flags & HAT_COPIED) &&
795 level == hat->hat_max_level;
796 if (is_copied || shared != NULL)
797 is_bare = 1;
798
799 /*
800 * First reuse a cached htable from the hat_ht_cached field, this
801 * avoids unnecessary trips through kmem/page allocators.
802 */
803 if (hat->hat_ht_cached != NULL && !is_bare) {
804 hat_enter(hat);
805 ht = hat->hat_ht_cached;
806 if (ht != NULL) {
807 hat->hat_ht_cached = ht->ht_next;
808 need_to_zero = 0;
809 /* XX64 ASSERT() they're all zero somehow */
810 ASSERT(ht->ht_pfn != PFN_INVALID);
811 }
812 hat_exit(hat);
813 }
814
815 if (ht == NULL) {
816 /*
918 /*
919 * Shared page tables have all entries locked and entries may not
920 * be added or deleted.
921 */
922 ht->ht_flags = 0;
923 if (shared != NULL) {
924 ASSERT(shared->ht_valid_cnt > 0);
925 ht->ht_flags |= HTABLE_SHARED_PFN;
926 ht->ht_pfn = shared->ht_pfn;
927 ht->ht_lock_cnt = 0;
928 ht->ht_valid_cnt = 0; /* updated in hat_share() */
929 ht->ht_shares = shared;
930 need_to_zero = 0;
931 } else {
932 ht->ht_shares = NULL;
933 ht->ht_lock_cnt = 0;
934 ht->ht_valid_cnt = 0;
935 }
936
937 /*
938 * setup flags, etc. for copied page tables.
939 */
940 if (is_copied) {
941 ht->ht_flags |= HTABLE_COPIED;
942 ASSERT(ht->ht_pfn == PFN_INVALID);
943 need_to_zero = 0;
944 }
945
946 /*
947 * fill in the htable
948 */
949 ht->ht_hat = hat;
950 ht->ht_parent = NULL;
951 ht->ht_vaddr = vaddr;
952 ht->ht_level = level;
953 ht->ht_busy = 1;
954 ht->ht_next = NULL;
955 ht->ht_prev = NULL;
956
957 /*
958 * Zero out any freshly allocated page table
959 */
960 if (need_to_zero)
961 x86pte_zero(ht, 0, mmu.ptes_per_table);
972 }
973
974 /*
975 * Free up an htable, either to a hat's cached list, the reserves or
976 * back to kmem.
977 */
978 static void
979 htable_free(htable_t *ht)
980 {
981 hat_t *hat = ht->ht_hat;
982
983 /*
984 * If the process isn't exiting, cache the free htable in the hat
985 * structure. We always do this for the boot time reserve. We don't
986 * do this if the hat is exiting or we are stealing/reaping htables.
987 */
988 if (hat != NULL &&
989 !(ht->ht_flags & HTABLE_SHARED_PFN) &&
990 (use_boot_reserve ||
991 (!(hat->hat_flags & HAT_FREEING) && !htable_dont_cache))) {
992 ASSERT((ht->ht_flags & HTABLE_COPIED) == 0);
993 ASSERT(ht->ht_pfn != PFN_INVALID);
994 hat_enter(hat);
995 ht->ht_next = hat->hat_ht_cached;
996 hat->hat_ht_cached = ht;
997 hat_exit(hat);
998 return;
999 }
1000
1001 /*
1002 * If we have a hardware page table, free it.
1003 * We don't free page tables that are accessed by sharing.
1004 */
1005 if (ht->ht_flags & HTABLE_SHARED_PFN) {
1006 ASSERT(ht->ht_pfn != PFN_INVALID);
1007 } else if (!(ht->ht_flags & HTABLE_COPIED)) {
1008 ptable_free(ht->ht_pfn);
1009 #if defined(__amd64) && defined(__xpv)
1010 if (ht->ht_level == mmu.max_level && hat != NULL) {
1011 ptable_free(hat->hat_user_ptable);
1012 hat->hat_user_ptable = PFN_INVALID;
1013 }
1014 #endif
1015 }
1016 ht->ht_pfn = PFN_INVALID;
1017
1018 /*
1019 * Free it or put into reserves.
1020 */
1021 if (USE_HAT_RESERVES() || htable_reserve_cnt < htable_reserve_amount) {
1022 htable_put_reserve(ht);
1023 } else {
1024 kmem_cache_free(htable_cache, ht);
1025 htable_adjust_reserve();
1026 }
1027 }
1099 x86pte_t found;
1100 hat_t *hat = old->ht_hat;
1101
1102 ASSERT(higher->ht_busy > 0);
1103 ASSERT(higher->ht_valid_cnt > 0);
1104 ASSERT(old->ht_valid_cnt == 0);
1105 found = x86pte_cas(higher, entry, expect, 0);
1106 #ifdef __xpv
1107 /*
1108 * This is weird, but Xen apparently automatically unlinks empty
1109 * pagetables from the upper page table. So allow PTP to be 0 already.
1110 */
1111 if (found != expect && found != 0)
1112 #else
1113 if (found != expect)
1114 #endif
1115 panic("Bad PTP found=" FMT_PTE ", expected=" FMT_PTE,
1116 found, expect);
1117
1118 /*
1119 * When a top level PTE changes for a copied htable, we must trigger a
1120 * hat_pcp_update() on all HAT CPUs.
1121 *
1122 * If we don't need do do that, then we still have to INVLPG against an
1123 * address covered by the inner page table, as the latest processors
1124 * have TLB-like caches for non-leaf page table entries.
1125 */
1126 if (!(hat->hat_flags & HAT_FREEING)) {
1127 hat_tlb_inval(hat, (higher->ht_flags & HTABLE_COPIED) ?
1128 DEMAP_ALL_ADDR : old->ht_vaddr);
1129 }
1130
1131 HTABLE_DEC(higher->ht_valid_cnt);
1132 }
1133
1134 /*
1135 * Link an entry for a new table at vaddr and level into the existing table
1136 * one level higher. We are always holding the HASH_ENTER() when doing this.
1137 */
1138 static void
1139 link_ptp(htable_t *higher, htable_t *new, uintptr_t vaddr)
1140 {
1141 uint_t entry = htable_va2entry(vaddr, higher);
1142 x86pte_t newptp = MAKEPTP(new->ht_pfn, new->ht_level);
1143 x86pte_t found;
1144
1145 ASSERT(higher->ht_busy > 0);
1146
1147 ASSERT(new->ht_level != mmu.max_level);
1148
1149 HTABLE_INC(higher->ht_valid_cnt);
1150
1151 found = x86pte_cas(higher, entry, 0, newptp);
1152 if ((found & ~PT_REF) != 0)
1153 panic("HAT: ptp not 0, found=" FMT_PTE, found);
1154
1155 /*
1156 * When a top level PTE changes for a copied htable, we must trigger a
1157 * hat_pcp_update() on all HAT CPUs.
1158 *
1159 * We also need to do this for the kernel hat on PAE 32 bit kernel.
1160 */
1161 if (
1162 #ifdef __i386
1163 (higher->ht_hat == kas.a_hat &&
1164 higher->ht_level == higher->ht_hat->hat_max_level) ||
1165 #endif
1166 (higher->ht_flags & HTABLE_COPIED))
1167 hat_tlb_inval(higher->ht_hat, DEMAP_ALL_ADDR);
1168 }
1169
1170 /*
1171 * Release of hold on an htable. If this is the last use and the pagetable
1172 * is empty we may want to free it, then recursively look at the pagetable
1173 * above it. The recursion is handled by the outer while() loop.
1174 *
1175 * On the metal, during process exit, we don't bother unlinking the tables from
1176 * upper level pagetables. They are instead handled in bulk by hat_free_end().
1177 * We can't do this on the hypervisor as we need the page table to be
1178 * implicitly unpinnned before it goes to the free page lists. This can't
1179 * happen unless we fully unlink it from the page table hierarchy.
1180 */
1181 void
1182 htable_release(htable_t *ht)
1183 {
1184 uint_t hashval;
1185 htable_t *shared;
1186 htable_t *higher;
1285 /*
1286 * Find the htable for the pagetable at the given level for the given address.
1287 * If found acquires a hold that eventually needs to be htable_release()d
1288 */
1289 htable_t *
1290 htable_lookup(hat_t *hat, uintptr_t vaddr, level_t level)
1291 {
1292 uintptr_t base;
1293 uint_t hashval;
1294 htable_t *ht = NULL;
1295
1296 ASSERT(level >= 0);
1297 ASSERT(level <= TOP_LEVEL(hat));
1298
1299 if (level == TOP_LEVEL(hat)) {
1300 #if defined(__amd64)
1301 /*
1302 * 32 bit address spaces on 64 bit kernels need to check
1303 * for overflow of the 32 bit address space
1304 */
1305 if ((hat->hat_flags & HAT_COPIED_32) &&
1306 vaddr >= ((uint64_t)1 << 32))
1307 return (NULL);
1308 #endif
1309 base = 0;
1310 } else {
1311 base = vaddr & LEVEL_MASK(level + 1);
1312 }
1313
1314 hashval = HTABLE_HASH(hat, base, level);
1315 HTABLE_ENTER(hashval);
1316 for (ht = hat->hat_ht_hash[hashval]; ht; ht = ht->ht_next) {
1317 if (ht->ht_hat == hat &&
1318 ht->ht_vaddr == base &&
1319 ht->ht_level == level)
1320 break;
1321 }
1322 if (ht)
1323 ++ht->ht_busy;
1324
1325 HTABLE_EXIT(hashval);
1326 return (ht);
1934
1935 ASSERT(mmu.pae_hat != 0);
1936 for (;;) {
1937 t = p[0];
1938 t |= (uint64_t)p[1] << 32;
1939 if ((t & 0xffffffff) == p[0])
1940 return (t);
1941 }
1942 }
1943 #endif /* __i386 */
1944
1945 /*
1946 * Disable preemption and establish a mapping to the pagetable with the
1947 * given pfn. This is optimized for there case where it's the same
1948 * pfn as we last used referenced from this CPU.
1949 */
1950 static x86pte_t *
1951 x86pte_access_pagetable(htable_t *ht, uint_t index)
1952 {
1953 /*
1954 * HTABLE_COPIED pagetables are contained in the hat_t
1955 */
1956 if (ht->ht_flags & HTABLE_COPIED) {
1957 ASSERT3U(index, <, ht->ht_hat->hat_num_copied);
1958 return (PT_INDEX_PTR(ht->ht_hat->hat_copied_ptes, index));
1959 }
1960 return (x86pte_mapin(ht->ht_pfn, index, ht));
1961 }
1962
1963 /*
1964 * map the given pfn into the page table window.
1965 */
1966 /*ARGSUSED*/
1967 x86pte_t *
1968 x86pte_mapin(pfn_t pfn, uint_t index, htable_t *ht)
1969 {
1970 x86pte_t *pteptr;
1971 x86pte_t pte = 0;
1972 x86pte_t newpte;
1973 int x;
1974
1975 ASSERT(pfn != PFN_INVALID);
1976
1977 if (!khat_running) {
1978 caddr_t va = kbm_remap_window(pfn_to_pa(pfn), 1);
1979 return (PT_INDEX_PTR(va, index));
1980 }
1981
1982 /*
1983 * If kpm is available, use it.
1984 */
1985 if (kpm_vbase)
1986 return (PT_INDEX_PTR(hat_kpm_pfn2va(pfn), index));
1987
1988 /*
1989 * Disable preemption and grab the CPU's hci_mutex
1990 */
1991 kpreempt_disable();
1992
1993 ASSERT(CPU->cpu_hat_info != NULL);
1994 ASSERT(!(getcr4() & CR4_PCIDE));
1995
1996 mutex_enter(&CPU->cpu_hat_info->hci_mutex);
1997 x = PWIN_TABLE(CPU->cpu_id);
1998 pteptr = (x86pte_t *)PWIN_PTE_VA(x);
1999 #ifndef __xpv
2000 if (mmu.pae_hat)
2001 pte = *pteptr;
2002 else
2003 pte = *(x86pte32_t *)pteptr;
2004 #endif
2005
2006 newpte = MAKEPTE(pfn, 0) | mmu.pt_global | mmu.pt_nx;
2007
2008 /*
2009 * For hardware we can use a writable mapping.
2010 */
2011 #ifdef __xpv
2012 if (IN_XPV_PANIC())
2013 #endif
2014 newpte |= PT_WRITABLE;
2015
2016 if (!PTE_EQUIV(newpte, pte)) {
2017
2018 #ifdef __xpv
2019 if (!IN_XPV_PANIC()) {
2020 xen_map(newpte, PWIN_VA(x));
2021 } else
2022 #endif
2023 {
2024 XPV_ALLOW_PAGETABLE_UPDATES();
2025 if (mmu.pae_hat)
2026 *pteptr = newpte;
2027 else
2028 *(x86pte32_t *)pteptr = newpte;
2029 XPV_DISALLOW_PAGETABLE_UPDATES();
2030 mmu_flush_tlb_kpage((uintptr_t)PWIN_VA(x));
2031 }
2032 }
2033 return (PT_INDEX_PTR(PWIN_VA(x), index));
2034 }
2035
2036 /*
2037 * Release access to a page table.
2038 */
2039 static void
2040 x86pte_release_pagetable(htable_t *ht)
2041 {
2042 if (ht->ht_flags & HTABLE_COPIED)
2043 return;
2044
2045 x86pte_mapout();
2046 }
2047
2048 void
2049 x86pte_mapout(void)
2050 {
2051 if (kpm_vbase != NULL || !khat_running)
2052 return;
2053
2054 /*
2055 * Drop the CPU's hci_mutex and restore preemption.
2056 */
2057 #ifdef __xpv
2058 if (!IN_XPV_PANIC()) {
2059 uintptr_t va;
2060
2061 /*
2062 * We need to always clear the mapping in case a page
2123 * Install the new PTE. If remapping the same PFN, then
2124 * copy existing REF/MOD bits to new mapping.
2125 */
2126 do {
2127 prev = GET_PTE(ptep);
2128 n = new;
2129 if (PTE_ISVALID(n) && (prev & pfn_mask) == (new & pfn_mask))
2130 n |= prev & (PT_REF | PT_MOD);
2131
2132 /*
2133 * Another thread may have installed this mapping already,
2134 * flush the local TLB and be done.
2135 */
2136 if (prev == n) {
2137 old = new;
2138 #ifdef __xpv
2139 if (!IN_XPV_PANIC())
2140 xen_flush_va((caddr_t)addr);
2141 else
2142 #endif
2143 mmu_flush_tlb_page(addr);
2144 goto done;
2145 }
2146
2147 /*
2148 * Detect if we have a collision of installing a large
2149 * page mapping where there already is a lower page table.
2150 */
2151 if (l > 0 && (prev & PT_VALID) && !(prev & PT_PAGESIZE)) {
2152 old = LPAGE_ERROR;
2153 goto done;
2154 }
2155
2156 XPV_ALLOW_PAGETABLE_UPDATES();
2157 old = CAS_PTE(ptep, prev, n);
2158 XPV_DISALLOW_PAGETABLE_UPDATES();
2159 } while (old != prev);
2160
2161 /*
2162 * Do a TLB demap if needed, ie. the old pte was valid.
2163 *
2182 * Atomic compare and swap of a page table entry. No TLB invalidates are done.
2183 * This is used for links between pagetables of different levels.
2184 * Note we always create these links with dirty/access set, so they should
2185 * never change.
2186 */
2187 x86pte_t
2188 x86pte_cas(htable_t *ht, uint_t entry, x86pte_t old, x86pte_t new)
2189 {
2190 x86pte_t pte;
2191 x86pte_t *ptep;
2192 #ifdef __xpv
2193 /*
2194 * We can't use writable pagetables for upper level tables, so fake it.
2195 */
2196 mmu_update_t t[2];
2197 int cnt = 1;
2198 int count;
2199 maddr_t ma;
2200
2201 if (!IN_XPV_PANIC()) {
2202 ASSERT(!(ht->ht_flags & HTABLE_COPIED));
2203 ma = pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(ht->ht_pfn), entry));
2204 t[0].ptr = ma | MMU_NORMAL_PT_UPDATE;
2205 t[0].val = new;
2206
2207 #if defined(__amd64)
2208 /*
2209 * On the 64-bit hypervisor we need to maintain the user mode
2210 * top page table too.
2211 */
2212 if (ht->ht_level == mmu.max_level && ht->ht_hat != kas.a_hat) {
2213 ma = pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(
2214 ht->ht_hat->hat_user_ptable), entry));
2215 t[1].ptr = ma | MMU_NORMAL_PT_UPDATE;
2216 t[1].val = new;
2217 ++cnt;
2218 }
2219 #endif /* __amd64 */
2220
2221 if (HYPERVISOR_mmu_update(t, cnt, &count, DOMID_SELF))
2222 panic("HYPERVISOR_mmu_update() failed");
2339 if ((expect & (PT_WRITABLE | PT_MOD)) == PT_WRITABLE &&
2340 (new & (PT_WRITABLE | PT_MOD)) == 0 &&
2341 (GET_PTE(ptep) & PT_MOD) != 0) {
2342 do {
2343 found = GET_PTE(ptep);
2344 XPV_ALLOW_PAGETABLE_UPDATES();
2345 found =
2346 CAS_PTE(ptep, found, found | PT_WRITABLE);
2347 XPV_DISALLOW_PAGETABLE_UPDATES();
2348 } while ((found & PT_WRITABLE) == 0);
2349 }
2350 }
2351 x86pte_release_pagetable(ht);
2352 return (found);
2353 }
2354
2355 #ifndef __xpv
2356 /*
2357 * Copy page tables - this is just a little more complicated than the
2358 * previous routines. Note that it's also not atomic! It also is never
2359 * used for HTABLE_COPIED pagetables.
2360 */
2361 void
2362 x86pte_copy(htable_t *src, htable_t *dest, uint_t entry, uint_t count)
2363 {
2364 caddr_t src_va;
2365 caddr_t dst_va;
2366 size_t size;
2367 x86pte_t *pteptr;
2368 x86pte_t pte;
2369
2370 ASSERT(khat_running);
2371 ASSERT(!(dest->ht_flags & HTABLE_COPIED));
2372 ASSERT(!(src->ht_flags & HTABLE_COPIED));
2373 ASSERT(!(src->ht_flags & HTABLE_SHARED_PFN));
2374 ASSERT(!(dest->ht_flags & HTABLE_SHARED_PFN));
2375
2376 /*
2377 * Acquire access to the CPU pagetable windows for the dest and source.
2378 */
2379 dst_va = (caddr_t)x86pte_access_pagetable(dest, entry);
2380 if (kpm_vbase) {
2381 src_va = (caddr_t)
2382 PT_INDEX_PTR(hat_kpm_pfn2va(src->ht_pfn), entry);
2383 } else {
2384 uint_t x = PWIN_SRC(CPU->cpu_id);
2385
2386 ASSERT(!(getcr4() & CR4_PCIDE));
2387
2388 /*
2389 * Finish defining the src pagetable mapping
2390 */
2391 src_va = (caddr_t)PT_INDEX_PTR(PWIN_VA(x), entry);
2392 pte = MAKEPTE(src->ht_pfn, 0) | mmu.pt_global | mmu.pt_nx;
2393 pteptr = (x86pte_t *)PWIN_PTE_VA(x);
2394 if (mmu.pae_hat)
2395 *pteptr = pte;
2396 else
2397 *(x86pte32_t *)pteptr = pte;
2398 mmu_flush_tlb_kpage((uintptr_t)PWIN_VA(x));
2399 }
2400
2401 /*
2402 * now do the copy
2403 */
2404 size = count << mmu.pte_size_shift;
2405 bcopy(src_va, dst_va, size);
2406
2407 x86pte_release_pagetable(dest);
2408 }
2409
2410 #else /* __xpv */
2411
2412 /*
2413 * The hypervisor only supports writable pagetables at level 0, so we have
2414 * to install these 1 by 1 the slow way.
2415 */
2416 void
2417 x86pte_copy(htable_t *src, htable_t *dest, uint_t entry, uint_t count)
2418 {
2445 }
2446 #endif /* __xpv */
2447
2448 /*
2449 * Zero page table entries - Note this doesn't use atomic stores!
2450 */
2451 static void
2452 x86pte_zero(htable_t *dest, uint_t entry, uint_t count)
2453 {
2454 caddr_t dst_va;
2455 size_t size;
2456 #ifdef __xpv
2457 int x;
2458 x86pte_t newpte;
2459 #endif
2460
2461 /*
2462 * Map in the page table to be zeroed.
2463 */
2464 ASSERT(!(dest->ht_flags & HTABLE_SHARED_PFN));
2465 ASSERT(!(dest->ht_flags & HTABLE_COPIED));
2466
2467 /*
2468 * On the hypervisor we don't use x86pte_access_pagetable() since
2469 * in this case the page is not pinned yet.
2470 */
2471 #ifdef __xpv
2472 if (kpm_vbase == NULL) {
2473 kpreempt_disable();
2474 ASSERT(CPU->cpu_hat_info != NULL);
2475 mutex_enter(&CPU->cpu_hat_info->hci_mutex);
2476 x = PWIN_TABLE(CPU->cpu_id);
2477 newpte = MAKEPTE(dest->ht_pfn, 0) | PT_WRITABLE;
2478 xen_map(newpte, PWIN_VA(x));
2479 dst_va = (caddr_t)PT_INDEX_PTR(PWIN_VA(x), entry);
2480 } else
2481 #endif
2482 dst_va = (caddr_t)x86pte_access_pagetable(dest, entry);
2483
2484 size = count << mmu.pte_size_shift;
2485 ASSERT(size > BLOCKZEROALIGN);
2499 #endif
2500 x86pte_release_pagetable(dest);
2501 }
2502
2503 /*
2504 * Called to ensure that all pagetables are in the system dump
2505 */
2506 void
2507 hat_dump(void)
2508 {
2509 hat_t *hat;
2510 uint_t h;
2511 htable_t *ht;
2512
2513 /*
2514 * Dump all page tables
2515 */
2516 for (hat = kas.a_hat; hat != NULL; hat = hat->hat_next) {
2517 for (h = 0; h < hat->hat_num_hash; ++h) {
2518 for (ht = hat->hat_ht_hash[h]; ht; ht = ht->ht_next) {
2519 if ((ht->ht_flags & HTABLE_COPIED) == 0)
2520 dump_page(ht->ht_pfn);
2521 }
2522 }
2523 }
2524 }
|