Print this page
8956 Implement KPTI
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/i86pc/vm/i86_mmu.c
+++ new/usr/src/uts/i86pc/vm/i86_mmu.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
↓ open down ↓ |
13 lines elided |
↑ open up ↑ |
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 + *
25 + * Copyright 2018 Joyent, Inc.
24 26 */
25 27
26 28 #include <sys/t_lock.h>
27 29 #include <sys/memlist.h>
28 30 #include <sys/cpuvar.h>
29 31 #include <sys/vmem.h>
30 32 #include <sys/mman.h>
31 33 #include <sys/vm.h>
32 34 #include <sys/kmem.h>
33 35 #include <sys/cmn_err.h>
34 36 #include <sys/debug.h>
35 37 #include <sys/vm_machparam.h>
36 38 #include <sys/tss.h>
37 39 #include <sys/vnode.h>
38 40 #include <vm/hat.h>
39 41 #include <vm/anon.h>
40 42 #include <vm/as.h>
41 43 #include <vm/page.h>
42 44 #include <vm/seg.h>
43 45 #include <vm/seg_kmem.h>
44 46 #include <vm/seg_map.h>
45 47 #include <vm/hat_i86.h>
46 48 #include <sys/promif.h>
47 49 #include <sys/x86_archext.h>
48 50 #include <sys/systm.h>
49 51 #include <sys/archsystm.h>
50 52 #include <sys/sunddi.h>
51 53 #include <sys/ddidmareq.h>
52 54 #include <sys/controlregs.h>
53 55 #include <sys/reboot.h>
↓ open down ↓ |
20 lines elided |
↑ open up ↑ |
54 56 #include <sys/kdi.h>
55 57 #include <sys/bootconf.h>
56 58 #include <sys/bootsvcs.h>
57 59 #include <sys/bootinfo.h>
58 60 #include <vm/kboot_mmu.h>
59 61
60 62 #ifdef __xpv
61 63 #include <sys/hypervisor.h>
62 64 #endif
63 65
64 -caddr_t
65 -i86devmap(pfn_t pf, pgcnt_t pgcnt, uint_t prot)
66 -{
67 - caddr_t addr;
68 - caddr_t addr1;
69 - page_t *pp;
66 +#define ON_USER_HAT(cpu) \
67 + ((cpu)->cpu_m.mcpu_current_hat != NULL && \
68 + (cpu)->cpu_m.mcpu_current_hat != kas.a_hat)
70 69
71 - addr1 = addr = vmem_alloc(heap_arena, mmu_ptob(pgcnt), VM_SLEEP);
72 -
73 - for (; pgcnt != 0; addr += MMU_PAGESIZE, ++pf, --pgcnt) {
74 - pp = page_numtopp_nolock(pf);
75 - if (pp == NULL) {
76 - hat_devload(kas.a_hat, addr, MMU_PAGESIZE, pf,
77 - prot | HAT_NOSYNC, HAT_LOAD_LOCK);
78 - } else {
79 - hat_memload(kas.a_hat, addr, pp,
80 - prot | HAT_NOSYNC, HAT_LOAD_LOCK);
81 - }
82 - }
83 -
84 - return (addr1);
85 -}
86 -
87 70 /*
88 - * This routine is like page_numtopp, but accepts only free pages, which
89 - * it allocates (unfrees) and returns with the exclusive lock held.
90 - * It is used by machdep.c/dma_init() to find contiguous free pages.
91 - *
92 - * XXX this and some others should probably be in vm_machdep.c
93 - */
94 -page_t *
95 -page_numtopp_alloc(pfn_t pfnum)
96 -{
97 - page_t *pp;
98 -
99 -retry:
100 - pp = page_numtopp_nolock(pfnum);
101 - if (pp == NULL) {
102 - return (NULL);
103 - }
104 -
105 - if (!page_trylock(pp, SE_EXCL)) {
106 - return (NULL);
107 - }
108 -
109 - if (page_pptonum(pp) != pfnum) {
110 - page_unlock(pp);
111 - goto retry;
112 - }
113 -
114 - if (!PP_ISFREE(pp)) {
115 - page_unlock(pp);
116 - return (NULL);
117 - }
118 - if (pp->p_szc) {
119 - page_demote_free_pages(pp);
120 - page_unlock(pp);
121 - goto retry;
122 - }
123 -
124 - /* If associated with a vnode, destroy mappings */
125 -
126 - if (pp->p_vnode) {
127 -
128 - page_destroy_free(pp);
129 -
130 - if (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_NO_RECLAIM)) {
131 - return (NULL);
132 - }
133 -
134 - if (page_pptonum(pp) != pfnum) {
135 - page_unlock(pp);
136 - goto retry;
137 - }
138 - }
139 -
140 - if (!PP_ISFREE(pp)) {
141 - page_unlock(pp);
142 - return (NULL);
143 - }
144 -
145 - if (!page_reclaim(pp, (kmutex_t *)NULL))
146 - return (NULL);
147 -
148 - return (pp);
149 -}
150 -
151 -/*
152 71 * Flag is not set early in boot. Once it is set we are no longer
153 72 * using boot's page tables.
154 73 */
155 74 uint_t khat_running = 0;
156 75
157 76 /*
158 77 * This procedure is callable only while the boot loader is in charge of the
159 78 * MMU. It assumes that PA == VA for page table pointers. It doesn't live in
160 79 * kboot_mmu.c since it's used from common code.
161 80 */
162 81 pfn_t
163 82 va_to_pfn(void *vaddr)
164 83 {
165 84 uintptr_t des_va = ALIGN2PAGE(vaddr);
166 85 uintptr_t va = des_va;
167 86 size_t len;
168 87 uint_t prot;
169 88 pfn_t pfn;
170 89
171 90 if (khat_running)
172 91 panic("va_to_pfn(): called too late\n");
173 92
174 93 if (kbm_probe(&va, &len, &pfn, &prot) == 0)
175 94 return (PFN_INVALID);
176 95 if (va > des_va)
177 96 return (PFN_INVALID);
178 97 if (va < des_va)
179 98 pfn += mmu_btop(des_va - va);
180 99 return (pfn);
181 100 }
182 101
183 102 /*
184 103 * Initialize a special area in the kernel that always holds some PTEs for
185 104 * faster performance. This always holds segmap's PTEs.
186 105 * In the 32 bit kernel this maps the kernel heap too.
187 106 */
188 107 void
189 108 hat_kmap_init(uintptr_t base, size_t len)
190 109 {
191 110 uintptr_t map_addr; /* base rounded down to large page size */
192 111 uintptr_t map_eaddr; /* base + len rounded up */
193 112 size_t map_len;
194 113 caddr_t ptes; /* mapping area in kernel for kmap ptes */
195 114 size_t window_size; /* size of mapping area for ptes */
196 115 ulong_t htable_cnt; /* # of page tables to cover map_len */
197 116 ulong_t i;
198 117 htable_t *ht;
199 118 uintptr_t va;
200 119
201 120 /*
202 121 * We have to map in an area that matches an entire page table.
203 122 * The PTEs are large page aligned to avoid spurious pagefaults
204 123 * on the hypervisor.
205 124 */
206 125 map_addr = base & LEVEL_MASK(1);
207 126 map_eaddr = (base + len + LEVEL_SIZE(1) - 1) & LEVEL_MASK(1);
208 127 map_len = map_eaddr - map_addr;
209 128 window_size = mmu_btop(map_len) * mmu.pte_size;
210 129 window_size = (window_size + LEVEL_SIZE(1)) & LEVEL_MASK(1);
211 130 htable_cnt = map_len >> LEVEL_SHIFT(1);
212 131
213 132 /*
214 133 * allocate vmem for the kmap_ptes
215 134 */
216 135 ptes = vmem_xalloc(heap_arena, window_size, LEVEL_SIZE(1), 0,
217 136 0, NULL, NULL, VM_SLEEP);
218 137 mmu.kmap_htables =
219 138 kmem_alloc(htable_cnt * sizeof (htable_t *), KM_SLEEP);
220 139
221 140 /*
222 141 * Map the page tables that cover kmap into the allocated range.
223 142 * Note we don't ever htable_release() the kmap page tables - they
224 143 * can't ever be stolen, freed, etc.
225 144 */
226 145 for (va = map_addr, i = 0; i < htable_cnt; va += LEVEL_SIZE(1), ++i) {
227 146 ht = htable_create(kas.a_hat, va, 0, NULL);
228 147 if (ht == NULL)
229 148 panic("hat_kmap_init: ht == NULL");
230 149 mmu.kmap_htables[i] = ht;
231 150
232 151 hat_devload(kas.a_hat, ptes + i * MMU_PAGESIZE,
233 152 MMU_PAGESIZE, ht->ht_pfn,
234 153 #ifdef __xpv
235 154 PROT_READ | HAT_NOSYNC | HAT_UNORDERED_OK,
236 155 #else
237 156 PROT_READ | PROT_WRITE | HAT_NOSYNC | HAT_UNORDERED_OK,
238 157 #endif
239 158 HAT_LOAD | HAT_LOAD_NOCONSIST);
240 159 }
241 160
242 161 /*
243 162 * set information in mmu to activate handling of kmap
244 163 */
245 164 mmu.kmap_addr = map_addr;
246 165 mmu.kmap_eaddr = map_eaddr;
247 166 mmu.kmap_ptes = (x86pte_t *)ptes;
248 167 }
249 168
250 169 extern caddr_t kpm_vbase;
251 170 extern size_t kpm_size;
252 171
253 172 #ifdef __xpv
254 173 /*
255 174 * Create the initial segkpm mappings for the hypervisor. To avoid having
256 175 * to deal with page tables being read only, we make all mappings
257 176 * read only at first.
258 177 */
259 178 static void
260 179 xen_kpm_create(paddr_t paddr, level_t lvl)
261 180 {
262 181 ulong_t pg_off;
263 182
264 183 for (pg_off = 0; pg_off < LEVEL_SIZE(lvl); pg_off += MMU_PAGESIZE) {
265 184 kbm_map((uintptr_t)kpm_vbase + paddr, (paddr_t)0, 0, 1);
266 185 kbm_read_only((uintptr_t)kpm_vbase + paddr + pg_off,
267 186 paddr + pg_off);
268 187 }
269 188 }
270 189
271 190 /*
272 191 * Try to make all kpm mappings writable. Failures are ok, as those
273 192 * are just pagetable, GDT, etc. pages.
274 193 */
275 194 static void
276 195 xen_kpm_finish_init(void)
277 196 {
278 197 pfn_t gdtpfn = mmu_btop(CPU->cpu_m.mcpu_gdtpa);
279 198 pfn_t pfn;
280 199 page_t *pp;
281 200
282 201 for (pfn = 0; pfn < mfn_count; ++pfn) {
283 202 /*
284 203 * skip gdt
285 204 */
286 205 if (pfn == gdtpfn)
287 206 continue;
288 207
289 208 /*
290 209 * p_index is a hint that this is a pagetable
291 210 */
292 211 pp = page_numtopp_nolock(pfn);
293 212 if (pp && pp->p_index) {
294 213 pp->p_index = 0;
295 214 continue;
296 215 }
297 216 (void) xen_kpm_page(pfn, PT_VALID | PT_WRITABLE);
298 217 }
299 218 }
300 219 #endif
301 220
302 221 /*
303 222 * Routine to pre-allocate data structures for hat_kern_setup(). It computes
304 223 * how many pagetables it needs by walking the boot loader's page tables.
305 224 */
306 225 /*ARGSUSED*/
307 226 void
308 227 hat_kern_alloc(
309 228 caddr_t segmap_base,
310 229 size_t segmap_size,
311 230 caddr_t ekernelheap)
312 231 {
313 232 uintptr_t last_va = (uintptr_t)-1; /* catch 1st time */
314 233 uintptr_t va = 0;
315 234 size_t size;
316 235 pfn_t pfn;
317 236 uint_t prot;
318 237 uint_t table_cnt = 1;
319 238 uint_t mapping_cnt;
320 239 level_t start_level;
321 240 level_t l;
322 241 struct memlist *pmem;
323 242 level_t lpagel = mmu.max_page_level;
324 243 uint64_t paddr;
325 244 int64_t psize;
326 245 int nwindows;
327 246
328 247 if (kpm_size > 0) {
329 248 /*
330 249 * Create the kpm page tables. When running on the
331 250 * hypervisor these are made read/only at first.
332 251 * Later we'll add write permission where possible.
333 252 */
334 253 for (pmem = phys_install; pmem; pmem = pmem->ml_next) {
335 254 paddr = pmem->ml_address;
336 255 psize = pmem->ml_size;
337 256 while (psize >= MMU_PAGESIZE) {
338 257 /* find the largest page size */
339 258 for (l = lpagel; l > 0; l--) {
340 259 if ((paddr & LEVEL_OFFSET(l)) == 0 &&
341 260 psize > LEVEL_SIZE(l))
342 261 break;
343 262 }
344 263
345 264 #if defined(__xpv)
346 265 /*
347 266 * Create read/only mappings to avoid
348 267 * conflicting with pagetable usage
349 268 */
350 269 xen_kpm_create(paddr, l);
351 270 #else
352 271 kbm_map((uintptr_t)kpm_vbase + paddr, paddr,
353 272 l, 1);
354 273 #endif
355 274 paddr += LEVEL_SIZE(l);
356 275 psize -= LEVEL_SIZE(l);
357 276 }
358 277 }
359 278 }
360 279
361 280 /*
362 281 * If this machine doesn't have a kpm segment, we need to allocate
363 282 * a small number of 'windows' which can be used to map pagetables.
364 283 */
365 284 nwindows = (kpm_size == 0) ? 2 * NCPU : 0;
366 285
367 286 #if defined(__xpv)
368 287 /*
369 288 * On a hypervisor, these windows are also used by the xpv_panic
370 289 * code, where we need one window for each level of the pagetable
371 290 * hierarchy.
372 291 */
373 292 nwindows = MAX(nwindows, mmu.max_level);
374 293 #endif
375 294
376 295 if (nwindows != 0) {
377 296 /*
378 297 * Create the page windows and 1 page of VA in
379 298 * which we map the PTEs of those windows.
380 299 */
381 300 mmu.pwin_base = vmem_xalloc(heap_arena, nwindows * MMU_PAGESIZE,
382 301 LEVEL_SIZE(1), 0, 0, NULL, NULL, VM_SLEEP);
383 302 ASSERT(nwindows <= MMU_PAGESIZE / mmu.pte_size);
384 303 mmu.pwin_pte_va = vmem_xalloc(heap_arena, MMU_PAGESIZE,
385 304 MMU_PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP);
386 305
387 306 /*
388 307 * Find/Create the page table window mappings.
389 308 */
390 309 paddr = 0;
391 310 (void) find_pte((uintptr_t)mmu.pwin_base, &paddr, 0, 0);
392 311 ASSERT(paddr != 0);
393 312 ASSERT((paddr & MMU_PAGEOFFSET) == 0);
394 313 mmu.pwin_pte_pa = paddr;
395 314 #ifdef __xpv
396 315 (void) find_pte((uintptr_t)mmu.pwin_pte_va, NULL, 0, 0);
397 316 kbm_read_only((uintptr_t)mmu.pwin_pte_va, mmu.pwin_pte_pa);
398 317 #else
399 318 kbm_map((uintptr_t)mmu.pwin_pte_va, mmu.pwin_pte_pa, 0, 1);
400 319 #endif
401 320 }
402 321
403 322 /*
404 323 * Walk the boot loader's page tables and figure out
405 324 * how many tables and page mappings there will be.
406 325 */
407 326 while (kbm_probe(&va, &size, &pfn, &prot) != 0) {
408 327 /*
409 328 * At each level, if the last_va falls into a new htable,
410 329 * increment table_cnt. We can stop at the 1st level where
411 330 * they are in the same htable.
412 331 */
413 332 start_level = 0;
414 333 while (start_level <= mmu.max_page_level) {
415 334 if (size == LEVEL_SIZE(start_level))
416 335 break;
417 336 start_level++;
418 337 }
419 338
420 339 for (l = start_level; l < mmu.max_level; ++l) {
421 340 if (va >> LEVEL_SHIFT(l + 1) ==
422 341 last_va >> LEVEL_SHIFT(l + 1))
423 342 break;
424 343 ++table_cnt;
425 344 }
426 345 last_va = va;
427 346 l = (start_level == 0) ? 1 : start_level;
428 347 va = (va & LEVEL_MASK(l)) + LEVEL_SIZE(l);
↓ open down ↓ |
267 lines elided |
↑ open up ↑ |
429 348 }
430 349
431 350 /*
432 351 * Besides the boot loader mappings, we're going to fill in
433 352 * the entire top level page table for the kernel. Make sure there's
434 353 * enough reserve for that too.
435 354 */
436 355 table_cnt += mmu.top_level_count - ((kernelbase >>
437 356 LEVEL_SHIFT(mmu.max_level)) & (mmu.top_level_count - 1));
438 357
439 -#if defined(__i386)
440 358 /*
441 - * The 32 bit PAE hat allocates tables one level below the top when
442 - * kernelbase isn't 1 Gig aligned. We'll just be sloppy and allocate
443 - * a bunch more to the reserve. Any unused will be returned later.
444 - * Note we've already counted these mappings, just not the extra
445 - * pagetables.
446 - */
447 - if (mmu.pae_hat != 0 && (kernelbase & LEVEL_OFFSET(mmu.max_level)) != 0)
448 - table_cnt += mmu.ptes_per_table -
449 - ((kernelbase & LEVEL_OFFSET(mmu.max_level)) >>
450 - LEVEL_SHIFT(mmu.max_level - 1));
451 -#endif
452 -
453 - /*
454 359 * Add 1/4 more into table_cnt for extra slop. The unused
455 360 * slop is freed back when we htable_adjust_reserve() later.
456 361 */
457 362 table_cnt += table_cnt >> 2;
458 363
459 364 /*
460 365 * We only need mapping entries (hments) for shared pages.
461 366 * This should be far, far fewer than the total possible,
462 367 * We'll allocate enough for 1/16 of all possible PTEs.
463 368 */
464 369 mapping_cnt = (table_cnt * mmu.ptes_per_table) >> 4;
465 370
466 371 /*
467 372 * Now create the initial htable/hment reserves
468 373 */
469 374 htable_initial_reserve(table_cnt);
470 375 hment_reserve(mapping_cnt);
471 376 x86pte_cpu_init(CPU);
472 377 }
473 378
474 379
475 380 /*
476 381 * This routine handles the work of creating the kernel's initial mappings
477 382 * by deciphering the mappings in the page tables created by the boot program.
478 383 *
479 384 * We maintain large page mappings, but only to a level 1 pagesize.
480 385 * The boot loader can only add new mappings once this function starts.
481 386 * In particular it can not change the pagesize used for any existing
482 387 * mappings or this code breaks!
483 388 */
484 389
485 390 void
↓ open down ↓ |
22 lines elided |
↑ open up ↑ |
486 391 hat_kern_setup(void)
487 392 {
488 393 /*
489 394 * Attach htables to the existing pagetables
490 395 */
491 396 /* BEGIN CSTYLED */
492 397 htable_attach(kas.a_hat, 0, mmu.max_level, NULL,
493 398 #ifdef __xpv
494 399 mmu_btop(xen_info->pt_base - ONE_GIG));
495 400 #else
496 - mmu_btop(getcr3()));
401 + mmu_btop(getcr3_pa()));
497 402 #endif
498 403 /* END CSTYLED */
499 404
500 -#if defined(__i386) && !defined(__xpv)
501 - CPU->cpu_tss->tss_cr3 = dftss0->tss_cr3 = getcr3();
502 -#endif /* __i386 */
503 -
504 -#if defined(__xpv) && defined(__amd64)
405 +#if defined(__xpv)
505 406 /*
506 407 * Try to make the kpm mappings r/w. Failures here are OK, as
507 408 * it's probably just a pagetable
508 409 */
509 410 xen_kpm_finish_init();
510 411 #endif
511 412
512 413 /*
513 414 * The kernel HAT is now officially open for business.
514 415 */
515 416 khat_running = 1;
516 417
517 418 CPUSET_ATOMIC_ADD(kas.a_hat->hat_cpus, CPU->cpu_id);
518 419 CPU->cpu_current_hat = kas.a_hat;
519 420 }
421 +
422 +#ifndef __xpv
423 +
424 +/*
425 + * Note that the INVPCID_ALL* variants can be used even in the !PCIDE case, but
426 + * INVPCID_ADDR isn't.
427 + */
428 +static void
429 +invpcid(uint64_t type, uint64_t pcid, uintptr_t addr)
430 +{
431 + ulong_t flag;
432 + uint64_t cr4;
433 +
434 + if (x86_use_invpcid == 1) {
435 + ASSERT(is_x86_feature(x86_featureset, X86FSET_INVPCID));
436 + invpcid_insn(type, pcid, addr);
437 + return;
438 + }
439 +
440 + switch (type) {
441 + case INVPCID_ALL_GLOBAL:
442 + flag = intr_clear();
443 + cr4 = getcr4();
444 + setcr4(cr4 & ~(ulong_t)CR4_PGE);
445 + setcr4(cr4 | CR4_PGE);
446 + intr_restore(flag);
447 + break;
448 +
449 + case INVPCID_ALL_NONGLOBAL:
450 + if (!(getcr4() & CR4_PCIDE)) {
451 + reload_cr3();
452 + } else {
453 + flag = intr_clear();
454 + cr4 = getcr4();
455 + setcr4(cr4 & ~(ulong_t)CR4_PGE);
456 + setcr4(cr4 | CR4_PGE);
457 + intr_restore(flag);
458 + }
459 + break;
460 +
461 + case INVPCID_ADDR:
462 + if (pcid == PCID_USER) {
463 + flag = intr_clear();
464 + ASSERT(addr < kernelbase);
465 + ASSERT(ON_USER_HAT(CPU));
466 + ASSERT(CPU->cpu_m.mcpu_kpti.kf_user_cr3 != 0);
467 + tr_mmu_flush_user_range(addr, MMU_PAGESIZE,
468 + MMU_PAGESIZE, CPU->cpu_m.mcpu_kpti.kf_user_cr3);
469 + intr_restore(flag);
470 + } else {
471 + mmu_invlpg((caddr_t)addr);
472 + }
473 + break;
474 +
475 + default:
476 + panic("unsupported invpcid(%lu)", type);
477 + break;
478 + }
479 +}
480 +
481 +/*
482 + * Flush one kernel mapping.
483 + *
484 + * We want to assert on kernel space here mainly for reasoning about the PCIDE
485 + * case: namely, this flush should never need to flush a non-current PCID
486 + * mapping. This presumes we never have reason to flush the kernel regions
487 + * available to PCID_USER (the trampolines and so on). It also relies on
488 + * PCID_KERNEL == PCID_NONE.
489 + */
490 +void
491 +mmu_flush_tlb_kpage(uintptr_t va)
492 +{
493 + ASSERT(va >= kernelbase);
494 + ASSERT(getpcid() == PCID_KERNEL);
495 + mmu_invlpg((caddr_t)va);
496 +}
497 +
498 +/*
499 + * Flush one mapping: local CPU version of hat_tlb_inval().
500 + *
501 + * If this is a userspace address in the PCIDE case, we need two invalidations,
502 + * one for any potentially stale PCID_USER mapping, as well as any established
503 + * while in the kernel.
504 + */
505 +void
506 +mmu_flush_tlb_page(uintptr_t va)
507 +{
508 + ASSERT(getpcid() == PCID_KERNEL);
509 +
510 + if (va >= kernelbase) {
511 + mmu_flush_tlb_kpage(va);
512 + return;
513 + }
514 +
515 + if (!(getcr4() & CR4_PCIDE)) {
516 + mmu_invlpg((caddr_t)va);
517 + return;
518 + }
519 +
520 + /*
521 + * Yes, kas will need to flush below kernelspace, at least during boot.
522 + * But there's no PCID_USER context.
523 + */
524 + if (ON_USER_HAT(CPU))
525 + invpcid(INVPCID_ADDR, PCID_USER, va);
526 + invpcid(INVPCID_ADDR, PCID_KERNEL, va);
527 +}
528 +
529 +static void
530 +mmu_flush_tlb_range(uintptr_t addr, size_t len, size_t pgsz)
531 +{
532 + EQUIV(addr < kernelbase, (addr + len - 1) < kernelbase);
533 + ASSERT(len > 0);
534 + ASSERT(pgsz != 0);
535 +
536 + if (!(getcr4() & CR4_PCIDE) || x86_use_invpcid == 1) {
537 + for (uintptr_t va = addr; va < (addr + len); va += pgsz)
538 + mmu_flush_tlb_page(va);
539 + return;
540 + }
541 +
542 + /*
543 + * As an emulated invpcid() in the PCIDE case requires jumping
544 + * cr3s, we batch the invalidations. We should only need to flush the
545 + * user range if we're on a user-space HAT.
546 + */
547 + if (addr < kernelbase && ON_USER_HAT(CPU)) {
548 + ulong_t flag = intr_clear();
549 + ASSERT(CPU->cpu_m.mcpu_kpti.kf_user_cr3 != 0);
550 + tr_mmu_flush_user_range(addr, len, pgsz,
551 + CPU->cpu_m.mcpu_kpti.kf_user_cr3);
552 + intr_restore(flag);
553 + }
554 +
555 + for (uintptr_t va = addr; va < (addr + len); va += pgsz)
556 + mmu_invlpg((caddr_t)va);
557 +}
558 +
559 +/*
560 + * MMU TLB (and PT cache) flushing on this CPU.
561 + *
562 + * FLUSH_TLB_ALL: invalidate everything, all PCIDs, all PT_GLOBAL.
563 + * FLUSH_TLB_NONGLOBAL: invalidate all PCIDs, excluding PT_GLOBAL
564 + * FLUSH_TLB_RANGE: invalidate the given range, including PCID_USER
565 + * mappings as appropriate. If using invpcid, PT_GLOBAL mappings are not
566 + * invalidated.
567 + */
568 +void
569 +mmu_flush_tlb(flush_tlb_type_t type, tlb_range_t *range)
570 +{
571 + ASSERT(getpcid() == PCID_KERNEL);
572 +
573 + switch (type) {
574 + case FLUSH_TLB_ALL:
575 + ASSERT(range == NULL);
576 + invpcid(INVPCID_ALL_GLOBAL, 0, 0);
577 + break;
578 +
579 + case FLUSH_TLB_NONGLOBAL:
580 + ASSERT(range == NULL);
581 + invpcid(INVPCID_ALL_NONGLOBAL, 0, 0);
582 + break;
583 +
584 + case FLUSH_TLB_RANGE: {
585 + mmu_flush_tlb_range(range->tr_va, TLB_RANGE_LEN(range),
586 + LEVEL_SIZE(range->tr_level));
587 + break;
588 + }
589 +
590 + default:
591 + panic("invalid call mmu_flush_tlb(%d)", type);
592 + break;
593 + }
594 +}
595 +
596 +#endif /* ! __xpv */
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX