1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * 25 * Copyright 2018 Joyent, Inc. 26 */ 27 28 #include <sys/t_lock.h> 29 #include <sys/memlist.h> 30 #include <sys/cpuvar.h> 31 #include <sys/vmem.h> 32 #include <sys/mman.h> 33 #include <sys/vm.h> 34 #include <sys/kmem.h> 35 #include <sys/cmn_err.h> 36 #include <sys/debug.h> 37 #include <sys/vm_machparam.h> 38 #include <sys/tss.h> 39 #include <sys/vnode.h> 40 #include <vm/hat.h> 41 #include <vm/anon.h> 42 #include <vm/as.h> 43 #include <vm/page.h> 44 #include <vm/seg.h> 45 #include <vm/seg_kmem.h> 46 #include <vm/seg_map.h> 47 #include <vm/hat_i86.h> 48 #include <sys/promif.h> 49 #include <sys/x86_archext.h> 50 #include <sys/systm.h> 51 #include <sys/archsystm.h> 52 #include <sys/sunddi.h> 53 #include <sys/ddidmareq.h> 54 #include <sys/controlregs.h> 55 #include <sys/reboot.h> 56 #include <sys/kdi.h> 57 #include <sys/bootconf.h> 58 #include <sys/bootsvcs.h> 59 #include <sys/bootinfo.h> 60 #include <vm/kboot_mmu.h> 61 62 #ifdef __xpv 63 #include <sys/hypervisor.h> 64 #endif 65 66 #define ON_USER_HAT(cpu) \ 67 ((cpu)->cpu_m.mcpu_current_hat != NULL && \ 68 (cpu)->cpu_m.mcpu_current_hat != kas.a_hat) 69 70 /* 71 * Flag is not set early in boot. Once it is set we are no longer 72 * using boot's page tables. 73 */ 74 uint_t khat_running = 0; 75 76 /* 77 * This procedure is callable only while the boot loader is in charge of the 78 * MMU. It assumes that PA == VA for page table pointers. It doesn't live in 79 * kboot_mmu.c since it's used from common code. 80 */ 81 pfn_t 82 va_to_pfn(void *vaddr) 83 { 84 uintptr_t des_va = ALIGN2PAGE(vaddr); 85 uintptr_t va = des_va; 86 size_t len; 87 uint_t prot; 88 pfn_t pfn; 89 90 if (khat_running) 91 panic("va_to_pfn(): called too late\n"); 92 93 if (kbm_probe(&va, &len, &pfn, &prot) == 0) 94 return (PFN_INVALID); 95 if (va > des_va) 96 return (PFN_INVALID); 97 if (va < des_va) 98 pfn += mmu_btop(des_va - va); 99 return (pfn); 100 } 101 102 /* 103 * Initialize a special area in the kernel that always holds some PTEs for 104 * faster performance. This always holds segmap's PTEs. 105 * In the 32 bit kernel this maps the kernel heap too. 106 */ 107 void 108 hat_kmap_init(uintptr_t base, size_t len) 109 { 110 uintptr_t map_addr; /* base rounded down to large page size */ 111 uintptr_t map_eaddr; /* base + len rounded up */ 112 size_t map_len; 113 caddr_t ptes; /* mapping area in kernel for kmap ptes */ 114 size_t window_size; /* size of mapping area for ptes */ 115 ulong_t htable_cnt; /* # of page tables to cover map_len */ 116 ulong_t i; 117 htable_t *ht; 118 uintptr_t va; 119 120 /* 121 * We have to map in an area that matches an entire page table. 122 * The PTEs are large page aligned to avoid spurious pagefaults 123 * on the hypervisor. 124 */ 125 map_addr = base & LEVEL_MASK(1); 126 map_eaddr = (base + len + LEVEL_SIZE(1) - 1) & LEVEL_MASK(1); 127 map_len = map_eaddr - map_addr; 128 window_size = mmu_btop(map_len) * mmu.pte_size; 129 window_size = (window_size + LEVEL_SIZE(1)) & LEVEL_MASK(1); 130 htable_cnt = map_len >> LEVEL_SHIFT(1); 131 132 /* 133 * allocate vmem for the kmap_ptes 134 */ 135 ptes = vmem_xalloc(heap_arena, window_size, LEVEL_SIZE(1), 0, 136 0, NULL, NULL, VM_SLEEP); 137 mmu.kmap_htables = 138 kmem_alloc(htable_cnt * sizeof (htable_t *), KM_SLEEP); 139 140 /* 141 * Map the page tables that cover kmap into the allocated range. 142 * Note we don't ever htable_release() the kmap page tables - they 143 * can't ever be stolen, freed, etc. 144 */ 145 for (va = map_addr, i = 0; i < htable_cnt; va += LEVEL_SIZE(1), ++i) { 146 ht = htable_create(kas.a_hat, va, 0, NULL); 147 if (ht == NULL) 148 panic("hat_kmap_init: ht == NULL"); 149 mmu.kmap_htables[i] = ht; 150 151 hat_devload(kas.a_hat, ptes + i * MMU_PAGESIZE, 152 MMU_PAGESIZE, ht->ht_pfn, 153 #ifdef __xpv 154 PROT_READ | HAT_NOSYNC | HAT_UNORDERED_OK, 155 #else 156 PROT_READ | PROT_WRITE | HAT_NOSYNC | HAT_UNORDERED_OK, 157 #endif 158 HAT_LOAD | HAT_LOAD_NOCONSIST); 159 } 160 161 /* 162 * set information in mmu to activate handling of kmap 163 */ 164 mmu.kmap_addr = map_addr; 165 mmu.kmap_eaddr = map_eaddr; 166 mmu.kmap_ptes = (x86pte_t *)ptes; 167 } 168 169 extern caddr_t kpm_vbase; 170 extern size_t kpm_size; 171 172 #ifdef __xpv 173 /* 174 * Create the initial segkpm mappings for the hypervisor. To avoid having 175 * to deal with page tables being read only, we make all mappings 176 * read only at first. 177 */ 178 static void 179 xen_kpm_create(paddr_t paddr, level_t lvl) 180 { 181 ulong_t pg_off; 182 183 for (pg_off = 0; pg_off < LEVEL_SIZE(lvl); pg_off += MMU_PAGESIZE) { 184 kbm_map((uintptr_t)kpm_vbase + paddr, (paddr_t)0, 0, 1); 185 kbm_read_only((uintptr_t)kpm_vbase + paddr + pg_off, 186 paddr + pg_off); 187 } 188 } 189 190 /* 191 * Try to make all kpm mappings writable. Failures are ok, as those 192 * are just pagetable, GDT, etc. pages. 193 */ 194 static void 195 xen_kpm_finish_init(void) 196 { 197 pfn_t gdtpfn = mmu_btop(CPU->cpu_m.mcpu_gdtpa); 198 pfn_t pfn; 199 page_t *pp; 200 201 for (pfn = 0; pfn < mfn_count; ++pfn) { 202 /* 203 * skip gdt 204 */ 205 if (pfn == gdtpfn) 206 continue; 207 208 /* 209 * p_index is a hint that this is a pagetable 210 */ 211 pp = page_numtopp_nolock(pfn); 212 if (pp && pp->p_index) { 213 pp->p_index = 0; 214 continue; 215 } 216 (void) xen_kpm_page(pfn, PT_VALID | PT_WRITABLE); 217 } 218 } 219 #endif 220 221 /* 222 * Routine to pre-allocate data structures for hat_kern_setup(). It computes 223 * how many pagetables it needs by walking the boot loader's page tables. 224 */ 225 /*ARGSUSED*/ 226 void 227 hat_kern_alloc( 228 caddr_t segmap_base, 229 size_t segmap_size, 230 caddr_t ekernelheap) 231 { 232 uintptr_t last_va = (uintptr_t)-1; /* catch 1st time */ 233 uintptr_t va = 0; 234 size_t size; 235 pfn_t pfn; 236 uint_t prot; 237 uint_t table_cnt = 1; 238 uint_t mapping_cnt; 239 level_t start_level; 240 level_t l; 241 struct memlist *pmem; 242 level_t lpagel = mmu.max_page_level; 243 uint64_t paddr; 244 int64_t psize; 245 int nwindows; 246 247 if (kpm_size > 0) { 248 /* 249 * Create the kpm page tables. When running on the 250 * hypervisor these are made read/only at first. 251 * Later we'll add write permission where possible. 252 */ 253 for (pmem = phys_install; pmem; pmem = pmem->ml_next) { 254 paddr = pmem->ml_address; 255 psize = pmem->ml_size; 256 while (psize >= MMU_PAGESIZE) { 257 /* find the largest page size */ 258 for (l = lpagel; l > 0; l--) { 259 if ((paddr & LEVEL_OFFSET(l)) == 0 && 260 psize > LEVEL_SIZE(l)) 261 break; 262 } 263 264 #if defined(__xpv) 265 /* 266 * Create read/only mappings to avoid 267 * conflicting with pagetable usage 268 */ 269 xen_kpm_create(paddr, l); 270 #else 271 kbm_map((uintptr_t)kpm_vbase + paddr, paddr, 272 l, 1); 273 #endif 274 paddr += LEVEL_SIZE(l); 275 psize -= LEVEL_SIZE(l); 276 } 277 } 278 } 279 280 /* 281 * If this machine doesn't have a kpm segment, we need to allocate 282 * a small number of 'windows' which can be used to map pagetables. 283 */ 284 nwindows = (kpm_size == 0) ? 2 * NCPU : 0; 285 286 #if defined(__xpv) 287 /* 288 * On a hypervisor, these windows are also used by the xpv_panic 289 * code, where we need one window for each level of the pagetable 290 * hierarchy. 291 */ 292 nwindows = MAX(nwindows, mmu.max_level); 293 #endif 294 295 if (nwindows != 0) { 296 /* 297 * Create the page windows and 1 page of VA in 298 * which we map the PTEs of those windows. 299 */ 300 mmu.pwin_base = vmem_xalloc(heap_arena, nwindows * MMU_PAGESIZE, 301 LEVEL_SIZE(1), 0, 0, NULL, NULL, VM_SLEEP); 302 ASSERT(nwindows <= MMU_PAGESIZE / mmu.pte_size); 303 mmu.pwin_pte_va = vmem_xalloc(heap_arena, MMU_PAGESIZE, 304 MMU_PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP); 305 306 /* 307 * Find/Create the page table window mappings. 308 */ 309 paddr = 0; 310 (void) find_pte((uintptr_t)mmu.pwin_base, &paddr, 0, 0); 311 ASSERT(paddr != 0); 312 ASSERT((paddr & MMU_PAGEOFFSET) == 0); 313 mmu.pwin_pte_pa = paddr; 314 #ifdef __xpv 315 (void) find_pte((uintptr_t)mmu.pwin_pte_va, NULL, 0, 0); 316 kbm_read_only((uintptr_t)mmu.pwin_pte_va, mmu.pwin_pte_pa); 317 #else 318 kbm_map((uintptr_t)mmu.pwin_pte_va, mmu.pwin_pte_pa, 0, 1); 319 #endif 320 } 321 322 /* 323 * Walk the boot loader's page tables and figure out 324 * how many tables and page mappings there will be. 325 */ 326 while (kbm_probe(&va, &size, &pfn, &prot) != 0) { 327 /* 328 * At each level, if the last_va falls into a new htable, 329 * increment table_cnt. We can stop at the 1st level where 330 * they are in the same htable. 331 */ 332 start_level = 0; 333 while (start_level <= mmu.max_page_level) { 334 if (size == LEVEL_SIZE(start_level)) 335 break; 336 start_level++; 337 } 338 339 for (l = start_level; l < mmu.max_level; ++l) { 340 if (va >> LEVEL_SHIFT(l + 1) == 341 last_va >> LEVEL_SHIFT(l + 1)) 342 break; 343 ++table_cnt; 344 } 345 last_va = va; 346 l = (start_level == 0) ? 1 : start_level; 347 va = (va & LEVEL_MASK(l)) + LEVEL_SIZE(l); 348 } 349 350 /* 351 * Besides the boot loader mappings, we're going to fill in 352 * the entire top level page table for the kernel. Make sure there's 353 * enough reserve for that too. 354 */ 355 table_cnt += mmu.top_level_count - ((kernelbase >> 356 LEVEL_SHIFT(mmu.max_level)) & (mmu.top_level_count - 1)); 357 358 /* 359 * Add 1/4 more into table_cnt for extra slop. The unused 360 * slop is freed back when we htable_adjust_reserve() later. 361 */ 362 table_cnt += table_cnt >> 2; 363 364 /* 365 * We only need mapping entries (hments) for shared pages. 366 * This should be far, far fewer than the total possible, 367 * We'll allocate enough for 1/16 of all possible PTEs. 368 */ 369 mapping_cnt = (table_cnt * mmu.ptes_per_table) >> 4; 370 371 /* 372 * Now create the initial htable/hment reserves 373 */ 374 htable_initial_reserve(table_cnt); 375 hment_reserve(mapping_cnt); 376 x86pte_cpu_init(CPU); 377 } 378 379 380 /* 381 * This routine handles the work of creating the kernel's initial mappings 382 * by deciphering the mappings in the page tables created by the boot program. 383 * 384 * We maintain large page mappings, but only to a level 1 pagesize. 385 * The boot loader can only add new mappings once this function starts. 386 * In particular it can not change the pagesize used for any existing 387 * mappings or this code breaks! 388 */ 389 390 void 391 hat_kern_setup(void) 392 { 393 /* 394 * Attach htables to the existing pagetables 395 */ 396 /* BEGIN CSTYLED */ 397 htable_attach(kas.a_hat, 0, mmu.max_level, NULL, 398 #ifdef __xpv 399 mmu_btop(xen_info->pt_base - ONE_GIG)); 400 #else 401 mmu_btop(getcr3_pa())); 402 #endif 403 /* END CSTYLED */ 404 405 #if defined(__xpv) 406 /* 407 * Try to make the kpm mappings r/w. Failures here are OK, as 408 * it's probably just a pagetable 409 */ 410 xen_kpm_finish_init(); 411 #endif 412 413 /* 414 * The kernel HAT is now officially open for business. 415 */ 416 khat_running = 1; 417 418 CPUSET_ATOMIC_ADD(kas.a_hat->hat_cpus, CPU->cpu_id); 419 CPU->cpu_current_hat = kas.a_hat; 420 } 421 422 #ifndef __xpv 423 424 /* 425 * Note that the INVPCID_ALL* variants can be used even in the !PCIDE case, but 426 * INVPCID_ADDR isn't. 427 */ 428 static void 429 invpcid(uint64_t type, uint64_t pcid, uintptr_t addr) 430 { 431 ulong_t flag; 432 uint64_t cr4; 433 434 if (x86_use_invpcid == 1) { 435 ASSERT(is_x86_feature(x86_featureset, X86FSET_INVPCID)); 436 invpcid_insn(type, pcid, addr); 437 return; 438 } 439 440 switch (type) { 441 case INVPCID_ALL_GLOBAL: 442 flag = intr_clear(); 443 cr4 = getcr4(); 444 setcr4(cr4 & ~(ulong_t)CR4_PGE); 445 setcr4(cr4 | CR4_PGE); 446 intr_restore(flag); 447 break; 448 449 case INVPCID_ALL_NONGLOBAL: 450 if (!(getcr4() & CR4_PCIDE)) { 451 reload_cr3(); 452 } else { 453 flag = intr_clear(); 454 cr4 = getcr4(); 455 setcr4(cr4 & ~(ulong_t)CR4_PGE); 456 setcr4(cr4 | CR4_PGE); 457 intr_restore(flag); 458 } 459 break; 460 461 case INVPCID_ADDR: 462 if (pcid == PCID_USER) { 463 flag = intr_clear(); 464 ASSERT(addr < kernelbase); 465 ASSERT(ON_USER_HAT(CPU)); 466 ASSERT(CPU->cpu_m.mcpu_kpti.kf_user_cr3 != 0); 467 tr_mmu_flush_user_range(addr, MMU_PAGESIZE, 468 MMU_PAGESIZE, CPU->cpu_m.mcpu_kpti.kf_user_cr3); 469 intr_restore(flag); 470 } else { 471 mmu_invlpg((caddr_t)addr); 472 } 473 break; 474 475 default: 476 panic("unsupported invpcid(%lu)", type); 477 break; 478 } 479 } 480 481 /* 482 * Flush one kernel mapping. 483 * 484 * We want to assert on kernel space here mainly for reasoning about the PCIDE 485 * case: namely, this flush should never need to flush a non-current PCID 486 * mapping. This presumes we never have reason to flush the kernel regions 487 * available to PCID_USER (the trampolines and so on). It also relies on 488 * PCID_KERNEL == PCID_NONE. 489 */ 490 void 491 mmu_flush_tlb_kpage(uintptr_t va) 492 { 493 ASSERT(va >= kernelbase); 494 ASSERT(getpcid() == PCID_KERNEL); 495 mmu_invlpg((caddr_t)va); 496 } 497 498 /* 499 * Flush one mapping: local CPU version of hat_tlb_inval(). 500 * 501 * If this is a userspace address in the PCIDE case, we need two invalidations, 502 * one for any potentially stale PCID_USER mapping, as well as any established 503 * while in the kernel. 504 */ 505 void 506 mmu_flush_tlb_page(uintptr_t va) 507 { 508 ASSERT(getpcid() == PCID_KERNEL); 509 510 if (va >= kernelbase) { 511 mmu_flush_tlb_kpage(va); 512 return; 513 } 514 515 if (!(getcr4() & CR4_PCIDE)) { 516 mmu_invlpg((caddr_t)va); 517 return; 518 } 519 520 /* 521 * Yes, kas will need to flush below kernelspace, at least during boot. 522 * But there's no PCID_USER context. 523 */ 524 if (ON_USER_HAT(CPU)) 525 invpcid(INVPCID_ADDR, PCID_USER, va); 526 invpcid(INVPCID_ADDR, PCID_KERNEL, va); 527 } 528 529 static void 530 mmu_flush_tlb_range(uintptr_t addr, size_t len, size_t pgsz) 531 { 532 EQUIV(addr < kernelbase, (addr + len - 1) < kernelbase); 533 ASSERT(len > 0); 534 ASSERT(pgsz != 0); 535 536 if (!(getcr4() & CR4_PCIDE) || x86_use_invpcid == 1) { 537 for (uintptr_t va = addr; va < (addr + len); va += pgsz) 538 mmu_flush_tlb_page(va); 539 return; 540 } 541 542 /* 543 * As an emulated invpcid() in the PCIDE case requires jumping 544 * cr3s, we batch the invalidations. We should only need to flush the 545 * user range if we're on a user-space HAT. 546 */ 547 if (addr < kernelbase && ON_USER_HAT(CPU)) { 548 ulong_t flag = intr_clear(); 549 ASSERT(CPU->cpu_m.mcpu_kpti.kf_user_cr3 != 0); 550 tr_mmu_flush_user_range(addr, len, pgsz, 551 CPU->cpu_m.mcpu_kpti.kf_user_cr3); 552 intr_restore(flag); 553 } 554 555 for (uintptr_t va = addr; va < (addr + len); va += pgsz) 556 mmu_invlpg((caddr_t)va); 557 } 558 559 /* 560 * MMU TLB (and PT cache) flushing on this CPU. 561 * 562 * FLUSH_TLB_ALL: invalidate everything, all PCIDs, all PT_GLOBAL. 563 * FLUSH_TLB_NONGLOBAL: invalidate all PCIDs, excluding PT_GLOBAL 564 * FLUSH_TLB_RANGE: invalidate the given range, including PCID_USER 565 * mappings as appropriate. If using invpcid, PT_GLOBAL mappings are not 566 * invalidated. 567 */ 568 void 569 mmu_flush_tlb(flush_tlb_type_t type, tlb_range_t *range) 570 { 571 ASSERT(getpcid() == PCID_KERNEL); 572 573 switch (type) { 574 case FLUSH_TLB_ALL: 575 ASSERT(range == NULL); 576 invpcid(INVPCID_ALL_GLOBAL, 0, 0); 577 break; 578 579 case FLUSH_TLB_NONGLOBAL: 580 ASSERT(range == NULL); 581 invpcid(INVPCID_ALL_NONGLOBAL, 0, 0); 582 break; 583 584 case FLUSH_TLB_RANGE: { 585 mmu_flush_tlb_range(range->tr_va, TLB_RANGE_LEN(range), 586 LEVEL_SIZE(range->tr_level)); 587 break; 588 } 589 590 default: 591 panic("invalid call mmu_flush_tlb(%d)", type); 592 break; 593 } 594 } 595 596 #endif /* ! __xpv */