4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #include <sys/t_lock.h>
27 #include <sys/memlist.h>
28 #include <sys/cpuvar.h>
29 #include <sys/vmem.h>
30 #include <sys/mman.h>
31 #include <sys/vm.h>
32 #include <sys/kmem.h>
33 #include <sys/cmn_err.h>
34 #include <sys/debug.h>
35 #include <sys/vm_machparam.h>
36 #include <sys/tss.h>
37 #include <sys/vnode.h>
38 #include <vm/hat.h>
39 #include <vm/anon.h>
40 #include <vm/as.h>
41 #include <vm/page.h>
42 #include <vm/seg.h>
43 #include <vm/seg_kmem.h>
44 #include <vm/seg_map.h>
45 #include <vm/hat_i86.h>
46 #include <sys/promif.h>
47 #include <sys/x86_archext.h>
48 #include <sys/systm.h>
49 #include <sys/archsystm.h>
50 #include <sys/sunddi.h>
51 #include <sys/ddidmareq.h>
52 #include <sys/controlregs.h>
53 #include <sys/reboot.h>
54 #include <sys/kdi.h>
55 #include <sys/bootconf.h>
56 #include <sys/bootsvcs.h>
57 #include <sys/bootinfo.h>
58 #include <vm/kboot_mmu.h>
59
60 #ifdef __xpv
61 #include <sys/hypervisor.h>
62 #endif
63
64 caddr_t
65 i86devmap(pfn_t pf, pgcnt_t pgcnt, uint_t prot)
66 {
67 caddr_t addr;
68 caddr_t addr1;
69 page_t *pp;
70
71 addr1 = addr = vmem_alloc(heap_arena, mmu_ptob(pgcnt), VM_SLEEP);
72
73 for (; pgcnt != 0; addr += MMU_PAGESIZE, ++pf, --pgcnt) {
74 pp = page_numtopp_nolock(pf);
75 if (pp == NULL) {
76 hat_devload(kas.a_hat, addr, MMU_PAGESIZE, pf,
77 prot | HAT_NOSYNC, HAT_LOAD_LOCK);
78 } else {
79 hat_memload(kas.a_hat, addr, pp,
80 prot | HAT_NOSYNC, HAT_LOAD_LOCK);
81 }
82 }
83
84 return (addr1);
85 }
86
87 /*
88 * This routine is like page_numtopp, but accepts only free pages, which
89 * it allocates (unfrees) and returns with the exclusive lock held.
90 * It is used by machdep.c/dma_init() to find contiguous free pages.
91 *
92 * XXX this and some others should probably be in vm_machdep.c
93 */
94 page_t *
95 page_numtopp_alloc(pfn_t pfnum)
96 {
97 page_t *pp;
98
99 retry:
100 pp = page_numtopp_nolock(pfnum);
101 if (pp == NULL) {
102 return (NULL);
103 }
104
105 if (!page_trylock(pp, SE_EXCL)) {
106 return (NULL);
107 }
108
109 if (page_pptonum(pp) != pfnum) {
110 page_unlock(pp);
111 goto retry;
112 }
113
114 if (!PP_ISFREE(pp)) {
115 page_unlock(pp);
116 return (NULL);
117 }
118 if (pp->p_szc) {
119 page_demote_free_pages(pp);
120 page_unlock(pp);
121 goto retry;
122 }
123
124 /* If associated with a vnode, destroy mappings */
125
126 if (pp->p_vnode) {
127
128 page_destroy_free(pp);
129
130 if (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_NO_RECLAIM)) {
131 return (NULL);
132 }
133
134 if (page_pptonum(pp) != pfnum) {
135 page_unlock(pp);
136 goto retry;
137 }
138 }
139
140 if (!PP_ISFREE(pp)) {
141 page_unlock(pp);
142 return (NULL);
143 }
144
145 if (!page_reclaim(pp, (kmutex_t *)NULL))
146 return (NULL);
147
148 return (pp);
149 }
150
151 /*
152 * Flag is not set early in boot. Once it is set we are no longer
153 * using boot's page tables.
154 */
155 uint_t khat_running = 0;
156
157 /*
158 * This procedure is callable only while the boot loader is in charge of the
159 * MMU. It assumes that PA == VA for page table pointers. It doesn't live in
160 * kboot_mmu.c since it's used from common code.
161 */
162 pfn_t
163 va_to_pfn(void *vaddr)
164 {
165 uintptr_t des_va = ALIGN2PAGE(vaddr);
166 uintptr_t va = des_va;
167 size_t len;
168 uint_t prot;
169 pfn_t pfn;
170
171 if (khat_running)
419
420 for (l = start_level; l < mmu.max_level; ++l) {
421 if (va >> LEVEL_SHIFT(l + 1) ==
422 last_va >> LEVEL_SHIFT(l + 1))
423 break;
424 ++table_cnt;
425 }
426 last_va = va;
427 l = (start_level == 0) ? 1 : start_level;
428 va = (va & LEVEL_MASK(l)) + LEVEL_SIZE(l);
429 }
430
431 /*
432 * Besides the boot loader mappings, we're going to fill in
433 * the entire top level page table for the kernel. Make sure there's
434 * enough reserve for that too.
435 */
436 table_cnt += mmu.top_level_count - ((kernelbase >>
437 LEVEL_SHIFT(mmu.max_level)) & (mmu.top_level_count - 1));
438
439 #if defined(__i386)
440 /*
441 * The 32 bit PAE hat allocates tables one level below the top when
442 * kernelbase isn't 1 Gig aligned. We'll just be sloppy and allocate
443 * a bunch more to the reserve. Any unused will be returned later.
444 * Note we've already counted these mappings, just not the extra
445 * pagetables.
446 */
447 if (mmu.pae_hat != 0 && (kernelbase & LEVEL_OFFSET(mmu.max_level)) != 0)
448 table_cnt += mmu.ptes_per_table -
449 ((kernelbase & LEVEL_OFFSET(mmu.max_level)) >>
450 LEVEL_SHIFT(mmu.max_level - 1));
451 #endif
452
453 /*
454 * Add 1/4 more into table_cnt for extra slop. The unused
455 * slop is freed back when we htable_adjust_reserve() later.
456 */
457 table_cnt += table_cnt >> 2;
458
459 /*
460 * We only need mapping entries (hments) for shared pages.
461 * This should be far, far fewer than the total possible,
462 * We'll allocate enough for 1/16 of all possible PTEs.
463 */
464 mapping_cnt = (table_cnt * mmu.ptes_per_table) >> 4;
465
466 /*
467 * Now create the initial htable/hment reserves
468 */
469 htable_initial_reserve(table_cnt);
470 hment_reserve(mapping_cnt);
471 x86pte_cpu_init(CPU);
472 }
473
476 * This routine handles the work of creating the kernel's initial mappings
477 * by deciphering the mappings in the page tables created by the boot program.
478 *
479 * We maintain large page mappings, but only to a level 1 pagesize.
480 * The boot loader can only add new mappings once this function starts.
481 * In particular it can not change the pagesize used for any existing
482 * mappings or this code breaks!
483 */
484
485 void
486 hat_kern_setup(void)
487 {
488 /*
489 * Attach htables to the existing pagetables
490 */
491 /* BEGIN CSTYLED */
492 htable_attach(kas.a_hat, 0, mmu.max_level, NULL,
493 #ifdef __xpv
494 mmu_btop(xen_info->pt_base - ONE_GIG));
495 #else
496 mmu_btop(getcr3()));
497 #endif
498 /* END CSTYLED */
499
500 #if defined(__i386) && !defined(__xpv)
501 CPU->cpu_tss->tss_cr3 = dftss0->tss_cr3 = getcr3();
502 #endif /* __i386 */
503
504 #if defined(__xpv) && defined(__amd64)
505 /*
506 * Try to make the kpm mappings r/w. Failures here are OK, as
507 * it's probably just a pagetable
508 */
509 xen_kpm_finish_init();
510 #endif
511
512 /*
513 * The kernel HAT is now officially open for business.
514 */
515 khat_running = 1;
516
517 CPUSET_ATOMIC_ADD(kas.a_hat->hat_cpus, CPU->cpu_id);
518 CPU->cpu_current_hat = kas.a_hat;
519 }
|
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 *
25 * Copyright 2018 Joyent, Inc.
26 */
27
28 #include <sys/t_lock.h>
29 #include <sys/memlist.h>
30 #include <sys/cpuvar.h>
31 #include <sys/vmem.h>
32 #include <sys/mman.h>
33 #include <sys/vm.h>
34 #include <sys/kmem.h>
35 #include <sys/cmn_err.h>
36 #include <sys/debug.h>
37 #include <sys/vm_machparam.h>
38 #include <sys/tss.h>
39 #include <sys/vnode.h>
40 #include <vm/hat.h>
41 #include <vm/anon.h>
42 #include <vm/as.h>
43 #include <vm/page.h>
44 #include <vm/seg.h>
45 #include <vm/seg_kmem.h>
46 #include <vm/seg_map.h>
47 #include <vm/hat_i86.h>
48 #include <sys/promif.h>
49 #include <sys/x86_archext.h>
50 #include <sys/systm.h>
51 #include <sys/archsystm.h>
52 #include <sys/sunddi.h>
53 #include <sys/ddidmareq.h>
54 #include <sys/controlregs.h>
55 #include <sys/reboot.h>
56 #include <sys/kdi.h>
57 #include <sys/bootconf.h>
58 #include <sys/bootsvcs.h>
59 #include <sys/bootinfo.h>
60 #include <vm/kboot_mmu.h>
61
62 #ifdef __xpv
63 #include <sys/hypervisor.h>
64 #endif
65
66 #define ON_USER_HAT(cpu) \
67 ((cpu)->cpu_m.mcpu_current_hat != NULL && \
68 (cpu)->cpu_m.mcpu_current_hat != kas.a_hat)
69
70 /*
71 * Flag is not set early in boot. Once it is set we are no longer
72 * using boot's page tables.
73 */
74 uint_t khat_running = 0;
75
76 /*
77 * This procedure is callable only while the boot loader is in charge of the
78 * MMU. It assumes that PA == VA for page table pointers. It doesn't live in
79 * kboot_mmu.c since it's used from common code.
80 */
81 pfn_t
82 va_to_pfn(void *vaddr)
83 {
84 uintptr_t des_va = ALIGN2PAGE(vaddr);
85 uintptr_t va = des_va;
86 size_t len;
87 uint_t prot;
88 pfn_t pfn;
89
90 if (khat_running)
338
339 for (l = start_level; l < mmu.max_level; ++l) {
340 if (va >> LEVEL_SHIFT(l + 1) ==
341 last_va >> LEVEL_SHIFT(l + 1))
342 break;
343 ++table_cnt;
344 }
345 last_va = va;
346 l = (start_level == 0) ? 1 : start_level;
347 va = (va & LEVEL_MASK(l)) + LEVEL_SIZE(l);
348 }
349
350 /*
351 * Besides the boot loader mappings, we're going to fill in
352 * the entire top level page table for the kernel. Make sure there's
353 * enough reserve for that too.
354 */
355 table_cnt += mmu.top_level_count - ((kernelbase >>
356 LEVEL_SHIFT(mmu.max_level)) & (mmu.top_level_count - 1));
357
358 /*
359 * Add 1/4 more into table_cnt for extra slop. The unused
360 * slop is freed back when we htable_adjust_reserve() later.
361 */
362 table_cnt += table_cnt >> 2;
363
364 /*
365 * We only need mapping entries (hments) for shared pages.
366 * This should be far, far fewer than the total possible,
367 * We'll allocate enough for 1/16 of all possible PTEs.
368 */
369 mapping_cnt = (table_cnt * mmu.ptes_per_table) >> 4;
370
371 /*
372 * Now create the initial htable/hment reserves
373 */
374 htable_initial_reserve(table_cnt);
375 hment_reserve(mapping_cnt);
376 x86pte_cpu_init(CPU);
377 }
378
381 * This routine handles the work of creating the kernel's initial mappings
382 * by deciphering the mappings in the page tables created by the boot program.
383 *
384 * We maintain large page mappings, but only to a level 1 pagesize.
385 * The boot loader can only add new mappings once this function starts.
386 * In particular it can not change the pagesize used for any existing
387 * mappings or this code breaks!
388 */
389
390 void
391 hat_kern_setup(void)
392 {
393 /*
394 * Attach htables to the existing pagetables
395 */
396 /* BEGIN CSTYLED */
397 htable_attach(kas.a_hat, 0, mmu.max_level, NULL,
398 #ifdef __xpv
399 mmu_btop(xen_info->pt_base - ONE_GIG));
400 #else
401 mmu_btop(getcr3_pa()));
402 #endif
403 /* END CSTYLED */
404
405 #if defined(__xpv)
406 /*
407 * Try to make the kpm mappings r/w. Failures here are OK, as
408 * it's probably just a pagetable
409 */
410 xen_kpm_finish_init();
411 #endif
412
413 /*
414 * The kernel HAT is now officially open for business.
415 */
416 khat_running = 1;
417
418 CPUSET_ATOMIC_ADD(kas.a_hat->hat_cpus, CPU->cpu_id);
419 CPU->cpu_current_hat = kas.a_hat;
420 }
421
422 #ifndef __xpv
423
424 /*
425 * Note that the INVPCID_ALL* variants can be used even in the !PCIDE case, but
426 * INVPCID_ADDR isn't.
427 */
428 static void
429 invpcid(uint64_t type, uint64_t pcid, uintptr_t addr)
430 {
431 ulong_t flag;
432 uint64_t cr4;
433
434 if (x86_use_invpcid == 1) {
435 ASSERT(is_x86_feature(x86_featureset, X86FSET_INVPCID));
436 invpcid_insn(type, pcid, addr);
437 return;
438 }
439
440 switch (type) {
441 case INVPCID_ALL_GLOBAL:
442 flag = intr_clear();
443 cr4 = getcr4();
444 setcr4(cr4 & ~(ulong_t)CR4_PGE);
445 setcr4(cr4 | CR4_PGE);
446 intr_restore(flag);
447 break;
448
449 case INVPCID_ALL_NONGLOBAL:
450 if (!(getcr4() & CR4_PCIDE)) {
451 reload_cr3();
452 } else {
453 flag = intr_clear();
454 cr4 = getcr4();
455 setcr4(cr4 & ~(ulong_t)CR4_PGE);
456 setcr4(cr4 | CR4_PGE);
457 intr_restore(flag);
458 }
459 break;
460
461 case INVPCID_ADDR:
462 if (pcid == PCID_USER) {
463 flag = intr_clear();
464 ASSERT(addr < kernelbase);
465 ASSERT(ON_USER_HAT(CPU));
466 ASSERT(CPU->cpu_m.mcpu_kpti.kf_user_cr3 != 0);
467 tr_mmu_flush_user_range(addr, MMU_PAGESIZE,
468 MMU_PAGESIZE, CPU->cpu_m.mcpu_kpti.kf_user_cr3);
469 intr_restore(flag);
470 } else {
471 mmu_invlpg((caddr_t)addr);
472 }
473 break;
474
475 default:
476 panic("unsupported invpcid(%lu)", type);
477 break;
478 }
479 }
480
481 /*
482 * Flush one kernel mapping.
483 *
484 * We want to assert on kernel space here mainly for reasoning about the PCIDE
485 * case: namely, this flush should never need to flush a non-current PCID
486 * mapping. This presumes we never have reason to flush the kernel regions
487 * available to PCID_USER (the trampolines and so on). It also relies on
488 * PCID_KERNEL == PCID_NONE.
489 */
490 void
491 mmu_flush_tlb_kpage(uintptr_t va)
492 {
493 ASSERT(va >= kernelbase);
494 ASSERT(getpcid() == PCID_KERNEL);
495 mmu_invlpg((caddr_t)va);
496 }
497
498 /*
499 * Flush one mapping: local CPU version of hat_tlb_inval().
500 *
501 * If this is a userspace address in the PCIDE case, we need two invalidations,
502 * one for any potentially stale PCID_USER mapping, as well as any established
503 * while in the kernel.
504 */
505 void
506 mmu_flush_tlb_page(uintptr_t va)
507 {
508 ASSERT(getpcid() == PCID_KERNEL);
509
510 if (va >= kernelbase) {
511 mmu_flush_tlb_kpage(va);
512 return;
513 }
514
515 if (!(getcr4() & CR4_PCIDE)) {
516 mmu_invlpg((caddr_t)va);
517 return;
518 }
519
520 /*
521 * Yes, kas will need to flush below kernelspace, at least during boot.
522 * But there's no PCID_USER context.
523 */
524 if (ON_USER_HAT(CPU))
525 invpcid(INVPCID_ADDR, PCID_USER, va);
526 invpcid(INVPCID_ADDR, PCID_KERNEL, va);
527 }
528
529 static void
530 mmu_flush_tlb_range(uintptr_t addr, size_t len, size_t pgsz)
531 {
532 EQUIV(addr < kernelbase, (addr + len - 1) < kernelbase);
533 ASSERT(len > 0);
534 ASSERT(pgsz != 0);
535
536 if (!(getcr4() & CR4_PCIDE) || x86_use_invpcid == 1) {
537 for (uintptr_t va = addr; va < (addr + len); va += pgsz)
538 mmu_flush_tlb_page(va);
539 return;
540 }
541
542 /*
543 * As an emulated invpcid() in the PCIDE case requires jumping
544 * cr3s, we batch the invalidations. We should only need to flush the
545 * user range if we're on a user-space HAT.
546 */
547 if (addr < kernelbase && ON_USER_HAT(CPU)) {
548 ulong_t flag = intr_clear();
549 ASSERT(CPU->cpu_m.mcpu_kpti.kf_user_cr3 != 0);
550 tr_mmu_flush_user_range(addr, len, pgsz,
551 CPU->cpu_m.mcpu_kpti.kf_user_cr3);
552 intr_restore(flag);
553 }
554
555 for (uintptr_t va = addr; va < (addr + len); va += pgsz)
556 mmu_invlpg((caddr_t)va);
557 }
558
559 /*
560 * MMU TLB (and PT cache) flushing on this CPU.
561 *
562 * FLUSH_TLB_ALL: invalidate everything, all PCIDs, all PT_GLOBAL.
563 * FLUSH_TLB_NONGLOBAL: invalidate all PCIDs, excluding PT_GLOBAL
564 * FLUSH_TLB_RANGE: invalidate the given range, including PCID_USER
565 * mappings as appropriate. If using invpcid, PT_GLOBAL mappings are not
566 * invalidated.
567 */
568 void
569 mmu_flush_tlb(flush_tlb_type_t type, tlb_range_t *range)
570 {
571 ASSERT(getpcid() == PCID_KERNEL);
572
573 switch (type) {
574 case FLUSH_TLB_ALL:
575 ASSERT(range == NULL);
576 invpcid(INVPCID_ALL_GLOBAL, 0, 0);
577 break;
578
579 case FLUSH_TLB_NONGLOBAL:
580 ASSERT(range == NULL);
581 invpcid(INVPCID_ALL_NONGLOBAL, 0, 0);
582 break;
583
584 case FLUSH_TLB_RANGE: {
585 mmu_flush_tlb_range(range->tr_va, TLB_RANGE_LEN(range),
586 LEVEL_SIZE(range->tr_level));
587 break;
588 }
589
590 default:
591 panic("invalid call mmu_flush_tlb(%d)", type);
592 break;
593 }
594 }
595
596 #endif /* ! __xpv */
|