1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  *
  25  * Copyright 2018 Joyent, Inc.
  26  */
  27 
  28 #include <sys/t_lock.h>
  29 #include <sys/memlist.h>
  30 #include <sys/cpuvar.h>
  31 #include <sys/vmem.h>
  32 #include <sys/mman.h>
  33 #include <sys/vm.h>
  34 #include <sys/kmem.h>
  35 #include <sys/cmn_err.h>
  36 #include <sys/debug.h>
  37 #include <sys/vm_machparam.h>
  38 #include <sys/tss.h>
  39 #include <sys/vnode.h>
  40 #include <vm/hat.h>
  41 #include <vm/anon.h>
  42 #include <vm/as.h>
  43 #include <vm/page.h>
  44 #include <vm/seg.h>
  45 #include <vm/seg_kmem.h>
  46 #include <vm/seg_map.h>
  47 #include <vm/hat_i86.h>
  48 #include <sys/promif.h>
  49 #include <sys/x86_archext.h>
  50 #include <sys/systm.h>
  51 #include <sys/archsystm.h>
  52 #include <sys/sunddi.h>
  53 #include <sys/ddidmareq.h>
  54 #include <sys/controlregs.h>
  55 #include <sys/reboot.h>
  56 #include <sys/kdi.h>
  57 #include <sys/bootconf.h>
  58 #include <sys/bootsvcs.h>
  59 #include <sys/bootinfo.h>
  60 #include <vm/kboot_mmu.h>
  61 
  62 #ifdef __xpv
  63 #include <sys/hypervisor.h>
  64 #endif
  65 
  66 #define ON_USER_HAT(cpu) \
  67         ((cpu)->cpu_m.mcpu_current_hat != NULL && \
  68         (cpu)->cpu_m.mcpu_current_hat != kas.a_hat)
  69 
  70 /*
  71  * Flag is not set early in boot. Once it is set we are no longer
  72  * using boot's page tables.
  73  */
  74 uint_t khat_running = 0;
  75 
  76 /*
  77  * This procedure is callable only while the boot loader is in charge of the
  78  * MMU. It assumes that PA == VA for page table pointers.  It doesn't live in
  79  * kboot_mmu.c since it's used from common code.
  80  */
  81 pfn_t
  82 va_to_pfn(void *vaddr)
  83 {
  84         uintptr_t       des_va = ALIGN2PAGE(vaddr);
  85         uintptr_t       va = des_va;
  86         size_t          len;
  87         uint_t          prot;
  88         pfn_t           pfn;
  89 
  90         if (khat_running)
  91                 panic("va_to_pfn(): called too late\n");
  92 
  93         if (kbm_probe(&va, &len, &pfn, &prot) == 0)
  94                 return (PFN_INVALID);
  95         if (va > des_va)
  96                 return (PFN_INVALID);
  97         if (va < des_va)
  98                 pfn += mmu_btop(des_va - va);
  99         return (pfn);
 100 }
 101 
 102 /*
 103  * Initialize a special area in the kernel that always holds some PTEs for
 104  * faster performance. This always holds segmap's PTEs.
 105  * In the 32 bit kernel this maps the kernel heap too.
 106  */
 107 void
 108 hat_kmap_init(uintptr_t base, size_t len)
 109 {
 110         uintptr_t map_addr;     /* base rounded down to large page size */
 111         uintptr_t map_eaddr;    /* base + len rounded up */
 112         size_t map_len;
 113         caddr_t ptes;           /* mapping area in kernel for kmap ptes */
 114         size_t window_size;     /* size of mapping area for ptes */
 115         ulong_t htable_cnt;     /* # of page tables to cover map_len */
 116         ulong_t i;
 117         htable_t *ht;
 118         uintptr_t va;
 119 
 120         /*
 121          * We have to map in an area that matches an entire page table.
 122          * The PTEs are large page aligned to avoid spurious pagefaults
 123          * on the hypervisor.
 124          */
 125         map_addr = base & LEVEL_MASK(1);
 126         map_eaddr = (base + len + LEVEL_SIZE(1) - 1) & LEVEL_MASK(1);
 127         map_len = map_eaddr - map_addr;
 128         window_size = mmu_btop(map_len) * mmu.pte_size;
 129         window_size = (window_size + LEVEL_SIZE(1)) & LEVEL_MASK(1);
 130         htable_cnt = map_len >> LEVEL_SHIFT(1);
 131 
 132         /*
 133          * allocate vmem for the kmap_ptes
 134          */
 135         ptes = vmem_xalloc(heap_arena, window_size, LEVEL_SIZE(1), 0,
 136             0, NULL, NULL, VM_SLEEP);
 137         mmu.kmap_htables =
 138             kmem_alloc(htable_cnt * sizeof (htable_t *), KM_SLEEP);
 139 
 140         /*
 141          * Map the page tables that cover kmap into the allocated range.
 142          * Note we don't ever htable_release() the kmap page tables - they
 143          * can't ever be stolen, freed, etc.
 144          */
 145         for (va = map_addr, i = 0; i < htable_cnt; va += LEVEL_SIZE(1), ++i) {
 146                 ht = htable_create(kas.a_hat, va, 0, NULL);
 147                 if (ht == NULL)
 148                         panic("hat_kmap_init: ht == NULL");
 149                 mmu.kmap_htables[i] = ht;
 150 
 151                 hat_devload(kas.a_hat, ptes + i * MMU_PAGESIZE,
 152                     MMU_PAGESIZE, ht->ht_pfn,
 153 #ifdef __xpv
 154                     PROT_READ | HAT_NOSYNC | HAT_UNORDERED_OK,
 155 #else
 156                     PROT_READ | PROT_WRITE | HAT_NOSYNC | HAT_UNORDERED_OK,
 157 #endif
 158                     HAT_LOAD | HAT_LOAD_NOCONSIST);
 159         }
 160 
 161         /*
 162          * set information in mmu to activate handling of kmap
 163          */
 164         mmu.kmap_addr = map_addr;
 165         mmu.kmap_eaddr = map_eaddr;
 166         mmu.kmap_ptes = (x86pte_t *)ptes;
 167 }
 168 
 169 extern caddr_t  kpm_vbase;
 170 extern size_t   kpm_size;
 171 
 172 #ifdef __xpv
 173 /*
 174  * Create the initial segkpm mappings for the hypervisor. To avoid having
 175  * to deal with page tables being read only, we make all mappings
 176  * read only at first.
 177  */
 178 static void
 179 xen_kpm_create(paddr_t paddr, level_t lvl)
 180 {
 181         ulong_t pg_off;
 182 
 183         for (pg_off = 0; pg_off < LEVEL_SIZE(lvl); pg_off += MMU_PAGESIZE) {
 184                 kbm_map((uintptr_t)kpm_vbase + paddr, (paddr_t)0, 0, 1);
 185                 kbm_read_only((uintptr_t)kpm_vbase + paddr + pg_off,
 186                     paddr + pg_off);
 187         }
 188 }
 189 
 190 /*
 191  * Try to make all kpm mappings writable. Failures are ok, as those
 192  * are just pagetable, GDT, etc. pages.
 193  */
 194 static void
 195 xen_kpm_finish_init(void)
 196 {
 197         pfn_t gdtpfn = mmu_btop(CPU->cpu_m.mcpu_gdtpa);
 198         pfn_t pfn;
 199         page_t *pp;
 200 
 201         for (pfn = 0; pfn < mfn_count; ++pfn) {
 202                 /*
 203                  * skip gdt
 204                  */
 205                 if (pfn == gdtpfn)
 206                         continue;
 207 
 208                 /*
 209                  * p_index is a hint that this is a pagetable
 210                  */
 211                 pp = page_numtopp_nolock(pfn);
 212                 if (pp && pp->p_index) {
 213                         pp->p_index = 0;
 214                         continue;
 215                 }
 216                 (void) xen_kpm_page(pfn, PT_VALID | PT_WRITABLE);
 217         }
 218 }
 219 #endif
 220 
 221 /*
 222  * Routine to pre-allocate data structures for hat_kern_setup(). It computes
 223  * how many pagetables it needs by walking the boot loader's page tables.
 224  */
 225 /*ARGSUSED*/
 226 void
 227 hat_kern_alloc(
 228         caddr_t segmap_base,
 229         size_t  segmap_size,
 230         caddr_t ekernelheap)
 231 {
 232         uintptr_t       last_va = (uintptr_t)-1;        /* catch 1st time */
 233         uintptr_t       va = 0;
 234         size_t          size;
 235         pfn_t           pfn;
 236         uint_t          prot;
 237         uint_t          table_cnt = 1;
 238         uint_t          mapping_cnt;
 239         level_t         start_level;
 240         level_t         l;
 241         struct memlist  *pmem;
 242         level_t         lpagel = mmu.max_page_level;
 243         uint64_t        paddr;
 244         int64_t         psize;
 245         int             nwindows;
 246 
 247         if (kpm_size > 0) {
 248                 /*
 249                  * Create the kpm page tables.  When running on the
 250                  * hypervisor these are made read/only at first.
 251                  * Later we'll add write permission where possible.
 252                  */
 253                 for (pmem = phys_install; pmem; pmem = pmem->ml_next) {
 254                         paddr = pmem->ml_address;
 255                         psize = pmem->ml_size;
 256                         while (psize >= MMU_PAGESIZE) {
 257                                 /* find the largest page size */
 258                                 for (l = lpagel; l > 0; l--) {
 259                                         if ((paddr & LEVEL_OFFSET(l)) == 0 &&
 260                                             psize > LEVEL_SIZE(l))
 261                                                 break;
 262                                 }
 263 
 264 #if defined(__xpv)
 265                                 /*
 266                                  * Create read/only mappings to avoid
 267                                  * conflicting with pagetable usage
 268                                  */
 269                                 xen_kpm_create(paddr, l);
 270 #else
 271                                 kbm_map((uintptr_t)kpm_vbase + paddr, paddr,
 272                                     l, 1);
 273 #endif
 274                                 paddr += LEVEL_SIZE(l);
 275                                 psize -= LEVEL_SIZE(l);
 276                         }
 277                 }
 278         }
 279 
 280         /*
 281          * If this machine doesn't have a kpm segment, we need to allocate
 282          * a small number of 'windows' which can be used to map pagetables.
 283          */
 284         nwindows = (kpm_size == 0) ? 2 * NCPU : 0;
 285 
 286 #if defined(__xpv)
 287         /*
 288          * On a hypervisor, these windows are also used by the xpv_panic
 289          * code, where we need one window for each level of the pagetable
 290          * hierarchy.
 291          */
 292         nwindows = MAX(nwindows, mmu.max_level);
 293 #endif
 294 
 295         if (nwindows != 0) {
 296                 /*
 297                  * Create the page windows and 1 page of VA in
 298                  * which we map the PTEs of those windows.
 299                  */
 300                 mmu.pwin_base = vmem_xalloc(heap_arena, nwindows * MMU_PAGESIZE,
 301                     LEVEL_SIZE(1), 0, 0, NULL, NULL, VM_SLEEP);
 302                 ASSERT(nwindows <= MMU_PAGESIZE / mmu.pte_size);
 303                 mmu.pwin_pte_va = vmem_xalloc(heap_arena, MMU_PAGESIZE,
 304                     MMU_PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP);
 305 
 306                 /*
 307                  * Find/Create the page table window mappings.
 308                  */
 309                 paddr = 0;
 310                 (void) find_pte((uintptr_t)mmu.pwin_base, &paddr, 0, 0);
 311                 ASSERT(paddr != 0);
 312                 ASSERT((paddr & MMU_PAGEOFFSET) == 0);
 313                 mmu.pwin_pte_pa = paddr;
 314 #ifdef __xpv
 315                 (void) find_pte((uintptr_t)mmu.pwin_pte_va, NULL, 0, 0);
 316                 kbm_read_only((uintptr_t)mmu.pwin_pte_va, mmu.pwin_pte_pa);
 317 #else
 318                 kbm_map((uintptr_t)mmu.pwin_pte_va, mmu.pwin_pte_pa, 0, 1);
 319 #endif
 320         }
 321 
 322         /*
 323          * Walk the boot loader's page tables and figure out
 324          * how many tables and page mappings there will be.
 325          */
 326         while (kbm_probe(&va, &size, &pfn, &prot) != 0) {
 327                 /*
 328                  * At each level, if the last_va falls into a new htable,
 329                  * increment table_cnt. We can stop at the 1st level where
 330                  * they are in the same htable.
 331                  */
 332                 start_level = 0;
 333                 while (start_level <= mmu.max_page_level) {
 334                         if (size == LEVEL_SIZE(start_level))
 335                                 break;
 336                         start_level++;
 337                 }
 338 
 339                 for (l = start_level; l < mmu.max_level; ++l) {
 340                         if (va >> LEVEL_SHIFT(l + 1) ==
 341                             last_va >> LEVEL_SHIFT(l + 1))
 342                                 break;
 343                         ++table_cnt;
 344                 }
 345                 last_va = va;
 346                 l = (start_level == 0) ? 1 : start_level;
 347                 va = (va & LEVEL_MASK(l)) + LEVEL_SIZE(l);
 348         }
 349 
 350         /*
 351          * Besides the boot loader mappings, we're going to fill in
 352          * the entire top level page table for the kernel. Make sure there's
 353          * enough reserve for that too.
 354          */
 355         table_cnt += mmu.top_level_count - ((kernelbase >>
 356             LEVEL_SHIFT(mmu.max_level)) & (mmu.top_level_count - 1));
 357 
 358         /*
 359          * Add 1/4 more into table_cnt for extra slop.  The unused
 360          * slop is freed back when we htable_adjust_reserve() later.
 361          */
 362         table_cnt += table_cnt >> 2;
 363 
 364         /*
 365          * We only need mapping entries (hments) for shared pages.
 366          * This should be far, far fewer than the total possible,
 367          * We'll allocate enough for 1/16 of all possible PTEs.
 368          */
 369         mapping_cnt = (table_cnt * mmu.ptes_per_table) >> 4;
 370 
 371         /*
 372          * Now create the initial htable/hment reserves
 373          */
 374         htable_initial_reserve(table_cnt);
 375         hment_reserve(mapping_cnt);
 376         x86pte_cpu_init(CPU);
 377 }
 378 
 379 
 380 /*
 381  * This routine handles the work of creating the kernel's initial mappings
 382  * by deciphering the mappings in the page tables created by the boot program.
 383  *
 384  * We maintain large page mappings, but only to a level 1 pagesize.
 385  * The boot loader can only add new mappings once this function starts.
 386  * In particular it can not change the pagesize used for any existing
 387  * mappings or this code breaks!
 388  */
 389 
 390 void
 391 hat_kern_setup(void)
 392 {
 393         /*
 394          * Attach htables to the existing pagetables
 395          */
 396         /* BEGIN CSTYLED */
 397         htable_attach(kas.a_hat, 0, mmu.max_level, NULL,
 398 #ifdef __xpv
 399             mmu_btop(xen_info->pt_base - ONE_GIG));
 400 #else
 401             mmu_btop(getcr3_pa()));
 402 #endif
 403         /* END CSTYLED */
 404 
 405 #if defined(__xpv)
 406         /*
 407          * Try to make the kpm mappings r/w. Failures here are OK, as
 408          * it's probably just a pagetable
 409          */
 410         xen_kpm_finish_init();
 411 #endif
 412 
 413         /*
 414          * The kernel HAT is now officially open for business.
 415          */
 416         khat_running = 1;
 417 
 418         CPUSET_ATOMIC_ADD(kas.a_hat->hat_cpus, CPU->cpu_id);
 419         CPU->cpu_current_hat = kas.a_hat;
 420 }
 421 
 422 #ifndef __xpv
 423 
 424 /*
 425  * Note that the INVPCID_ALL* variants can be used even in the !PCIDE case, but
 426  * INVPCID_ADDR isn't.
 427  */
 428 static void
 429 invpcid(uint64_t type, uint64_t pcid, uintptr_t addr)
 430 {
 431         ulong_t flag;
 432         uint64_t cr4;
 433 
 434         if (x86_use_invpcid == 1) {
 435                 ASSERT(is_x86_feature(x86_featureset, X86FSET_INVPCID));
 436                 invpcid_insn(type, pcid, addr);
 437                 return;
 438         }
 439 
 440         switch (type) {
 441         case INVPCID_ALL_GLOBAL:
 442                 flag = intr_clear();
 443                 cr4 = getcr4();
 444                 setcr4(cr4 & ~(ulong_t)CR4_PGE);
 445                 setcr4(cr4 | CR4_PGE);
 446                 intr_restore(flag);
 447                 break;
 448 
 449         case INVPCID_ALL_NONGLOBAL:
 450                 if (!(getcr4() & CR4_PCIDE)) {
 451                         reload_cr3();
 452                 } else {
 453                         flag = intr_clear();
 454                         cr4 = getcr4();
 455                         setcr4(cr4 & ~(ulong_t)CR4_PGE);
 456                         setcr4(cr4 | CR4_PGE);
 457                         intr_restore(flag);
 458                 }
 459                 break;
 460 
 461         case INVPCID_ADDR:
 462                 if (pcid == PCID_USER) {
 463                         flag = intr_clear();
 464                         ASSERT(addr < kernelbase);
 465                         ASSERT(ON_USER_HAT(CPU));
 466                         ASSERT(CPU->cpu_m.mcpu_kpti.kf_user_cr3 != 0);
 467                         tr_mmu_flush_user_range(addr, MMU_PAGESIZE,
 468                             MMU_PAGESIZE, CPU->cpu_m.mcpu_kpti.kf_user_cr3);
 469                         intr_restore(flag);
 470                 } else {
 471                         mmu_invlpg((caddr_t)addr);
 472                 }
 473                 break;
 474 
 475         default:
 476                 panic("unsupported invpcid(%lu)", type);
 477                 break;
 478         }
 479 }
 480 
 481 /*
 482  * Flush one kernel mapping.
 483  *
 484  * We want to assert on kernel space here mainly for reasoning about the PCIDE
 485  * case: namely, this flush should never need to flush a non-current PCID
 486  * mapping.  This presumes we never have reason to flush the kernel regions
 487  * available to PCID_USER (the trampolines and so on).  It also relies on
 488  * PCID_KERNEL == PCID_NONE.
 489  */
 490 void
 491 mmu_flush_tlb_kpage(uintptr_t va)
 492 {
 493         ASSERT(va >= kernelbase);
 494         ASSERT(getpcid() == PCID_KERNEL);
 495         mmu_invlpg((caddr_t)va);
 496 }
 497 
 498 /*
 499  * Flush one mapping: local CPU version of hat_tlb_inval().
 500  *
 501  * If this is a userspace address in the PCIDE case, we need two invalidations,
 502  * one for any potentially stale PCID_USER mapping, as well as any established
 503  * while in the kernel.
 504  */
 505 void
 506 mmu_flush_tlb_page(uintptr_t va)
 507 {
 508         ASSERT(getpcid() == PCID_KERNEL);
 509 
 510         if (va >= kernelbase) {
 511                 mmu_flush_tlb_kpage(va);
 512                 return;
 513         }
 514 
 515         if (!(getcr4() & CR4_PCIDE)) {
 516                 mmu_invlpg((caddr_t)va);
 517                 return;
 518         }
 519 
 520         /*
 521          * Yes, kas will need to flush below kernelspace, at least during boot.
 522          * But there's no PCID_USER context.
 523          */
 524         if (ON_USER_HAT(CPU))
 525                 invpcid(INVPCID_ADDR, PCID_USER, va);
 526         invpcid(INVPCID_ADDR, PCID_KERNEL, va);
 527 }
 528 
 529 static void
 530 mmu_flush_tlb_range(uintptr_t addr, size_t len, size_t pgsz)
 531 {
 532         EQUIV(addr < kernelbase, (addr + len - 1) < kernelbase);
 533         ASSERT(len > 0);
 534         ASSERT(pgsz != 0);
 535 
 536         if (!(getcr4() & CR4_PCIDE) || x86_use_invpcid == 1) {
 537                 for (uintptr_t va = addr; va < (addr + len); va += pgsz)
 538                         mmu_flush_tlb_page(va);
 539                 return;
 540         }
 541 
 542         /*
 543          * As an emulated invpcid() in the PCIDE case requires jumping
 544          * cr3s, we batch the invalidations.  We should only need to flush the
 545          * user range if we're on a user-space HAT.
 546          */
 547         if (addr < kernelbase && ON_USER_HAT(CPU)) {
 548                 ulong_t flag = intr_clear();
 549                 ASSERT(CPU->cpu_m.mcpu_kpti.kf_user_cr3 != 0);
 550                 tr_mmu_flush_user_range(addr, len, pgsz,
 551                     CPU->cpu_m.mcpu_kpti.kf_user_cr3);
 552                 intr_restore(flag);
 553         }
 554 
 555         for (uintptr_t va = addr; va < (addr + len); va += pgsz)
 556                 mmu_invlpg((caddr_t)va);
 557 }
 558 
 559 /*
 560  * MMU TLB (and PT cache) flushing on this CPU.
 561  *
 562  * FLUSH_TLB_ALL: invalidate everything, all PCIDs, all PT_GLOBAL.
 563  * FLUSH_TLB_NONGLOBAL: invalidate all PCIDs, excluding PT_GLOBAL
 564  * FLUSH_TLB_RANGE: invalidate the given range, including PCID_USER
 565  * mappings as appropriate.  If using invpcid, PT_GLOBAL mappings are not
 566  * invalidated.
 567  */
 568 void
 569 mmu_flush_tlb(flush_tlb_type_t type, tlb_range_t *range)
 570 {
 571         ASSERT(getpcid() == PCID_KERNEL);
 572 
 573         switch (type) {
 574         case FLUSH_TLB_ALL:
 575                 ASSERT(range == NULL);
 576                 invpcid(INVPCID_ALL_GLOBAL, 0, 0);
 577                 break;
 578 
 579         case FLUSH_TLB_NONGLOBAL:
 580                 ASSERT(range == NULL);
 581                 invpcid(INVPCID_ALL_NONGLOBAL, 0, 0);
 582                 break;
 583 
 584         case FLUSH_TLB_RANGE: {
 585                 mmu_flush_tlb_range(range->tr_va, TLB_RANGE_LEN(range),
 586                     LEVEL_SIZE(range->tr_level));
 587                 break;
 588         }
 589 
 590         default:
 591                 panic("invalid call mmu_flush_tlb(%d)", type);
 592                 break;
 593         }
 594 }
 595 
 596 #endif /* ! __xpv */