1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 #include <sys/t_lock.h>
  27 #include <sys/memlist.h>
  28 #include <sys/cpuvar.h>
  29 #include <sys/vmem.h>
  30 #include <sys/mman.h>
  31 #include <sys/vm.h>
  32 #include <sys/kmem.h>
  33 #include <sys/cmn_err.h>
  34 #include <sys/debug.h>
  35 #include <sys/vm_machparam.h>
  36 #include <sys/tss.h>
  37 #include <sys/vnode.h>
  38 #include <vm/hat.h>
  39 #include <vm/anon.h>
  40 #include <vm/as.h>
  41 #include <vm/page.h>
  42 #include <vm/seg.h>
  43 #include <vm/seg_kmem.h>
  44 #include <vm/seg_map.h>
  45 #include <vm/hat_i86.h>
  46 #include <sys/promif.h>
  47 #include <sys/x86_archext.h>
  48 #include <sys/systm.h>
  49 #include <sys/archsystm.h>
  50 #include <sys/sunddi.h>
  51 #include <sys/ddidmareq.h>
  52 #include <sys/controlregs.h>
  53 #include <sys/reboot.h>
  54 #include <sys/kdi.h>
  55 #include <sys/bootconf.h>
  56 #include <sys/bootsvcs.h>
  57 #include <sys/bootinfo.h>
  58 #include <vm/kboot_mmu.h>
  59 
  60 #ifdef __xpv
  61 #include <sys/hypervisor.h>
  62 #endif
  63 
  64 caddr_t
  65 i86devmap(pfn_t pf, pgcnt_t pgcnt, uint_t prot)
  66 {
  67         caddr_t addr;
  68         caddr_t addr1;
  69         page_t *pp;
  70 
  71         addr1 = addr = vmem_alloc(heap_arena, mmu_ptob(pgcnt), VM_SLEEP);
  72 
  73         for (; pgcnt != 0; addr += MMU_PAGESIZE, ++pf, --pgcnt) {
  74                 pp = page_numtopp_nolock(pf);
  75                 if (pp == NULL) {
  76                         hat_devload(kas.a_hat, addr, MMU_PAGESIZE, pf,
  77                             prot | HAT_NOSYNC, HAT_LOAD_LOCK);
  78                 } else {
  79                         hat_memload(kas.a_hat, addr, pp,
  80                             prot | HAT_NOSYNC, HAT_LOAD_LOCK);
  81                 }
  82         }
  83 
  84         return (addr1);
  85 }
  86 
  87 /*
  88  * This routine is like page_numtopp, but accepts only free pages, which
  89  * it allocates (unfrees) and returns with the exclusive lock held.
  90  * It is used by machdep.c/dma_init() to find contiguous free pages.
  91  *
  92  * XXX this and some others should probably be in vm_machdep.c
  93  */
  94 page_t *
  95 page_numtopp_alloc(pfn_t pfnum)
  96 {
  97         page_t *pp;
  98 
  99 retry:
 100         pp = page_numtopp_nolock(pfnum);
 101         if (pp == NULL) {
 102                 return (NULL);
 103         }
 104 
 105         if (!page_trylock(pp, SE_EXCL)) {
 106                 return (NULL);
 107         }
 108 
 109         if (page_pptonum(pp) != pfnum) {
 110                 page_unlock(pp);
 111                 goto retry;
 112         }
 113 
 114         if (!PP_ISFREE(pp)) {
 115                 page_unlock(pp);
 116                 return (NULL);
 117         }
 118         if (pp->p_szc) {
 119                 page_demote_free_pages(pp);
 120                 page_unlock(pp);
 121                 goto retry;
 122         }
 123 
 124         /* If associated with a vnode, destroy mappings */
 125 
 126         if (pp->p_vnode) {
 127 
 128                 page_destroy_free(pp);
 129 
 130                 if (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_NO_RECLAIM)) {
 131                         return (NULL);
 132                 }
 133 
 134                 if (page_pptonum(pp) != pfnum) {
 135                         page_unlock(pp);
 136                         goto retry;
 137                 }
 138         }
 139 
 140         if (!PP_ISFREE(pp)) {
 141                 page_unlock(pp);
 142                 return (NULL);
 143         }
 144 
 145         if (!page_reclaim(pp, (kmutex_t *)NULL))
 146                 return (NULL);
 147 
 148         return (pp);
 149 }
 150 
 151 /*
 152  * Flag is not set early in boot. Once it is set we are no longer
 153  * using boot's page tables.
 154  */
 155 uint_t khat_running = 0;
 156 
 157 /*
 158  * This procedure is callable only while the boot loader is in charge of the
 159  * MMU. It assumes that PA == VA for page table pointers.  It doesn't live in
 160  * kboot_mmu.c since it's used from common code.
 161  */
 162 pfn_t
 163 va_to_pfn(void *vaddr)
 164 {
 165         uintptr_t       des_va = ALIGN2PAGE(vaddr);
 166         uintptr_t       va = des_va;
 167         size_t          len;
 168         uint_t          prot;
 169         pfn_t           pfn;
 170 
 171         if (khat_running)
 172                 panic("va_to_pfn(): called too late\n");
 173 
 174         if (kbm_probe(&va, &len, &pfn, &prot) == 0)
 175                 return (PFN_INVALID);
 176         if (va > des_va)
 177                 return (PFN_INVALID);
 178         if (va < des_va)
 179                 pfn += mmu_btop(des_va - va);
 180         return (pfn);
 181 }
 182 
 183 /*
 184  * Initialize a special area in the kernel that always holds some PTEs for
 185  * faster performance. This always holds segmap's PTEs.
 186  * In the 32 bit kernel this maps the kernel heap too.
 187  */
 188 void
 189 hat_kmap_init(uintptr_t base, size_t len)
 190 {
 191         uintptr_t map_addr;     /* base rounded down to large page size */
 192         uintptr_t map_eaddr;    /* base + len rounded up */
 193         size_t map_len;
 194         caddr_t ptes;           /* mapping area in kernel for kmap ptes */
 195         size_t window_size;     /* size of mapping area for ptes */
 196         ulong_t htable_cnt;     /* # of page tables to cover map_len */
 197         ulong_t i;
 198         htable_t *ht;
 199         uintptr_t va;
 200 
 201         /*
 202          * We have to map in an area that matches an entire page table.
 203          * The PTEs are large page aligned to avoid spurious pagefaults
 204          * on the hypervisor.
 205          */
 206         map_addr = base & LEVEL_MASK(1);
 207         map_eaddr = (base + len + LEVEL_SIZE(1) - 1) & LEVEL_MASK(1);
 208         map_len = map_eaddr - map_addr;
 209         window_size = mmu_btop(map_len) * mmu.pte_size;
 210         window_size = (window_size + LEVEL_SIZE(1)) & LEVEL_MASK(1);
 211         htable_cnt = map_len >> LEVEL_SHIFT(1);
 212 
 213         /*
 214          * allocate vmem for the kmap_ptes
 215          */
 216         ptes = vmem_xalloc(heap_arena, window_size, LEVEL_SIZE(1), 0,
 217             0, NULL, NULL, VM_SLEEP);
 218         mmu.kmap_htables =
 219             kmem_alloc(htable_cnt * sizeof (htable_t *), KM_SLEEP);
 220 
 221         /*
 222          * Map the page tables that cover kmap into the allocated range.
 223          * Note we don't ever htable_release() the kmap page tables - they
 224          * can't ever be stolen, freed, etc.
 225          */
 226         for (va = map_addr, i = 0; i < htable_cnt; va += LEVEL_SIZE(1), ++i) {
 227                 ht = htable_create(kas.a_hat, va, 0, NULL);
 228                 if (ht == NULL)
 229                         panic("hat_kmap_init: ht == NULL");
 230                 mmu.kmap_htables[i] = ht;
 231 
 232                 hat_devload(kas.a_hat, ptes + i * MMU_PAGESIZE,
 233                     MMU_PAGESIZE, ht->ht_pfn,
 234 #ifdef __xpv
 235                     PROT_READ | HAT_NOSYNC | HAT_UNORDERED_OK,
 236 #else
 237                     PROT_READ | PROT_WRITE | HAT_NOSYNC | HAT_UNORDERED_OK,
 238 #endif
 239                     HAT_LOAD | HAT_LOAD_NOCONSIST);
 240         }
 241 
 242         /*
 243          * set information in mmu to activate handling of kmap
 244          */
 245         mmu.kmap_addr = map_addr;
 246         mmu.kmap_eaddr = map_eaddr;
 247         mmu.kmap_ptes = (x86pte_t *)ptes;
 248 }
 249 
 250 extern caddr_t  kpm_vbase;
 251 extern size_t   kpm_size;
 252 
 253 #ifdef __xpv
 254 /*
 255  * Create the initial segkpm mappings for the hypervisor. To avoid having
 256  * to deal with page tables being read only, we make all mappings
 257  * read only at first.
 258  */
 259 static void
 260 xen_kpm_create(paddr_t paddr, level_t lvl)
 261 {
 262         ulong_t pg_off;
 263 
 264         for (pg_off = 0; pg_off < LEVEL_SIZE(lvl); pg_off += MMU_PAGESIZE) {
 265                 kbm_map((uintptr_t)kpm_vbase + paddr, (paddr_t)0, 0, 1);
 266                 kbm_read_only((uintptr_t)kpm_vbase + paddr + pg_off,
 267                     paddr + pg_off);
 268         }
 269 }
 270 
 271 /*
 272  * Try to make all kpm mappings writable. Failures are ok, as those
 273  * are just pagetable, GDT, etc. pages.
 274  */
 275 static void
 276 xen_kpm_finish_init(void)
 277 {
 278         pfn_t gdtpfn = mmu_btop(CPU->cpu_m.mcpu_gdtpa);
 279         pfn_t pfn;
 280         page_t *pp;
 281 
 282         for (pfn = 0; pfn < mfn_count; ++pfn) {
 283                 /*
 284                  * skip gdt
 285                  */
 286                 if (pfn == gdtpfn)
 287                         continue;
 288 
 289                 /*
 290                  * p_index is a hint that this is a pagetable
 291                  */
 292                 pp = page_numtopp_nolock(pfn);
 293                 if (pp && pp->p_index) {
 294                         pp->p_index = 0;
 295                         continue;
 296                 }
 297                 (void) xen_kpm_page(pfn, PT_VALID | PT_WRITABLE);
 298         }
 299 }
 300 #endif
 301 
 302 /*
 303  * Routine to pre-allocate data structures for hat_kern_setup(). It computes
 304  * how many pagetables it needs by walking the boot loader's page tables.
 305  */
 306 /*ARGSUSED*/
 307 void
 308 hat_kern_alloc(
 309         caddr_t segmap_base,
 310         size_t  segmap_size,
 311         caddr_t ekernelheap)
 312 {
 313         uintptr_t       last_va = (uintptr_t)-1;        /* catch 1st time */
 314         uintptr_t       va = 0;
 315         size_t          size;
 316         pfn_t           pfn;
 317         uint_t          prot;
 318         uint_t          table_cnt = 1;
 319         uint_t          mapping_cnt;
 320         level_t         start_level;
 321         level_t         l;
 322         struct memlist  *pmem;
 323         level_t         lpagel = mmu.max_page_level;
 324         uint64_t        paddr;
 325         int64_t         psize;
 326         int             nwindows;
 327 
 328         if (kpm_size > 0) {
 329                 /*
 330                  * Create the kpm page tables.  When running on the
 331                  * hypervisor these are made read/only at first.
 332                  * Later we'll add write permission where possible.
 333                  */
 334                 for (pmem = phys_install; pmem; pmem = pmem->ml_next) {
 335                         paddr = pmem->ml_address;
 336                         psize = pmem->ml_size;
 337                         while (psize >= MMU_PAGESIZE) {
 338                                 /* find the largest page size */
 339                                 for (l = lpagel; l > 0; l--) {
 340                                         if ((paddr & LEVEL_OFFSET(l)) == 0 &&
 341                                             psize > LEVEL_SIZE(l))
 342                                                 break;
 343                                 }
 344 
 345 #if defined(__xpv)
 346                                 /*
 347                                  * Create read/only mappings to avoid
 348                                  * conflicting with pagetable usage
 349                                  */
 350                                 xen_kpm_create(paddr, l);
 351 #else
 352                                 kbm_map((uintptr_t)kpm_vbase + paddr, paddr,
 353                                     l, 1);
 354 #endif
 355                                 paddr += LEVEL_SIZE(l);
 356                                 psize -= LEVEL_SIZE(l);
 357                         }
 358                 }
 359         }
 360 
 361         /*
 362          * If this machine doesn't have a kpm segment, we need to allocate
 363          * a small number of 'windows' which can be used to map pagetables.
 364          */
 365         nwindows = (kpm_size == 0) ? 2 * NCPU : 0;
 366 
 367 #if defined(__xpv)
 368         /*
 369          * On a hypervisor, these windows are also used by the xpv_panic
 370          * code, where we need one window for each level of the pagetable
 371          * hierarchy.
 372          */
 373         nwindows = MAX(nwindows, mmu.max_level);
 374 #endif
 375 
 376         if (nwindows != 0) {
 377                 /*
 378                  * Create the page windows and 1 page of VA in
 379                  * which we map the PTEs of those windows.
 380                  */
 381                 mmu.pwin_base = vmem_xalloc(heap_arena, nwindows * MMU_PAGESIZE,
 382                     LEVEL_SIZE(1), 0, 0, NULL, NULL, VM_SLEEP);
 383                 ASSERT(nwindows <= MMU_PAGESIZE / mmu.pte_size);
 384                 mmu.pwin_pte_va = vmem_xalloc(heap_arena, MMU_PAGESIZE,
 385                     MMU_PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP);
 386 
 387                 /*
 388                  * Find/Create the page table window mappings.
 389                  */
 390                 paddr = 0;
 391                 (void) find_pte((uintptr_t)mmu.pwin_base, &paddr, 0, 0);
 392                 ASSERT(paddr != 0);
 393                 ASSERT((paddr & MMU_PAGEOFFSET) == 0);
 394                 mmu.pwin_pte_pa = paddr;
 395 #ifdef __xpv
 396                 (void) find_pte((uintptr_t)mmu.pwin_pte_va, NULL, 0, 0);
 397                 kbm_read_only((uintptr_t)mmu.pwin_pte_va, mmu.pwin_pte_pa);
 398 #else
 399                 kbm_map((uintptr_t)mmu.pwin_pte_va, mmu.pwin_pte_pa, 0, 1);
 400 #endif
 401         }
 402 
 403         /*
 404          * Walk the boot loader's page tables and figure out
 405          * how many tables and page mappings there will be.
 406          */
 407         while (kbm_probe(&va, &size, &pfn, &prot) != 0) {
 408                 /*
 409                  * At each level, if the last_va falls into a new htable,
 410                  * increment table_cnt. We can stop at the 1st level where
 411                  * they are in the same htable.
 412                  */
 413                 start_level = 0;
 414                 while (start_level <= mmu.max_page_level) {
 415                         if (size == LEVEL_SIZE(start_level))
 416                                 break;
 417                         start_level++;
 418                 }
 419 
 420                 for (l = start_level; l < mmu.max_level; ++l) {
 421                         if (va >> LEVEL_SHIFT(l + 1) ==
 422                             last_va >> LEVEL_SHIFT(l + 1))
 423                                 break;
 424                         ++table_cnt;
 425                 }
 426                 last_va = va;
 427                 l = (start_level == 0) ? 1 : start_level;
 428                 va = (va & LEVEL_MASK(l)) + LEVEL_SIZE(l);
 429         }
 430 
 431         /*
 432          * Besides the boot loader mappings, we're going to fill in
 433          * the entire top level page table for the kernel. Make sure there's
 434          * enough reserve for that too.
 435          */
 436         table_cnt += mmu.top_level_count - ((kernelbase >>
 437             LEVEL_SHIFT(mmu.max_level)) & (mmu.top_level_count - 1));
 438 
 439 #if defined(__i386)
 440         /*
 441          * The 32 bit PAE hat allocates tables one level below the top when
 442          * kernelbase isn't 1 Gig aligned. We'll just be sloppy and allocate
 443          * a bunch more to the reserve. Any unused will be returned later.
 444          * Note we've already counted these mappings, just not the extra
 445          * pagetables.
 446          */
 447         if (mmu.pae_hat != 0 && (kernelbase & LEVEL_OFFSET(mmu.max_level)) != 0)
 448                 table_cnt += mmu.ptes_per_table -
 449                     ((kernelbase & LEVEL_OFFSET(mmu.max_level)) >>
 450                     LEVEL_SHIFT(mmu.max_level - 1));
 451 #endif
 452 
 453         /*
 454          * Add 1/4 more into table_cnt for extra slop.  The unused
 455          * slop is freed back when we htable_adjust_reserve() later.
 456          */
 457         table_cnt += table_cnt >> 2;
 458 
 459         /*
 460          * We only need mapping entries (hments) for shared pages.
 461          * This should be far, far fewer than the total possible,
 462          * We'll allocate enough for 1/16 of all possible PTEs.
 463          */
 464         mapping_cnt = (table_cnt * mmu.ptes_per_table) >> 4;
 465 
 466         /*
 467          * Now create the initial htable/hment reserves
 468          */
 469         htable_initial_reserve(table_cnt);
 470         hment_reserve(mapping_cnt);
 471         x86pte_cpu_init(CPU);
 472 }
 473 
 474 
 475 /*
 476  * This routine handles the work of creating the kernel's initial mappings
 477  * by deciphering the mappings in the page tables created by the boot program.
 478  *
 479  * We maintain large page mappings, but only to a level 1 pagesize.
 480  * The boot loader can only add new mappings once this function starts.
 481  * In particular it can not change the pagesize used for any existing
 482  * mappings or this code breaks!
 483  */
 484 
 485 void
 486 hat_kern_setup(void)
 487 {
 488         /*
 489          * Attach htables to the existing pagetables
 490          */
 491         /* BEGIN CSTYLED */
 492         htable_attach(kas.a_hat, 0, mmu.max_level, NULL,
 493 #ifdef __xpv
 494             mmu_btop(xen_info->pt_base - ONE_GIG));
 495 #else
 496             mmu_btop(getcr3()));
 497 #endif
 498         /* END CSTYLED */
 499 
 500 #if defined(__i386) && !defined(__xpv)
 501         CPU->cpu_tss->tss_cr3 = dftss0->tss_cr3 = getcr3();
 502 #endif /* __i386 */
 503 
 504 #if defined(__xpv) && defined(__amd64)
 505         /*
 506          * Try to make the kpm mappings r/w. Failures here are OK, as
 507          * it's probably just a pagetable
 508          */
 509         xen_kpm_finish_init();
 510 #endif
 511 
 512         /*
 513          * The kernel HAT is now officially open for business.
 514          */
 515         khat_running = 1;
 516 
 517         CPUSET_ATOMIC_ADD(kas.a_hat->hat_cpus, CPU->cpu_id);
 518         CPU->cpu_current_hat = kas.a_hat;
 519 }