1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/t_lock.h> 27 #include <sys/memlist.h> 28 #include <sys/cpuvar.h> 29 #include <sys/vmem.h> 30 #include <sys/mman.h> 31 #include <sys/vm.h> 32 #include <sys/kmem.h> 33 #include <sys/cmn_err.h> 34 #include <sys/debug.h> 35 #include <sys/vm_machparam.h> 36 #include <sys/tss.h> 37 #include <sys/vnode.h> 38 #include <vm/hat.h> 39 #include <vm/anon.h> 40 #include <vm/as.h> 41 #include <vm/page.h> 42 #include <vm/seg.h> 43 #include <vm/seg_kmem.h> 44 #include <vm/seg_map.h> 45 #include <vm/hat_i86.h> 46 #include <sys/promif.h> 47 #include <sys/x86_archext.h> 48 #include <sys/systm.h> 49 #include <sys/archsystm.h> 50 #include <sys/sunddi.h> 51 #include <sys/ddidmareq.h> 52 #include <sys/controlregs.h> 53 #include <sys/reboot.h> 54 #include <sys/kdi.h> 55 #include <sys/bootconf.h> 56 #include <sys/bootsvcs.h> 57 #include <sys/bootinfo.h> 58 #include <vm/kboot_mmu.h> 59 60 #ifdef __xpv 61 #include <sys/hypervisor.h> 62 #endif 63 64 caddr_t 65 i86devmap(pfn_t pf, pgcnt_t pgcnt, uint_t prot) 66 { 67 caddr_t addr; 68 caddr_t addr1; 69 page_t *pp; 70 71 addr1 = addr = vmem_alloc(heap_arena, mmu_ptob(pgcnt), VM_SLEEP); 72 73 for (; pgcnt != 0; addr += MMU_PAGESIZE, ++pf, --pgcnt) { 74 pp = page_numtopp_nolock(pf); 75 if (pp == NULL) { 76 hat_devload(kas.a_hat, addr, MMU_PAGESIZE, pf, 77 prot | HAT_NOSYNC, HAT_LOAD_LOCK); 78 } else { 79 hat_memload(kas.a_hat, addr, pp, 80 prot | HAT_NOSYNC, HAT_LOAD_LOCK); 81 } 82 } 83 84 return (addr1); 85 } 86 87 /* 88 * This routine is like page_numtopp, but accepts only free pages, which 89 * it allocates (unfrees) and returns with the exclusive lock held. 90 * It is used by machdep.c/dma_init() to find contiguous free pages. 91 * 92 * XXX this and some others should probably be in vm_machdep.c 93 */ 94 page_t * 95 page_numtopp_alloc(pfn_t pfnum) 96 { 97 page_t *pp; 98 99 retry: 100 pp = page_numtopp_nolock(pfnum); 101 if (pp == NULL) { 102 return (NULL); 103 } 104 105 if (!page_trylock(pp, SE_EXCL)) { 106 return (NULL); 107 } 108 109 if (page_pptonum(pp) != pfnum) { 110 page_unlock(pp); 111 goto retry; 112 } 113 114 if (!PP_ISFREE(pp)) { 115 page_unlock(pp); 116 return (NULL); 117 } 118 if (pp->p_szc) { 119 page_demote_free_pages(pp); 120 page_unlock(pp); 121 goto retry; 122 } 123 124 /* If associated with a vnode, destroy mappings */ 125 126 if (pp->p_vnode) { 127 128 page_destroy_free(pp); 129 130 if (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_NO_RECLAIM)) { 131 return (NULL); 132 } 133 134 if (page_pptonum(pp) != pfnum) { 135 page_unlock(pp); 136 goto retry; 137 } 138 } 139 140 if (!PP_ISFREE(pp)) { 141 page_unlock(pp); 142 return (NULL); 143 } 144 145 if (!page_reclaim(pp, (kmutex_t *)NULL)) 146 return (NULL); 147 148 return (pp); 149 } 150 151 /* 152 * Flag is not set early in boot. Once it is set we are no longer 153 * using boot's page tables. 154 */ 155 uint_t khat_running = 0; 156 157 /* 158 * This procedure is callable only while the boot loader is in charge of the 159 * MMU. It assumes that PA == VA for page table pointers. It doesn't live in 160 * kboot_mmu.c since it's used from common code. 161 */ 162 pfn_t 163 va_to_pfn(void *vaddr) 164 { 165 uintptr_t des_va = ALIGN2PAGE(vaddr); 166 uintptr_t va = des_va; 167 size_t len; 168 uint_t prot; 169 pfn_t pfn; 170 171 if (khat_running) 172 panic("va_to_pfn(): called too late\n"); 173 174 if (kbm_probe(&va, &len, &pfn, &prot) == 0) 175 return (PFN_INVALID); 176 if (va > des_va) 177 return (PFN_INVALID); 178 if (va < des_va) 179 pfn += mmu_btop(des_va - va); 180 return (pfn); 181 } 182 183 /* 184 * Initialize a special area in the kernel that always holds some PTEs for 185 * faster performance. This always holds segmap's PTEs. 186 * In the 32 bit kernel this maps the kernel heap too. 187 */ 188 void 189 hat_kmap_init(uintptr_t base, size_t len) 190 { 191 uintptr_t map_addr; /* base rounded down to large page size */ 192 uintptr_t map_eaddr; /* base + len rounded up */ 193 size_t map_len; 194 caddr_t ptes; /* mapping area in kernel for kmap ptes */ 195 size_t window_size; /* size of mapping area for ptes */ 196 ulong_t htable_cnt; /* # of page tables to cover map_len */ 197 ulong_t i; 198 htable_t *ht; 199 uintptr_t va; 200 201 /* 202 * We have to map in an area that matches an entire page table. 203 * The PTEs are large page aligned to avoid spurious pagefaults 204 * on the hypervisor. 205 */ 206 map_addr = base & LEVEL_MASK(1); 207 map_eaddr = (base + len + LEVEL_SIZE(1) - 1) & LEVEL_MASK(1); 208 map_len = map_eaddr - map_addr; 209 window_size = mmu_btop(map_len) * mmu.pte_size; 210 window_size = (window_size + LEVEL_SIZE(1)) & LEVEL_MASK(1); 211 htable_cnt = map_len >> LEVEL_SHIFT(1); 212 213 /* 214 * allocate vmem for the kmap_ptes 215 */ 216 ptes = vmem_xalloc(heap_arena, window_size, LEVEL_SIZE(1), 0, 217 0, NULL, NULL, VM_SLEEP); 218 mmu.kmap_htables = 219 kmem_alloc(htable_cnt * sizeof (htable_t *), KM_SLEEP); 220 221 /* 222 * Map the page tables that cover kmap into the allocated range. 223 * Note we don't ever htable_release() the kmap page tables - they 224 * can't ever be stolen, freed, etc. 225 */ 226 for (va = map_addr, i = 0; i < htable_cnt; va += LEVEL_SIZE(1), ++i) { 227 ht = htable_create(kas.a_hat, va, 0, NULL); 228 if (ht == NULL) 229 panic("hat_kmap_init: ht == NULL"); 230 mmu.kmap_htables[i] = ht; 231 232 hat_devload(kas.a_hat, ptes + i * MMU_PAGESIZE, 233 MMU_PAGESIZE, ht->ht_pfn, 234 #ifdef __xpv 235 PROT_READ | HAT_NOSYNC | HAT_UNORDERED_OK, 236 #else 237 PROT_READ | PROT_WRITE | HAT_NOSYNC | HAT_UNORDERED_OK, 238 #endif 239 HAT_LOAD | HAT_LOAD_NOCONSIST); 240 } 241 242 /* 243 * set information in mmu to activate handling of kmap 244 */ 245 mmu.kmap_addr = map_addr; 246 mmu.kmap_eaddr = map_eaddr; 247 mmu.kmap_ptes = (x86pte_t *)ptes; 248 } 249 250 extern caddr_t kpm_vbase; 251 extern size_t kpm_size; 252 253 #ifdef __xpv 254 /* 255 * Create the initial segkpm mappings for the hypervisor. To avoid having 256 * to deal with page tables being read only, we make all mappings 257 * read only at first. 258 */ 259 static void 260 xen_kpm_create(paddr_t paddr, level_t lvl) 261 { 262 ulong_t pg_off; 263 264 for (pg_off = 0; pg_off < LEVEL_SIZE(lvl); pg_off += MMU_PAGESIZE) { 265 kbm_map((uintptr_t)kpm_vbase + paddr, (paddr_t)0, 0, 1); 266 kbm_read_only((uintptr_t)kpm_vbase + paddr + pg_off, 267 paddr + pg_off); 268 } 269 } 270 271 /* 272 * Try to make all kpm mappings writable. Failures are ok, as those 273 * are just pagetable, GDT, etc. pages. 274 */ 275 static void 276 xen_kpm_finish_init(void) 277 { 278 pfn_t gdtpfn = mmu_btop(CPU->cpu_m.mcpu_gdtpa); 279 pfn_t pfn; 280 page_t *pp; 281 282 for (pfn = 0; pfn < mfn_count; ++pfn) { 283 /* 284 * skip gdt 285 */ 286 if (pfn == gdtpfn) 287 continue; 288 289 /* 290 * p_index is a hint that this is a pagetable 291 */ 292 pp = page_numtopp_nolock(pfn); 293 if (pp && pp->p_index) { 294 pp->p_index = 0; 295 continue; 296 } 297 (void) xen_kpm_page(pfn, PT_VALID | PT_WRITABLE); 298 } 299 } 300 #endif 301 302 /* 303 * Routine to pre-allocate data structures for hat_kern_setup(). It computes 304 * how many pagetables it needs by walking the boot loader's page tables. 305 */ 306 /*ARGSUSED*/ 307 void 308 hat_kern_alloc( 309 caddr_t segmap_base, 310 size_t segmap_size, 311 caddr_t ekernelheap) 312 { 313 uintptr_t last_va = (uintptr_t)-1; /* catch 1st time */ 314 uintptr_t va = 0; 315 size_t size; 316 pfn_t pfn; 317 uint_t prot; 318 uint_t table_cnt = 1; 319 uint_t mapping_cnt; 320 level_t start_level; 321 level_t l; 322 struct memlist *pmem; 323 level_t lpagel = mmu.max_page_level; 324 uint64_t paddr; 325 int64_t psize; 326 int nwindows; 327 328 if (kpm_size > 0) { 329 /* 330 * Create the kpm page tables. When running on the 331 * hypervisor these are made read/only at first. 332 * Later we'll add write permission where possible. 333 */ 334 for (pmem = phys_install; pmem; pmem = pmem->ml_next) { 335 paddr = pmem->ml_address; 336 psize = pmem->ml_size; 337 while (psize >= MMU_PAGESIZE) { 338 /* find the largest page size */ 339 for (l = lpagel; l > 0; l--) { 340 if ((paddr & LEVEL_OFFSET(l)) == 0 && 341 psize > LEVEL_SIZE(l)) 342 break; 343 } 344 345 #if defined(__xpv) 346 /* 347 * Create read/only mappings to avoid 348 * conflicting with pagetable usage 349 */ 350 xen_kpm_create(paddr, l); 351 #else 352 kbm_map((uintptr_t)kpm_vbase + paddr, paddr, 353 l, 1); 354 #endif 355 paddr += LEVEL_SIZE(l); 356 psize -= LEVEL_SIZE(l); 357 } 358 } 359 } 360 361 /* 362 * If this machine doesn't have a kpm segment, we need to allocate 363 * a small number of 'windows' which can be used to map pagetables. 364 */ 365 nwindows = (kpm_size == 0) ? 2 * NCPU : 0; 366 367 #if defined(__xpv) 368 /* 369 * On a hypervisor, these windows are also used by the xpv_panic 370 * code, where we need one window for each level of the pagetable 371 * hierarchy. 372 */ 373 nwindows = MAX(nwindows, mmu.max_level); 374 #endif 375 376 if (nwindows != 0) { 377 /* 378 * Create the page windows and 1 page of VA in 379 * which we map the PTEs of those windows. 380 */ 381 mmu.pwin_base = vmem_xalloc(heap_arena, nwindows * MMU_PAGESIZE, 382 LEVEL_SIZE(1), 0, 0, NULL, NULL, VM_SLEEP); 383 ASSERT(nwindows <= MMU_PAGESIZE / mmu.pte_size); 384 mmu.pwin_pte_va = vmem_xalloc(heap_arena, MMU_PAGESIZE, 385 MMU_PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP); 386 387 /* 388 * Find/Create the page table window mappings. 389 */ 390 paddr = 0; 391 (void) find_pte((uintptr_t)mmu.pwin_base, &paddr, 0, 0); 392 ASSERT(paddr != 0); 393 ASSERT((paddr & MMU_PAGEOFFSET) == 0); 394 mmu.pwin_pte_pa = paddr; 395 #ifdef __xpv 396 (void) find_pte((uintptr_t)mmu.pwin_pte_va, NULL, 0, 0); 397 kbm_read_only((uintptr_t)mmu.pwin_pte_va, mmu.pwin_pte_pa); 398 #else 399 kbm_map((uintptr_t)mmu.pwin_pte_va, mmu.pwin_pte_pa, 0, 1); 400 #endif 401 } 402 403 /* 404 * Walk the boot loader's page tables and figure out 405 * how many tables and page mappings there will be. 406 */ 407 while (kbm_probe(&va, &size, &pfn, &prot) != 0) { 408 /* 409 * At each level, if the last_va falls into a new htable, 410 * increment table_cnt. We can stop at the 1st level where 411 * they are in the same htable. 412 */ 413 start_level = 0; 414 while (start_level <= mmu.max_page_level) { 415 if (size == LEVEL_SIZE(start_level)) 416 break; 417 start_level++; 418 } 419 420 for (l = start_level; l < mmu.max_level; ++l) { 421 if (va >> LEVEL_SHIFT(l + 1) == 422 last_va >> LEVEL_SHIFT(l + 1)) 423 break; 424 ++table_cnt; 425 } 426 last_va = va; 427 l = (start_level == 0) ? 1 : start_level; 428 va = (va & LEVEL_MASK(l)) + LEVEL_SIZE(l); 429 } 430 431 /* 432 * Besides the boot loader mappings, we're going to fill in 433 * the entire top level page table for the kernel. Make sure there's 434 * enough reserve for that too. 435 */ 436 table_cnt += mmu.top_level_count - ((kernelbase >> 437 LEVEL_SHIFT(mmu.max_level)) & (mmu.top_level_count - 1)); 438 439 #if defined(__i386) 440 /* 441 * The 32 bit PAE hat allocates tables one level below the top when 442 * kernelbase isn't 1 Gig aligned. We'll just be sloppy and allocate 443 * a bunch more to the reserve. Any unused will be returned later. 444 * Note we've already counted these mappings, just not the extra 445 * pagetables. 446 */ 447 if (mmu.pae_hat != 0 && (kernelbase & LEVEL_OFFSET(mmu.max_level)) != 0) 448 table_cnt += mmu.ptes_per_table - 449 ((kernelbase & LEVEL_OFFSET(mmu.max_level)) >> 450 LEVEL_SHIFT(mmu.max_level - 1)); 451 #endif 452 453 /* 454 * Add 1/4 more into table_cnt for extra slop. The unused 455 * slop is freed back when we htable_adjust_reserve() later. 456 */ 457 table_cnt += table_cnt >> 2; 458 459 /* 460 * We only need mapping entries (hments) for shared pages. 461 * This should be far, far fewer than the total possible, 462 * We'll allocate enough for 1/16 of all possible PTEs. 463 */ 464 mapping_cnt = (table_cnt * mmu.ptes_per_table) >> 4; 465 466 /* 467 * Now create the initial htable/hment reserves 468 */ 469 htable_initial_reserve(table_cnt); 470 hment_reserve(mapping_cnt); 471 x86pte_cpu_init(CPU); 472 } 473 474 475 /* 476 * This routine handles the work of creating the kernel's initial mappings 477 * by deciphering the mappings in the page tables created by the boot program. 478 * 479 * We maintain large page mappings, but only to a level 1 pagesize. 480 * The boot loader can only add new mappings once this function starts. 481 * In particular it can not change the pagesize used for any existing 482 * mappings or this code breaks! 483 */ 484 485 void 486 hat_kern_setup(void) 487 { 488 /* 489 * Attach htables to the existing pagetables 490 */ 491 /* BEGIN CSTYLED */ 492 htable_attach(kas.a_hat, 0, mmu.max_level, NULL, 493 #ifdef __xpv 494 mmu_btop(xen_info->pt_base - ONE_GIG)); 495 #else 496 mmu_btop(getcr3())); 497 #endif 498 /* END CSTYLED */ 499 500 #if defined(__i386) && !defined(__xpv) 501 CPU->cpu_tss->tss_cr3 = dftss0->tss_cr3 = getcr3(); 502 #endif /* __i386 */ 503 504 #if defined(__xpv) && defined(__amd64) 505 /* 506 * Try to make the kpm mappings r/w. Failures here are OK, as 507 * it's probably just a pagetable 508 */ 509 xen_kpm_finish_init(); 510 #endif 511 512 /* 513 * The kernel HAT is now officially open for business. 514 */ 515 khat_running = 1; 516 517 CPUSET_ATOMIC_ADD(kas.a_hat->hat_cpus, CPU->cpu_id); 518 CPU->cpu_current_hat = kas.a_hat; 519 }