8956 Implement KPTI
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.


  25  */
  26 
  27 #include <sys/types.h>
  28 #include <sys/systm.h>
  29 #include <sys/archsystm.h>
  30 #include <sys/debug.h>
  31 #include <sys/bootconf.h>
  32 #include <sys/bootsvcs.h>
  33 #include <sys/bootinfo.h>
  34 #include <sys/mman.h>
  35 #include <sys/cmn_err.h>
  36 #include <sys/param.h>
  37 #include <sys/machparam.h>
  38 #include <sys/machsystm.h>
  39 #include <sys/promif.h>
  40 #include <sys/kobj.h>
  41 #ifdef __xpv
  42 #include <sys/hypervisor.h>
  43 #endif
  44 #include <vm/kboot_mmu.h>
  45 #include <vm/hat_pte.h>
  46 #include <vm/hat_i86.h>
  47 #include <vm/seg_kmem.h>
  48 
  49 #if 0
  50 /*
  51  * Joe's debug printing
  52  */
  53 #define DBG(x)    \
  54         bop_printf(NULL, "kboot_mmu.c: %s is %" PRIx64 "\n", #x, (uint64_t)(x));
  55 #else
  56 #define DBG(x)  /* naught */
  57 #endif
  58 
  59 /*
  60  * Page table and memory stuff.
  61  */
  62 static caddr_t window;
  63 static caddr_t pte_to_window;
  64 
  65 /*
  66  * this are needed by mmu_init()
  67  */
  68 int kbm_nx_support = 0;         /* NX bit in PTEs is in use */
  69 int kbm_pae_support = 0;        /* PAE is 64 bit Page table entries */
  70 int kbm_pge_support = 0;        /* PGE is Page table global bit enabled */
  71 int kbm_largepage_support = 0;
  72 uint_t kbm_nucleus_size = 0;
  73 
  74 #define BOOT_SHIFT(l)   (shift_amt[l])
  75 #define BOOT_SZ(l)      ((size_t)1 << BOOT_SHIFT(l))
  76 #define BOOT_OFFSET(l)  (BOOT_SZ(l) - 1)
  77 #define BOOT_MASK(l)    (~BOOT_OFFSET(l))
  78 
  79 /*
  80  * Initialize memory management parameters for boot time page table management
  81  */
  82 void
  83 kbm_init(struct xboot_info *bi)
  84 {
  85         /*
  86          * configure mmu information
  87          */
  88         kbm_nucleus_size = (uintptr_t)bi->bi_kseg_size;
  89         kbm_largepage_support = bi->bi_use_largepage;
  90         kbm_nx_support = bi->bi_use_nx;
  91         kbm_pae_support = bi->bi_use_pae;
  92         kbm_pge_support = bi->bi_use_pge;
  93         window = bi->bi_pt_window;
  94         DBG(window);
  95         pte_to_window = bi->bi_pte_to_pt_window;
  96         DBG(pte_to_window);
  97         if (kbm_pae_support) {
  98                 shift_amt = shift_amt_pae;
  99                 ptes_per_table = 512;
 100                 pte_size = 8;
 101                 lpagesize = TWO_MEG;
 102 #ifdef __amd64
 103                 top_level = 3;
 104 #else
 105                 top_level = 2;
 106 #endif
 107         } else {
 108                 shift_amt = shift_amt_nopae;
 109                 ptes_per_table = 1024;
 110                 pte_size = 4;
 111                 lpagesize = FOUR_MEG;
 112                 top_level = 1;
 113         }
 114 
 115 #ifdef __xpv
 116         xen_info = bi->bi_xen_start_info;
 117         mfn_list = (mfn_t *)xen_info->mfn_list;
 118         DBG(mfn_list);
 119         mfn_count = xen_info->nr_pages;
 120         DBG(mfn_count);
 121 #endif
 122         top_page_table = bi->bi_top_page_table;
 123         DBG(top_page_table);
 124 }
 125 
 126 /*
 127  * Change the addressible page table window to point at a given page
 128  */
 129 /*ARGSUSED*/
 130 void *
 131 kbm_remap_window(paddr_t physaddr, int writeable)
 132 {
 133         x86pte_t pt_bits = PT_NOCONSIST | PT_VALID | PT_WRITABLE;
 134 
 135         DBG(physaddr);
 136 
 137 #ifdef __xpv
 138         if (!writeable)
 139                 pt_bits &= ~PT_WRITABLE;
 140         if (HYPERVISOR_update_va_mapping((uintptr_t)window,
 141             pa_to_ma(physaddr) | pt_bits, UVMF_INVLPG | UVMF_LOCAL) < 0)
 142                 bop_panic("HYPERVISOR_update_va_mapping() failed");
 143 #else
 144         if (kbm_pae_support)
 145                 *((x86pte_t *)pte_to_window) = physaddr | pt_bits;
 146         else
 147                 *((x86pte32_t *)pte_to_window) = physaddr | pt_bits;
 148         mmu_tlbflush_entry(window);
 149 #endif
 150         DBG(window);
 151         return (window);
 152 }
 153 
 154 /*
 155  * Add a mapping for the physical page at the given virtual address.
 156  */
 157 void
 158 kbm_map(uintptr_t va, paddr_t pa, uint_t level, uint_t is_kernel)
 159 {
 160         x86pte_t *ptep;
 161         paddr_t pte_physaddr;
 162         x86pte_t pteval;
 163 
 164         if (khat_running)
 165                 panic("kbm_map() called too late");
 166 
 167         pteval = pa_to_ma(pa) | PT_NOCONSIST | PT_VALID | PT_WRITABLE;
 168         if (level >= 1)
 169                 pteval |= PT_PAGESIZE;
 170         if (kbm_pge_support && is_kernel)
 171                 pteval |= PT_GLOBAL;
 172 
 173 #ifdef __xpv
 174         /*
 175          * try update_va_mapping first - fails if page table is missing.
 176          */
 177         if (HYPERVISOR_update_va_mapping(va, pteval,
 178             UVMF_INVLPG | UVMF_LOCAL) == 0)
 179                 return;
 180 #endif
 181 
 182         /*
 183          * Find the pte that will map this address. This creates any
 184          * missing intermediate level page tables.
 185          */
 186         ptep = find_pte(va, &pte_physaddr, level, 0);
 187         if (ptep == NULL)
 188                 bop_panic("kbm_map: find_pte returned NULL");
 189 
 190 #ifdef __xpv
 191         if (HYPERVISOR_update_va_mapping(va, pteval, UVMF_INVLPG | UVMF_LOCAL))
 192                 bop_panic("HYPERVISOR_update_va_mapping() failed");
 193 #else
 194         if (kbm_pae_support)
 195                 *ptep = pteval;
 196         else
 197                 *((x86pte32_t *)ptep) = pteval;
 198         mmu_tlbflush_entry((caddr_t)va);
 199 #endif
 200 }
 201 
 202 #ifdef __xpv
 203 
 204 /*
 205  * Add a mapping for the machine page at the given virtual address.
 206  */
 207 void
 208 kbm_map_ma(maddr_t ma, uintptr_t va, uint_t level)
 209 {
 210         paddr_t pte_physaddr;
 211         x86pte_t pteval;
 212 
 213         pteval = ma | PT_NOCONSIST | PT_VALID | PT_REF | PT_WRITABLE;
 214         if (level == 1)
 215                 pteval |= PT_PAGESIZE;
 216 
 217         /*
 218          * try update_va_mapping first - fails if page table is missing.
 219          */
 220         if (HYPERVISOR_update_va_mapping(va,
 221             pteval, UVMF_INVLPG | UVMF_LOCAL) == 0)
 222                 return;
 223 
 224         /*
 225          * Find the pte that will map this address. This creates any
 226          * missing intermediate level page tables
 227          */
 228         (void) find_pte(va, &pte_physaddr, level, 0);
 229 
 230         if (HYPERVISOR_update_va_mapping(va,
 231             pteval, UVMF_INVLPG | UVMF_LOCAL) != 0)
 232                 bop_panic("HYPERVISOR_update_va_mapping failed");
 233 }
 234 
 235 #endif /* __xpv */
 236 
 237 
 238 /*
 239  * Probe the boot time page tables to find the first mapping
 240  * including va (or higher) and return non-zero if one is found.
 241  * va is updated to the starting address and len to the pagesize.
 242  * pp will be set to point to the 1st page_t of the mapped page(s).
 243  *
 244  * Note that if va is in the middle of a large page, the returned va
 245  * will be less than what was asked for.
 246  */
 247 int
 248 kbm_probe(uintptr_t *va, size_t *len, pfn_t *pfn, uint_t *prot)
 249 {
 250         uintptr_t       probe_va;
 251         x86pte_t        *ptep;
 252         paddr_t         pte_physaddr;
 253         x86pte_t        pte_val;
 254         level_t         l;
 255 
 256         if (khat_running)
 257                 panic("kbm_probe() called too late");
 258         *len = 0;
 259         *pfn = PFN_INVALID;
 260         *prot = 0;
 261         probe_va = *va;
 262 restart_new_va:
 263         l = top_level;
 264         for (;;) {
 265                 if (IN_VA_HOLE(probe_va))
 266                         probe_va = mmu.hole_end;
 267 
 268                 if (IN_HYPERVISOR_VA(probe_va))
 269 #if defined(__amd64) && defined(__xpv)
 270                         probe_va = HYPERVISOR_VIRT_END;
 271 #else
 272                         return (0);
 273 #endif
 274 
 275                 /*
 276                  * If we don't have a valid PTP/PTE at this level
 277                  * then we can bump VA by this level's pagesize and try again.
 278                  * When the probe_va wraps around, we are done.
 279                  */
 280                 ptep = find_pte(probe_va, &pte_physaddr, l, 1);
 281                 if (ptep == NULL)
 282                         bop_panic("kbm_probe: find_pte returned NULL");
 283                 if (kbm_pae_support)
 284                         pte_val = *ptep;
 285                 else
 286                         pte_val = *((x86pte32_t *)ptep);
 287                 if (!PTE_ISVALID(pte_val)) {
 288                         probe_va = (probe_va & BOOT_MASK(l)) + BOOT_SZ(l);
 289                         if (probe_va <= *va)
 290                                 return (0);
 291                         goto restart_new_va;
 292                 }
 293 
 294                 /*
 295                  * If this entry is a pointer to a lower level page table
 296                  * go down to it.
 297                  */
 298                 if (!PTE_ISPAGE(pte_val, l)) {
 299                         ASSERT(l > 0);
 300                         --l;
 301                         continue;
 302                 }
 303 
 304                 /*
 305                  * We found a boot level page table entry
 306                  */
 307                 *len = BOOT_SZ(l);
 308                 *va = probe_va & ~(*len - 1);
 309                 *pfn = PTE2PFN(pte_val, l);
 310 
 311 
 312                 *prot = PROT_READ | PROT_EXEC;
 313                 if (PTE_GET(pte_val, PT_WRITABLE))
 314                         *prot |= PROT_WRITE;
 315 
 316                 /*
 317                  * pt_nx is cleared if processor doesn't support NX bit
 318                  */
 319                 if (PTE_GET(pte_val, mmu.pt_nx))
 320                         *prot &= ~PROT_EXEC;
 321 
 322                 return (1);
 323         }
 324 }
 325 
 326 
 327 /*
 328  * Destroy a boot loader page table 4K mapping.
 329  */
 330 void
 331 kbm_unmap(uintptr_t va)
 332 {
 333         if (khat_running)
 334                 panic("kbm_unmap() called too late");
 335         else {
 336 #ifdef __xpv
 337                 (void) HYPERVISOR_update_va_mapping(va, 0,
 338                     UVMF_INVLPG | UVMF_LOCAL);
 339 #else
 340                 x86pte_t *ptep;
 341                 level_t level = 0;
 342                 uint_t  probe_only = 1;
 343 
 344                 ptep = find_pte(va, NULL, level, probe_only);
 345                 if (ptep == NULL)
 346                         return;
 347 
 348                 if (kbm_pae_support)
 349                         *ptep = 0;
 350                 else
 351                         *((x86pte32_t *)ptep) = 0;
 352                 mmu_tlbflush_entry((caddr_t)va);
 353 #endif
 354         }
 355 }
 356 
 357 
 358 /*
 359  * Change a boot loader page table 4K mapping.
 360  * Returns the pfn of the old mapping.
 361  */
 362 pfn_t
 363 kbm_remap(uintptr_t va, pfn_t pfn)
 364 {
 365         x86pte_t *ptep;
 366         level_t level = 0;
 367         uint_t  probe_only = 1;
 368         x86pte_t pte_val = pa_to_ma(pfn_to_pa(pfn)) | PT_WRITABLE |
 369             PT_NOCONSIST | PT_VALID;
 370         x86pte_t old_pte;
 371 
 372         if (khat_running)
 373                 panic("kbm_remap() called too late");
 374         ptep = find_pte(va, NULL, level, probe_only);
 375         if (ptep == NULL)
 376                 bop_panic("kbm_remap: find_pte returned NULL");
 377 
 378         if (kbm_pae_support)
 379                 old_pte = *ptep;
 380         else
 381                 old_pte = *((x86pte32_t *)ptep);
 382 
 383 #ifdef __xpv
 384         if (HYPERVISOR_update_va_mapping(va, pte_val, UVMF_INVLPG | UVMF_LOCAL))
 385                 bop_panic("HYPERVISOR_update_va_mapping() failed");
 386 #else
 387         if (kbm_pae_support)
 388                 *((x86pte_t *)ptep) = pte_val;
 389         else
 390                 *((x86pte32_t *)ptep) = pte_val;
 391         mmu_tlbflush_entry((caddr_t)va);
 392 #endif
 393 
 394         if (!(old_pte & PT_VALID) || ma_to_pa(old_pte) == -1)
 395                 return (PFN_INVALID);
 396         return (mmu_btop(ma_to_pa(old_pte)));
 397 }
 398 
 399 
 400 /*
 401  * Change a boot loader page table 4K mapping to read only.
 402  */
 403 void
 404 kbm_read_only(uintptr_t va, paddr_t pa)
 405 {
 406         x86pte_t pte_val = pa_to_ma(pa) |
 407             PT_NOCONSIST | PT_REF | PT_MOD | PT_VALID;
 408 
 409 #ifdef __xpv
 410         if (HYPERVISOR_update_va_mapping(va, pte_val, UVMF_INVLPG | UVMF_LOCAL))
 411                 bop_panic("HYPERVISOR_update_va_mapping() failed");
 412 #else
 413         x86pte_t *ptep;
 414         level_t level = 0;
 415 
 416         ptep = find_pte(va, NULL, level, 0);
 417         if (ptep == NULL)
 418                 bop_panic("kbm_read_only: find_pte returned NULL");
 419 
 420         if (kbm_pae_support)
 421                 *ptep = pte_val;
 422         else
 423                 *((x86pte32_t *)ptep) = pte_val;
 424         mmu_tlbflush_entry((caddr_t)va);
 425 #endif
 426 }
 427 
 428 /*
 429  * interfaces for kernel debugger to access physical memory
 430  */
 431 static x86pte_t save_pte;
 432 
 433 void *
 434 kbm_push(paddr_t pa)
 435 {
 436         static int first_time = 1;
 437 
 438         if (first_time) {
 439                 first_time = 0;
 440                 return (window);
 441         }
 442 
 443         if (kbm_pae_support)
 444                 save_pte = *((x86pte_t *)pte_to_window);
 445         else
 446                 save_pte = *((x86pte32_t *)pte_to_window);
 447         return (kbm_remap_window(pa, 0));
 448 }
 449 
 450 void
 451 kbm_pop(void)
 452 {
 453 #ifdef __xpv
 454         if (HYPERVISOR_update_va_mapping((uintptr_t)window, save_pte,
 455             UVMF_INVLPG | UVMF_LOCAL) < 0)
 456                 bop_panic("HYPERVISOR_update_va_mapping() failed");
 457 #else
 458         if (kbm_pae_support)
 459                 *((x86pte_t *)pte_to_window) = save_pte;
 460         else
 461                 *((x86pte32_t *)pte_to_window) = save_pte;
 462         mmu_tlbflush_entry(window);
 463 #endif
 464 }
 465 
 466 x86pte_t
 467 get_pteval(paddr_t table, uint_t index)
 468 {
 469         void *table_ptr = kbm_remap_window(table, 0);
 470 
 471         if (kbm_pae_support)
 472                 return (((x86pte_t *)table_ptr)[index]);
 473         return (((x86pte32_t *)table_ptr)[index]);
 474 }
 475 
 476 #ifndef __xpv
 477 void
 478 set_pteval(paddr_t table, uint_t index, uint_t level, x86pte_t pteval)
 479 {
 480         void *table_ptr = kbm_remap_window(table, 0);
 481         if (kbm_pae_support)
 482                 ((x86pte_t *)table_ptr)[index] = pteval;
 483         else
 484                 ((x86pte32_t *)table_ptr)[index] = pteval;
 485         if (level == top_level && level == 2)
 486                 reload_cr3();
 487 }
 488 #endif
 489 
 490 paddr_t
 491 make_ptable(x86pte_t *pteval, uint_t level)
 492 {
 493         paddr_t new_table;
 494         void *table_ptr;
 495 
 496         new_table = do_bop_phys_alloc(MMU_PAGESIZE, MMU_PAGESIZE);
 497         table_ptr = kbm_remap_window(new_table, 1);
 498         bzero(table_ptr, MMU_PAGESIZE);
 499 #ifdef __xpv
 500         /* Remove write permission to the new page table.  */
 501         (void) kbm_remap_window(new_table, 0);
 502 #endif
 503 
 504         if (level == top_level && level == 2)
 505                 *pteval = pa_to_ma(new_table) | PT_VALID;
 506         else
 507                 *pteval = pa_to_ma(new_table) |
 508                     PT_VALID | PT_REF | PT_USER | PT_WRITABLE;
 509 
 510         return (new_table);
 511 }
 512 
 513 x86pte_t *
 514 map_pte(paddr_t table, uint_t index)
 515 {
 516         void *table_ptr = kbm_remap_window(table, 0);
 517         return ((x86pte_t *)((caddr_t)table_ptr + index * pte_size));
 518 }
--- EOF ---