8956 Implement KPTI
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright 2018 Joyent, Inc.
  27  */
  28 
  29 #include <sys/types.h>
  30 #include <sys/systm.h>
  31 #include <sys/archsystm.h>
  32 #include <sys/debug.h>
  33 #include <sys/bootconf.h>
  34 #include <sys/bootsvcs.h>
  35 #include <sys/bootinfo.h>
  36 #include <sys/mman.h>
  37 #include <sys/cmn_err.h>
  38 #include <sys/param.h>
  39 #include <sys/machparam.h>
  40 #include <sys/machsystm.h>
  41 #include <sys/promif.h>
  42 #include <sys/kobj.h>
  43 #ifdef __xpv
  44 #include <sys/hypervisor.h>
  45 #endif
  46 #include <vm/kboot_mmu.h>
  47 #include <vm/hat_pte.h>
  48 #include <vm/hat_i86.h>
  49 #include <vm/seg_kmem.h>
  50 
  51 #if 0
  52 /*
  53  * Joe's debug printing
  54  */
  55 #define DBG(x)    \
  56         bop_printf(NULL, "kboot_mmu.c: %s is %" PRIx64 "\n", #x, (uint64_t)(x));
  57 #else
  58 #define DBG(x)  /* naught */
  59 #endif
  60 
  61 /*
  62  * Page table and memory stuff.
  63  */
  64 static caddr_t window;
  65 static caddr_t pte_to_window;
  66 
  67 /*
  68  * this are needed by mmu_init()
  69  */
  70 int kbm_nx_support = 0;         /* NX bit in PTEs is in use */
  71 int kbm_pae_support = 0;        /* PAE is 64 bit Page table entries */
  72 int kbm_pge_support = 0;        /* PGE is Page table global bit enabled */
  73 int kbm_largepage_support = 0;
  74 uint_t kbm_nucleus_size = 0;
  75 
  76 #define BOOT_SHIFT(l)   (shift_amt[l])
  77 #define BOOT_SZ(l)      ((size_t)1 << BOOT_SHIFT(l))
  78 #define BOOT_OFFSET(l)  (BOOT_SZ(l) - 1)
  79 #define BOOT_MASK(l)    (~BOOT_OFFSET(l))
  80 
  81 /*
  82  * Initialize memory management parameters for boot time page table management
  83  */
  84 void
  85 kbm_init(struct xboot_info *bi)
  86 {
  87         /*
  88          * configure mmu information
  89          */
  90         kbm_nucleus_size = (uintptr_t)bi->bi_kseg_size;
  91         kbm_largepage_support = bi->bi_use_largepage;
  92         kbm_nx_support = bi->bi_use_nx;
  93         kbm_pae_support = bi->bi_use_pae;
  94         kbm_pge_support = bi->bi_use_pge;
  95         window = bi->bi_pt_window;
  96         DBG(window);
  97         pte_to_window = bi->bi_pte_to_pt_window;
  98         DBG(pte_to_window);
  99         if (kbm_pae_support) {
 100                 shift_amt = shift_amt_pae;
 101                 ptes_per_table = 512;
 102                 pte_size = 8;
 103                 lpagesize = TWO_MEG;
 104 #ifdef __amd64
 105                 top_level = 3;
 106 #else
 107                 top_level = 2;
 108 #endif
 109         } else {
 110                 shift_amt = shift_amt_nopae;
 111                 ptes_per_table = 1024;
 112                 pte_size = 4;
 113                 lpagesize = FOUR_MEG;
 114                 top_level = 1;
 115         }
 116 
 117 #ifdef __xpv
 118         xen_info = bi->bi_xen_start_info;
 119         mfn_list = (mfn_t *)xen_info->mfn_list;
 120         DBG(mfn_list);
 121         mfn_count = xen_info->nr_pages;
 122         DBG(mfn_count);
 123 #endif
 124         top_page_table = bi->bi_top_page_table;
 125         DBG(top_page_table);
 126 }
 127 
 128 /*
 129  * Change the addressible page table window to point at a given page
 130  */
 131 /*ARGSUSED*/
 132 void *
 133 kbm_remap_window(paddr_t physaddr, int writeable)
 134 {
 135         x86pte_t pt_bits = PT_NOCONSIST | PT_VALID | PT_WRITABLE;
 136 
 137         DBG(physaddr);
 138 
 139 #ifdef __xpv
 140         if (!writeable)
 141                 pt_bits &= ~PT_WRITABLE;
 142         if (HYPERVISOR_update_va_mapping((uintptr_t)window,
 143             pa_to_ma(physaddr) | pt_bits, UVMF_INVLPG | UVMF_LOCAL) < 0)
 144                 bop_panic("HYPERVISOR_update_va_mapping() failed");
 145 #else
 146         if (kbm_pae_support)
 147                 *((x86pte_t *)pte_to_window) = physaddr | pt_bits;
 148         else
 149                 *((x86pte32_t *)pte_to_window) = physaddr | pt_bits;
 150         mmu_invlpg(window);
 151 #endif
 152         DBG(window);
 153         return (window);
 154 }
 155 
 156 /*
 157  * Add a mapping for the physical page at the given virtual address.
 158  */
 159 void
 160 kbm_map(uintptr_t va, paddr_t pa, uint_t level, uint_t is_kernel)
 161 {
 162         x86pte_t *ptep;
 163         paddr_t pte_physaddr;
 164         x86pte_t pteval;
 165 
 166         if (khat_running)
 167                 panic("kbm_map() called too late");
 168 
 169         pteval = pa_to_ma(pa) | PT_NOCONSIST | PT_VALID | PT_WRITABLE;
 170         if (level >= 1)
 171                 pteval |= PT_PAGESIZE;
 172         if (kbm_pge_support && is_kernel)
 173                 pteval |= PT_GLOBAL;
 174 
 175 #ifdef __xpv
 176         /*
 177          * try update_va_mapping first - fails if page table is missing.
 178          */
 179         if (HYPERVISOR_update_va_mapping(va, pteval,
 180             UVMF_INVLPG | UVMF_LOCAL) == 0)
 181                 return;
 182 #endif
 183 
 184         /*
 185          * Find the pte that will map this address. This creates any
 186          * missing intermediate level page tables.
 187          */
 188         ptep = find_pte(va, &pte_physaddr, level, 0);
 189         if (ptep == NULL)
 190                 bop_panic("kbm_map: find_pte returned NULL");
 191 
 192 #ifdef __xpv
 193         if (HYPERVISOR_update_va_mapping(va, pteval, UVMF_INVLPG | UVMF_LOCAL))
 194                 bop_panic("HYPERVISOR_update_va_mapping() failed");
 195 #else
 196         if (kbm_pae_support)
 197                 *ptep = pteval;
 198         else
 199                 *((x86pte32_t *)ptep) = pteval;
 200         mmu_invlpg((caddr_t)va);
 201 #endif
 202 }
 203 
 204 #ifdef __xpv
 205 
 206 /*
 207  * Add a mapping for the machine page at the given virtual address.
 208  */
 209 void
 210 kbm_map_ma(maddr_t ma, uintptr_t va, uint_t level)
 211 {
 212         paddr_t pte_physaddr;
 213         x86pte_t pteval;
 214 
 215         pteval = ma | PT_NOCONSIST | PT_VALID | PT_REF | PT_WRITABLE;
 216         if (level == 1)
 217                 pteval |= PT_PAGESIZE;
 218 
 219         /*
 220          * try update_va_mapping first - fails if page table is missing.
 221          */
 222         if (HYPERVISOR_update_va_mapping(va,
 223             pteval, UVMF_INVLPG | UVMF_LOCAL) == 0)
 224                 return;
 225 
 226         /*
 227          * Find the pte that will map this address. This creates any
 228          * missing intermediate level page tables
 229          */
 230         (void) find_pte(va, &pte_physaddr, level, 0);
 231 
 232         if (HYPERVISOR_update_va_mapping(va,
 233             pteval, UVMF_INVLPG | UVMF_LOCAL) != 0)
 234                 bop_panic("HYPERVISOR_update_va_mapping failed");
 235 }
 236 
 237 #endif /* __xpv */
 238 
 239 
 240 /*
 241  * Probe the boot time page tables to find the first mapping
 242  * including va (or higher) and return non-zero if one is found.
 243  * va is updated to the starting address and len to the pagesize.
 244  * pp will be set to point to the 1st page_t of the mapped page(s).
 245  *
 246  * Note that if va is in the middle of a large page, the returned va
 247  * will be less than what was asked for.
 248  */
 249 int
 250 kbm_probe(uintptr_t *va, size_t *len, pfn_t *pfn, uint_t *prot)
 251 {
 252         uintptr_t       probe_va;
 253         x86pte_t        *ptep;
 254         paddr_t         pte_physaddr;
 255         x86pte_t        pte_val;
 256         level_t         l;
 257 
 258         if (khat_running)
 259                 panic("kbm_probe() called too late");
 260         *len = 0;
 261         *pfn = PFN_INVALID;
 262         *prot = 0;
 263         probe_va = *va;
 264 restart_new_va:
 265         l = top_level;
 266         for (;;) {
 267                 if (IN_VA_HOLE(probe_va))
 268                         probe_va = mmu.hole_end;
 269 
 270                 if (IN_HYPERVISOR_VA(probe_va))
 271 #if defined(__amd64) && defined(__xpv)
 272                         probe_va = HYPERVISOR_VIRT_END;
 273 #else
 274                         return (0);
 275 #endif
 276 
 277                 /*
 278                  * If we don't have a valid PTP/PTE at this level
 279                  * then we can bump VA by this level's pagesize and try again.
 280                  * When the probe_va wraps around, we are done.
 281                  */
 282                 ptep = find_pte(probe_va, &pte_physaddr, l, 1);
 283                 if (ptep == NULL)
 284                         bop_panic("kbm_probe: find_pte returned NULL");
 285                 if (kbm_pae_support)
 286                         pte_val = *ptep;
 287                 else
 288                         pte_val = *((x86pte32_t *)ptep);
 289                 if (!PTE_ISVALID(pte_val)) {
 290                         probe_va = (probe_va & BOOT_MASK(l)) + BOOT_SZ(l);
 291                         if (probe_va <= *va)
 292                                 return (0);
 293                         goto restart_new_va;
 294                 }
 295 
 296                 /*
 297                  * If this entry is a pointer to a lower level page table
 298                  * go down to it.
 299                  */
 300                 if (!PTE_ISPAGE(pte_val, l)) {
 301                         ASSERT(l > 0);
 302                         --l;
 303                         continue;
 304                 }
 305 
 306                 /*
 307                  * We found a boot level page table entry
 308                  */
 309                 *len = BOOT_SZ(l);
 310                 *va = probe_va & ~(*len - 1);
 311                 *pfn = PTE2PFN(pte_val, l);
 312 
 313 
 314                 *prot = PROT_READ | PROT_EXEC;
 315                 if (PTE_GET(pte_val, PT_WRITABLE))
 316                         *prot |= PROT_WRITE;
 317 
 318                 /*
 319                  * pt_nx is cleared if processor doesn't support NX bit
 320                  */
 321                 if (PTE_GET(pte_val, mmu.pt_nx))
 322                         *prot &= ~PROT_EXEC;
 323 
 324                 return (1);
 325         }
 326 }
 327 
 328 
 329 /*
 330  * Destroy a boot loader page table 4K mapping.
 331  */
 332 void
 333 kbm_unmap(uintptr_t va)
 334 {
 335         if (khat_running)
 336                 panic("kbm_unmap() called too late");
 337         else {
 338 #ifdef __xpv
 339                 (void) HYPERVISOR_update_va_mapping(va, 0,
 340                     UVMF_INVLPG | UVMF_LOCAL);
 341 #else
 342                 x86pte_t *ptep;
 343                 level_t level = 0;
 344                 uint_t  probe_only = 1;
 345 
 346                 ptep = find_pte(va, NULL, level, probe_only);
 347                 if (ptep == NULL)
 348                         return;
 349 
 350                 if (kbm_pae_support)
 351                         *ptep = 0;
 352                 else
 353                         *((x86pte32_t *)ptep) = 0;
 354                 mmu_invlpg((caddr_t)va);
 355 #endif
 356         }
 357 }
 358 
 359 
 360 /*
 361  * Change a boot loader page table 4K mapping.
 362  * Returns the pfn of the old mapping.
 363  */
 364 pfn_t
 365 kbm_remap(uintptr_t va, pfn_t pfn)
 366 {
 367         x86pte_t *ptep;
 368         level_t level = 0;
 369         uint_t  probe_only = 1;
 370         x86pte_t pte_val = pa_to_ma(pfn_to_pa(pfn)) | PT_WRITABLE |
 371             PT_NOCONSIST | PT_VALID;
 372         x86pte_t old_pte;
 373 
 374         if (khat_running)
 375                 panic("kbm_remap() called too late");
 376         ptep = find_pte(va, NULL, level, probe_only);
 377         if (ptep == NULL)
 378                 bop_panic("kbm_remap: find_pte returned NULL");
 379 
 380         if (kbm_pae_support)
 381                 old_pte = *ptep;
 382         else
 383                 old_pte = *((x86pte32_t *)ptep);
 384 
 385 #ifdef __xpv
 386         if (HYPERVISOR_update_va_mapping(va, pte_val, UVMF_INVLPG | UVMF_LOCAL))
 387                 bop_panic("HYPERVISOR_update_va_mapping() failed");
 388 #else
 389         if (kbm_pae_support)
 390                 *((x86pte_t *)ptep) = pte_val;
 391         else
 392                 *((x86pte32_t *)ptep) = pte_val;
 393         mmu_invlpg((caddr_t)va);
 394 #endif
 395 
 396         if (!(old_pte & PT_VALID) || ma_to_pa(old_pte) == -1)
 397                 return (PFN_INVALID);
 398         return (mmu_btop(ma_to_pa(old_pte)));
 399 }
 400 
 401 
 402 /*
 403  * Change a boot loader page table 4K mapping to read only.
 404  */
 405 void
 406 kbm_read_only(uintptr_t va, paddr_t pa)
 407 {
 408         x86pte_t pte_val = pa_to_ma(pa) |
 409             PT_NOCONSIST | PT_REF | PT_MOD | PT_VALID;
 410 
 411 #ifdef __xpv
 412         if (HYPERVISOR_update_va_mapping(va, pte_val, UVMF_INVLPG | UVMF_LOCAL))
 413                 bop_panic("HYPERVISOR_update_va_mapping() failed");
 414 #else
 415         x86pte_t *ptep;
 416         level_t level = 0;
 417 
 418         ptep = find_pte(va, NULL, level, 0);
 419         if (ptep == NULL)
 420                 bop_panic("kbm_read_only: find_pte returned NULL");
 421 
 422         if (kbm_pae_support)
 423                 *ptep = pte_val;
 424         else
 425                 *((x86pte32_t *)ptep) = pte_val;
 426         mmu_invlpg((caddr_t)va);
 427 #endif
 428 }
 429 
 430 /*
 431  * interfaces for kernel debugger to access physical memory
 432  */
 433 static x86pte_t save_pte;
 434 
 435 void *
 436 kbm_push(paddr_t pa)
 437 {
 438         static int first_time = 1;
 439 
 440         if (first_time) {
 441                 first_time = 0;
 442                 return (window);
 443         }
 444 
 445         if (kbm_pae_support)
 446                 save_pte = *((x86pte_t *)pte_to_window);
 447         else
 448                 save_pte = *((x86pte32_t *)pte_to_window);
 449         return (kbm_remap_window(pa, 0));
 450 }
 451 
 452 void
 453 kbm_pop(void)
 454 {
 455 #ifdef __xpv
 456         if (HYPERVISOR_update_va_mapping((uintptr_t)window, save_pte,
 457             UVMF_INVLPG | UVMF_LOCAL) < 0)
 458                 bop_panic("HYPERVISOR_update_va_mapping() failed");
 459 #else
 460         if (kbm_pae_support)
 461                 *((x86pte_t *)pte_to_window) = save_pte;
 462         else
 463                 *((x86pte32_t *)pte_to_window) = save_pte;
 464         mmu_invlpg(window);
 465 #endif
 466 }
 467 
 468 x86pte_t
 469 get_pteval(paddr_t table, uint_t index)
 470 {
 471         void *table_ptr = kbm_remap_window(table, 0);
 472 
 473         if (kbm_pae_support)
 474                 return (((x86pte_t *)table_ptr)[index]);
 475         return (((x86pte32_t *)table_ptr)[index]);
 476 }
 477 
 478 #ifndef __xpv
 479 void
 480 set_pteval(paddr_t table, uint_t index, uint_t level, x86pte_t pteval)
 481 {
 482         void *table_ptr = kbm_remap_window(table, 0);
 483         if (kbm_pae_support)
 484                 ((x86pte_t *)table_ptr)[index] = pteval;
 485         else
 486                 ((x86pte32_t *)table_ptr)[index] = pteval;
 487         if (level == top_level && level == 2)
 488                 reload_cr3();
 489 }
 490 #endif
 491 
 492 paddr_t
 493 make_ptable(x86pte_t *pteval, uint_t level)
 494 {
 495         paddr_t new_table;
 496         void *table_ptr;
 497 
 498         new_table = do_bop_phys_alloc(MMU_PAGESIZE, MMU_PAGESIZE);
 499         table_ptr = kbm_remap_window(new_table, 1);
 500         bzero(table_ptr, MMU_PAGESIZE);
 501 #ifdef __xpv
 502         /* Remove write permission to the new page table.  */
 503         (void) kbm_remap_window(new_table, 0);
 504 #endif
 505 
 506         if (level == top_level && level == 2)
 507                 *pteval = pa_to_ma(new_table) | PT_VALID;
 508         else
 509                 *pteval = pa_to_ma(new_table) |
 510                     PT_VALID | PT_REF | PT_USER | PT_WRITABLE;
 511 
 512         return (new_table);
 513 }
 514 
 515 x86pte_t *
 516 map_pte(paddr_t table, uint_t index)
 517 {
 518         void *table_ptr = kbm_remap_window(table, 0);
 519         return ((x86pte_t *)((caddr_t)table_ptr + index * pte_size));
 520 }
--- EOF ---