Print this page
    
8956 Implement KPTI
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/i86pc/vm/hat_kdi.c
          +++ new/usr/src/uts/i86pc/vm/hat_kdi.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  
    | 
      ↓ open down ↓ | 
    14 lines elided | 
    
      ↑ open up ↑ | 
  
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  24   24   * Use is subject to license terms.
       25 + *
       26 + * Copyright 2018 Joyent, Inc.
  25   27   */
  26   28  
  27   29  /*
  28   30   * HAT interfaces used by the kernel debugger to interact with the VM system.
  29   31   * These interfaces are invoked when the world is stopped.  As such, no blocking
  30   32   * operations may be performed.
  31   33   */
  32   34  
  33   35  #include <sys/cpuvar.h>
  34   36  #include <sys/kdi_impl.h>
  35   37  #include <sys/errno.h>
  36   38  #include <sys/systm.h>
  37   39  #include <sys/sysmacros.h>
  38   40  #include <sys/mman.h>
  39   41  #include <sys/bootconf.h>
  40   42  #include <sys/cmn_err.h>
  41   43  #include <vm/seg_kmem.h>
  42   44  #include <vm/hat_i86.h>
  43   45  #if defined(__xpv)
  44   46  #include <sys/hypervisor.h>
  45   47  #endif
  46   48  #include <sys/bootinfo.h>
  47   49  #include <vm/kboot_mmu.h>
  48   50  #include <sys/machsystm.h>
  49   51  
  50   52  /*
  51   53   * The debugger needs direct access to the PTE of one page table entry
  52   54   * in order to implement vtop and physical read/writes
  53   55   */
  54   56  static uintptr_t hat_kdi_page = 0;      /* vaddr for phsical page accesses */
  55   57  static uint_t use_kbm = 1;
  56   58  uint_t hat_kdi_use_pae;                 /* if 0, use x86pte32_t for pte type */
  57   59  
  58   60  #if !defined(__xpv)
  59   61  static x86pte_t *hat_kdi_pte = NULL;    /* vaddr of pte for hat_kdi_page */
  60   62  #endif
  61   63  
  62   64  /*
  63   65   * Get the address for remapping physical pages during boot
  64   66   */
  65   67  void
  66   68  hat_boot_kdi_init(void)
  67   69  {
  68   70          hat_kdi_page = (uintptr_t)kbm_push(0);  /* first call gets address... */
  69   71  }
  70   72  
  71   73  /*
  72   74   * Switch to using a page in the kernel's va range for physical memory access.
  73   75   * We need to allocate a virtual page, then permanently map in the page that
  74   76   * contains the PTE to it.
  75   77   */
  76   78  void
  77   79  hat_kdi_init(void)
  78   80  {
  79   81          /*LINTED:set but not used in function*/
  80   82          htable_t *ht __unused;
  81   83  
  82   84          /*
  83   85           * Get an kernel page VA to use for phys mem access. Then make sure
  84   86           * the VA has a page table.
  85   87           */
  86   88          hat_kdi_use_pae = mmu.pae_hat;
  87   89          hat_kdi_page = (uintptr_t)vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
  88   90          ht = htable_create(kas.a_hat, hat_kdi_page, 0, NULL);
  89   91          use_kbm = 0;
  90   92  
  91   93  #ifndef __xpv
  92   94          /*
  93   95           * Get an address at which to put the pagetable and devload it.
  94   96           */
  95   97          hat_kdi_pte = vmem_xalloc(heap_arena, MMU_PAGESIZE, MMU_PAGESIZE, 0,
  96   98              0, NULL, NULL, VM_SLEEP);
  97   99          hat_devload(kas.a_hat, (caddr_t)hat_kdi_pte, MMU_PAGESIZE, ht->ht_pfn,
  98  100              PROT_READ | PROT_WRITE | HAT_NOSYNC | HAT_UNORDERED_OK,
  99  101              HAT_LOAD | HAT_LOAD_NOCONSIST);
 100  102          hat_kdi_pte =
 101  103              PT_INDEX_PTR(hat_kdi_pte, htable_va2entry(hat_kdi_page, ht));
 102  104  
 103  105          HTABLE_INC(ht->ht_valid_cnt);
 104  106          htable_release(ht);
 105  107  #endif
 106  108  }
 107  109  
 108  110  #ifdef __xpv
 109  111  
 110  112  /*
 111  113   * translate machine address to physical address
 112  114   */
 113  115  static uint64_t
 114  116  kdi_ptom(uint64_t pa)
 115  117  {
 116  118          extern pfn_t *mfn_list;
 117  119          ulong_t mfn = mfn_list[mmu_btop(pa)];
 118  120  
 119  121          return (pfn_to_pa(mfn) | (pa & MMU_PAGEOFFSET));
 120  122  }
 121  123  
 122  124  /*
 123  125   * This is like mfn_to_pfn(), but we can't use ontrap() from kmdb.
 124  126   * Instead we let the fault happen and kmdb deals with it.
 125  127   */
 126  128  static uint64_t
 127  129  kdi_mtop(uint64_t ma)
 128  130  {
 129  131          pfn_t pfn;
 130  132          mfn_t mfn = ma >> MMU_PAGESHIFT;
 131  133  
 132  134          if (HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL) < mfn)
 133  135                  return (ma | PFN_IS_FOREIGN_MFN);
 134  136  
 135  137          pfn = mfn_to_pfn_mapping[mfn];
 136  138          if (pfn >= mfn_count || pfn_to_mfn(pfn) != mfn)
 137  139                  return (ma | PFN_IS_FOREIGN_MFN);
 138  140          return (pfn_to_pa(pfn) | (ma & MMU_PAGEOFFSET));
 139  141  }
 140  142  
 141  143  #else
 142  144  #define kdi_mtop(m)     (m)
 143  145  #define kdi_ptom(p)     (p)
 144  146  #endif
 145  147  
 146  148  /*ARGSUSED*/
 147  149  int
 148  150  kdi_vtop(uintptr_t va, uint64_t *pap)
 149  151  {
 150  152          uintptr_t vaddr = va;
 151  153          size_t  len;
 152  154          pfn_t   pfn;
 153  155          uint_t  prot;
 154  156          int     level;
 155  157          x86pte_t pte;
 156  158          int     index;
 157  159  
 158  160          /*
 159  161           * if the mmu struct isn't relevant yet, we need to probe
 160  162           * the boot loader's pagetables.
 161  163           */
 162  164          if (!khat_running) {
 163  165                  if (kbm_probe(&vaddr, &len, &pfn, &prot) == 0)
 164  166                          return (ENOENT);
 165  167                  if (vaddr > va)
 166  168                          return (ENOENT);
 167  169                  if (vaddr < va)
 168  170                          pfn += mmu_btop(va - vaddr);
 169  171                  *pap = pfn_to_pa(pfn) + (vaddr & MMU_PAGEOFFSET);
  
    | 
      ↓ open down ↓ | 
    135 lines elided | 
    
      ↑ open up ↑ | 
  
 170  172                  return (0);
 171  173          }
 172  174  
 173  175          /*
 174  176           * We can't go through normal hat routines, so we'll use
 175  177           * kdi_pread() to walk the page tables
 176  178           */
 177  179  #if defined(__xpv)
 178  180          *pap = pfn_to_pa(CPU->cpu_current_hat->hat_htable->ht_pfn);
 179  181  #else
 180      -        *pap = getcr3() & MMU_PAGEMASK;
      182 +        *pap = getcr3_pa();
 181  183  #endif
 182  184          for (level = mmu.max_level; ; --level) {
 183  185                  index = (va >> LEVEL_SHIFT(level)) & (mmu.ptes_per_table - 1);
 184  186                  *pap += index << mmu.pte_size_shift;
 185  187                  pte = 0;
 186  188                  if (kdi_pread((caddr_t)&pte, mmu.pte_size, *pap, &len) != 0)
 187  189                          return (ENOENT);
 188  190                  if (pte == 0)
 189  191                          return (ENOENT);
 190  192                  if (level > 0 && level <= mmu.max_page_level &&
 191  193                      (pte & PT_PAGESIZE)) {
 192  194                          *pap = kdi_mtop(pte & PT_PADDR_LGPG);
 193  195                          break;
 194  196                  } else {
 195  197                          *pap = kdi_mtop(pte & PT_PADDR);
 196  198                          if (level == 0)
 197  199                                  break;
 198  200                  }
 199  201          }
 200  202          *pap += va & LEVEL_OFFSET(level);
 201  203          return (0);
 202  204  }
 203  205  
 204  206  static int
 205  207  kdi_prw(caddr_t buf, size_t nbytes, uint64_t pa, size_t *ncopiedp, int doread)
 206  208  {
 207  209          size_t  ncopied = 0;
 208  210          off_t   pgoff;
 209  211          size_t  sz;
 210  212          caddr_t va;
 211  213          caddr_t from;
 212  214          caddr_t to;
 213  215          x86pte_t pte;
 214  216  
 215  217          /*
 216  218           * if this is called before any initialization - fail
 217  219           */
 218  220          if (hat_kdi_page == 0)
 219  221                  return (EAGAIN);
 220  222  
 221  223          while (nbytes > 0) {
 222  224                  /*
 223  225                   * figure out the addresses and construct a minimal PTE
 224  226                   */
 225  227                  pgoff = pa & MMU_PAGEOFFSET;
 226  228                  sz = MIN(nbytes, MMU_PAGESIZE - pgoff);
 227  229                  va = (caddr_t)hat_kdi_page + pgoff;
 228  230                  pte = kdi_ptom(mmu_ptob(mmu_btop(pa))) | PT_VALID;
 229  231                  if (doread) {
 230  232                          from = va;
 231  233                          to = buf;
 232  234                  } else {
 233  235                          PTE_SET(pte, PT_WRITABLE);
 234  236                          from = buf;
 235  237                          to = va;
 236  238                  }
 237  239  
 238  240                  /*
 239  241                   * map the physical page
 240  242                   */
 241  243                  if (use_kbm)
  
    | 
      ↓ open down ↓ | 
    51 lines elided | 
    
      ↑ open up ↑ | 
  
 242  244                          (void) kbm_push(pa);
 243  245  #if defined(__xpv)
 244  246                  else
 245  247                          (void) HYPERVISOR_update_va_mapping(
 246  248                              (uintptr_t)va, pte, UVMF_INVLPG);
 247  249  #else
 248  250                  else if (hat_kdi_use_pae)
 249  251                          *hat_kdi_pte = pte;
 250  252                  else
 251  253                          *(x86pte32_t *)hat_kdi_pte = pte;
 252      -                mmu_tlbflush_entry((caddr_t)hat_kdi_page);
      254 +                mmu_flush_tlb_kpage(hat_kdi_page);
 253  255  #endif
 254  256  
 255  257                  bcopy(from, to, sz);
 256  258  
 257  259                  /*
 258  260                   * erase the mapping
 259  261                   */
 260  262                  if (use_kbm)
 261  263                          kbm_pop();
 262  264  #if defined(__xpv)
 263  265                  else
 264  266                          (void) HYPERVISOR_update_va_mapping(
 265  267                              (uintptr_t)va, 0, UVMF_INVLPG);
 266  268  #else
 267  269                  else if (hat_kdi_use_pae)
 268  270                          *hat_kdi_pte = 0;
 269  271                  else
 270  272                          *(x86pte32_t *)hat_kdi_pte = 0;
 271      -                mmu_tlbflush_entry((caddr_t)hat_kdi_page);
      273 +                mmu_flush_tlb_kpage(hat_kdi_page);
 272  274  #endif
 273  275  
 274  276                  buf += sz;
 275  277                  pa += sz;
 276  278                  nbytes -= sz;
 277  279                  ncopied += sz;
 278  280          }
 279  281  
 280  282          if (ncopied == 0)
 281  283                  return (ENOENT);
 282  284  
 283  285          *ncopiedp = ncopied;
 284  286          return (0);
 285  287  }
 286  288  
 287  289  int
 288  290  kdi_pread(caddr_t buf, size_t nbytes, uint64_t addr, size_t *ncopiedp)
  
    | 
      ↓ open down ↓ | 
    7 lines elided | 
    
      ↑ open up ↑ | 
  
 289  291  {
 290  292          return (kdi_prw(buf, nbytes, addr, ncopiedp, 1));
 291  293  }
 292  294  
 293  295  int
 294  296  kdi_pwrite(caddr_t buf, size_t nbytes, uint64_t addr, size_t *ncopiedp)
 295  297  {
 296  298          return (kdi_prw(buf, nbytes, addr, ncopiedp, 0));
 297  299  }
 298  300  
      301 +#if !defined(__xpv)
      302 +/*
      303 + * This gets used for flushing the TLB on all the slaves just prior to doing a
      304 + * kdi_prw().  It's unclear why this was originally done, since kdi_prw() itself
      305 + * will flush any lingering hat_kdi_page mappings, but let's presume it was a
      306 + * good idea.
      307 + */
      308 +void
      309 +kdi_flush_caches(void)
      310 +{
      311 +        mmu_flush_tlb(FLUSH_TLB_ALL, NULL);
      312 +}
      313 +#endif
 299  314  
 300  315  /*
 301  316   * Return the number of bytes, relative to the beginning of a given range, that
 302  317   * are non-toxic (can be read from and written to with relative impunity).
 303  318   */
 304  319  /*ARGSUSED*/
 305  320  size_t
 306  321  kdi_range_is_nontoxic(uintptr_t va, size_t sz, int write)
 307  322  {
 308  323  #if defined(__amd64)
 309  324          extern uintptr_t toxic_addr;
 310  325          extern size_t   toxic_size;
 311  326  
 312  327          /*
 313  328           * Check 64 bit toxic range.
 314  329           */
 315  330          if (toxic_addr != 0 &&
 316  331              va + sz >= toxic_addr &&
 317  332              va < toxic_addr + toxic_size)
 318  333                  return (va < toxic_addr ? toxic_addr - va : 0);
 319  334  
 320  335          /*
 321  336           * avoid any Virtual Address hole
 322  337           */
 323  338          if (va + sz >= hole_start && va < hole_end)
 324  339                  return (va < hole_start ? hole_start - va : 0);
 325  340  
 326  341          return (sz);
 327  342  
 328  343  #elif defined(__i386)
 329  344          extern void *device_arena_contains(void *, size_t, size_t *);
 330  345          uintptr_t v;
 331  346  
 332  347          v = (uintptr_t)device_arena_contains((void *)va, sz, NULL);
 333  348          if (v == 0)
 334  349                  return (sz);
 335  350          else if (v <= va)
 336  351                  return (0);
 337  352          else
 338  353                  return (v - va);
 339  354  
 340  355  #endif  /* __i386 */
 341  356  }
  
    | 
      ↓ open down ↓ | 
    33 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX