illumos-gate Wdiff usr/src/uts/i86pc/vm/htable.c

Print this page

8956 Implement KPTI
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/i86pc/vm/htable.c
          +++ new/usr/src/uts/i86pc/vm/htable.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.

↓ open down ↓

14 lines elided

↑ open up ↑

  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright (c) 2014 by Delphix. All rights reserved.
  25      - * Copyright 2015 Joyent, Inc.
       25 + * Copyright 2018 Joyent, Inc.
  26   26   */
  27   27  
  28   28  #include <sys/types.h>
  29   29  #include <sys/sysmacros.h>
  30   30  #include <sys/kmem.h>
  31   31  #include <sys/atomic.h>
  32   32  #include <sys/bitmap.h>
  33   33  #include <sys/machparam.h>
  34   34  #include <sys/machsystm.h>
  35   35  #include <sys/mman.h>

  36   36  #include <sys/systm.h>
  37   37  #include <sys/cpuvar.h>
  38   38  #include <sys/thread.h>
  39   39  #include <sys/proc.h>
  40   40  #include <sys/cpu.h>
  41   41  #include <sys/kmem.h>
  42   42  #include <sys/disp.h>
  43   43  #include <sys/vmem.h>
  44   44  #include <sys/vmsystm.h>
  45   45  #include <sys/promif.h>
  46   46  #include <sys/var.h>
  47   47  #include <sys/x86_archext.h>
  48   48  #include <sys/archsystm.h>
  49   49  #include <sys/bootconf.h>
  50   50  #include <sys/dumphdr.h>
  51   51  #include <vm/seg_kmem.h>
  52   52  #include <vm/seg_kpm.h>
  53   53  #include <vm/hat.h>
  54   54  #include <vm/hat_i86.h>
  55   55  #include <sys/cmn_err.h>
  56   56  #include <sys/panic.h>
  57   57  
  58   58  #ifdef __xpv
  59   59  #include <sys/hypervisor.h>
  60   60  #include <sys/xpv_panic.h>
  61   61  #endif
  62   62  
  63   63  #include <sys/bootinfo.h>
  64   64  #include <vm/kboot_mmu.h>
  65   65  
  66   66  static void x86pte_zero(htable_t *dest, uint_t entry, uint_t count);
  67   67  
  68   68  kmem_cache_t *htable_cache;
  69   69  
  70   70  /*
  71   71   * The variable htable_reserve_amount, rather than HTABLE_RESERVE_AMOUNT,
  72   72   * is used in order to facilitate testing of the htable_steal() code.
  73   73   * By resetting htable_reserve_amount to a lower value, we can force
  74   74   * stealing to occur.  The reserve amount is a guess to get us through boot.
  75   75   */
  76   76  #define HTABLE_RESERVE_AMOUNT   (200)
  77   77  uint_t htable_reserve_amount = HTABLE_RESERVE_AMOUNT;
  78   78  kmutex_t htable_reserve_mutex;
  79   79  uint_t htable_reserve_cnt;
  80   80  htable_t *htable_reserve_pool;
  81   81  
  82   82  /*
  83   83   * Used to hand test htable_steal().
  84   84   */
  85   85  #ifdef DEBUG
  86   86  ulong_t force_steal = 0;
  87   87  ulong_t ptable_cnt = 0;
  88   88  #endif
  89   89  
  90   90  /*
  91   91   * This variable is so that we can tune this via /etc/system
  92   92   * Any value works, but a power of two <= mmu.ptes_per_table is best.
  93   93   */
  94   94  uint_t htable_steal_passes = 8;
  95   95  
  96   96  /*
  97   97   * mutex stuff for access to htable hash
  98   98   */
  99   99  #define NUM_HTABLE_MUTEX 128
 100  100  kmutex_t htable_mutex[NUM_HTABLE_MUTEX];
 101  101  #define HTABLE_MUTEX_HASH(h) ((h) & (NUM_HTABLE_MUTEX - 1))
 102  102  
 103  103  #define HTABLE_ENTER(h) mutex_enter(&htable_mutex[HTABLE_MUTEX_HASH(h)]);
 104  104  #define HTABLE_EXIT(h)  mutex_exit(&htable_mutex[HTABLE_MUTEX_HASH(h)]);
 105  105  
 106  106  /*
 107  107   * forward declarations
 108  108   */
 109  109  static void link_ptp(htable_t *higher, htable_t *new, uintptr_t vaddr);
 110  110  static void unlink_ptp(htable_t *higher, htable_t *old, uintptr_t vaddr);
 111  111  static void htable_free(htable_t *ht);
 112  112  static x86pte_t *x86pte_access_pagetable(htable_t *ht, uint_t index);
 113  113  static void x86pte_release_pagetable(htable_t *ht);
 114  114  static x86pte_t x86pte_cas(htable_t *ht, uint_t entry, x86pte_t old,
 115  115          x86pte_t new);
 116  116  
 117  117  /*
 118  118   * A counter to track if we are stealing or reaping htables. When non-zero
 119  119   * htable_free() will directly free htables (either to the reserve or kmem)
 120  120   * instead of putting them in a hat's htable cache.
 121  121   */
 122  122  uint32_t htable_dont_cache = 0;
 123  123  
 124  124  /*
 125  125   * Track the number of active pagetables, so we can know how many to reap
 126  126   */
 127  127  static uint32_t active_ptables = 0;
 128  128  
 129  129  #ifdef __xpv

↓ open down ↓

94 lines elided

↑ open up ↑

 130  130  /*
 131  131   * Deal with hypervisor complications.
 132  132   */
 133  133  void
 134  134  xen_flush_va(caddr_t va)
 135  135  {
 136  136          struct mmuext_op t;
 137  137          uint_t count;
 138  138  
 139  139          if (IN_XPV_PANIC()) {
 140      -                mmu_tlbflush_entry((caddr_t)va);
      140 +                mmu_flush_tlb_page((uintptr_t)va);
 141  141          } else {
 142  142                  t.cmd = MMUEXT_INVLPG_LOCAL;
 143  143                  t.arg1.linear_addr = (uintptr_t)va;
 144  144                  if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
 145  145                          panic("HYPERVISOR_mmuext_op() failed");
 146  146                  ASSERT(count == 1);
 147  147          }
 148  148  }
 149  149  
 150  150  void
 151  151  xen_gflush_va(caddr_t va, cpuset_t cpus)
 152  152  {
 153  153          struct mmuext_op t;
 154  154          uint_t count;
 155  155  
 156  156          if (IN_XPV_PANIC()) {
 157      -                mmu_tlbflush_entry((caddr_t)va);
      157 +                mmu_flush_tlb_page((uintptr_t)va);
 158  158                  return;
 159  159          }
 160  160  
 161  161          t.cmd = MMUEXT_INVLPG_MULTI;
 162  162          t.arg1.linear_addr = (uintptr_t)va;
 163  163          /*LINTED: constant in conditional context*/
 164  164          set_xen_guest_handle(t.arg2.vcpumask, &cpus);
 165  165          if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
 166  166                  panic("HYPERVISOR_mmuext_op() failed");
 167  167          ASSERT(count == 1);

 168  168  }
 169  169  
 170  170  void
 171  171  xen_flush_tlb()
 172  172  {
 173  173          struct mmuext_op t;
 174  174          uint_t count;
 175  175  
 176  176          if (IN_XPV_PANIC()) {
 177  177                  xpv_panic_reload_cr3();
 178  178          } else {
 179  179                  t.cmd = MMUEXT_TLB_FLUSH_LOCAL;
 180  180                  if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
 181  181                          panic("HYPERVISOR_mmuext_op() failed");
 182  182                  ASSERT(count == 1);
 183  183          }
 184  184  }
 185  185  
 186  186  void
 187  187  xen_gflush_tlb(cpuset_t cpus)
 188  188  {
 189  189          struct mmuext_op t;
 190  190          uint_t count;
 191  191  
 192  192          ASSERT(!IN_XPV_PANIC());
 193  193          t.cmd = MMUEXT_TLB_FLUSH_MULTI;
 194  194          /*LINTED: constant in conditional context*/
 195  195          set_xen_guest_handle(t.arg2.vcpumask, &cpus);
 196  196          if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
 197  197                  panic("HYPERVISOR_mmuext_op() failed");
 198  198          ASSERT(count == 1);
 199  199  }
 200  200  
 201  201  /*
 202  202   * Install/Adjust a kpm mapping under the hypervisor.
 203  203   * Value of "how" should be:
 204  204   *      PT_WRITABLE | PT_VALID - regular kpm mapping
 205  205   *      PT_VALID - make mapping read-only
 206  206   *      0       - remove mapping
 207  207   *
 208  208   * returns 0 on success. non-zero for failure.
 209  209   */
 210  210  int
 211  211  xen_kpm_page(pfn_t pfn, uint_t how)
 212  212  {
 213  213          paddr_t pa = mmu_ptob((paddr_t)pfn);
 214  214          x86pte_t pte = PT_NOCONSIST | PT_REF | PT_MOD;
 215  215  
 216  216          if (kpm_vbase == NULL)
 217  217                  return (0);
 218  218  
 219  219          if (how)
 220  220                  pte |= pa_to_ma(pa) | how;
 221  221          else
 222  222                  pte = 0;
 223  223          return (HYPERVISOR_update_va_mapping((uintptr_t)kpm_vbase + pa,
 224  224              pte, UVMF_INVLPG | UVMF_ALL));
 225  225  }
 226  226  
 227  227  void
 228  228  xen_pin(pfn_t pfn, level_t lvl)
 229  229  {
 230  230          struct mmuext_op t;
 231  231          uint_t count;
 232  232  
 233  233          t.cmd = MMUEXT_PIN_L1_TABLE + lvl;
 234  234          t.arg1.mfn = pfn_to_mfn(pfn);
 235  235          if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
 236  236                  panic("HYPERVISOR_mmuext_op() failed");
 237  237          ASSERT(count == 1);
 238  238  }
 239  239  
 240  240  void
 241  241  xen_unpin(pfn_t pfn)
 242  242  {
 243  243          struct mmuext_op t;
 244  244          uint_t count;
 245  245  
 246  246          t.cmd = MMUEXT_UNPIN_TABLE;
 247  247          t.arg1.mfn = pfn_to_mfn(pfn);
 248  248          if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
 249  249                  panic("HYPERVISOR_mmuext_op() failed");
 250  250          ASSERT(count == 1);
 251  251  }
 252  252  
 253  253  static void
 254  254  xen_map(uint64_t pte, caddr_t va)
 255  255  {
 256  256          if (HYPERVISOR_update_va_mapping((uintptr_t)va, pte,
 257  257              UVMF_INVLPG | UVMF_LOCAL))
 258  258                  panic("HYPERVISOR_update_va_mapping() failed");
 259  259  }
 260  260  #endif /* __xpv */
 261  261  
 262  262  /*
 263  263   * Allocate a memory page for a hardware page table.
 264  264   *
 265  265   * A wrapper around page_get_physical(), with some extra checks.
 266  266   */
 267  267  static pfn_t
 268  268  ptable_alloc(uintptr_t seed)
 269  269  {
 270  270          pfn_t pfn;
 271  271          page_t *pp;
 272  272  
 273  273          pfn = PFN_INVALID;
 274  274  
 275  275          /*
 276  276           * The first check is to see if there is memory in the system. If we
 277  277           * drop to throttlefree, then fail the ptable_alloc() and let the
 278  278           * stealing code kick in. Note that we have to do this test here,
 279  279           * since the test in page_create_throttle() would let the NOSLEEP
 280  280           * allocation go through and deplete the page reserves.
 281  281           *
 282  282           * The !NOMEMWAIT() lets pageout, fsflush, etc. skip this check.
 283  283           */
 284  284          if (!NOMEMWAIT() && freemem <= throttlefree + 1)
 285  285                  return (PFN_INVALID);
 286  286  
 287  287  #ifdef DEBUG
 288  288          /*
 289  289           * This code makes htable_steal() easier to test. By setting
 290  290           * force_steal we force pagetable allocations to fall
 291  291           * into the stealing code. Roughly 1 in ever "force_steal"
 292  292           * page table allocations will fail.
 293  293           */
 294  294          if (proc_pageout != NULL && force_steal > 1 &&
 295  295              ++ptable_cnt > force_steal) {
 296  296                  ptable_cnt = 0;
 297  297                  return (PFN_INVALID);
 298  298          }
 299  299  #endif /* DEBUG */
 300  300  
 301  301          pp = page_get_physical(seed);
 302  302          if (pp == NULL)
 303  303                  return (PFN_INVALID);
 304  304          ASSERT(PAGE_SHARED(pp));
 305  305          pfn = pp->p_pagenum;
 306  306          if (pfn == PFN_INVALID)
 307  307                  panic("ptable_alloc(): Invalid PFN!!");
 308  308          atomic_inc_32(&active_ptables);
 309  309          HATSTAT_INC(hs_ptable_allocs);
 310  310          return (pfn);
 311  311  }
 312  312  
 313  313  /*
 314  314   * Free an htable's associated page table page.  See the comments
 315  315   * for ptable_alloc().
 316  316   */
 317  317  static void
 318  318  ptable_free(pfn_t pfn)
 319  319  {
 320  320          page_t *pp = page_numtopp_nolock(pfn);
 321  321  
 322  322          /*
 323  323           * need to destroy the page used for the pagetable
 324  324           */
 325  325          ASSERT(pfn != PFN_INVALID);
 326  326          HATSTAT_INC(hs_ptable_frees);
 327  327          atomic_dec_32(&active_ptables);
 328  328          if (pp == NULL)
 329  329                  panic("ptable_free(): no page for pfn!");
 330  330          ASSERT(PAGE_SHARED(pp));
 331  331          ASSERT(pfn == pp->p_pagenum);
 332  332          ASSERT(!IN_XPV_PANIC());
 333  333  
 334  334          /*
 335  335           * Get an exclusive lock, might have to wait for a kmem reader.
 336  336           */
 337  337          if (!page_tryupgrade(pp)) {
 338  338                  u_offset_t off = pp->p_offset;
 339  339                  page_unlock(pp);
 340  340                  pp = page_lookup(&kvp, off, SE_EXCL);
 341  341                  if (pp == NULL)
 342  342                          panic("page not found");
 343  343          }
 344  344  #ifdef __xpv
 345  345          if (kpm_vbase && xen_kpm_page(pfn, PT_VALID | PT_WRITABLE) < 0)
 346  346                  panic("failure making kpm r/w pfn=0x%lx", pfn);
 347  347  #endif
 348  348          page_hashout(pp, NULL);
 349  349          page_free(pp, 1);
 350  350          page_unresv(1);
 351  351  }
 352  352  
 353  353  /*
 354  354   * Put one htable on the reserve list.
 355  355   */
 356  356  static void
 357  357  htable_put_reserve(htable_t *ht)
 358  358  {
 359  359          ht->ht_hat = NULL;              /* no longer tied to a hat */
 360  360          ASSERT(ht->ht_pfn == PFN_INVALID);
 361  361          HATSTAT_INC(hs_htable_rputs);
 362  362          mutex_enter(&htable_reserve_mutex);
 363  363          ht->ht_next = htable_reserve_pool;
 364  364          htable_reserve_pool = ht;
 365  365          ++htable_reserve_cnt;
 366  366          mutex_exit(&htable_reserve_mutex);
 367  367  }
 368  368  
 369  369  /*
 370  370   * Take one htable from the reserve.
 371  371   */
 372  372  static htable_t *
 373  373  htable_get_reserve(void)
 374  374  {
 375  375          htable_t *ht = NULL;
 376  376  
 377  377          mutex_enter(&htable_reserve_mutex);
 378  378          if (htable_reserve_cnt != 0) {
 379  379                  ht = htable_reserve_pool;
 380  380                  ASSERT(ht != NULL);
 381  381                  ASSERT(ht->ht_pfn == PFN_INVALID);
 382  382                  htable_reserve_pool = ht->ht_next;
 383  383                  --htable_reserve_cnt;
 384  384                  HATSTAT_INC(hs_htable_rgets);
 385  385          }
 386  386          mutex_exit(&htable_reserve_mutex);
 387  387          return (ht);
 388  388  }
 389  389  
 390  390  /*
 391  391   * Allocate initial htables and put them on the reserve list
 392  392   */
 393  393  void
 394  394  htable_initial_reserve(uint_t count)
 395  395  {
 396  396          htable_t *ht;
 397  397  
 398  398          count += HTABLE_RESERVE_AMOUNT;
 399  399          while (count > 0) {
 400  400                  ht = kmem_cache_alloc(htable_cache, KM_NOSLEEP);
 401  401                  ASSERT(ht != NULL);
 402  402  
 403  403                  ASSERT(use_boot_reserve);
 404  404                  ht->ht_pfn = PFN_INVALID;
 405  405                  htable_put_reserve(ht);
 406  406                  --count;
 407  407          }
 408  408  }
 409  409  
 410  410  /*
 411  411   * Readjust the reserves after a thread finishes using them.
 412  412   */
 413  413  void
 414  414  htable_adjust_reserve()
 415  415  {
 416  416          htable_t *ht;
 417  417  
 418  418          /*
 419  419           * Free any excess htables in the reserve list
 420  420           */
 421  421          while (htable_reserve_cnt > htable_reserve_amount &&
 422  422              !USE_HAT_RESERVES()) {
 423  423                  ht = htable_get_reserve();
 424  424                  if (ht == NULL)
 425  425                          return;
 426  426                  ASSERT(ht->ht_pfn == PFN_INVALID);
 427  427                  kmem_cache_free(htable_cache, ht);
 428  428          }
 429  429  }
 430  430  
 431  431  /*
 432  432   * Search the active htables for one to steal. Start at a different hash
 433  433   * bucket every time to help spread the pain of stealing
 434  434   */
 435  435  static void
 436  436  htable_steal_active(hat_t *hat, uint_t cnt, uint_t threshold,
 437  437      uint_t *stolen, htable_t **list)
 438  438  {
 439  439          static uint_t   h_seed = 0;
 440  440          htable_t        *higher, *ht;
 441  441          uint_t          h, e, h_start;
 442  442          uintptr_t       va;
 443  443          x86pte_t        pte;
 444  444  
 445  445          h = h_start = h_seed++ % hat->hat_num_hash;
 446  446          do {
 447  447                  higher = NULL;
 448  448                  HTABLE_ENTER(h);
 449  449                  for (ht = hat->hat_ht_hash[h]; ht; ht = ht->ht_next) {
 450  450  
 451  451                          /*
 452  452                           * Can we rule out reaping?
 453  453                           */
 454  454                          if (ht->ht_busy != 0 ||
 455  455                              (ht->ht_flags & HTABLE_SHARED_PFN) ||
 456  456                              ht->ht_level > 0 || ht->ht_valid_cnt > threshold ||
 457  457                              ht->ht_lock_cnt != 0)
 458  458                                  continue;
 459  459  
 460  460                          /*
 461  461                           * Increment busy so the htable can't disappear. We
 462  462                           * drop the htable mutex to avoid deadlocks with
 463  463                           * hat_pageunload() and the hment mutex while we
 464  464                           * call hat_pte_unmap()
 465  465                           */
 466  466                          ++ht->ht_busy;
 467  467                          HTABLE_EXIT(h);
 468  468  
 469  469                          /*
 470  470                           * Try stealing.
 471  471                           * - unload and invalidate all PTEs
 472  472                           */
 473  473                          for (e = 0, va = ht->ht_vaddr;
 474  474                              e < HTABLE_NUM_PTES(ht) && ht->ht_valid_cnt > 0 &&
 475  475                              ht->ht_busy == 1 && ht->ht_lock_cnt == 0;
 476  476                              ++e, va += MMU_PAGESIZE) {
 477  477                                  pte = x86pte_get(ht, e);
 478  478                                  if (!PTE_ISVALID(pte))
 479  479                                          continue;
 480  480                                  hat_pte_unmap(ht, e, HAT_UNLOAD, pte, NULL,
 481  481                                      B_TRUE);
 482  482                          }
 483  483  
 484  484                          /*
 485  485                           * Reacquire htable lock. If we didn't remove all
 486  486                           * mappings in the table, or another thread added a new
 487  487                           * mapping behind us, give up on this table.
 488  488                           */
 489  489                          HTABLE_ENTER(h);
 490  490                          if (ht->ht_busy != 1 || ht->ht_valid_cnt != 0 ||
 491  491                              ht->ht_lock_cnt != 0) {
 492  492                                  --ht->ht_busy;
 493  493                                  continue;
 494  494                          }
 495  495  
 496  496                          /*
 497  497                           * Steal it and unlink the page table.
 498  498                           */
 499  499                          higher = ht->ht_parent;
 500  500                          unlink_ptp(higher, ht, ht->ht_vaddr);
 501  501  
 502  502                          /*
 503  503                           * remove from the hash list
 504  504                           */
 505  505                          if (ht->ht_next)
 506  506                                  ht->ht_next->ht_prev = ht->ht_prev;
 507  507  
 508  508                          if (ht->ht_prev) {
 509  509                                  ht->ht_prev->ht_next = ht->ht_next;
 510  510                          } else {
 511  511                                  ASSERT(hat->hat_ht_hash[h] == ht);
 512  512                                  hat->hat_ht_hash[h] = ht->ht_next;
 513  513                          }
 514  514  
 515  515                          /*
 516  516                           * Break to outer loop to release the
 517  517                           * higher (ht_parent) pagetable. This
 518  518                           * spreads out the pain caused by
 519  519                           * pagefaults.
 520  520                           */
 521  521                          ht->ht_next = *list;
 522  522                          *list = ht;
 523  523                          ++*stolen;
 524  524                          break;
 525  525                  }
 526  526                  HTABLE_EXIT(h);
 527  527                  if (higher != NULL)
 528  528                          htable_release(higher);
 529  529                  if (++h == hat->hat_num_hash)
 530  530                          h = 0;
 531  531          } while (*stolen < cnt && h != h_start);
 532  532  }
 533  533  
 534  534  /*
 535  535   * Move hat to the end of the kas list
 536  536   */
 537  537  static void
 538  538  move_victim(hat_t *hat)
 539  539  {
 540  540          ASSERT(MUTEX_HELD(&hat_list_lock));
 541  541  
 542  542          /* unlink victim hat */
 543  543          if (hat->hat_prev)
 544  544                  hat->hat_prev->hat_next = hat->hat_next;
 545  545          else
 546  546                  kas.a_hat->hat_next = hat->hat_next;
 547  547  
 548  548          if (hat->hat_next)
 549  549                  hat->hat_next->hat_prev = hat->hat_prev;
 550  550          else
 551  551                  kas.a_hat->hat_prev = hat->hat_prev;
 552  552          /* relink at end of hat list */
 553  553          hat->hat_next = NULL;
 554  554          hat->hat_prev = kas.a_hat->hat_prev;
 555  555          if (hat->hat_prev)
 556  556                  hat->hat_prev->hat_next = hat;
 557  557          else
 558  558                  kas.a_hat->hat_next = hat;
 559  559  
 560  560          kas.a_hat->hat_prev = hat;
 561  561  }
 562  562  
 563  563  /*
 564  564   * This routine steals htables from user processes.  Called by htable_reap
 565  565   * (reap=TRUE) or htable_alloc (reap=FALSE).
 566  566   */
 567  567  static htable_t *
 568  568  htable_steal(uint_t cnt, boolean_t reap)
 569  569  {
 570  570          hat_t           *hat = kas.a_hat;       /* list starts with khat */
 571  571          htable_t        *list = NULL;
 572  572          htable_t        *ht;
 573  573          uint_t          stolen = 0;
 574  574          uint_t          pass, passes;
 575  575          uint_t          threshold;
 576  576  
 577  577          /*
 578  578           * Limit htable_steal_passes to something reasonable
 579  579           */
 580  580          if (htable_steal_passes == 0)
 581  581                  htable_steal_passes = 1;
 582  582          if (htable_steal_passes > mmu.ptes_per_table)
 583  583                  htable_steal_passes = mmu.ptes_per_table;
 584  584  
 585  585          /*
 586  586           * If we're stealing merely as part of kmem reaping (versus stealing
 587  587           * to assure forward progress), we don't want to actually steal any
 588  588           * active htables.  (Stealing active htables merely to give memory
 589  589           * back to the system can inadvertently kick off an htable crime wave
 590  590           * as active processes repeatedly steal htables from one another,
 591  591           * plummeting the system into a kind of HAT lawlessness that can
 592  592           * become so violent as to impede the one thing that can end it:  the
 593  593           * freeing of memory via ARC reclaim and other means.)  So if we're
 594  594           * reaping, we limit ourselves to the first pass that steals cached
 595  595           * htables that aren't in use -- which gives memory back, but averts
 596  596           * the entire breakdown of social order.
 597  597           */
 598  598          passes = reap ? 0 : htable_steal_passes;
 599  599  
 600  600          /*
 601  601           * Loop through all user hats. The 1st pass takes cached htables that
 602  602           * aren't in use. The later passes steal by removing mappings, too.
 603  603           */
 604  604          atomic_inc_32(&htable_dont_cache);
 605  605          for (pass = 0; pass <= passes && stolen < cnt; ++pass) {
 606  606                  threshold = pass * mmu.ptes_per_table / htable_steal_passes;
 607  607  
 608  608                  mutex_enter(&hat_list_lock);
 609  609  
 610  610                  /* skip the first hat (kernel) */
 611  611                  hat = kas.a_hat->hat_next;
 612  612                  for (;;) {
 613  613                          /*

↓ open down ↓

446 lines elided

↑ open up ↑

 614  614                           * Skip any hat that is already being stolen from.
 615  615                           *
 616  616                           * We skip SHARED hats, as these are dummy
 617  617                           * hats that host ISM shared page tables.
 618  618                           *
 619  619                           * We also skip if HAT_FREEING because hat_pte_unmap()
 620  620                           * won't zero out the PTE's. That would lead to hitting
 621  621                           * stale PTEs either here or under hat_unload() when we
 622  622                           * steal and unload the same page table in competing
 623  623                           * threads.
      624 +                         *
      625 +                         * We skip HATs that belong to CPUs, to make our lives
      626 +                         * simpler.
 624  627                           */
 625      -                        while (hat != NULL &&
 626      -                            (hat->hat_flags &
 627      -                            (HAT_VICTIM | HAT_SHARED | HAT_FREEING)) != 0)
      628 +                        while (hat != NULL && (hat->hat_flags &
      629 +                            (HAT_VICTIM | HAT_SHARED | HAT_FREEING |
      630 +                            HAT_PCP)) != 0) {
 628  631                                  hat = hat->hat_next;
      632 +                        }
 629  633  
 630  634                          if (hat == NULL)
 631  635                                  break;
 632  636  
 633  637                          /*
 634  638                           * Mark the HAT as a stealing victim so that it is
 635  639                           * not freed from under us, e.g. in as_free()
 636  640                           */
 637  641                          hat->hat_flags |= HAT_VICTIM;
 638  642                          mutex_exit(&hat_list_lock);

 639  643  
 640  644                          /*
 641  645                           * Take any htables from the hat's cached "free" list.
 642  646                           */
 643  647                          hat_enter(hat);
 644  648                          while ((ht = hat->hat_ht_cached) != NULL &&
 645  649                              stolen < cnt) {
 646  650                                  hat->hat_ht_cached = ht->ht_next;
 647  651                                  ht->ht_next = list;
 648  652                                  list = ht;
 649  653                                  ++stolen;
 650  654                          }
 651  655                          hat_exit(hat);
 652  656  
 653  657                          /*
 654  658                           * Don't steal active htables on first pass.
 655  659                           */
 656  660                          if (pass != 0 && (stolen < cnt))
 657  661                                  htable_steal_active(hat, cnt, threshold,
 658  662                                      &stolen, &list);
 659  663  
 660  664                          /*

↓ open down ↓

22 lines elided

↑ open up ↑

 661  665                           * do synchronous teardown for the reap case so that
 662  666                           * we can forget hat; at this time, hat is
 663  667                           * guaranteed to be around because HAT_VICTIM is set
 664  668                           * (see htable_free() for similar code)
 665  669                           */
 666  670                          for (ht = list; (ht) && (reap); ht = ht->ht_next) {
 667  671                                  if (ht->ht_hat == NULL)
 668  672                                          continue;
 669  673                                  ASSERT(ht->ht_hat == hat);
 670  674  #if defined(__xpv) && defined(__amd64)
 671      -                                if (!(ht->ht_flags & HTABLE_VLP) &&
 672      -                                    ht->ht_level == mmu.max_level) {
      675 +                                ASSERT(!(ht->ht_flags & HTABLE_COPIED));
      676 +                                if (ht->ht_level == mmu.max_level) {
 673  677                                          ptable_free(hat->hat_user_ptable);
 674  678                                          hat->hat_user_ptable = PFN_INVALID;
 675  679                                  }
 676  680  #endif
 677  681                                  /*
 678  682                                   * forget the hat
 679  683                                   */
 680  684                                  ht->ht_hat = NULL;
 681  685                          }
 682  686

 683  687                          mutex_enter(&hat_list_lock);
 684  688  
 685  689                          /*
 686  690                           * Are we finished?
 687  691                           */
 688  692                          if (stolen == cnt) {
 689  693                                  /*
 690  694                                   * Try to spread the pain of stealing,
 691  695                                   * move victim HAT to the end of the HAT list.
 692  696                                   */
 693  697                                  if (pass >= 1 && cnt == 1 &&
 694  698                                      kas.a_hat->hat_prev != hat)
 695  699                                          move_victim(hat);
 696  700                                  /*
 697  701                                   * We are finished
 698  702                                   */
 699  703                          }
 700  704  
 701  705                          /*
 702  706                           * Clear the victim flag, hat can go away now (once
 703  707                           * the lock is dropped)
 704  708                           */
 705  709                          if (hat->hat_flags & HAT_VICTIM) {
 706  710                                  ASSERT(hat != kas.a_hat);
 707  711                                  hat->hat_flags &= ~HAT_VICTIM;
 708  712                                  cv_broadcast(&hat_list_cv);
 709  713                          }
 710  714  
 711  715                          /* move on to the next hat */
 712  716                          hat = hat->hat_next;
 713  717                  }
 714  718  
 715  719                  mutex_exit(&hat_list_lock);
 716  720  
 717  721          }
 718  722          ASSERT(!MUTEX_HELD(&hat_list_lock));
 719  723  
 720  724          atomic_dec_32(&htable_dont_cache);
 721  725          return (list);
 722  726  }
 723  727  
 724  728  /*
 725  729   * This is invoked from kmem when the system is low on memory.  We try
 726  730   * to free hments, htables, and ptables to improve the memory situation.
 727  731   */
 728  732  /*ARGSUSED*/
 729  733  static void
 730  734  htable_reap(void *handle)
 731  735  {
 732  736          uint_t          reap_cnt;
 733  737          htable_t        *list;
 734  738          htable_t        *ht;
 735  739  
 736  740          HATSTAT_INC(hs_reap_attempts);
 737  741          if (!can_steal_post_boot)
 738  742                  return;
 739  743  
 740  744          /*
 741  745           * Try to reap 5% of the page tables bounded by a maximum of
 742  746           * 5% of physmem and a minimum of 10.
 743  747           */
 744  748          reap_cnt = MAX(MIN(physmem / 20, active_ptables / 20), 10);
 745  749  
 746  750          /*
 747  751           * Note: htable_dont_cache should be set at the time of
 748  752           * invoking htable_free()
 749  753           */
 750  754          atomic_inc_32(&htable_dont_cache);
 751  755          /*
 752  756           * Let htable_steal() do the work, we just call htable_free()
 753  757           */
 754  758          XPV_DISALLOW_MIGRATE();
 755  759          list = htable_steal(reap_cnt, B_TRUE);
 756  760          XPV_ALLOW_MIGRATE();
 757  761          while ((ht = list) != NULL) {
 758  762                  list = ht->ht_next;
 759  763                  HATSTAT_INC(hs_reaped);
 760  764                  htable_free(ht);
 761  765          }
 762  766          atomic_dec_32(&htable_dont_cache);
 763  767  
 764  768          /*
 765  769           * Free up excess reserves
 766  770           */
 767  771          htable_adjust_reserve();
 768  772          hment_adjust_reserve();
 769  773  }
 770  774  
 771  775  /*

↓ open down ↓

89 lines elided

↑ open up ↑

 772  776   * Allocate an htable, stealing one or using the reserve if necessary
 773  777   */
 774  778  static htable_t *
 775  779  htable_alloc(
 776  780          hat_t           *hat,
 777  781          uintptr_t       vaddr,
 778  782          level_t         level,
 779  783          htable_t        *shared)
 780  784  {
 781  785          htable_t        *ht = NULL;
 782      -        uint_t          is_vlp;
      786 +        uint_t          is_copied;
 783  787          uint_t          is_bare = 0;
 784  788          uint_t          need_to_zero = 1;
 785  789          int             kmflags = (can_steal_post_boot ? KM_NOSLEEP : KM_SLEEP);
 786  790  
 787  791          if (level < 0 || level > TOP_LEVEL(hat))
 788  792                  panic("htable_alloc(): level %d out of range\n", level);
 789  793  
 790      -        is_vlp = (hat->hat_flags & HAT_VLP) && level == VLP_LEVEL;
 791      -        if (is_vlp || shared != NULL)
      794 +        is_copied = (hat->hat_flags & HAT_COPIED) &&
      795 +            level == hat->hat_max_level;
      796 +        if (is_copied || shared != NULL)
 792  797                  is_bare = 1;
 793  798  
 794  799          /*
 795  800           * First reuse a cached htable from the hat_ht_cached field, this
 796  801           * avoids unnecessary trips through kmem/page allocators.
 797  802           */
 798  803          if (hat->hat_ht_cached != NULL && !is_bare) {
 799  804                  hat_enter(hat);
 800  805                  ht = hat->hat_ht_cached;
 801  806                  if (ht != NULL) {

 802  807                          hat->hat_ht_cached = ht->ht_next;
 803  808                          need_to_zero = 0;
 804  809                          /* XX64 ASSERT() they're all zero somehow */
 805  810                          ASSERT(ht->ht_pfn != PFN_INVALID);
 806  811                  }
 807  812                  hat_exit(hat);
 808  813          }
 809  814  
 810  815          if (ht == NULL) {
 811  816                  /*
 812  817                   * Allocate an htable, possibly refilling the reserves.
 813  818                   */
 814  819                  if (USE_HAT_RESERVES()) {
 815  820                          ht = htable_get_reserve();
 816  821                  } else {
 817  822                          /*
 818  823                           * Donate successful htable allocations to the reserve.
 819  824                           */
 820  825                          for (;;) {
 821  826                                  ht = kmem_cache_alloc(htable_cache, kmflags);
 822  827                                  if (ht == NULL)
 823  828                                          break;
 824  829                                  ht->ht_pfn = PFN_INVALID;
 825  830                                  if (USE_HAT_RESERVES() ||
 826  831                                      htable_reserve_cnt >= htable_reserve_amount)
 827  832                                          break;
 828  833                                  htable_put_reserve(ht);
 829  834                          }
 830  835                  }
 831  836  
 832  837                  /*
 833  838                   * allocate a page for the hardware page table if needed
 834  839                   */
 835  840                  if (ht != NULL && !is_bare) {
 836  841                          ht->ht_hat = hat;
 837  842                          ht->ht_pfn = ptable_alloc((uintptr_t)ht);
 838  843                          if (ht->ht_pfn == PFN_INVALID) {
 839  844                                  if (USE_HAT_RESERVES())
 840  845                                          htable_put_reserve(ht);
 841  846                                  else
 842  847                                          kmem_cache_free(htable_cache, ht);
 843  848                                  ht = NULL;
 844  849                          }
 845  850                  }
 846  851          }
 847  852  
 848  853          /*
 849  854           * If allocations failed, kick off a kmem_reap() and resort to
 850  855           * htable steal(). We may spin here if the system is very low on
 851  856           * memory. If the kernel itself has consumed all memory and kmem_reap()
 852  857           * can't free up anything, then we'll really get stuck here.
 853  858           * That should only happen in a system where the administrator has
 854  859           * misconfigured VM parameters via /etc/system.
 855  860           */
 856  861          while (ht == NULL && can_steal_post_boot) {
 857  862                  kmem_reap();
 858  863                  ht = htable_steal(1, B_FALSE);
 859  864                  HATSTAT_INC(hs_steals);
 860  865  
 861  866                  /*
 862  867                   * If we stole for a bare htable, release the pagetable page.
 863  868                   */
 864  869                  if (ht != NULL) {
 865  870                          if (is_bare) {
 866  871                                  ptable_free(ht->ht_pfn);
 867  872                                  ht->ht_pfn = PFN_INVALID;
 868  873  #if defined(__xpv) && defined(__amd64)
 869  874                          /*
 870  875                           * make stolen page table writable again in kpm
 871  876                           */
 872  877                          } else if (kpm_vbase && xen_kpm_page(ht->ht_pfn,
 873  878                              PT_VALID | PT_WRITABLE) < 0) {
 874  879                                  panic("failure making kpm r/w pfn=0x%lx",
 875  880                                      ht->ht_pfn);
 876  881  #endif
 877  882                          }
 878  883                  }
 879  884          }
 880  885  
 881  886          /*
 882  887           * All attempts to allocate or steal failed. This should only happen
 883  888           * if we run out of memory during boot, due perhaps to a huge
 884  889           * boot_archive. At this point there's no way to continue.
 885  890           */
 886  891          if (ht == NULL)
 887  892                  panic("htable_alloc(): couldn't steal\n");
 888  893  
 889  894  #if defined(__amd64) && defined(__xpv)
 890  895          /*
 891  896           * Under the 64-bit hypervisor, we have 2 top level page tables.
 892  897           * If this allocation fails, we'll resort to stealing.
 893  898           * We use the stolen page indirectly, by freeing the
 894  899           * stolen htable first.
 895  900           */
 896  901          if (level == mmu.max_level) {
 897  902                  for (;;) {
 898  903                          htable_t *stolen;
 899  904  
 900  905                          hat->hat_user_ptable = ptable_alloc((uintptr_t)ht + 1);
 901  906                          if (hat->hat_user_ptable != PFN_INVALID)
 902  907                                  break;
 903  908                          stolen = htable_steal(1, B_FALSE);
 904  909                          if (stolen == NULL)
 905  910                                  panic("2nd steal ptable failed\n");
 906  911                          htable_free(stolen);
 907  912                  }
 908  913                  block_zero_no_xmm(kpm_vbase + pfn_to_pa(hat->hat_user_ptable),
 909  914                      MMU_PAGESIZE);
 910  915          }
 911  916  #endif
 912  917  
 913  918          /*
 914  919           * Shared page tables have all entries locked and entries may not
 915  920           * be added or deleted.
 916  921           */
 917  922          ht->ht_flags = 0;
 918  923          if (shared != NULL) {
 919  924                  ASSERT(shared->ht_valid_cnt > 0);
 920  925                  ht->ht_flags |= HTABLE_SHARED_PFN;
 921  926                  ht->ht_pfn = shared->ht_pfn;
 922  927                  ht->ht_lock_cnt = 0;

↓ open down ↓

121 lines elided

↑ open up ↑

 923  928                  ht->ht_valid_cnt = 0;           /* updated in hat_share() */
 924  929                  ht->ht_shares = shared;
 925  930                  need_to_zero = 0;
 926  931          } else {
 927  932                  ht->ht_shares = NULL;
 928  933                  ht->ht_lock_cnt = 0;
 929  934                  ht->ht_valid_cnt = 0;
 930  935          }
 931  936  
 932  937          /*
 933      -         * setup flags, etc. for VLP htables
      938 +         * setup flags, etc. for copied page tables.
 934  939           */
 935      -        if (is_vlp) {
 936      -                ht->ht_flags |= HTABLE_VLP;
      940 +        if (is_copied) {
      941 +                ht->ht_flags |= HTABLE_COPIED;
 937  942                  ASSERT(ht->ht_pfn == PFN_INVALID);
 938  943                  need_to_zero = 0;
 939  944          }
 940  945  
 941  946          /*
 942  947           * fill in the htable
 943  948           */
 944  949          ht->ht_hat = hat;
 945  950          ht->ht_parent = NULL;
 946  951          ht->ht_vaddr = vaddr;

 947  952          ht->ht_level = level;
 948  953          ht->ht_busy = 1;
 949  954          ht->ht_next = NULL;
 950  955          ht->ht_prev = NULL;
 951  956  
 952  957          /*
 953  958           * Zero out any freshly allocated page table
 954  959           */
 955  960          if (need_to_zero)
 956  961                  x86pte_zero(ht, 0, mmu.ptes_per_table);
 957  962  
 958  963  #if defined(__amd64) && defined(__xpv)
 959  964          if (!is_bare && kpm_vbase) {
 960  965                  (void) xen_kpm_page(ht->ht_pfn, PT_VALID);
 961  966                  if (level == mmu.max_level)
 962  967                          (void) xen_kpm_page(hat->hat_user_ptable, PT_VALID);
 963  968          }
 964  969  #endif
 965  970  
 966  971          return (ht);
 967  972  }
 968  973  
 969  974  /*
 970  975   * Free up an htable, either to a hat's cached list, the reserves or
 971  976   * back to kmem.
 972  977   */
 973  978  static void
 974  979  htable_free(htable_t *ht)
 975  980  {
 976  981          hat_t *hat = ht->ht_hat;

↓ open down ↓

30 lines elided

↑ open up ↑

 977  982  
 978  983          /*
 979  984           * If the process isn't exiting, cache the free htable in the hat
 980  985           * structure. We always do this for the boot time reserve. We don't
 981  986           * do this if the hat is exiting or we are stealing/reaping htables.
 982  987           */
 983  988          if (hat != NULL &&
 984  989              !(ht->ht_flags & HTABLE_SHARED_PFN) &&
 985  990              (use_boot_reserve ||
 986  991              (!(hat->hat_flags & HAT_FREEING) && !htable_dont_cache))) {
 987      -                ASSERT((ht->ht_flags & HTABLE_VLP) == 0);
      992 +                ASSERT((ht->ht_flags & HTABLE_COPIED) == 0);
 988  993                  ASSERT(ht->ht_pfn != PFN_INVALID);
 989  994                  hat_enter(hat);
 990  995                  ht->ht_next = hat->hat_ht_cached;
 991  996                  hat->hat_ht_cached = ht;
 992  997                  hat_exit(hat);
 993  998                  return;
 994  999          }
 995 1000  
 996 1001          /*
 997 1002           * If we have a hardware page table, free it.
 998 1003           * We don't free page tables that are accessed by sharing.
 999 1004           */
1000 1005          if (ht->ht_flags & HTABLE_SHARED_PFN) {
1001 1006                  ASSERT(ht->ht_pfn != PFN_INVALID);
1002      -        } else if (!(ht->ht_flags & HTABLE_VLP)) {
     1007 +        } else if (!(ht->ht_flags & HTABLE_COPIED)) {
1003 1008                  ptable_free(ht->ht_pfn);
1004 1009  #if defined(__amd64) && defined(__xpv)
1005 1010                  if (ht->ht_level == mmu.max_level && hat != NULL) {
1006 1011                          ptable_free(hat->hat_user_ptable);
1007 1012                          hat->hat_user_ptable = PFN_INVALID;
1008 1013                  }
1009 1014  #endif
1010 1015          }
1011 1016          ht->ht_pfn = PFN_INVALID;
1012 1017

1013 1018          /*
1014 1019           * Free it or put into reserves.
1015 1020           */
1016 1021          if (USE_HAT_RESERVES() || htable_reserve_cnt < htable_reserve_amount) {
1017 1022                  htable_put_reserve(ht);
1018 1023          } else {
1019 1024                  kmem_cache_free(htable_cache, ht);
1020 1025                  htable_adjust_reserve();
1021 1026          }
1022 1027  }
1023 1028  
1024 1029  
1025 1030  /*
1026 1031   * This is called when a hat is being destroyed or swapped out. We reap all
1027 1032   * the remaining htables in the hat cache. If destroying all left over
1028 1033   * htables are also destroyed.
1029 1034   *
1030 1035   * We also don't need to invalidate any of the PTPs nor do any demapping.
1031 1036   */
1032 1037  void
1033 1038  htable_purge_hat(hat_t *hat)
1034 1039  {
1035 1040          htable_t *ht;
1036 1041          int h;
1037 1042  
1038 1043          /*
1039 1044           * Purge the htable cache if just reaping.
1040 1045           */
1041 1046          if (!(hat->hat_flags & HAT_FREEING)) {
1042 1047                  atomic_inc_32(&htable_dont_cache);
1043 1048                  for (;;) {
1044 1049                          hat_enter(hat);
1045 1050                          ht = hat->hat_ht_cached;
1046 1051                          if (ht == NULL) {
1047 1052                                  hat_exit(hat);
1048 1053                                  break;
1049 1054                          }
1050 1055                          hat->hat_ht_cached = ht->ht_next;
1051 1056                          hat_exit(hat);
1052 1057                          htable_free(ht);
1053 1058                  }
1054 1059                  atomic_dec_32(&htable_dont_cache);
1055 1060                  return;
1056 1061          }
1057 1062  
1058 1063          /*
1059 1064           * if freeing, no locking is needed
1060 1065           */
1061 1066          while ((ht = hat->hat_ht_cached) != NULL) {
1062 1067                  hat->hat_ht_cached = ht->ht_next;
1063 1068                  htable_free(ht);
1064 1069          }
1065 1070  
1066 1071          /*
1067 1072           * walk thru the htable hash table and free all the htables in it.
1068 1073           */
1069 1074          for (h = 0; h < hat->hat_num_hash; ++h) {
1070 1075                  while ((ht = hat->hat_ht_hash[h]) != NULL) {
1071 1076                          if (ht->ht_next)
1072 1077                                  ht->ht_next->ht_prev = ht->ht_prev;
1073 1078  
1074 1079                          if (ht->ht_prev) {
1075 1080                                  ht->ht_prev->ht_next = ht->ht_next;
1076 1081                          } else {
1077 1082                                  ASSERT(hat->hat_ht_hash[h] == ht);
1078 1083                                  hat->hat_ht_hash[h] = ht->ht_next;
1079 1084                          }
1080 1085                          htable_free(ht);
1081 1086                  }
1082 1087          }
1083 1088  }
1084 1089  
1085 1090  /*
1086 1091   * Unlink an entry for a table at vaddr and level out of the existing table
1087 1092   * one level higher. We are always holding the HASH_ENTER() when doing this.
1088 1093   */
1089 1094  static void
1090 1095  unlink_ptp(htable_t *higher, htable_t *old, uintptr_t vaddr)
1091 1096  {
1092 1097          uint_t          entry = htable_va2entry(vaddr, higher);
1093 1098          x86pte_t        expect = MAKEPTP(old->ht_pfn, old->ht_level);
1094 1099          x86pte_t        found;
1095 1100          hat_t           *hat = old->ht_hat;
1096 1101  
1097 1102          ASSERT(higher->ht_busy > 0);
1098 1103          ASSERT(higher->ht_valid_cnt > 0);
1099 1104          ASSERT(old->ht_valid_cnt == 0);
1100 1105          found = x86pte_cas(higher, entry, expect, 0);
1101 1106  #ifdef __xpv
1102 1107          /*
1103 1108           * This is weird, but Xen apparently automatically unlinks empty

↓ open down ↓

91 lines elided

↑ open up ↑

1104 1109           * pagetables from the upper page table. So allow PTP to be 0 already.
1105 1110           */
1106 1111          if (found != expect && found != 0)
1107 1112  #else
1108 1113          if (found != expect)
1109 1114  #endif
1110 1115                  panic("Bad PTP found=" FMT_PTE ", expected=" FMT_PTE,
1111 1116                      found, expect);
1112 1117  
1113 1118          /*
1114      -         * When a top level VLP page table entry changes, we must issue
1115      -         * a reload of cr3 on all processors.
     1119 +         * When a top level PTE changes for a copied htable, we must trigger a
     1120 +         * hat_pcp_update() on all HAT CPUs.
1116 1121           *
1117      -         * If we don't need do do that, then we still have to INVLPG against
1118      -         * an address covered by the inner page table, as the latest processors
     1122 +         * If we don't need do do that, then we still have to INVLPG against an
     1123 +         * address covered by the inner page table, as the latest processors
1119 1124           * have TLB-like caches for non-leaf page table entries.
1120 1125           */
1121 1126          if (!(hat->hat_flags & HAT_FREEING)) {
1122      -                hat_tlb_inval(hat, (higher->ht_flags & HTABLE_VLP) ?
     1127 +                hat_tlb_inval(hat, (higher->ht_flags & HTABLE_COPIED) ?
1123 1128                      DEMAP_ALL_ADDR : old->ht_vaddr);
1124 1129          }
1125 1130  
1126 1131          HTABLE_DEC(higher->ht_valid_cnt);
1127 1132  }
1128 1133  
1129 1134  /*
1130 1135   * Link an entry for a new table at vaddr and level into the existing table
1131 1136   * one level higher. We are always holding the HASH_ENTER() when doing this.
1132 1137   */

1133 1138  static void
1134 1139  link_ptp(htable_t *higher, htable_t *new, uintptr_t vaddr)
1135 1140  {
1136 1141          uint_t          entry = htable_va2entry(vaddr, higher);
1137 1142          x86pte_t        newptp = MAKEPTP(new->ht_pfn, new->ht_level);
1138 1143          x86pte_t        found;
1139 1144  
1140 1145          ASSERT(higher->ht_busy > 0);

↓ open down ↓

8 lines elided

↑ open up ↑

1141 1146  
1142 1147          ASSERT(new->ht_level != mmu.max_level);
1143 1148  
1144 1149          HTABLE_INC(higher->ht_valid_cnt);
1145 1150  
1146 1151          found = x86pte_cas(higher, entry, 0, newptp);
1147 1152          if ((found & ~PT_REF) != 0)
1148 1153                  panic("HAT: ptp not 0, found=" FMT_PTE, found);
1149 1154  
1150 1155          /*
1151      -         * When any top level VLP page table entry changes, we must issue
1152      -         * a reload of cr3 on all processors using it.
     1156 +         * When a top level PTE changes for a copied htable, we must trigger a
     1157 +         * hat_pcp_update() on all HAT CPUs.
     1158 +         *
1153 1159           * We also need to do this for the kernel hat on PAE 32 bit kernel.
1154 1160           */
1155 1161          if (
1156 1162  #ifdef __i386
1157      -            (higher->ht_hat == kas.a_hat && higher->ht_level == VLP_LEVEL) ||
     1163 +            (higher->ht_hat == kas.a_hat &&
     1164 +            higher->ht_level == higher->ht_hat->hat_max_level) ||
1158 1165  #endif
1159      -            (higher->ht_flags & HTABLE_VLP))
     1166 +            (higher->ht_flags & HTABLE_COPIED))
1160 1167                  hat_tlb_inval(higher->ht_hat, DEMAP_ALL_ADDR);
1161 1168  }
1162 1169  
1163 1170  /*
1164 1171   * Release of hold on an htable. If this is the last use and the pagetable
1165 1172   * is empty we may want to free it, then recursively look at the pagetable
1166 1173   * above it. The recursion is handled by the outer while() loop.
1167 1174   *
1168 1175   * On the metal, during process exit, we don't bother unlinking the tables from
1169 1176   * upper level pagetables. They are instead handled in bulk by hat_free_end().

1170 1177   * We can't do this on the hypervisor as we need the page table to be
1171 1178   * implicitly unpinnned before it goes to the free page lists. This can't
1172 1179   * happen unless we fully unlink it from the page table hierarchy.
1173 1180   */
1174 1181  void
1175 1182  htable_release(htable_t *ht)
1176 1183  {
1177 1184          uint_t          hashval;
1178 1185          htable_t        *shared;
1179 1186          htable_t        *higher;
1180 1187          hat_t           *hat;
1181 1188          uintptr_t       va;
1182 1189          level_t         level;
1183 1190  
1184 1191          while (ht != NULL) {
1185 1192                  shared = NULL;
1186 1193                  for (;;) {
1187 1194                          hat = ht->ht_hat;
1188 1195                          va = ht->ht_vaddr;
1189 1196                          level = ht->ht_level;
1190 1197                          hashval = HTABLE_HASH(hat, va, level);
1191 1198  
1192 1199                          /*
1193 1200                           * The common case is that this isn't the last use of
1194 1201                           * an htable so we don't want to free the htable.
1195 1202                           */
1196 1203                          HTABLE_ENTER(hashval);
1197 1204                          ASSERT(ht->ht_valid_cnt >= 0);
1198 1205                          ASSERT(ht->ht_busy > 0);
1199 1206                          if (ht->ht_valid_cnt > 0)
1200 1207                                  break;
1201 1208                          if (ht->ht_busy > 1)
1202 1209                                  break;
1203 1210                          ASSERT(ht->ht_lock_cnt == 0);
1204 1211  
1205 1212  #if !defined(__xpv)
1206 1213                          /*
1207 1214                           * we always release empty shared htables
1208 1215                           */
1209 1216                          if (!(ht->ht_flags & HTABLE_SHARED_PFN)) {
1210 1217  
1211 1218                                  /*
1212 1219                                   * don't release if in address space tear down
1213 1220                                   */
1214 1221                                  if (hat->hat_flags & HAT_FREEING)
1215 1222                                          break;
1216 1223  
1217 1224                                  /*
1218 1225                                   * At and above max_page_level, free if it's for
1219 1226                                   * a boot-time kernel mapping below kernelbase.
1220 1227                                   */
1221 1228                                  if (level >= mmu.max_page_level &&
1222 1229                                      (hat != kas.a_hat || va >= kernelbase))
1223 1230                                          break;
1224 1231                          }
1225 1232  #endif /* __xpv */
1226 1233  
1227 1234                          /*
1228 1235                           * Remember if we destroy an htable that shares its PFN
1229 1236                           * from elsewhere.
1230 1237                           */
1231 1238                          if (ht->ht_flags & HTABLE_SHARED_PFN) {
1232 1239                                  ASSERT(shared == NULL);
1233 1240                                  shared = ht->ht_shares;
1234 1241                                  HATSTAT_INC(hs_htable_unshared);
1235 1242                          }
1236 1243  
1237 1244                          /*
1238 1245                           * Handle release of a table and freeing the htable_t.
1239 1246                           * Unlink it from the table higher (ie. ht_parent).
1240 1247                           */
1241 1248                          higher = ht->ht_parent;
1242 1249                          ASSERT(higher != NULL);
1243 1250  
1244 1251                          /*
1245 1252                           * Unlink the pagetable.
1246 1253                           */
1247 1254                          unlink_ptp(higher, ht, va);
1248 1255  
1249 1256                          /*
1250 1257                           * remove this htable from its hash list
1251 1258                           */
1252 1259                          if (ht->ht_next)
1253 1260                                  ht->ht_next->ht_prev = ht->ht_prev;
1254 1261  
1255 1262                          if (ht->ht_prev) {
1256 1263                                  ht->ht_prev->ht_next = ht->ht_next;
1257 1264                          } else {
1258 1265                                  ASSERT(hat->hat_ht_hash[hashval] == ht);
1259 1266                                  hat->hat_ht_hash[hashval] = ht->ht_next;
1260 1267                          }
1261 1268                          HTABLE_EXIT(hashval);
1262 1269                          htable_free(ht);
1263 1270                          ht = higher;
1264 1271                  }
1265 1272  
1266 1273                  ASSERT(ht->ht_busy >= 1);
1267 1274                  --ht->ht_busy;
1268 1275                  HTABLE_EXIT(hashval);
1269 1276  
1270 1277                  /*
1271 1278                   * If we released a shared htable, do a release on the htable
1272 1279                   * from which it shared
1273 1280                   */
1274 1281                  ht = shared;
1275 1282          }
1276 1283  }
1277 1284  
1278 1285  /*
1279 1286   * Find the htable for the pagetable at the given level for the given address.
1280 1287   * If found acquires a hold that eventually needs to be htable_release()d
1281 1288   */
1282 1289  htable_t *
1283 1290  htable_lookup(hat_t *hat, uintptr_t vaddr, level_t level)
1284 1291  {
1285 1292          uintptr_t       base;
1286 1293          uint_t          hashval;
1287 1294          htable_t        *ht = NULL;

↓ open down ↓

118 lines elided

↑ open up ↑

1288 1295  
1289 1296          ASSERT(level >= 0);
1290 1297          ASSERT(level <= TOP_LEVEL(hat));
1291 1298  
1292 1299          if (level == TOP_LEVEL(hat)) {
1293 1300  #if defined(__amd64)
1294 1301                  /*
1295 1302                   * 32 bit address spaces on 64 bit kernels need to check
1296 1303                   * for overflow of the 32 bit address space
1297 1304                   */
1298      -                if ((hat->hat_flags & HAT_VLP) && vaddr >= ((uint64_t)1 << 32))
     1305 +                if ((hat->hat_flags & HAT_COPIED_32) &&
     1306 +                    vaddr >= ((uint64_t)1 << 32))
1299 1307                          return (NULL);
1300 1308  #endif
1301 1309                  base = 0;
1302 1310          } else {
1303 1311                  base = vaddr & LEVEL_MASK(level + 1);
1304 1312          }
1305 1313  
1306 1314          hashval = HTABLE_HASH(hat, base, level);
1307 1315          HTABLE_ENTER(hashval);
1308 1316          for (ht = hat->hat_ht_hash[hashval]; ht; ht = ht->ht_next) {

1309 1317                  if (ht->ht_hat == hat &&
1310 1318                      ht->ht_vaddr == base &&
1311 1319                      ht->ht_level == level)
1312 1320                          break;
1313 1321          }
1314 1322          if (ht)
1315 1323                  ++ht->ht_busy;
1316 1324  
1317 1325          HTABLE_EXIT(hashval);
1318 1326          return (ht);
1319 1327  }
1320 1328  
1321 1329  /*
1322 1330   * Acquires a hold on a known htable (from a locked hment entry).
1323 1331   */
1324 1332  void
1325 1333  htable_acquire(htable_t *ht)
1326 1334  {
1327 1335          hat_t           *hat = ht->ht_hat;
1328 1336          level_t         level = ht->ht_level;
1329 1337          uintptr_t       base = ht->ht_vaddr;
1330 1338          uint_t          hashval = HTABLE_HASH(hat, base, level);
1331 1339  
1332 1340          HTABLE_ENTER(hashval);
1333 1341  #ifdef DEBUG
1334 1342          /*
1335 1343           * make sure the htable is there
1336 1344           */
1337 1345          {
1338 1346                  htable_t        *h;
1339 1347  
1340 1348                  for (h = hat->hat_ht_hash[hashval];
1341 1349                      h && h != ht;
1342 1350                      h = h->ht_next)
1343 1351                          ;
1344 1352                  ASSERT(h == ht);
1345 1353          }
1346 1354  #endif /* DEBUG */
1347 1355          ++ht->ht_busy;
1348 1356          HTABLE_EXIT(hashval);
1349 1357  }
1350 1358  
1351 1359  /*
1352 1360   * Find the htable for the pagetable at the given level for the given address.
1353 1361   * If found acquires a hold that eventually needs to be htable_release()d
1354 1362   * If not found the table is created.
1355 1363   *
1356 1364   * Since we can't hold a hash table mutex during allocation, we have to
1357 1365   * drop it and redo the search on a create. Then we may have to free the newly
1358 1366   * allocated htable if another thread raced in and created it ahead of us.
1359 1367   */
1360 1368  htable_t *
1361 1369  htable_create(
1362 1370          hat_t           *hat,
1363 1371          uintptr_t       vaddr,
1364 1372          level_t         level,
1365 1373          htable_t        *shared)
1366 1374  {
1367 1375          uint_t          h;
1368 1376          level_t         l;
1369 1377          uintptr_t       base;
1370 1378          htable_t        *ht;
1371 1379          htable_t        *higher = NULL;
1372 1380          htable_t        *new = NULL;
1373 1381  
1374 1382          if (level < 0 || level > TOP_LEVEL(hat))
1375 1383                  panic("htable_create(): level %d out of range\n", level);
1376 1384  
1377 1385          /*
1378 1386           * Create the page tables in top down order.
1379 1387           */
1380 1388          for (l = TOP_LEVEL(hat); l >= level; --l) {
1381 1389                  new = NULL;
1382 1390                  if (l == TOP_LEVEL(hat))
1383 1391                          base = 0;
1384 1392                  else
1385 1393                          base = vaddr & LEVEL_MASK(l + 1);
1386 1394  
1387 1395                  h = HTABLE_HASH(hat, base, l);
1388 1396  try_again:
1389 1397                  /*
1390 1398                   * look up the htable at this level
1391 1399                   */
1392 1400                  HTABLE_ENTER(h);
1393 1401                  if (l == TOP_LEVEL(hat)) {
1394 1402                          ht = hat->hat_htable;
1395 1403                  } else {
1396 1404                          for (ht = hat->hat_ht_hash[h]; ht; ht = ht->ht_next) {
1397 1405                                  ASSERT(ht->ht_hat == hat);
1398 1406                                  if (ht->ht_vaddr == base &&
1399 1407                                      ht->ht_level == l)
1400 1408                                          break;
1401 1409                          }
1402 1410                  }
1403 1411  
1404 1412                  /*
1405 1413                   * if we found the htable, increment its busy cnt
1406 1414                   * and if we had allocated a new htable, free it.
1407 1415                   */
1408 1416                  if (ht != NULL) {
1409 1417                          /*
1410 1418                           * If we find a pre-existing shared table, it must
1411 1419                           * share from the same place.
1412 1420                           */
1413 1421                          if (l == level && shared && ht->ht_shares &&
1414 1422                              ht->ht_shares != shared) {
1415 1423                                  panic("htable shared from wrong place "
1416 1424                                      "found htable=%p shared=%p",
1417 1425                                      (void *)ht, (void *)shared);
1418 1426                          }
1419 1427                          ++ht->ht_busy;
1420 1428                          HTABLE_EXIT(h);
1421 1429                          if (new)
1422 1430                                  htable_free(new);
1423 1431                          if (higher != NULL)
1424 1432                                  htable_release(higher);
1425 1433                          higher = ht;
1426 1434  
1427 1435                  /*
1428 1436                   * if we didn't find it on the first search
1429 1437                   * allocate a new one and search again
1430 1438                   */
1431 1439                  } else if (new == NULL) {
1432 1440                          HTABLE_EXIT(h);
1433 1441                          new = htable_alloc(hat, base, l,
1434 1442                              l == level ? shared : NULL);
1435 1443                          goto try_again;
1436 1444  
1437 1445                  /*
1438 1446                   * 2nd search and still not there, use "new" table
1439 1447                   * Link new table into higher, when not at top level.
1440 1448                   */
1441 1449                  } else {
1442 1450                          ht = new;
1443 1451                          if (higher != NULL) {
1444 1452                                  link_ptp(higher, ht, base);
1445 1453                                  ht->ht_parent = higher;
1446 1454                          }
1447 1455                          ht->ht_next = hat->hat_ht_hash[h];
1448 1456                          ASSERT(ht->ht_prev == NULL);
1449 1457                          if (hat->hat_ht_hash[h])
1450 1458                                  hat->hat_ht_hash[h]->ht_prev = ht;
1451 1459                          hat->hat_ht_hash[h] = ht;
1452 1460                          HTABLE_EXIT(h);
1453 1461  
1454 1462                          /*
1455 1463                           * Note we don't do htable_release(higher).
1456 1464                           * That happens recursively when "new" is removed by
1457 1465                           * htable_release() or htable_steal().
1458 1466                           */
1459 1467                          higher = ht;
1460 1468  
1461 1469                          /*
1462 1470                           * If we just created a new shared page table we
1463 1471                           * increment the shared htable's busy count, so that
1464 1472                           * it can't be the victim of a steal even if it's empty.
1465 1473                           */
1466 1474                          if (l == level && shared) {
1467 1475                                  (void) htable_lookup(shared->ht_hat,
1468 1476                                      shared->ht_vaddr, shared->ht_level);
1469 1477                                  HATSTAT_INC(hs_htable_shared);
1470 1478                          }
1471 1479                  }
1472 1480          }
1473 1481  
1474 1482          return (ht);
1475 1483  }
1476 1484  
1477 1485  /*
1478 1486   * Inherit initial pagetables from the boot program. On the 64-bit
1479 1487   * hypervisor we also temporarily mark the p_index field of page table
1480 1488   * pages, so we know not to try making them writable in seg_kpm.
1481 1489   */
1482 1490  void
1483 1491  htable_attach(
1484 1492          hat_t *hat,
1485 1493          uintptr_t base,
1486 1494          level_t level,
1487 1495          htable_t *parent,
1488 1496          pfn_t pfn)
1489 1497  {
1490 1498          htable_t        *ht;
1491 1499          uint_t          h;
1492 1500          uint_t          i;
1493 1501          x86pte_t        pte;
1494 1502          x86pte_t        *ptep;
1495 1503          page_t          *pp;
1496 1504          extern page_t   *boot_claim_page(pfn_t);
1497 1505  
1498 1506          ht = htable_get_reserve();
1499 1507          if (level == mmu.max_level)
1500 1508                  kas.a_hat->hat_htable = ht;
1501 1509          ht->ht_hat = hat;
1502 1510          ht->ht_parent = parent;
1503 1511          ht->ht_vaddr = base;
1504 1512          ht->ht_level = level;
1505 1513          ht->ht_busy = 1;
1506 1514          ht->ht_next = NULL;
1507 1515          ht->ht_prev = NULL;
1508 1516          ht->ht_flags = 0;
1509 1517          ht->ht_pfn = pfn;
1510 1518          ht->ht_lock_cnt = 0;
1511 1519          ht->ht_valid_cnt = 0;
1512 1520          if (parent != NULL)
1513 1521                  ++parent->ht_busy;
1514 1522  
1515 1523          h = HTABLE_HASH(hat, base, level);
1516 1524          HTABLE_ENTER(h);
1517 1525          ht->ht_next = hat->hat_ht_hash[h];
1518 1526          ASSERT(ht->ht_prev == NULL);
1519 1527          if (hat->hat_ht_hash[h])
1520 1528                  hat->hat_ht_hash[h]->ht_prev = ht;
1521 1529          hat->hat_ht_hash[h] = ht;
1522 1530          HTABLE_EXIT(h);
1523 1531  
1524 1532          /*
1525 1533           * make sure the page table physical page is not FREE
1526 1534           */
1527 1535          if (page_resv(1, KM_NOSLEEP) == 0)
1528 1536                  panic("page_resv() failed in ptable alloc");
1529 1537  
1530 1538          pp = boot_claim_page(pfn);
1531 1539          ASSERT(pp != NULL);
1532 1540  
1533 1541          /*
1534 1542           * Page table pages that were allocated by dboot or
1535 1543           * in very early startup didn't go through boot_mapin()
1536 1544           * and so won't have vnode/offsets. Fix that here.
1537 1545           */
1538 1546          if (pp->p_vnode == NULL) {
1539 1547                  /* match offset calculation in page_get_physical() */
1540 1548                  u_offset_t offset = (uintptr_t)ht;
1541 1549                  if (offset > kernelbase)
1542 1550                          offset -= kernelbase;
1543 1551                  offset <<= MMU_PAGESHIFT;
1544 1552  #if defined(__amd64)
1545 1553                  offset += mmu.hole_start;       /* something in VA hole */
1546 1554  #else
1547 1555                  offset += 1ULL << 40;           /* something > 4 Gig */
1548 1556  #endif
1549 1557                  ASSERT(page_exists(&kvp, offset) == NULL);
1550 1558                  (void) page_hashin(pp, &kvp, offset, NULL);
1551 1559          }
1552 1560          page_downgrade(pp);
1553 1561  #if defined(__xpv) && defined(__amd64)
1554 1562          /*
1555 1563           * Record in the page_t that is a pagetable for segkpm setup.
1556 1564           */
1557 1565          if (kpm_vbase)
1558 1566                  pp->p_index = 1;
1559 1567  #endif
1560 1568  
1561 1569          /*
1562 1570           * Count valid mappings and recursively attach lower level pagetables.
1563 1571           */
1564 1572          ptep = kbm_remap_window(pfn_to_pa(pfn), 0);
1565 1573          for (i = 0; i < HTABLE_NUM_PTES(ht); ++i) {
1566 1574                  if (mmu.pae_hat)
1567 1575                          pte = ptep[i];
1568 1576                  else
1569 1577                          pte = ((x86pte32_t *)ptep)[i];
1570 1578                  if (!IN_HYPERVISOR_VA(base) && PTE_ISVALID(pte)) {
1571 1579                          ++ht->ht_valid_cnt;
1572 1580                          if (!PTE_ISPAGE(pte, level)) {
1573 1581                                  htable_attach(hat, base, level - 1,
1574 1582                                      ht, PTE2PFN(pte, level));
1575 1583                                  ptep = kbm_remap_window(pfn_to_pa(pfn), 0);
1576 1584                          }
1577 1585                  }
1578 1586                  base += LEVEL_SIZE(level);
1579 1587                  if (base == mmu.hole_start)
1580 1588                          base = (mmu.hole_end + MMU_PAGEOFFSET) & MMU_PAGEMASK;
1581 1589          }
1582 1590  
1583 1591          /*
1584 1592           * As long as all the mappings we had were below kernel base
1585 1593           * we can release the htable.
1586 1594           */
1587 1595          if (base < kernelbase)
1588 1596                  htable_release(ht);
1589 1597  }
1590 1598  
1591 1599  /*
1592 1600   * Walk through a given htable looking for the first valid entry.  This
1593 1601   * routine takes both a starting and ending address.  The starting address
1594 1602   * is required to be within the htable provided by the caller, but there is
1595 1603   * no such restriction on the ending address.
1596 1604   *
1597 1605   * If the routine finds a valid entry in the htable (at or beyond the
1598 1606   * starting address), the PTE (and its address) will be returned.
1599 1607   * This PTE may correspond to either a page or a pagetable - it is the
1600 1608   * caller's responsibility to determine which.  If no valid entry is
1601 1609   * found, 0 (and invalid PTE) and the next unexamined address will be
1602 1610   * returned.
1603 1611   *
1604 1612   * The loop has been carefully coded for optimization.
1605 1613   */
1606 1614  static x86pte_t
1607 1615  htable_scan(htable_t *ht, uintptr_t *vap, uintptr_t eaddr)
1608 1616  {
1609 1617          uint_t e;
1610 1618          x86pte_t found_pte = (x86pte_t)0;
1611 1619          caddr_t pte_ptr;
1612 1620          caddr_t end_pte_ptr;
1613 1621          int l = ht->ht_level;
1614 1622          uintptr_t va = *vap & LEVEL_MASK(l);
1615 1623          size_t pgsize = LEVEL_SIZE(l);
1616 1624  
1617 1625          ASSERT(va >= ht->ht_vaddr);
1618 1626          ASSERT(va <= HTABLE_LAST_PAGE(ht));
1619 1627  
1620 1628          /*
1621 1629           * Compute the starting index and ending virtual address
1622 1630           */
1623 1631          e = htable_va2entry(va, ht);
1624 1632  
1625 1633          /*
1626 1634           * The following page table scan code knows that the valid
1627 1635           * bit of a PTE is in the lowest byte AND that x86 is little endian!!
1628 1636           */
1629 1637          pte_ptr = (caddr_t)x86pte_access_pagetable(ht, 0);
1630 1638          end_pte_ptr = (caddr_t)PT_INDEX_PTR(pte_ptr, HTABLE_NUM_PTES(ht));
1631 1639          pte_ptr = (caddr_t)PT_INDEX_PTR((x86pte_t *)pte_ptr, e);
1632 1640          while (!PTE_ISVALID(*pte_ptr)) {
1633 1641                  va += pgsize;
1634 1642                  if (va >= eaddr)
1635 1643                          break;
1636 1644                  pte_ptr += mmu.pte_size;
1637 1645                  ASSERT(pte_ptr <= end_pte_ptr);
1638 1646                  if (pte_ptr == end_pte_ptr)
1639 1647                          break;
1640 1648          }
1641 1649  
1642 1650          /*
1643 1651           * if we found a valid PTE, load the entire PTE
1644 1652           */
1645 1653          if (va < eaddr && pte_ptr != end_pte_ptr)
1646 1654                  found_pte = GET_PTE((x86pte_t *)pte_ptr);
1647 1655          x86pte_release_pagetable(ht);
1648 1656  
1649 1657  #if defined(__amd64)
1650 1658          /*
1651 1659           * deal with VA hole on amd64
1652 1660           */
1653 1661          if (l == mmu.max_level && va >= mmu.hole_start && va <= mmu.hole_end)
1654 1662                  va = mmu.hole_end + va - mmu.hole_start;
1655 1663  #endif /* __amd64 */
1656 1664  
1657 1665          *vap = va;
1658 1666          return (found_pte);
1659 1667  }
1660 1668  
1661 1669  /*
1662 1670   * Find the address and htable for the first populated translation at or
1663 1671   * above the given virtual address.  The caller may also specify an upper
1664 1672   * limit to the address range to search.  Uses level information to quickly
1665 1673   * skip unpopulated sections of virtual address spaces.
1666 1674   *
1667 1675   * If not found returns NULL. When found, returns the htable and virt addr
1668 1676   * and has a hold on the htable.
1669 1677   */
1670 1678  x86pte_t
1671 1679  htable_walk(
1672 1680          struct hat *hat,
1673 1681          htable_t **htp,
1674 1682          uintptr_t *vaddr,
1675 1683          uintptr_t eaddr)
1676 1684  {
1677 1685          uintptr_t va = *vaddr;
1678 1686          htable_t *ht;
1679 1687          htable_t *prev = *htp;
1680 1688          level_t l;
1681 1689          level_t max_mapped_level;
1682 1690          x86pte_t pte;
1683 1691  
1684 1692          ASSERT(eaddr > va);
1685 1693  
1686 1694          /*
1687 1695           * If this is a user address, then we know we need not look beyond
1688 1696           * kernelbase.
1689 1697           */
1690 1698          ASSERT(hat == kas.a_hat || eaddr <= kernelbase ||
1691 1699              eaddr == HTABLE_WALK_TO_END);
1692 1700          if (hat != kas.a_hat && eaddr == HTABLE_WALK_TO_END)
1693 1701                  eaddr = kernelbase;
1694 1702  
1695 1703          /*
1696 1704           * If we're coming in with a previous page table, search it first
1697 1705           * without doing an htable_lookup(), this should be frequent.
1698 1706           */
1699 1707          if (prev) {
1700 1708                  ASSERT(prev->ht_busy > 0);
1701 1709                  ASSERT(prev->ht_vaddr <= va);
1702 1710                  l = prev->ht_level;
1703 1711                  if (va <= HTABLE_LAST_PAGE(prev)) {
1704 1712                          pte = htable_scan(prev, &va, eaddr);
1705 1713  
1706 1714                          if (PTE_ISPAGE(pte, l)) {
1707 1715                                  *vaddr = va;
1708 1716                                  *htp = prev;
1709 1717                                  return (pte);
1710 1718                          }
1711 1719                  }
1712 1720  
1713 1721                  /*
1714 1722                   * We found nothing in the htable provided by the caller,
1715 1723                   * so fall through and do the full search
1716 1724                   */
1717 1725                  htable_release(prev);
1718 1726          }
1719 1727  
1720 1728          /*
1721 1729           * Find the level of the largest pagesize used by this HAT.
1722 1730           */
1723 1731          if (hat->hat_ism_pgcnt > 0) {
1724 1732                  max_mapped_level = mmu.umax_page_level;
1725 1733          } else {
1726 1734                  max_mapped_level = 0;
1727 1735                  for (l = 1; l <= mmu.max_page_level; ++l)
1728 1736                          if (hat->hat_pages_mapped[l] != 0)
1729 1737                                  max_mapped_level = l;
1730 1738          }
1731 1739  
1732 1740          while (va < eaddr && va >= *vaddr) {
1733 1741                  /*
1734 1742                   *  Find lowest table with any entry for given address.
1735 1743                   */
1736 1744                  for (l = 0; l <= TOP_LEVEL(hat); ++l) {
1737 1745                          ht = htable_lookup(hat, va, l);
1738 1746                          if (ht != NULL) {
1739 1747                                  pte = htable_scan(ht, &va, eaddr);
1740 1748                                  if (PTE_ISPAGE(pte, l)) {
1741 1749                                          VERIFY(!IN_VA_HOLE(va));
1742 1750                                          *vaddr = va;
1743 1751                                          *htp = ht;
1744 1752                                          return (pte);
1745 1753                                  }
1746 1754                                  htable_release(ht);
1747 1755                                  break;
1748 1756                          }
1749 1757  
1750 1758                          /*
1751 1759                           * No htable at this level for the address. If there
1752 1760                           * is no larger page size that could cover it, we can
1753 1761                           * skip right to the start of the next page table.
1754 1762                           */
1755 1763                          ASSERT(l < TOP_LEVEL(hat));
1756 1764                          if (l >= max_mapped_level) {
1757 1765                                  va = NEXT_ENTRY_VA(va, l + 1);
1758 1766                                  if (va >= eaddr)
1759 1767                                          break;
1760 1768                          }
1761 1769                  }
1762 1770          }
1763 1771  
1764 1772          *vaddr = 0;
1765 1773          *htp = NULL;
1766 1774          return (0);
1767 1775  }
1768 1776  
1769 1777  /*
1770 1778   * Find the htable and page table entry index of the given virtual address
1771 1779   * with pagesize at or below given level.
1772 1780   * If not found returns NULL. When found, returns the htable, sets
1773 1781   * entry, and has a hold on the htable.
1774 1782   */
1775 1783  htable_t *
1776 1784  htable_getpte(
1777 1785          struct hat *hat,
1778 1786          uintptr_t vaddr,
1779 1787          uint_t *entry,
1780 1788          x86pte_t *pte,
1781 1789          level_t level)
1782 1790  {
1783 1791          htable_t        *ht;
1784 1792          level_t         l;
1785 1793          uint_t          e;
1786 1794  
1787 1795          ASSERT(level <= mmu.max_page_level);
1788 1796  
1789 1797          for (l = 0; l <= level; ++l) {
1790 1798                  ht = htable_lookup(hat, vaddr, l);
1791 1799                  if (ht == NULL)
1792 1800                          continue;
1793 1801                  e = htable_va2entry(vaddr, ht);
1794 1802                  if (entry != NULL)
1795 1803                          *entry = e;
1796 1804                  if (pte != NULL)
1797 1805                          *pte = x86pte_get(ht, e);
1798 1806                  return (ht);
1799 1807          }
1800 1808          return (NULL);
1801 1809  }
1802 1810  
1803 1811  /*
1804 1812   * Find the htable and page table entry index of the given virtual address.
1805 1813   * There must be a valid page mapped at the given address.
1806 1814   * If not found returns NULL. When found, returns the htable, sets
1807 1815   * entry, and has a hold on the htable.
1808 1816   */
1809 1817  htable_t *
1810 1818  htable_getpage(struct hat *hat, uintptr_t vaddr, uint_t *entry)
1811 1819  {
1812 1820          htable_t        *ht;
1813 1821          uint_t          e;
1814 1822          x86pte_t        pte;
1815 1823  
1816 1824          ht = htable_getpte(hat, vaddr, &e, &pte, mmu.max_page_level);
1817 1825          if (ht == NULL)
1818 1826                  return (NULL);
1819 1827  
1820 1828          if (entry)
1821 1829                  *entry = e;
1822 1830  
1823 1831          if (PTE_ISPAGE(pte, ht->ht_level))
1824 1832                  return (ht);
1825 1833          htable_release(ht);
1826 1834          return (NULL);
1827 1835  }
1828 1836  
1829 1837  
1830 1838  void
1831 1839  htable_init()
1832 1840  {
1833 1841          /*
1834 1842           * To save on kernel VA usage, we avoid debug information in 32 bit
1835 1843           * kernels.
1836 1844           */
1837 1845  #if defined(__amd64)
1838 1846          int     kmem_flags = KMC_NOHASH;
1839 1847  #elif defined(__i386)
1840 1848          int     kmem_flags = KMC_NOHASH | KMC_NODEBUG;
1841 1849  #endif
1842 1850  
1843 1851          /*
1844 1852           * initialize kmem caches
1845 1853           */
1846 1854          htable_cache = kmem_cache_create("htable_t",
1847 1855              sizeof (htable_t), 0, NULL, NULL,
1848 1856              htable_reap, NULL, hat_memload_arena, kmem_flags);
1849 1857  }
1850 1858  
1851 1859  /*
1852 1860   * get the pte index for the virtual address in the given htable's pagetable
1853 1861   */
1854 1862  uint_t
1855 1863  htable_va2entry(uintptr_t va, htable_t *ht)
1856 1864  {
1857 1865          level_t l = ht->ht_level;
1858 1866  
1859 1867          ASSERT(va >= ht->ht_vaddr);
1860 1868          ASSERT(va <= HTABLE_LAST_PAGE(ht));
1861 1869          return ((va >> LEVEL_SHIFT(l)) & (HTABLE_NUM_PTES(ht) - 1));
1862 1870  }
1863 1871  
1864 1872  /*
1865 1873   * Given an htable and the index of a pte in it, return the virtual address
1866 1874   * of the page.
1867 1875   */
1868 1876  uintptr_t
1869 1877  htable_e2va(htable_t *ht, uint_t entry)
1870 1878  {
1871 1879          level_t l = ht->ht_level;
1872 1880          uintptr_t va;
1873 1881  
1874 1882          ASSERT(entry < HTABLE_NUM_PTES(ht));
1875 1883          va = ht->ht_vaddr + ((uintptr_t)entry << LEVEL_SHIFT(l));
1876 1884  
1877 1885          /*
1878 1886           * Need to skip over any VA hole in top level table
1879 1887           */
1880 1888  #if defined(__amd64)
1881 1889          if (ht->ht_level == mmu.max_level && va >= mmu.hole_start)
1882 1890                  va += ((mmu.hole_end - mmu.hole_start) + 1);
1883 1891  #endif
1884 1892  
1885 1893          return (va);
1886 1894  }
1887 1895  
1888 1896  /*
1889 1897   * The code uses compare and swap instructions to read/write PTE's to
1890 1898   * avoid atomicity problems, since PTEs can be 8 bytes on 32 bit systems.
1891 1899   * will naturally be atomic.
1892 1900   *
1893 1901   * The combination of using kpreempt_disable()/_enable() and the hci_mutex
1894 1902   * are used to ensure that an interrupt won't overwrite a temporary mapping
1895 1903   * while it's in use. If an interrupt thread tries to access a PTE, it will
1896 1904   * yield briefly back to the pinned thread which holds the cpu's hci_mutex.
1897 1905   */
1898 1906  void
1899 1907  x86pte_cpu_init(cpu_t *cpu)
1900 1908  {
1901 1909          struct hat_cpu_info *hci;
1902 1910  
1903 1911          hci = kmem_zalloc(sizeof (*hci), KM_SLEEP);
1904 1912          mutex_init(&hci->hci_mutex, NULL, MUTEX_DEFAULT, NULL);
1905 1913          cpu->cpu_hat_info = hci;
1906 1914  }
1907 1915  
1908 1916  void
1909 1917  x86pte_cpu_fini(cpu_t *cpu)
1910 1918  {
1911 1919          struct hat_cpu_info *hci = cpu->cpu_hat_info;
1912 1920  
1913 1921          kmem_free(hci, sizeof (*hci));
1914 1922          cpu->cpu_hat_info = NULL;
1915 1923  }
1916 1924  
1917 1925  #ifdef __i386
1918 1926  /*
1919 1927   * On 32 bit kernels, loading a 64 bit PTE is a little tricky
1920 1928   */
1921 1929  x86pte_t
1922 1930  get_pte64(x86pte_t *ptr)
1923 1931  {
1924 1932          volatile uint32_t *p = (uint32_t *)ptr;
1925 1933          x86pte_t t;
1926 1934  
1927 1935          ASSERT(mmu.pae_hat != 0);
1928 1936          for (;;) {
1929 1937                  t = p[0];
1930 1938                  t |= (uint64_t)p[1] << 32;
1931 1939                  if ((t & 0xffffffff) == p[0])
1932 1940                          return (t);
1933 1941          }
1934 1942  }
1935 1943  #endif /* __i386 */

↓ open down ↓

627 lines elided

↑ open up ↑

1936 1944  
1937 1945  /*
1938 1946   * Disable preemption and establish a mapping to the pagetable with the
1939 1947   * given pfn. This is optimized for there case where it's the same
1940 1948   * pfn as we last used referenced from this CPU.
1941 1949   */
1942 1950  static x86pte_t *
1943 1951  x86pte_access_pagetable(htable_t *ht, uint_t index)
1944 1952  {
1945 1953          /*
1946      -         * VLP pagetables are contained in the hat_t
     1954 +         * HTABLE_COPIED pagetables are contained in the hat_t
1947 1955           */
1948      -        if (ht->ht_flags & HTABLE_VLP)
1949      -                return (PT_INDEX_PTR(ht->ht_hat->hat_vlp_ptes, index));
     1956 +        if (ht->ht_flags & HTABLE_COPIED) {
     1957 +                ASSERT3U(index, <, ht->ht_hat->hat_num_copied);
     1958 +                return (PT_INDEX_PTR(ht->ht_hat->hat_copied_ptes, index));
     1959 +        }
1950 1960          return (x86pte_mapin(ht->ht_pfn, index, ht));
1951 1961  }
1952 1962  
1953 1963  /*
1954 1964   * map the given pfn into the page table window.
1955 1965   */
1956 1966  /*ARGSUSED*/
1957 1967  x86pte_t *
1958 1968  x86pte_mapin(pfn_t pfn, uint_t index, htable_t *ht)
1959 1969  {

1960 1970          x86pte_t *pteptr;
1961 1971          x86pte_t pte = 0;
1962 1972          x86pte_t newpte;
1963 1973          int x;
1964 1974  
1965 1975          ASSERT(pfn != PFN_INVALID);
1966 1976  
1967 1977          if (!khat_running) {
1968 1978                  caddr_t va = kbm_remap_window(pfn_to_pa(pfn), 1);
1969 1979                  return (PT_INDEX_PTR(va, index));
1970 1980          }
1971 1981

↓ open down ↓

12 lines elided

↑ open up ↑

1972 1982          /*
1973 1983           * If kpm is available, use it.
1974 1984           */
1975 1985          if (kpm_vbase)
1976 1986                  return (PT_INDEX_PTR(hat_kpm_pfn2va(pfn), index));
1977 1987  
1978 1988          /*
1979 1989           * Disable preemption and grab the CPU's hci_mutex
1980 1990           */
1981 1991          kpreempt_disable();
     1992 +
1982 1993          ASSERT(CPU->cpu_hat_info != NULL);
     1994 +        ASSERT(!(getcr4() & CR4_PCIDE));
     1995 +
1983 1996          mutex_enter(&CPU->cpu_hat_info->hci_mutex);
1984 1997          x = PWIN_TABLE(CPU->cpu_id);
1985 1998          pteptr = (x86pte_t *)PWIN_PTE_VA(x);
1986 1999  #ifndef __xpv
1987 2000          if (mmu.pae_hat)
1988 2001                  pte = *pteptr;
1989 2002          else
1990 2003                  pte = *(x86pte32_t *)pteptr;
1991 2004  #endif
1992 2005

1993 2006          newpte = MAKEPTE(pfn, 0) | mmu.pt_global | mmu.pt_nx;
1994 2007  
1995 2008          /*
1996 2009           * For hardware we can use a writable mapping.
1997 2010           */
1998 2011  #ifdef __xpv
1999 2012          if (IN_XPV_PANIC())
2000 2013  #endif
2001 2014                  newpte |= PT_WRITABLE;
2002 2015  
2003 2016          if (!PTE_EQUIV(newpte, pte)) {
2004 2017  
2005 2018  #ifdef __xpv
2006 2019                  if (!IN_XPV_PANIC()) {

↓ open down ↓

14 lines elided

↑ open up ↑

2007 2020                          xen_map(newpte, PWIN_VA(x));
2008 2021                  } else
2009 2022  #endif
2010 2023                  {
2011 2024                          XPV_ALLOW_PAGETABLE_UPDATES();
2012 2025                          if (mmu.pae_hat)
2013 2026                                  *pteptr = newpte;
2014 2027                          else
2015 2028                                  *(x86pte32_t *)pteptr = newpte;
2016 2029                          XPV_DISALLOW_PAGETABLE_UPDATES();
2017      -                        mmu_tlbflush_entry((caddr_t)(PWIN_VA(x)));
     2030 +                        mmu_flush_tlb_kpage((uintptr_t)PWIN_VA(x));
2018 2031                  }
2019 2032          }
2020 2033          return (PT_INDEX_PTR(PWIN_VA(x), index));
2021 2034  }
2022 2035  
2023 2036  /*
2024 2037   * Release access to a page table.
2025 2038   */
2026 2039  static void
2027 2040  x86pte_release_pagetable(htable_t *ht)
2028 2041  {
2029      -        /*
2030      -         * nothing to do for VLP htables
2031      -         */
2032      -        if (ht->ht_flags & HTABLE_VLP)
     2042 +        if (ht->ht_flags & HTABLE_COPIED)
2033 2043                  return;
2034 2044  
2035 2045          x86pte_mapout();
2036 2046  }
2037 2047  
2038 2048  void
2039 2049  x86pte_mapout(void)
2040 2050  {
2041 2051          if (kpm_vbase != NULL || !khat_running)
2042 2052                  return;

2043 2053  
2044 2054          /*
2045 2055           * Drop the CPU's hci_mutex and restore preemption.
2046 2056           */
2047 2057  #ifdef __xpv
2048 2058          if (!IN_XPV_PANIC()) {
2049 2059                  uintptr_t va;
2050 2060  
2051 2061                  /*
2052 2062                   * We need to always clear the mapping in case a page
2053 2063                   * that was once a page table page is ballooned out.
2054 2064                   */
2055 2065                  va = (uintptr_t)PWIN_VA(PWIN_TABLE(CPU->cpu_id));
2056 2066                  (void) HYPERVISOR_update_va_mapping(va, 0,
2057 2067                      UVMF_INVLPG | UVMF_LOCAL);
2058 2068          }
2059 2069  #endif
2060 2070          mutex_exit(&CPU->cpu_hat_info->hci_mutex);
2061 2071          kpreempt_enable();
2062 2072  }
2063 2073  
2064 2074  /*
2065 2075   * Atomic retrieval of a pagetable entry
2066 2076   */
2067 2077  x86pte_t
2068 2078  x86pte_get(htable_t *ht, uint_t entry)
2069 2079  {
2070 2080          x86pte_t        pte;
2071 2081          x86pte_t        *ptep;
2072 2082  
2073 2083          /*
2074 2084           * Be careful that loading PAE entries in 32 bit kernel is atomic.
2075 2085           */
2076 2086          ASSERT(entry < mmu.ptes_per_table);
2077 2087          ptep = x86pte_access_pagetable(ht, entry);
2078 2088          pte = GET_PTE(ptep);
2079 2089          x86pte_release_pagetable(ht);
2080 2090          return (pte);
2081 2091  }
2082 2092  
2083 2093  /*
2084 2094   * Atomic unconditional set of a page table entry, it returns the previous
2085 2095   * value. For pre-existing mappings if the PFN changes, then we don't care
2086 2096   * about the old pte's REF / MOD bits. If the PFN remains the same, we leave
2087 2097   * the MOD/REF bits unchanged.
2088 2098   *
2089 2099   * If asked to overwrite a link to a lower page table with a large page
2090 2100   * mapping, this routine returns the special value of LPAGE_ERROR. This
2091 2101   * allows the upper HAT layers to retry with a smaller mapping size.
2092 2102   */
2093 2103  x86pte_t
2094 2104  x86pte_set(htable_t *ht, uint_t entry, x86pte_t new, void *ptr)
2095 2105  {
2096 2106          x86pte_t        old;
2097 2107          x86pte_t        prev;
2098 2108          x86pte_t        *ptep;
2099 2109          level_t         l = ht->ht_level;
2100 2110          x86pte_t        pfn_mask = (l != 0) ? PT_PADDR_LGPG : PT_PADDR;
2101 2111          x86pte_t        n;
2102 2112          uintptr_t       addr = htable_e2va(ht, entry);
2103 2113          hat_t           *hat = ht->ht_hat;
2104 2114  
2105 2115          ASSERT(new != 0); /* don't use to invalidate a PTE, see x86pte_update */
2106 2116          ASSERT(!(ht->ht_flags & HTABLE_SHARED_PFN));
2107 2117          if (ptr == NULL)
2108 2118                  ptep = x86pte_access_pagetable(ht, entry);
2109 2119          else
2110 2120                  ptep = ptr;
2111 2121  
2112 2122          /*
2113 2123           * Install the new PTE. If remapping the same PFN, then
2114 2124           * copy existing REF/MOD bits to new mapping.
2115 2125           */
2116 2126          do {
2117 2127                  prev = GET_PTE(ptep);
2118 2128                  n = new;
2119 2129                  if (PTE_ISVALID(n) && (prev & pfn_mask) == (new & pfn_mask))
2120 2130                          n |= prev & (PT_REF | PT_MOD);
2121 2131  
2122 2132                  /*

↓ open down ↓

80 lines elided

↑ open up ↑

2123 2133                   * Another thread may have installed this mapping already,
2124 2134                   * flush the local TLB and be done.
2125 2135                   */
2126 2136                  if (prev == n) {
2127 2137                          old = new;
2128 2138  #ifdef __xpv
2129 2139                          if (!IN_XPV_PANIC())
2130 2140                                  xen_flush_va((caddr_t)addr);
2131 2141                          else
2132 2142  #endif
2133      -                                mmu_tlbflush_entry((caddr_t)addr);
     2143 +                                mmu_flush_tlb_page(addr);
2134 2144                          goto done;
2135 2145                  }
2136 2146  
2137 2147                  /*
2138 2148                   * Detect if we have a collision of installing a large
2139 2149                   * page mapping where there already is a lower page table.
2140 2150                   */
2141 2151                  if (l > 0 && (prev & PT_VALID) && !(prev & PT_PAGESIZE)) {
2142 2152                          old = LPAGE_ERROR;
2143 2153                          goto done;

2144 2154                  }
2145 2155  
2146 2156                  XPV_ALLOW_PAGETABLE_UPDATES();
2147 2157                  old = CAS_PTE(ptep, prev, n);
2148 2158                  XPV_DISALLOW_PAGETABLE_UPDATES();
2149 2159          } while (old != prev);
2150 2160  
2151 2161          /*
2152 2162           * Do a TLB demap if needed, ie. the old pte was valid.
2153 2163           *
2154 2164           * Note that a stale TLB writeback to the PTE here either can't happen
2155 2165           * or doesn't matter. The PFN can only change for NOSYNC|NOCONSIST
2156 2166           * mappings, but they were created with REF and MOD already set, so
2157 2167           * no stale writeback will happen.
2158 2168           *
2159 2169           * Segmap is the only place where remaps happen on the same pfn and for
2160 2170           * that we want to preserve the stale REF/MOD bits.
2161 2171           */
2162 2172          if (old & PT_REF)
2163 2173                  hat_tlb_inval(hat, addr);
2164 2174  
2165 2175  done:
2166 2176          if (ptr == NULL)
2167 2177                  x86pte_release_pagetable(ht);
2168 2178          return (old);
2169 2179  }
2170 2180  
2171 2181  /*
2172 2182   * Atomic compare and swap of a page table entry. No TLB invalidates are done.
2173 2183   * This is used for links between pagetables of different levels.
2174 2184   * Note we always create these links with dirty/access set, so they should
2175 2185   * never change.
2176 2186   */
2177 2187  x86pte_t
2178 2188  x86pte_cas(htable_t *ht, uint_t entry, x86pte_t old, x86pte_t new)
2179 2189  {
2180 2190          x86pte_t        pte;
2181 2191          x86pte_t        *ptep;

↓ open down ↓

38 lines elided

↑ open up ↑

2182 2192  #ifdef __xpv
2183 2193          /*
2184 2194           * We can't use writable pagetables for upper level tables, so fake it.
2185 2195           */
2186 2196          mmu_update_t t[2];
2187 2197          int cnt = 1;
2188 2198          int count;
2189 2199          maddr_t ma;
2190 2200  
2191 2201          if (!IN_XPV_PANIC()) {
2192      -                ASSERT(!(ht->ht_flags & HTABLE_VLP));   /* no VLP yet */
     2202 +                ASSERT(!(ht->ht_flags & HTABLE_COPIED));
2193 2203                  ma = pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(ht->ht_pfn), entry));
2194 2204                  t[0].ptr = ma | MMU_NORMAL_PT_UPDATE;
2195 2205                  t[0].val = new;
2196 2206  
2197 2207  #if defined(__amd64)
2198 2208                  /*
2199 2209                   * On the 64-bit hypervisor we need to maintain the user mode
2200 2210                   * top page table too.
2201 2211                   */
2202 2212                  if (ht->ht_level == mmu.max_level && ht->ht_hat != kas.a_hat) {

2203 2213                          ma = pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(
2204 2214                              ht->ht_hat->hat_user_ptable), entry));
2205 2215                          t[1].ptr = ma | MMU_NORMAL_PT_UPDATE;
2206 2216                          t[1].val = new;
2207 2217                          ++cnt;
2208 2218                  }
2209 2219  #endif  /* __amd64 */
2210 2220  
2211 2221                  if (HYPERVISOR_mmu_update(t, cnt, &count, DOMID_SELF))
2212 2222                          panic("HYPERVISOR_mmu_update() failed");
2213 2223                  ASSERT(count == cnt);
2214 2224                  return (old);
2215 2225          }
2216 2226  #endif
2217 2227          ptep = x86pte_access_pagetable(ht, entry);
2218 2228          XPV_ALLOW_PAGETABLE_UPDATES();
2219 2229          pte = CAS_PTE(ptep, old, new);
2220 2230          XPV_DISALLOW_PAGETABLE_UPDATES();
2221 2231          x86pte_release_pagetable(ht);
2222 2232          return (pte);
2223 2233  }
2224 2234  
2225 2235  /*
2226 2236   * Invalidate a page table entry as long as it currently maps something that
2227 2237   * matches the value determined by expect.
2228 2238   *
2229 2239   * If tlb is set, also invalidates any TLB entries.
2230 2240   *
2231 2241   * Returns the previous value of the PTE.
2232 2242   */
2233 2243  x86pte_t
2234 2244  x86pte_inval(
2235 2245          htable_t *ht,
2236 2246          uint_t entry,
2237 2247          x86pte_t expect,
2238 2248          x86pte_t *pte_ptr,
2239 2249          boolean_t tlb)
2240 2250  {
2241 2251          x86pte_t        *ptep;
2242 2252          x86pte_t        oldpte;
2243 2253          x86pte_t        found;
2244 2254  
2245 2255          ASSERT(!(ht->ht_flags & HTABLE_SHARED_PFN));
2246 2256          ASSERT(ht->ht_level <= mmu.max_page_level);
2247 2257  
2248 2258          if (pte_ptr != NULL)
2249 2259                  ptep = pte_ptr;
2250 2260          else
2251 2261                  ptep = x86pte_access_pagetable(ht, entry);
2252 2262  
2253 2263  #if defined(__xpv)
2254 2264          /*
2255 2265           * If exit()ing just use HYPERVISOR_mmu_update(), as we can't be racing
2256 2266           * with anything else.
2257 2267           */
2258 2268          if ((ht->ht_hat->hat_flags & HAT_FREEING) && !IN_XPV_PANIC()) {
2259 2269                  int count;
2260 2270                  mmu_update_t t[1];
2261 2271                  maddr_t ma;
2262 2272  
2263 2273                  oldpte = GET_PTE(ptep);
2264 2274                  if (expect != 0 && (oldpte & PT_PADDR) != (expect & PT_PADDR))
2265 2275                          goto done;
2266 2276                  ma = pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(ht->ht_pfn), entry));
2267 2277                  t[0].ptr = ma | MMU_NORMAL_PT_UPDATE;
2268 2278                  t[0].val = 0;
2269 2279                  if (HYPERVISOR_mmu_update(t, 1, &count, DOMID_SELF))
2270 2280                          panic("HYPERVISOR_mmu_update() failed");
2271 2281                  ASSERT(count == 1);
2272 2282                  goto done;
2273 2283          }
2274 2284  #endif /* __xpv */
2275 2285  
2276 2286          /*
2277 2287           * Note that the loop is needed to handle changes due to h/w updating
2278 2288           * of PT_MOD/PT_REF.
2279 2289           */
2280 2290          do {
2281 2291                  oldpte = GET_PTE(ptep);
2282 2292                  if (expect != 0 && (oldpte & PT_PADDR) != (expect & PT_PADDR))
2283 2293                          goto done;
2284 2294                  XPV_ALLOW_PAGETABLE_UPDATES();
2285 2295                  found = CAS_PTE(ptep, oldpte, 0);
2286 2296                  XPV_DISALLOW_PAGETABLE_UPDATES();
2287 2297          } while (found != oldpte);
2288 2298          if (tlb && (oldpte & (PT_REF | PT_MOD)))
2289 2299                  hat_tlb_inval(ht->ht_hat, htable_e2va(ht, entry));
2290 2300  
2291 2301  done:
2292 2302          if (pte_ptr == NULL)
2293 2303                  x86pte_release_pagetable(ht);
2294 2304          return (oldpte);
2295 2305  }
2296 2306  
2297 2307  /*
2298 2308   * Change a page table entry af it currently matches the value in expect.
2299 2309   */
2300 2310  x86pte_t
2301 2311  x86pte_update(
2302 2312          htable_t *ht,
2303 2313          uint_t entry,
2304 2314          x86pte_t expect,
2305 2315          x86pte_t new)
2306 2316  {
2307 2317          x86pte_t        *ptep;
2308 2318          x86pte_t        found;
2309 2319  
2310 2320          ASSERT(new != 0);
2311 2321          ASSERT(!(ht->ht_flags & HTABLE_SHARED_PFN));
2312 2322          ASSERT(ht->ht_level <= mmu.max_page_level);
2313 2323  
2314 2324          ptep = x86pte_access_pagetable(ht, entry);
2315 2325          XPV_ALLOW_PAGETABLE_UPDATES();
2316 2326          found = CAS_PTE(ptep, expect, new);
2317 2327          XPV_DISALLOW_PAGETABLE_UPDATES();
2318 2328          if (found == expect) {
2319 2329                  hat_tlb_inval(ht->ht_hat, htable_e2va(ht, entry));
2320 2330  
2321 2331                  /*
2322 2332                   * When removing write permission *and* clearing the
2323 2333                   * MOD bit, check if a write happened via a stale
2324 2334                   * TLB entry before the TLB shootdown finished.
2325 2335                   *
2326 2336                   * If it did happen, simply re-enable write permission and
2327 2337                   * act like the original CAS failed.
2328 2338                   */
2329 2339                  if ((expect & (PT_WRITABLE | PT_MOD)) == PT_WRITABLE &&
2330 2340                      (new & (PT_WRITABLE | PT_MOD)) == 0 &&
2331 2341                      (GET_PTE(ptep) & PT_MOD) != 0) {
2332 2342                          do {
2333 2343                                  found = GET_PTE(ptep);
2334 2344                                  XPV_ALLOW_PAGETABLE_UPDATES();
2335 2345                                  found =
2336 2346                                      CAS_PTE(ptep, found, found | PT_WRITABLE);
2337 2347                                  XPV_DISALLOW_PAGETABLE_UPDATES();
2338 2348                          } while ((found & PT_WRITABLE) == 0);

↓ open down ↓

136 lines elided

↑ open up ↑

2339 2349                  }
2340 2350          }
2341 2351          x86pte_release_pagetable(ht);
2342 2352          return (found);
2343 2353  }
2344 2354  
2345 2355  #ifndef __xpv
2346 2356  /*
2347 2357   * Copy page tables - this is just a little more complicated than the
2348 2358   * previous routines. Note that it's also not atomic! It also is never
2349      - * used for VLP pagetables.
     2359 + * used for HTABLE_COPIED pagetables.
2350 2360   */
2351 2361  void
2352 2362  x86pte_copy(htable_t *src, htable_t *dest, uint_t entry, uint_t count)
2353 2363  {
2354 2364          caddr_t src_va;
2355 2365          caddr_t dst_va;
2356 2366          size_t size;
2357 2367          x86pte_t *pteptr;
2358 2368          x86pte_t pte;
2359 2369  
2360 2370          ASSERT(khat_running);
2361      -        ASSERT(!(dest->ht_flags & HTABLE_VLP));
2362      -        ASSERT(!(src->ht_flags & HTABLE_VLP));
     2371 +        ASSERT(!(dest->ht_flags & HTABLE_COPIED));
     2372 +        ASSERT(!(src->ht_flags & HTABLE_COPIED));
2363 2373          ASSERT(!(src->ht_flags & HTABLE_SHARED_PFN));
2364 2374          ASSERT(!(dest->ht_flags & HTABLE_SHARED_PFN));
2365 2375  
2366 2376          /*
2367 2377           * Acquire access to the CPU pagetable windows for the dest and source.
2368 2378           */
2369 2379          dst_va = (caddr_t)x86pte_access_pagetable(dest, entry);
2370 2380          if (kpm_vbase) {
2371 2381                  src_va = (caddr_t)
2372 2382                      PT_INDEX_PTR(hat_kpm_pfn2va(src->ht_pfn), entry);
2373 2383          } else {
2374 2384                  uint_t x = PWIN_SRC(CPU->cpu_id);
2375 2385  
     2386 +                ASSERT(!(getcr4() & CR4_PCIDE));
     2387 +
2376 2388                  /*
2377 2389                   * Finish defining the src pagetable mapping
2378 2390                   */
2379 2391                  src_va = (caddr_t)PT_INDEX_PTR(PWIN_VA(x), entry);
2380 2392                  pte = MAKEPTE(src->ht_pfn, 0) | mmu.pt_global | mmu.pt_nx;
2381 2393                  pteptr = (x86pte_t *)PWIN_PTE_VA(x);
2382 2394                  if (mmu.pae_hat)
2383 2395                          *pteptr = pte;
2384 2396                  else
2385 2397                          *(x86pte32_t *)pteptr = pte;
2386      -                mmu_tlbflush_entry((caddr_t)(PWIN_VA(x)));
     2398 +                mmu_flush_tlb_kpage((uintptr_t)PWIN_VA(x));
2387 2399          }
2388 2400  
2389 2401          /*
2390 2402           * now do the copy
2391 2403           */
2392 2404          size = count << mmu.pte_size_shift;
2393 2405          bcopy(src_va, dst_va, size);
2394 2406  
2395 2407          x86pte_release_pagetable(dest);
2396 2408  }

2397 2409  
2398 2410  #else /* __xpv */
2399 2411  
2400 2412  /*
2401 2413   * The hypervisor only supports writable pagetables at level 0, so we have
2402 2414   * to install these 1 by 1 the slow way.
2403 2415   */
2404 2416  void
2405 2417  x86pte_copy(htable_t *src, htable_t *dest, uint_t entry, uint_t count)
2406 2418  {
2407 2419          caddr_t src_va;
2408 2420          x86pte_t pte;
2409 2421  
2410 2422          ASSERT(!IN_XPV_PANIC());
2411 2423          src_va = (caddr_t)x86pte_access_pagetable(src, entry);
2412 2424          while (count) {
2413 2425                  if (mmu.pae_hat)
2414 2426                          pte = *(x86pte_t *)src_va;
2415 2427                  else
2416 2428                          pte = *(x86pte32_t *)src_va;
2417 2429                  if (pte != 0) {
2418 2430                          set_pteval(pfn_to_pa(dest->ht_pfn), entry,
2419 2431                              dest->ht_level, pte);
2420 2432  #ifdef __amd64
2421 2433                          if (dest->ht_level == mmu.max_level &&
2422 2434                              htable_e2va(dest, entry) < HYPERVISOR_VIRT_END)
2423 2435                                  set_pteval(
2424 2436                                      pfn_to_pa(dest->ht_hat->hat_user_ptable),
2425 2437                                      entry, dest->ht_level, pte);
2426 2438  #endif
2427 2439                  }
2428 2440                  --count;
2429 2441                  ++entry;
2430 2442                  src_va += mmu.pte_size;
2431 2443          }
2432 2444          x86pte_release_pagetable(src);
2433 2445  }
2434 2446  #endif /* __xpv */
2435 2447  
2436 2448  /*
2437 2449   * Zero page table entries - Note this doesn't use atomic stores!
2438 2450   */
2439 2451  static void
2440 2452  x86pte_zero(htable_t *dest, uint_t entry, uint_t count)
2441 2453  {
2442 2454          caddr_t dst_va;

↓ open down ↓

46 lines elided

↑ open up ↑

2443 2455          size_t size;
2444 2456  #ifdef __xpv
2445 2457          int x;
2446 2458          x86pte_t newpte;
2447 2459  #endif
2448 2460  
2449 2461          /*
2450 2462           * Map in the page table to be zeroed.
2451 2463           */
2452 2464          ASSERT(!(dest->ht_flags & HTABLE_SHARED_PFN));
2453      -        ASSERT(!(dest->ht_flags & HTABLE_VLP));
     2465 +        ASSERT(!(dest->ht_flags & HTABLE_COPIED));
2454 2466  
2455 2467          /*
2456 2468           * On the hypervisor we don't use x86pte_access_pagetable() since
2457 2469           * in this case the page is not pinned yet.
2458 2470           */
2459 2471  #ifdef __xpv
2460 2472          if (kpm_vbase == NULL) {
2461 2473                  kpreempt_disable();
2462 2474                  ASSERT(CPU->cpu_hat_info != NULL);
2463 2475                  mutex_enter(&CPU->cpu_hat_info->hci_mutex);

2464 2476                  x = PWIN_TABLE(CPU->cpu_id);
2465 2477                  newpte = MAKEPTE(dest->ht_pfn, 0) | PT_WRITABLE;
2466 2478                  xen_map(newpte, PWIN_VA(x));
2467 2479                  dst_va = (caddr_t)PT_INDEX_PTR(PWIN_VA(x), entry);
2468 2480          } else
2469 2481  #endif
2470 2482                  dst_va = (caddr_t)x86pte_access_pagetable(dest, entry);
2471 2483  
2472 2484          size = count << mmu.pte_size_shift;
2473 2485          ASSERT(size > BLOCKZEROALIGN);
2474 2486  #ifdef __i386
2475 2487          if (!is_x86_feature(x86_featureset, X86FSET_SSE2))
2476 2488                  bzero(dst_va, size);
2477 2489          else
2478 2490  #endif
2479 2491                  block_zero_no_xmm(dst_va, size);
2480 2492  
2481 2493  #ifdef __xpv
2482 2494          if (kpm_vbase == NULL) {
2483 2495                  xen_map(0, PWIN_VA(x));
2484 2496                  mutex_exit(&CPU->cpu_hat_info->hci_mutex);
2485 2497                  kpreempt_enable();
2486 2498          } else
2487 2499  #endif
2488 2500                  x86pte_release_pagetable(dest);
2489 2501  }
2490 2502  
2491 2503  /*
2492 2504   * Called to ensure that all pagetables are in the system dump
2493 2505   */
2494 2506  void
2495 2507  hat_dump(void)
2496 2508  {

↓ open down ↓

33 lines elided

↑ open up ↑

2497 2509          hat_t *hat;
2498 2510          uint_t h;
2499 2511          htable_t *ht;
2500 2512  
2501 2513          /*
2502 2514           * Dump all page tables
2503 2515           */
2504 2516          for (hat = kas.a_hat; hat != NULL; hat = hat->hat_next) {
2505 2517                  for (h = 0; h < hat->hat_num_hash; ++h) {
2506 2518                          for (ht = hat->hat_ht_hash[h]; ht; ht = ht->ht_next) {
2507      -                                if ((ht->ht_flags & HTABLE_VLP) == 0)
     2519 +                                if ((ht->ht_flags & HTABLE_COPIED) == 0)
2508 2520                                          dump_page(ht->ht_pfn);
2509 2521                          }
2510 2522                  }
2511 2523          }
2512 2524  }

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX