1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2014 by Delphix. All rights reserved.
  25  * Copyright 2015 Joyent, Inc.
  26  */
  27 
  28 #include <sys/types.h>
  29 #include <sys/sysmacros.h>
  30 #include <sys/kmem.h>
  31 #include <sys/atomic.h>
  32 #include <sys/bitmap.h>
  33 #include <sys/machparam.h>
  34 #include <sys/machsystm.h>
  35 #include <sys/mman.h>
  36 #include <sys/systm.h>
  37 #include <sys/cpuvar.h>
  38 #include <sys/thread.h>
  39 #include <sys/proc.h>
  40 #include <sys/cpu.h>
  41 #include <sys/kmem.h>
  42 #include <sys/disp.h>
  43 #include <sys/vmem.h>
  44 #include <sys/vmsystm.h>
  45 #include <sys/promif.h>
  46 #include <sys/var.h>
  47 #include <sys/x86_archext.h>
  48 #include <sys/archsystm.h>
  49 #include <sys/bootconf.h>
  50 #include <sys/dumphdr.h>
  51 #include <vm/seg_kmem.h>
  52 #include <vm/seg_kpm.h>
  53 #include <vm/hat.h>
  54 #include <vm/hat_i86.h>
  55 #include <sys/cmn_err.h>
  56 #include <sys/panic.h>
  57 
  58 #ifdef __xpv
  59 #include <sys/hypervisor.h>
  60 #include <sys/xpv_panic.h>
  61 #endif
  62 
  63 #include <sys/bootinfo.h>
  64 #include <vm/kboot_mmu.h>
  65 
  66 static void x86pte_zero(htable_t *dest, uint_t entry, uint_t count);
  67 
  68 kmem_cache_t *htable_cache;
  69 
  70 /*
  71  * The variable htable_reserve_amount, rather than HTABLE_RESERVE_AMOUNT,
  72  * is used in order to facilitate testing of the htable_steal() code.
  73  * By resetting htable_reserve_amount to a lower value, we can force
  74  * stealing to occur.  The reserve amount is a guess to get us through boot.
  75  */
  76 #define HTABLE_RESERVE_AMOUNT   (200)
  77 uint_t htable_reserve_amount = HTABLE_RESERVE_AMOUNT;
  78 kmutex_t htable_reserve_mutex;
  79 uint_t htable_reserve_cnt;
  80 htable_t *htable_reserve_pool;
  81 
  82 /*
  83  * Used to hand test htable_steal().
  84  */
  85 #ifdef DEBUG
  86 ulong_t force_steal = 0;
  87 ulong_t ptable_cnt = 0;
  88 #endif
  89 
  90 /*
  91  * This variable is so that we can tune this via /etc/system
  92  * Any value works, but a power of two <= mmu.ptes_per_table is best.
  93  */
  94 uint_t htable_steal_passes = 8;
  95 
  96 /*
  97  * mutex stuff for access to htable hash
  98  */
  99 #define NUM_HTABLE_MUTEX 128
 100 kmutex_t htable_mutex[NUM_HTABLE_MUTEX];
 101 #define HTABLE_MUTEX_HASH(h) ((h) & (NUM_HTABLE_MUTEX - 1))
 102 
 103 #define HTABLE_ENTER(h) mutex_enter(&htable_mutex[HTABLE_MUTEX_HASH(h)]);
 104 #define HTABLE_EXIT(h)  mutex_exit(&htable_mutex[HTABLE_MUTEX_HASH(h)]);
 105 
 106 /*
 107  * forward declarations
 108  */
 109 static void link_ptp(htable_t *higher, htable_t *new, uintptr_t vaddr);
 110 static void unlink_ptp(htable_t *higher, htable_t *old, uintptr_t vaddr);
 111 static void htable_free(htable_t *ht);
 112 static x86pte_t *x86pte_access_pagetable(htable_t *ht, uint_t index);
 113 static void x86pte_release_pagetable(htable_t *ht);
 114 static x86pte_t x86pte_cas(htable_t *ht, uint_t entry, x86pte_t old,
 115         x86pte_t new);
 116 
 117 /*
 118  * A counter to track if we are stealing or reaping htables. When non-zero
 119  * htable_free() will directly free htables (either to the reserve or kmem)
 120  * instead of putting them in a hat's htable cache.
 121  */
 122 uint32_t htable_dont_cache = 0;
 123 
 124 /*
 125  * Track the number of active pagetables, so we can know how many to reap
 126  */
 127 static uint32_t active_ptables = 0;
 128 
 129 #ifdef __xpv
 130 /*
 131  * Deal with hypervisor complications.
 132  */
 133 void
 134 xen_flush_va(caddr_t va)
 135 {
 136         struct mmuext_op t;
 137         uint_t count;
 138 
 139         if (IN_XPV_PANIC()) {
 140                 mmu_tlbflush_entry((caddr_t)va);
 141         } else {
 142                 t.cmd = MMUEXT_INVLPG_LOCAL;
 143                 t.arg1.linear_addr = (uintptr_t)va;
 144                 if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
 145                         panic("HYPERVISOR_mmuext_op() failed");
 146                 ASSERT(count == 1);
 147         }
 148 }
 149 
 150 void
 151 xen_gflush_va(caddr_t va, cpuset_t cpus)
 152 {
 153         struct mmuext_op t;
 154         uint_t count;
 155 
 156         if (IN_XPV_PANIC()) {
 157                 mmu_tlbflush_entry((caddr_t)va);
 158                 return;
 159         }
 160 
 161         t.cmd = MMUEXT_INVLPG_MULTI;
 162         t.arg1.linear_addr = (uintptr_t)va;
 163         /*LINTED: constant in conditional context*/
 164         set_xen_guest_handle(t.arg2.vcpumask, &cpus);
 165         if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
 166                 panic("HYPERVISOR_mmuext_op() failed");
 167         ASSERT(count == 1);
 168 }
 169 
 170 void
 171 xen_flush_tlb()
 172 {
 173         struct mmuext_op t;
 174         uint_t count;
 175 
 176         if (IN_XPV_PANIC()) {
 177                 xpv_panic_reload_cr3();
 178         } else {
 179                 t.cmd = MMUEXT_TLB_FLUSH_LOCAL;
 180                 if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
 181                         panic("HYPERVISOR_mmuext_op() failed");
 182                 ASSERT(count == 1);
 183         }
 184 }
 185 
 186 void
 187 xen_gflush_tlb(cpuset_t cpus)
 188 {
 189         struct mmuext_op t;
 190         uint_t count;
 191 
 192         ASSERT(!IN_XPV_PANIC());
 193         t.cmd = MMUEXT_TLB_FLUSH_MULTI;
 194         /*LINTED: constant in conditional context*/
 195         set_xen_guest_handle(t.arg2.vcpumask, &cpus);
 196         if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
 197                 panic("HYPERVISOR_mmuext_op() failed");
 198         ASSERT(count == 1);
 199 }
 200 
 201 /*
 202  * Install/Adjust a kpm mapping under the hypervisor.
 203  * Value of "how" should be:
 204  *      PT_WRITABLE | PT_VALID - regular kpm mapping
 205  *      PT_VALID - make mapping read-only
 206  *      0       - remove mapping
 207  *
 208  * returns 0 on success. non-zero for failure.
 209  */
 210 int
 211 xen_kpm_page(pfn_t pfn, uint_t how)
 212 {
 213         paddr_t pa = mmu_ptob((paddr_t)pfn);
 214         x86pte_t pte = PT_NOCONSIST | PT_REF | PT_MOD;
 215 
 216         if (kpm_vbase == NULL)
 217                 return (0);
 218 
 219         if (how)
 220                 pte |= pa_to_ma(pa) | how;
 221         else
 222                 pte = 0;
 223         return (HYPERVISOR_update_va_mapping((uintptr_t)kpm_vbase + pa,
 224             pte, UVMF_INVLPG | UVMF_ALL));
 225 }
 226 
 227 void
 228 xen_pin(pfn_t pfn, level_t lvl)
 229 {
 230         struct mmuext_op t;
 231         uint_t count;
 232 
 233         t.cmd = MMUEXT_PIN_L1_TABLE + lvl;
 234         t.arg1.mfn = pfn_to_mfn(pfn);
 235         if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
 236                 panic("HYPERVISOR_mmuext_op() failed");
 237         ASSERT(count == 1);
 238 }
 239 
 240 void
 241 xen_unpin(pfn_t pfn)
 242 {
 243         struct mmuext_op t;
 244         uint_t count;
 245 
 246         t.cmd = MMUEXT_UNPIN_TABLE;
 247         t.arg1.mfn = pfn_to_mfn(pfn);
 248         if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
 249                 panic("HYPERVISOR_mmuext_op() failed");
 250         ASSERT(count == 1);
 251 }
 252 
 253 static void
 254 xen_map(uint64_t pte, caddr_t va)
 255 {
 256         if (HYPERVISOR_update_va_mapping((uintptr_t)va, pte,
 257             UVMF_INVLPG | UVMF_LOCAL))
 258                 panic("HYPERVISOR_update_va_mapping() failed");
 259 }
 260 #endif /* __xpv */
 261 
 262 /*
 263  * Allocate a memory page for a hardware page table.
 264  *
 265  * A wrapper around page_get_physical(), with some extra checks.
 266  */
 267 static pfn_t
 268 ptable_alloc(uintptr_t seed)
 269 {
 270         pfn_t pfn;
 271         page_t *pp;
 272 
 273         pfn = PFN_INVALID;
 274 
 275         /*
 276          * The first check is to see if there is memory in the system. If we
 277          * drop to throttlefree, then fail the ptable_alloc() and let the
 278          * stealing code kick in. Note that we have to do this test here,
 279          * since the test in page_create_throttle() would let the NOSLEEP
 280          * allocation go through and deplete the page reserves.
 281          *
 282          * The !NOMEMWAIT() lets pageout, fsflush, etc. skip this check.
 283          */
 284         if (!NOMEMWAIT() && freemem <= throttlefree + 1)
 285                 return (PFN_INVALID);
 286 
 287 #ifdef DEBUG
 288         /*
 289          * This code makes htable_steal() easier to test. By setting
 290          * force_steal we force pagetable allocations to fall
 291          * into the stealing code. Roughly 1 in ever "force_steal"
 292          * page table allocations will fail.
 293          */
 294         if (proc_pageout != NULL && force_steal > 1 &&
 295             ++ptable_cnt > force_steal) {
 296                 ptable_cnt = 0;
 297                 return (PFN_INVALID);
 298         }
 299 #endif /* DEBUG */
 300 
 301         pp = page_get_physical(seed);
 302         if (pp == NULL)
 303                 return (PFN_INVALID);
 304         ASSERT(PAGE_SHARED(pp));
 305         pfn = pp->p_pagenum;
 306         if (pfn == PFN_INVALID)
 307                 panic("ptable_alloc(): Invalid PFN!!");
 308         atomic_inc_32(&active_ptables);
 309         HATSTAT_INC(hs_ptable_allocs);
 310         return (pfn);
 311 }
 312 
 313 /*
 314  * Free an htable's associated page table page.  See the comments
 315  * for ptable_alloc().
 316  */
 317 static void
 318 ptable_free(pfn_t pfn)
 319 {
 320         page_t *pp = page_numtopp_nolock(pfn);
 321 
 322         /*
 323          * need to destroy the page used for the pagetable
 324          */
 325         ASSERT(pfn != PFN_INVALID);
 326         HATSTAT_INC(hs_ptable_frees);
 327         atomic_dec_32(&active_ptables);
 328         if (pp == NULL)
 329                 panic("ptable_free(): no page for pfn!");
 330         ASSERT(PAGE_SHARED(pp));
 331         ASSERT(pfn == pp->p_pagenum);
 332         ASSERT(!IN_XPV_PANIC());
 333 
 334         /*
 335          * Get an exclusive lock, might have to wait for a kmem reader.
 336          */
 337         if (!page_tryupgrade(pp)) {
 338                 u_offset_t off = pp->p_offset;
 339                 page_unlock(pp);
 340                 pp = page_lookup(&kvp, off, SE_EXCL);
 341                 if (pp == NULL)
 342                         panic("page not found");
 343         }
 344 #ifdef __xpv
 345         if (kpm_vbase && xen_kpm_page(pfn, PT_VALID | PT_WRITABLE) < 0)
 346                 panic("failure making kpm r/w pfn=0x%lx", pfn);
 347 #endif
 348         page_hashout(pp, NULL);
 349         page_free(pp, 1);
 350         page_unresv(1);
 351 }
 352 
 353 /*
 354  * Put one htable on the reserve list.
 355  */
 356 static void
 357 htable_put_reserve(htable_t *ht)
 358 {
 359         ht->ht_hat = NULL;           /* no longer tied to a hat */
 360         ASSERT(ht->ht_pfn == PFN_INVALID);
 361         HATSTAT_INC(hs_htable_rputs);
 362         mutex_enter(&htable_reserve_mutex);
 363         ht->ht_next = htable_reserve_pool;
 364         htable_reserve_pool = ht;
 365         ++htable_reserve_cnt;
 366         mutex_exit(&htable_reserve_mutex);
 367 }
 368 
 369 /*
 370  * Take one htable from the reserve.
 371  */
 372 static htable_t *
 373 htable_get_reserve(void)
 374 {
 375         htable_t *ht = NULL;
 376 
 377         mutex_enter(&htable_reserve_mutex);
 378         if (htable_reserve_cnt != 0) {
 379                 ht = htable_reserve_pool;
 380                 ASSERT(ht != NULL);
 381                 ASSERT(ht->ht_pfn == PFN_INVALID);
 382                 htable_reserve_pool = ht->ht_next;
 383                 --htable_reserve_cnt;
 384                 HATSTAT_INC(hs_htable_rgets);
 385         }
 386         mutex_exit(&htable_reserve_mutex);
 387         return (ht);
 388 }
 389 
 390 /*
 391  * Allocate initial htables and put them on the reserve list
 392  */
 393 void
 394 htable_initial_reserve(uint_t count)
 395 {
 396         htable_t *ht;
 397 
 398         count += HTABLE_RESERVE_AMOUNT;
 399         while (count > 0) {
 400                 ht = kmem_cache_alloc(htable_cache, KM_NOSLEEP);
 401                 ASSERT(ht != NULL);
 402 
 403                 ASSERT(use_boot_reserve);
 404                 ht->ht_pfn = PFN_INVALID;
 405                 htable_put_reserve(ht);
 406                 --count;
 407         }
 408 }
 409 
 410 /*
 411  * Readjust the reserves after a thread finishes using them.
 412  */
 413 void
 414 htable_adjust_reserve()
 415 {
 416         htable_t *ht;
 417 
 418         /*
 419          * Free any excess htables in the reserve list
 420          */
 421         while (htable_reserve_cnt > htable_reserve_amount &&
 422             !USE_HAT_RESERVES()) {
 423                 ht = htable_get_reserve();
 424                 if (ht == NULL)
 425                         return;
 426                 ASSERT(ht->ht_pfn == PFN_INVALID);
 427                 kmem_cache_free(htable_cache, ht);
 428         }
 429 }
 430 
 431 /*
 432  * Search the active htables for one to steal. Start at a different hash
 433  * bucket every time to help spread the pain of stealing
 434  */
 435 static void
 436 htable_steal_active(hat_t *hat, uint_t cnt, uint_t threshold,
 437     uint_t *stolen, htable_t **list)
 438 {
 439         static uint_t   h_seed = 0;
 440         htable_t        *higher, *ht;
 441         uint_t          h, e, h_start;
 442         uintptr_t       va;
 443         x86pte_t        pte;
 444 
 445         h = h_start = h_seed++ % hat->hat_num_hash;
 446         do {
 447                 higher = NULL;
 448                 HTABLE_ENTER(h);
 449                 for (ht = hat->hat_ht_hash[h]; ht; ht = ht->ht_next) {
 450 
 451                         /*
 452                          * Can we rule out reaping?
 453                          */
 454                         if (ht->ht_busy != 0 ||
 455                             (ht->ht_flags & HTABLE_SHARED_PFN) ||
 456                             ht->ht_level > 0 || ht->ht_valid_cnt > threshold ||
 457                             ht->ht_lock_cnt != 0)
 458                                 continue;
 459 
 460                         /*
 461                          * Increment busy so the htable can't disappear. We
 462                          * drop the htable mutex to avoid deadlocks with
 463                          * hat_pageunload() and the hment mutex while we
 464                          * call hat_pte_unmap()
 465                          */
 466                         ++ht->ht_busy;
 467                         HTABLE_EXIT(h);
 468 
 469                         /*
 470                          * Try stealing.
 471                          * - unload and invalidate all PTEs
 472                          */
 473                         for (e = 0, va = ht->ht_vaddr;
 474                             e < HTABLE_NUM_PTES(ht) && ht->ht_valid_cnt > 0 &&
 475                             ht->ht_busy == 1 && ht->ht_lock_cnt == 0;
 476                             ++e, va += MMU_PAGESIZE) {
 477                                 pte = x86pte_get(ht, e);
 478                                 if (!PTE_ISVALID(pte))
 479                                         continue;
 480                                 hat_pte_unmap(ht, e, HAT_UNLOAD, pte, NULL,
 481                                     B_TRUE);
 482                         }
 483 
 484                         /*
 485                          * Reacquire htable lock. If we didn't remove all
 486                          * mappings in the table, or another thread added a new
 487                          * mapping behind us, give up on this table.
 488                          */
 489                         HTABLE_ENTER(h);
 490                         if (ht->ht_busy != 1 || ht->ht_valid_cnt != 0 ||
 491                             ht->ht_lock_cnt != 0) {
 492                                 --ht->ht_busy;
 493                                 continue;
 494                         }
 495 
 496                         /*
 497                          * Steal it and unlink the page table.
 498                          */
 499                         higher = ht->ht_parent;
 500                         unlink_ptp(higher, ht, ht->ht_vaddr);
 501 
 502                         /*
 503                          * remove from the hash list
 504                          */
 505                         if (ht->ht_next)
 506                                 ht->ht_next->ht_prev = ht->ht_prev;
 507 
 508                         if (ht->ht_prev) {
 509                                 ht->ht_prev->ht_next = ht->ht_next;
 510                         } else {
 511                                 ASSERT(hat->hat_ht_hash[h] == ht);
 512                                 hat->hat_ht_hash[h] = ht->ht_next;
 513                         }
 514 
 515                         /*
 516                          * Break to outer loop to release the
 517                          * higher (ht_parent) pagetable. This
 518                          * spreads out the pain caused by
 519                          * pagefaults.
 520                          */
 521                         ht->ht_next = *list;
 522                         *list = ht;
 523                         ++*stolen;
 524                         break;
 525                 }
 526                 HTABLE_EXIT(h);
 527                 if (higher != NULL)
 528                         htable_release(higher);
 529                 if (++h == hat->hat_num_hash)
 530                         h = 0;
 531         } while (*stolen < cnt && h != h_start);
 532 }
 533 
 534 /*
 535  * Move hat to the end of the kas list
 536  */
 537 static void
 538 move_victim(hat_t *hat)
 539 {
 540         ASSERT(MUTEX_HELD(&hat_list_lock));
 541 
 542         /* unlink victim hat */
 543         if (hat->hat_prev)
 544                 hat->hat_prev->hat_next = hat->hat_next;
 545         else
 546                 kas.a_hat->hat_next = hat->hat_next;
 547 
 548         if (hat->hat_next)
 549                 hat->hat_next->hat_prev = hat->hat_prev;
 550         else
 551                 kas.a_hat->hat_prev = hat->hat_prev;
 552         /* relink at end of hat list */
 553         hat->hat_next = NULL;
 554         hat->hat_prev = kas.a_hat->hat_prev;
 555         if (hat->hat_prev)
 556                 hat->hat_prev->hat_next = hat;
 557         else
 558                 kas.a_hat->hat_next = hat;
 559 
 560         kas.a_hat->hat_prev = hat;
 561 }
 562 
 563 /*
 564  * This routine steals htables from user processes.  Called by htable_reap
 565  * (reap=TRUE) or htable_alloc (reap=FALSE).
 566  */
 567 static htable_t *
 568 htable_steal(uint_t cnt, boolean_t reap)
 569 {
 570         hat_t           *hat = kas.a_hat;       /* list starts with khat */
 571         htable_t        *list = NULL;
 572         htable_t        *ht;
 573         uint_t          stolen = 0;
 574         uint_t          pass, passes;
 575         uint_t          threshold;
 576 
 577         /*
 578          * Limit htable_steal_passes to something reasonable
 579          */
 580         if (htable_steal_passes == 0)
 581                 htable_steal_passes = 1;
 582         if (htable_steal_passes > mmu.ptes_per_table)
 583                 htable_steal_passes = mmu.ptes_per_table;
 584 
 585         /*
 586          * If we're stealing merely as part of kmem reaping (versus stealing
 587          * to assure forward progress), we don't want to actually steal any
 588          * active htables.  (Stealing active htables merely to give memory
 589          * back to the system can inadvertently kick off an htable crime wave
 590          * as active processes repeatedly steal htables from one another,
 591          * plummeting the system into a kind of HAT lawlessness that can
 592          * become so violent as to impede the one thing that can end it:  the
 593          * freeing of memory via ARC reclaim and other means.)  So if we're
 594          * reaping, we limit ourselves to the first pass that steals cached
 595          * htables that aren't in use -- which gives memory back, but averts
 596          * the entire breakdown of social order.
 597          */
 598         passes = reap ? 0 : htable_steal_passes;
 599 
 600         /*
 601          * Loop through all user hats. The 1st pass takes cached htables that
 602          * aren't in use. The later passes steal by removing mappings, too.
 603          */
 604         atomic_inc_32(&htable_dont_cache);
 605         for (pass = 0; pass <= passes && stolen < cnt; ++pass) {
 606                 threshold = pass * mmu.ptes_per_table / htable_steal_passes;
 607 
 608                 mutex_enter(&hat_list_lock);
 609 
 610                 /* skip the first hat (kernel) */
 611                 hat = kas.a_hat->hat_next;
 612                 for (;;) {
 613                         /*
 614                          * Skip any hat that is already being stolen from.
 615                          *
 616                          * We skip SHARED hats, as these are dummy
 617                          * hats that host ISM shared page tables.
 618                          *
 619                          * We also skip if HAT_FREEING because hat_pte_unmap()
 620                          * won't zero out the PTE's. That would lead to hitting
 621                          * stale PTEs either here or under hat_unload() when we
 622                          * steal and unload the same page table in competing
 623                          * threads.
 624                          */
 625                         while (hat != NULL &&
 626                             (hat->hat_flags &
 627                             (HAT_VICTIM | HAT_SHARED | HAT_FREEING)) != 0)
 628                                 hat = hat->hat_next;
 629 
 630                         if (hat == NULL)
 631                                 break;
 632 
 633                         /*
 634                          * Mark the HAT as a stealing victim so that it is
 635                          * not freed from under us, e.g. in as_free()
 636                          */
 637                         hat->hat_flags |= HAT_VICTIM;
 638                         mutex_exit(&hat_list_lock);
 639 
 640                         /*
 641                          * Take any htables from the hat's cached "free" list.
 642                          */
 643                         hat_enter(hat);
 644                         while ((ht = hat->hat_ht_cached) != NULL &&
 645                             stolen < cnt) {
 646                                 hat->hat_ht_cached = ht->ht_next;
 647                                 ht->ht_next = list;
 648                                 list = ht;
 649                                 ++stolen;
 650                         }
 651                         hat_exit(hat);
 652 
 653                         /*
 654                          * Don't steal active htables on first pass.
 655                          */
 656                         if (pass != 0 && (stolen < cnt))
 657                                 htable_steal_active(hat, cnt, threshold,
 658                                     &stolen, &list);
 659 
 660                         /*
 661                          * do synchronous teardown for the reap case so that
 662                          * we can forget hat; at this time, hat is
 663                          * guaranteed to be around because HAT_VICTIM is set
 664                          * (see htable_free() for similar code)
 665                          */
 666                         for (ht = list; (ht) && (reap); ht = ht->ht_next) {
 667                                 if (ht->ht_hat == NULL)
 668                                         continue;
 669                                 ASSERT(ht->ht_hat == hat);
 670 #if defined(__xpv) && defined(__amd64)
 671                                 if (!(ht->ht_flags & HTABLE_VLP) &&
 672                                     ht->ht_level == mmu.max_level) {
 673                                         ptable_free(hat->hat_user_ptable);
 674                                         hat->hat_user_ptable = PFN_INVALID;
 675                                 }
 676 #endif
 677                                 /*
 678                                  * forget the hat
 679                                  */
 680                                 ht->ht_hat = NULL;
 681                         }
 682 
 683                         mutex_enter(&hat_list_lock);
 684 
 685                         /*
 686                          * Are we finished?
 687                          */
 688                         if (stolen == cnt) {
 689                                 /*
 690                                  * Try to spread the pain of stealing,
 691                                  * move victim HAT to the end of the HAT list.
 692                                  */
 693                                 if (pass >= 1 && cnt == 1 &&
 694                                     kas.a_hat->hat_prev != hat)
 695                                         move_victim(hat);
 696                                 /*
 697                                  * We are finished
 698                                  */
 699                         }
 700 
 701                         /*
 702                          * Clear the victim flag, hat can go away now (once
 703                          * the lock is dropped)
 704                          */
 705                         if (hat->hat_flags & HAT_VICTIM) {
 706                                 ASSERT(hat != kas.a_hat);
 707                                 hat->hat_flags &= ~HAT_VICTIM;
 708                                 cv_broadcast(&hat_list_cv);
 709                         }
 710 
 711                         /* move on to the next hat */
 712                         hat = hat->hat_next;
 713                 }
 714 
 715                 mutex_exit(&hat_list_lock);
 716 
 717         }
 718         ASSERT(!MUTEX_HELD(&hat_list_lock));
 719 
 720         atomic_dec_32(&htable_dont_cache);
 721         return (list);
 722 }
 723 
 724 /*
 725  * This is invoked from kmem when the system is low on memory.  We try
 726  * to free hments, htables, and ptables to improve the memory situation.
 727  */
 728 /*ARGSUSED*/
 729 static void
 730 htable_reap(void *handle)
 731 {
 732         uint_t          reap_cnt;
 733         htable_t        *list;
 734         htable_t        *ht;
 735 
 736         HATSTAT_INC(hs_reap_attempts);
 737         if (!can_steal_post_boot)
 738                 return;
 739 
 740         /*
 741          * Try to reap 5% of the page tables bounded by a maximum of
 742          * 5% of physmem and a minimum of 10.
 743          */
 744         reap_cnt = MAX(MIN(physmem / 20, active_ptables / 20), 10);
 745 
 746         /*
 747          * Note: htable_dont_cache should be set at the time of
 748          * invoking htable_free()
 749          */
 750         atomic_inc_32(&htable_dont_cache);
 751         /*
 752          * Let htable_steal() do the work, we just call htable_free()
 753          */
 754         XPV_DISALLOW_MIGRATE();
 755         list = htable_steal(reap_cnt, B_TRUE);
 756         XPV_ALLOW_MIGRATE();
 757         while ((ht = list) != NULL) {
 758                 list = ht->ht_next;
 759                 HATSTAT_INC(hs_reaped);
 760                 htable_free(ht);
 761         }
 762         atomic_dec_32(&htable_dont_cache);
 763 
 764         /*
 765          * Free up excess reserves
 766          */
 767         htable_adjust_reserve();
 768         hment_adjust_reserve();
 769 }
 770 
 771 /*
 772  * Allocate an htable, stealing one or using the reserve if necessary
 773  */
 774 static htable_t *
 775 htable_alloc(
 776         hat_t           *hat,
 777         uintptr_t       vaddr,
 778         level_t         level,
 779         htable_t        *shared)
 780 {
 781         htable_t        *ht = NULL;
 782         uint_t          is_vlp;
 783         uint_t          is_bare = 0;
 784         uint_t          need_to_zero = 1;
 785         int             kmflags = (can_steal_post_boot ? KM_NOSLEEP : KM_SLEEP);
 786 
 787         if (level < 0 || level > TOP_LEVEL(hat))
 788                 panic("htable_alloc(): level %d out of range\n", level);
 789 
 790         is_vlp = (hat->hat_flags & HAT_VLP) && level == VLP_LEVEL;
 791         if (is_vlp || shared != NULL)
 792                 is_bare = 1;
 793 
 794         /*
 795          * First reuse a cached htable from the hat_ht_cached field, this
 796          * avoids unnecessary trips through kmem/page allocators.
 797          */
 798         if (hat->hat_ht_cached != NULL && !is_bare) {
 799                 hat_enter(hat);
 800                 ht = hat->hat_ht_cached;
 801                 if (ht != NULL) {
 802                         hat->hat_ht_cached = ht->ht_next;
 803                         need_to_zero = 0;
 804                         /* XX64 ASSERT() they're all zero somehow */
 805                         ASSERT(ht->ht_pfn != PFN_INVALID);
 806                 }
 807                 hat_exit(hat);
 808         }
 809 
 810         if (ht == NULL) {
 811                 /*
 812                  * Allocate an htable, possibly refilling the reserves.
 813                  */
 814                 if (USE_HAT_RESERVES()) {
 815                         ht = htable_get_reserve();
 816                 } else {
 817                         /*
 818                          * Donate successful htable allocations to the reserve.
 819                          */
 820                         for (;;) {
 821                                 ht = kmem_cache_alloc(htable_cache, kmflags);
 822                                 if (ht == NULL)
 823                                         break;
 824                                 ht->ht_pfn = PFN_INVALID;
 825                                 if (USE_HAT_RESERVES() ||
 826                                     htable_reserve_cnt >= htable_reserve_amount)
 827                                         break;
 828                                 htable_put_reserve(ht);
 829                         }
 830                 }
 831 
 832                 /*
 833                  * allocate a page for the hardware page table if needed
 834                  */
 835                 if (ht != NULL && !is_bare) {
 836                         ht->ht_hat = hat;
 837                         ht->ht_pfn = ptable_alloc((uintptr_t)ht);
 838                         if (ht->ht_pfn == PFN_INVALID) {
 839                                 if (USE_HAT_RESERVES())
 840                                         htable_put_reserve(ht);
 841                                 else
 842                                         kmem_cache_free(htable_cache, ht);
 843                                 ht = NULL;
 844                         }
 845                 }
 846         }
 847 
 848         /*
 849          * If allocations failed, kick off a kmem_reap() and resort to
 850          * htable steal(). We may spin here if the system is very low on
 851          * memory. If the kernel itself has consumed all memory and kmem_reap()
 852          * can't free up anything, then we'll really get stuck here.
 853          * That should only happen in a system where the administrator has
 854          * misconfigured VM parameters via /etc/system.
 855          */
 856         while (ht == NULL && can_steal_post_boot) {
 857                 kmem_reap();
 858                 ht = htable_steal(1, B_FALSE);
 859                 HATSTAT_INC(hs_steals);
 860 
 861                 /*
 862                  * If we stole for a bare htable, release the pagetable page.
 863                  */
 864                 if (ht != NULL) {
 865                         if (is_bare) {
 866                                 ptable_free(ht->ht_pfn);
 867                                 ht->ht_pfn = PFN_INVALID;
 868 #if defined(__xpv) && defined(__amd64)
 869                         /*
 870                          * make stolen page table writable again in kpm
 871                          */
 872                         } else if (kpm_vbase && xen_kpm_page(ht->ht_pfn,
 873                             PT_VALID | PT_WRITABLE) < 0) {
 874                                 panic("failure making kpm r/w pfn=0x%lx",
 875                                     ht->ht_pfn);
 876 #endif
 877                         }
 878                 }
 879         }
 880 
 881         /*
 882          * All attempts to allocate or steal failed. This should only happen
 883          * if we run out of memory during boot, due perhaps to a huge
 884          * boot_archive. At this point there's no way to continue.
 885          */
 886         if (ht == NULL)
 887                 panic("htable_alloc(): couldn't steal\n");
 888 
 889 #if defined(__amd64) && defined(__xpv)
 890         /*
 891          * Under the 64-bit hypervisor, we have 2 top level page tables.
 892          * If this allocation fails, we'll resort to stealing.
 893          * We use the stolen page indirectly, by freeing the
 894          * stolen htable first.
 895          */
 896         if (level == mmu.max_level) {
 897                 for (;;) {
 898                         htable_t *stolen;
 899 
 900                         hat->hat_user_ptable = ptable_alloc((uintptr_t)ht + 1);
 901                         if (hat->hat_user_ptable != PFN_INVALID)
 902                                 break;
 903                         stolen = htable_steal(1, B_FALSE);
 904                         if (stolen == NULL)
 905                                 panic("2nd steal ptable failed\n");
 906                         htable_free(stolen);
 907                 }
 908                 block_zero_no_xmm(kpm_vbase + pfn_to_pa(hat->hat_user_ptable),
 909                     MMU_PAGESIZE);
 910         }
 911 #endif
 912 
 913         /*
 914          * Shared page tables have all entries locked and entries may not
 915          * be added or deleted.
 916          */
 917         ht->ht_flags = 0;
 918         if (shared != NULL) {
 919                 ASSERT(shared->ht_valid_cnt > 0);
 920                 ht->ht_flags |= HTABLE_SHARED_PFN;
 921                 ht->ht_pfn = shared->ht_pfn;
 922                 ht->ht_lock_cnt = 0;
 923                 ht->ht_valid_cnt = 0;                /* updated in hat_share() */
 924                 ht->ht_shares = shared;
 925                 need_to_zero = 0;
 926         } else {
 927                 ht->ht_shares = NULL;
 928                 ht->ht_lock_cnt = 0;
 929                 ht->ht_valid_cnt = 0;
 930         }
 931 
 932         /*
 933          * setup flags, etc. for VLP htables
 934          */
 935         if (is_vlp) {
 936                 ht->ht_flags |= HTABLE_VLP;
 937                 ASSERT(ht->ht_pfn == PFN_INVALID);
 938                 need_to_zero = 0;
 939         }
 940 
 941         /*
 942          * fill in the htable
 943          */
 944         ht->ht_hat = hat;
 945         ht->ht_parent = NULL;
 946         ht->ht_vaddr = vaddr;
 947         ht->ht_level = level;
 948         ht->ht_busy = 1;
 949         ht->ht_next = NULL;
 950         ht->ht_prev = NULL;
 951 
 952         /*
 953          * Zero out any freshly allocated page table
 954          */
 955         if (need_to_zero)
 956                 x86pte_zero(ht, 0, mmu.ptes_per_table);
 957 
 958 #if defined(__amd64) && defined(__xpv)
 959         if (!is_bare && kpm_vbase) {
 960                 (void) xen_kpm_page(ht->ht_pfn, PT_VALID);
 961                 if (level == mmu.max_level)
 962                         (void) xen_kpm_page(hat->hat_user_ptable, PT_VALID);
 963         }
 964 #endif
 965 
 966         return (ht);
 967 }
 968 
 969 /*
 970  * Free up an htable, either to a hat's cached list, the reserves or
 971  * back to kmem.
 972  */
 973 static void
 974 htable_free(htable_t *ht)
 975 {
 976         hat_t *hat = ht->ht_hat;
 977 
 978         /*
 979          * If the process isn't exiting, cache the free htable in the hat
 980          * structure. We always do this for the boot time reserve. We don't
 981          * do this if the hat is exiting or we are stealing/reaping htables.
 982          */
 983         if (hat != NULL &&
 984             !(ht->ht_flags & HTABLE_SHARED_PFN) &&
 985             (use_boot_reserve ||
 986             (!(hat->hat_flags & HAT_FREEING) && !htable_dont_cache))) {
 987                 ASSERT((ht->ht_flags & HTABLE_VLP) == 0);
 988                 ASSERT(ht->ht_pfn != PFN_INVALID);
 989                 hat_enter(hat);
 990                 ht->ht_next = hat->hat_ht_cached;
 991                 hat->hat_ht_cached = ht;
 992                 hat_exit(hat);
 993                 return;
 994         }
 995 
 996         /*
 997          * If we have a hardware page table, free it.
 998          * We don't free page tables that are accessed by sharing.
 999          */
1000         if (ht->ht_flags & HTABLE_SHARED_PFN) {
1001                 ASSERT(ht->ht_pfn != PFN_INVALID);
1002         } else if (!(ht->ht_flags & HTABLE_VLP)) {
1003                 ptable_free(ht->ht_pfn);
1004 #if defined(__amd64) && defined(__xpv)
1005                 if (ht->ht_level == mmu.max_level && hat != NULL) {
1006                         ptable_free(hat->hat_user_ptable);
1007                         hat->hat_user_ptable = PFN_INVALID;
1008                 }
1009 #endif
1010         }
1011         ht->ht_pfn = PFN_INVALID;
1012 
1013         /*
1014          * Free it or put into reserves.
1015          */
1016         if (USE_HAT_RESERVES() || htable_reserve_cnt < htable_reserve_amount) {
1017                 htable_put_reserve(ht);
1018         } else {
1019                 kmem_cache_free(htable_cache, ht);
1020                 htable_adjust_reserve();
1021         }
1022 }
1023 
1024 
1025 /*
1026  * This is called when a hat is being destroyed or swapped out. We reap all
1027  * the remaining htables in the hat cache. If destroying all left over
1028  * htables are also destroyed.
1029  *
1030  * We also don't need to invalidate any of the PTPs nor do any demapping.
1031  */
1032 void
1033 htable_purge_hat(hat_t *hat)
1034 {
1035         htable_t *ht;
1036         int h;
1037 
1038         /*
1039          * Purge the htable cache if just reaping.
1040          */
1041         if (!(hat->hat_flags & HAT_FREEING)) {
1042                 atomic_inc_32(&htable_dont_cache);
1043                 for (;;) {
1044                         hat_enter(hat);
1045                         ht = hat->hat_ht_cached;
1046                         if (ht == NULL) {
1047                                 hat_exit(hat);
1048                                 break;
1049                         }
1050                         hat->hat_ht_cached = ht->ht_next;
1051                         hat_exit(hat);
1052                         htable_free(ht);
1053                 }
1054                 atomic_dec_32(&htable_dont_cache);
1055                 return;
1056         }
1057 
1058         /*
1059          * if freeing, no locking is needed
1060          */
1061         while ((ht = hat->hat_ht_cached) != NULL) {
1062                 hat->hat_ht_cached = ht->ht_next;
1063                 htable_free(ht);
1064         }
1065 
1066         /*
1067          * walk thru the htable hash table and free all the htables in it.
1068          */
1069         for (h = 0; h < hat->hat_num_hash; ++h) {
1070                 while ((ht = hat->hat_ht_hash[h]) != NULL) {
1071                         if (ht->ht_next)
1072                                 ht->ht_next->ht_prev = ht->ht_prev;
1073 
1074                         if (ht->ht_prev) {
1075                                 ht->ht_prev->ht_next = ht->ht_next;
1076                         } else {
1077                                 ASSERT(hat->hat_ht_hash[h] == ht);
1078                                 hat->hat_ht_hash[h] = ht->ht_next;
1079                         }
1080                         htable_free(ht);
1081                 }
1082         }
1083 }
1084 
1085 /*
1086  * Unlink an entry for a table at vaddr and level out of the existing table
1087  * one level higher. We are always holding the HASH_ENTER() when doing this.
1088  */
1089 static void
1090 unlink_ptp(htable_t *higher, htable_t *old, uintptr_t vaddr)
1091 {
1092         uint_t          entry = htable_va2entry(vaddr, higher);
1093         x86pte_t        expect = MAKEPTP(old->ht_pfn, old->ht_level);
1094         x86pte_t        found;
1095         hat_t           *hat = old->ht_hat;
1096 
1097         ASSERT(higher->ht_busy > 0);
1098         ASSERT(higher->ht_valid_cnt > 0);
1099         ASSERT(old->ht_valid_cnt == 0);
1100         found = x86pte_cas(higher, entry, expect, 0);
1101 #ifdef __xpv
1102         /*
1103          * This is weird, but Xen apparently automatically unlinks empty
1104          * pagetables from the upper page table. So allow PTP to be 0 already.
1105          */
1106         if (found != expect && found != 0)
1107 #else
1108         if (found != expect)
1109 #endif
1110                 panic("Bad PTP found=" FMT_PTE ", expected=" FMT_PTE,
1111                     found, expect);
1112 
1113         /*
1114          * When a top level VLP page table entry changes, we must issue
1115          * a reload of cr3 on all processors.
1116          *
1117          * If we don't need do do that, then we still have to INVLPG against
1118          * an address covered by the inner page table, as the latest processors
1119          * have TLB-like caches for non-leaf page table entries.
1120          */
1121         if (!(hat->hat_flags & HAT_FREEING)) {
1122                 hat_tlb_inval(hat, (higher->ht_flags & HTABLE_VLP) ?
1123                     DEMAP_ALL_ADDR : old->ht_vaddr);
1124         }
1125 
1126         HTABLE_DEC(higher->ht_valid_cnt);
1127 }
1128 
1129 /*
1130  * Link an entry for a new table at vaddr and level into the existing table
1131  * one level higher. We are always holding the HASH_ENTER() when doing this.
1132  */
1133 static void
1134 link_ptp(htable_t *higher, htable_t *new, uintptr_t vaddr)
1135 {
1136         uint_t          entry = htable_va2entry(vaddr, higher);
1137         x86pte_t        newptp = MAKEPTP(new->ht_pfn, new->ht_level);
1138         x86pte_t        found;
1139 
1140         ASSERT(higher->ht_busy > 0);
1141 
1142         ASSERT(new->ht_level != mmu.max_level);
1143 
1144         HTABLE_INC(higher->ht_valid_cnt);
1145 
1146         found = x86pte_cas(higher, entry, 0, newptp);
1147         if ((found & ~PT_REF) != 0)
1148                 panic("HAT: ptp not 0, found=" FMT_PTE, found);
1149 
1150         /*
1151          * When any top level VLP page table entry changes, we must issue
1152          * a reload of cr3 on all processors using it.
1153          * We also need to do this for the kernel hat on PAE 32 bit kernel.
1154          */
1155         if (
1156 #ifdef __i386
1157             (higher->ht_hat == kas.a_hat && higher->ht_level == VLP_LEVEL) ||
1158 #endif
1159             (higher->ht_flags & HTABLE_VLP))
1160                 hat_tlb_inval(higher->ht_hat, DEMAP_ALL_ADDR);
1161 }
1162 
1163 /*
1164  * Release of hold on an htable. If this is the last use and the pagetable
1165  * is empty we may want to free it, then recursively look at the pagetable
1166  * above it. The recursion is handled by the outer while() loop.
1167  *
1168  * On the metal, during process exit, we don't bother unlinking the tables from
1169  * upper level pagetables. They are instead handled in bulk by hat_free_end().
1170  * We can't do this on the hypervisor as we need the page table to be
1171  * implicitly unpinnned before it goes to the free page lists. This can't
1172  * happen unless we fully unlink it from the page table hierarchy.
1173  */
1174 void
1175 htable_release(htable_t *ht)
1176 {
1177         uint_t          hashval;
1178         htable_t        *shared;
1179         htable_t        *higher;
1180         hat_t           *hat;
1181         uintptr_t       va;
1182         level_t         level;
1183 
1184         while (ht != NULL) {
1185                 shared = NULL;
1186                 for (;;) {
1187                         hat = ht->ht_hat;
1188                         va = ht->ht_vaddr;
1189                         level = ht->ht_level;
1190                         hashval = HTABLE_HASH(hat, va, level);
1191 
1192                         /*
1193                          * The common case is that this isn't the last use of
1194                          * an htable so we don't want to free the htable.
1195                          */
1196                         HTABLE_ENTER(hashval);
1197                         ASSERT(ht->ht_valid_cnt >= 0);
1198                         ASSERT(ht->ht_busy > 0);
1199                         if (ht->ht_valid_cnt > 0)
1200                                 break;
1201                         if (ht->ht_busy > 1)
1202                                 break;
1203                         ASSERT(ht->ht_lock_cnt == 0);
1204 
1205 #if !defined(__xpv)
1206                         /*
1207                          * we always release empty shared htables
1208                          */
1209                         if (!(ht->ht_flags & HTABLE_SHARED_PFN)) {
1210 
1211                                 /*
1212                                  * don't release if in address space tear down
1213                                  */
1214                                 if (hat->hat_flags & HAT_FREEING)
1215                                         break;
1216 
1217                                 /*
1218                                  * At and above max_page_level, free if it's for
1219                                  * a boot-time kernel mapping below kernelbase.
1220                                  */
1221                                 if (level >= mmu.max_page_level &&
1222                                     (hat != kas.a_hat || va >= kernelbase))
1223                                         break;
1224                         }
1225 #endif /* __xpv */
1226 
1227                         /*
1228                          * Remember if we destroy an htable that shares its PFN
1229                          * from elsewhere.
1230                          */
1231                         if (ht->ht_flags & HTABLE_SHARED_PFN) {
1232                                 ASSERT(shared == NULL);
1233                                 shared = ht->ht_shares;
1234                                 HATSTAT_INC(hs_htable_unshared);
1235                         }
1236 
1237                         /*
1238                          * Handle release of a table and freeing the htable_t.
1239                          * Unlink it from the table higher (ie. ht_parent).
1240                          */
1241                         higher = ht->ht_parent;
1242                         ASSERT(higher != NULL);
1243 
1244                         /*
1245                          * Unlink the pagetable.
1246                          */
1247                         unlink_ptp(higher, ht, va);
1248 
1249                         /*
1250                          * remove this htable from its hash list
1251                          */
1252                         if (ht->ht_next)
1253                                 ht->ht_next->ht_prev = ht->ht_prev;
1254 
1255                         if (ht->ht_prev) {
1256                                 ht->ht_prev->ht_next = ht->ht_next;
1257                         } else {
1258                                 ASSERT(hat->hat_ht_hash[hashval] == ht);
1259                                 hat->hat_ht_hash[hashval] = ht->ht_next;
1260                         }
1261                         HTABLE_EXIT(hashval);
1262                         htable_free(ht);
1263                         ht = higher;
1264                 }
1265 
1266                 ASSERT(ht->ht_busy >= 1);
1267                 --ht->ht_busy;
1268                 HTABLE_EXIT(hashval);
1269 
1270                 /*
1271                  * If we released a shared htable, do a release on the htable
1272                  * from which it shared
1273                  */
1274                 ht = shared;
1275         }
1276 }
1277 
1278 /*
1279  * Find the htable for the pagetable at the given level for the given address.
1280  * If found acquires a hold that eventually needs to be htable_release()d
1281  */
1282 htable_t *
1283 htable_lookup(hat_t *hat, uintptr_t vaddr, level_t level)
1284 {
1285         uintptr_t       base;
1286         uint_t          hashval;
1287         htable_t        *ht = NULL;
1288 
1289         ASSERT(level >= 0);
1290         ASSERT(level <= TOP_LEVEL(hat));
1291 
1292         if (level == TOP_LEVEL(hat)) {
1293 #if defined(__amd64)
1294                 /*
1295                  * 32 bit address spaces on 64 bit kernels need to check
1296                  * for overflow of the 32 bit address space
1297                  */
1298                 if ((hat->hat_flags & HAT_VLP) && vaddr >= ((uint64_t)1 << 32))
1299                         return (NULL);
1300 #endif
1301                 base = 0;
1302         } else {
1303                 base = vaddr & LEVEL_MASK(level + 1);
1304         }
1305 
1306         hashval = HTABLE_HASH(hat, base, level);
1307         HTABLE_ENTER(hashval);
1308         for (ht = hat->hat_ht_hash[hashval]; ht; ht = ht->ht_next) {
1309                 if (ht->ht_hat == hat &&
1310                     ht->ht_vaddr == base &&
1311                     ht->ht_level == level)
1312                         break;
1313         }
1314         if (ht)
1315                 ++ht->ht_busy;
1316 
1317         HTABLE_EXIT(hashval);
1318         return (ht);
1319 }
1320 
1321 /*
1322  * Acquires a hold on a known htable (from a locked hment entry).
1323  */
1324 void
1325 htable_acquire(htable_t *ht)
1326 {
1327         hat_t           *hat = ht->ht_hat;
1328         level_t         level = ht->ht_level;
1329         uintptr_t       base = ht->ht_vaddr;
1330         uint_t          hashval = HTABLE_HASH(hat, base, level);
1331 
1332         HTABLE_ENTER(hashval);
1333 #ifdef DEBUG
1334         /*
1335          * make sure the htable is there
1336          */
1337         {
1338                 htable_t        *h;
1339 
1340                 for (h = hat->hat_ht_hash[hashval];
1341                     h && h != ht;
1342                     h = h->ht_next)
1343                         ;
1344                 ASSERT(h == ht);
1345         }
1346 #endif /* DEBUG */
1347         ++ht->ht_busy;
1348         HTABLE_EXIT(hashval);
1349 }
1350 
1351 /*
1352  * Find the htable for the pagetable at the given level for the given address.
1353  * If found acquires a hold that eventually needs to be htable_release()d
1354  * If not found the table is created.
1355  *
1356  * Since we can't hold a hash table mutex during allocation, we have to
1357  * drop it and redo the search on a create. Then we may have to free the newly
1358  * allocated htable if another thread raced in and created it ahead of us.
1359  */
1360 htable_t *
1361 htable_create(
1362         hat_t           *hat,
1363         uintptr_t       vaddr,
1364         level_t         level,
1365         htable_t        *shared)
1366 {
1367         uint_t          h;
1368         level_t         l;
1369         uintptr_t       base;
1370         htable_t        *ht;
1371         htable_t        *higher = NULL;
1372         htable_t        *new = NULL;
1373 
1374         if (level < 0 || level > TOP_LEVEL(hat))
1375                 panic("htable_create(): level %d out of range\n", level);
1376 
1377         /*
1378          * Create the page tables in top down order.
1379          */
1380         for (l = TOP_LEVEL(hat); l >= level; --l) {
1381                 new = NULL;
1382                 if (l == TOP_LEVEL(hat))
1383                         base = 0;
1384                 else
1385                         base = vaddr & LEVEL_MASK(l + 1);
1386 
1387                 h = HTABLE_HASH(hat, base, l);
1388 try_again:
1389                 /*
1390                  * look up the htable at this level
1391                  */
1392                 HTABLE_ENTER(h);
1393                 if (l == TOP_LEVEL(hat)) {
1394                         ht = hat->hat_htable;
1395                 } else {
1396                         for (ht = hat->hat_ht_hash[h]; ht; ht = ht->ht_next) {
1397                                 ASSERT(ht->ht_hat == hat);
1398                                 if (ht->ht_vaddr == base &&
1399                                     ht->ht_level == l)
1400                                         break;
1401                         }
1402                 }
1403 
1404                 /*
1405                  * if we found the htable, increment its busy cnt
1406                  * and if we had allocated a new htable, free it.
1407                  */
1408                 if (ht != NULL) {
1409                         /*
1410                          * If we find a pre-existing shared table, it must
1411                          * share from the same place.
1412                          */
1413                         if (l == level && shared && ht->ht_shares &&
1414                             ht->ht_shares != shared) {
1415                                 panic("htable shared from wrong place "
1416                                     "found htable=%p shared=%p",
1417                                     (void *)ht, (void *)shared);
1418                         }
1419                         ++ht->ht_busy;
1420                         HTABLE_EXIT(h);
1421                         if (new)
1422                                 htable_free(new);
1423                         if (higher != NULL)
1424                                 htable_release(higher);
1425                         higher = ht;
1426 
1427                 /*
1428                  * if we didn't find it on the first search
1429                  * allocate a new one and search again
1430                  */
1431                 } else if (new == NULL) {
1432                         HTABLE_EXIT(h);
1433                         new = htable_alloc(hat, base, l,
1434                             l == level ? shared : NULL);
1435                         goto try_again;
1436 
1437                 /*
1438                  * 2nd search and still not there, use "new" table
1439                  * Link new table into higher, when not at top level.
1440                  */
1441                 } else {
1442                         ht = new;
1443                         if (higher != NULL) {
1444                                 link_ptp(higher, ht, base);
1445                                 ht->ht_parent = higher;
1446                         }
1447                         ht->ht_next = hat->hat_ht_hash[h];
1448                         ASSERT(ht->ht_prev == NULL);
1449                         if (hat->hat_ht_hash[h])
1450                                 hat->hat_ht_hash[h]->ht_prev = ht;
1451                         hat->hat_ht_hash[h] = ht;
1452                         HTABLE_EXIT(h);
1453 
1454                         /*
1455                          * Note we don't do htable_release(higher).
1456                          * That happens recursively when "new" is removed by
1457                          * htable_release() or htable_steal().
1458                          */
1459                         higher = ht;
1460 
1461                         /*
1462                          * If we just created a new shared page table we
1463                          * increment the shared htable's busy count, so that
1464                          * it can't be the victim of a steal even if it's empty.
1465                          */
1466                         if (l == level && shared) {
1467                                 (void) htable_lookup(shared->ht_hat,
1468                                     shared->ht_vaddr, shared->ht_level);
1469                                 HATSTAT_INC(hs_htable_shared);
1470                         }
1471                 }
1472         }
1473 
1474         return (ht);
1475 }
1476 
1477 /*
1478  * Inherit initial pagetables from the boot program. On the 64-bit
1479  * hypervisor we also temporarily mark the p_index field of page table
1480  * pages, so we know not to try making them writable in seg_kpm.
1481  */
1482 void
1483 htable_attach(
1484         hat_t *hat,
1485         uintptr_t base,
1486         level_t level,
1487         htable_t *parent,
1488         pfn_t pfn)
1489 {
1490         htable_t        *ht;
1491         uint_t          h;
1492         uint_t          i;
1493         x86pte_t        pte;
1494         x86pte_t        *ptep;
1495         page_t          *pp;
1496         extern page_t   *boot_claim_page(pfn_t);
1497 
1498         ht = htable_get_reserve();
1499         if (level == mmu.max_level)
1500                 kas.a_hat->hat_htable = ht;
1501         ht->ht_hat = hat;
1502         ht->ht_parent = parent;
1503         ht->ht_vaddr = base;
1504         ht->ht_level = level;
1505         ht->ht_busy = 1;
1506         ht->ht_next = NULL;
1507         ht->ht_prev = NULL;
1508         ht->ht_flags = 0;
1509         ht->ht_pfn = pfn;
1510         ht->ht_lock_cnt = 0;
1511         ht->ht_valid_cnt = 0;
1512         if (parent != NULL)
1513                 ++parent->ht_busy;
1514 
1515         h = HTABLE_HASH(hat, base, level);
1516         HTABLE_ENTER(h);
1517         ht->ht_next = hat->hat_ht_hash[h];
1518         ASSERT(ht->ht_prev == NULL);
1519         if (hat->hat_ht_hash[h])
1520                 hat->hat_ht_hash[h]->ht_prev = ht;
1521         hat->hat_ht_hash[h] = ht;
1522         HTABLE_EXIT(h);
1523 
1524         /*
1525          * make sure the page table physical page is not FREE
1526          */
1527         if (page_resv(1, KM_NOSLEEP) == 0)
1528                 panic("page_resv() failed in ptable alloc");
1529 
1530         pp = boot_claim_page(pfn);
1531         ASSERT(pp != NULL);
1532 
1533         /*
1534          * Page table pages that were allocated by dboot or
1535          * in very early startup didn't go through boot_mapin()
1536          * and so won't have vnode/offsets. Fix that here.
1537          */
1538         if (pp->p_vnode == NULL) {
1539                 /* match offset calculation in page_get_physical() */
1540                 u_offset_t offset = (uintptr_t)ht;
1541                 if (offset > kernelbase)
1542                         offset -= kernelbase;
1543                 offset <<= MMU_PAGESHIFT;
1544 #if defined(__amd64)
1545                 offset += mmu.hole_start;       /* something in VA hole */
1546 #else
1547                 offset += 1ULL << 40;             /* something > 4 Gig */
1548 #endif
1549                 ASSERT(page_exists(&kvp, offset) == NULL);
1550                 (void) page_hashin(pp, &kvp, offset, NULL);
1551         }
1552         page_downgrade(pp);
1553 #if defined(__xpv) && defined(__amd64)
1554         /*
1555          * Record in the page_t that is a pagetable for segkpm setup.
1556          */
1557         if (kpm_vbase)
1558                 pp->p_index = 1;
1559 #endif
1560 
1561         /*
1562          * Count valid mappings and recursively attach lower level pagetables.
1563          */
1564         ptep = kbm_remap_window(pfn_to_pa(pfn), 0);
1565         for (i = 0; i < HTABLE_NUM_PTES(ht); ++i) {
1566                 if (mmu.pae_hat)
1567                         pte = ptep[i];
1568                 else
1569                         pte = ((x86pte32_t *)ptep)[i];
1570                 if (!IN_HYPERVISOR_VA(base) && PTE_ISVALID(pte)) {
1571                         ++ht->ht_valid_cnt;
1572                         if (!PTE_ISPAGE(pte, level)) {
1573                                 htable_attach(hat, base, level - 1,
1574                                     ht, PTE2PFN(pte, level));
1575                                 ptep = kbm_remap_window(pfn_to_pa(pfn), 0);
1576                         }
1577                 }
1578                 base += LEVEL_SIZE(level);
1579                 if (base == mmu.hole_start)
1580                         base = (mmu.hole_end + MMU_PAGEOFFSET) & MMU_PAGEMASK;
1581         }
1582 
1583         /*
1584          * As long as all the mappings we had were below kernel base
1585          * we can release the htable.
1586          */
1587         if (base < kernelbase)
1588                 htable_release(ht);
1589 }
1590 
1591 /*
1592  * Walk through a given htable looking for the first valid entry.  This
1593  * routine takes both a starting and ending address.  The starting address
1594  * is required to be within the htable provided by the caller, but there is
1595  * no such restriction on the ending address.
1596  *
1597  * If the routine finds a valid entry in the htable (at or beyond the
1598  * starting address), the PTE (and its address) will be returned.
1599  * This PTE may correspond to either a page or a pagetable - it is the
1600  * caller's responsibility to determine which.  If no valid entry is
1601  * found, 0 (and invalid PTE) and the next unexamined address will be
1602  * returned.
1603  *
1604  * The loop has been carefully coded for optimization.
1605  */
1606 static x86pte_t
1607 htable_scan(htable_t *ht, uintptr_t *vap, uintptr_t eaddr)
1608 {
1609         uint_t e;
1610         x86pte_t found_pte = (x86pte_t)0;
1611         caddr_t pte_ptr;
1612         caddr_t end_pte_ptr;
1613         int l = ht->ht_level;
1614         uintptr_t va = *vap & LEVEL_MASK(l);
1615         size_t pgsize = LEVEL_SIZE(l);
1616 
1617         ASSERT(va >= ht->ht_vaddr);
1618         ASSERT(va <= HTABLE_LAST_PAGE(ht));
1619 
1620         /*
1621          * Compute the starting index and ending virtual address
1622          */
1623         e = htable_va2entry(va, ht);
1624 
1625         /*
1626          * The following page table scan code knows that the valid
1627          * bit of a PTE is in the lowest byte AND that x86 is little endian!!
1628          */
1629         pte_ptr = (caddr_t)x86pte_access_pagetable(ht, 0);
1630         end_pte_ptr = (caddr_t)PT_INDEX_PTR(pte_ptr, HTABLE_NUM_PTES(ht));
1631         pte_ptr = (caddr_t)PT_INDEX_PTR((x86pte_t *)pte_ptr, e);
1632         while (!PTE_ISVALID(*pte_ptr)) {
1633                 va += pgsize;
1634                 if (va >= eaddr)
1635                         break;
1636                 pte_ptr += mmu.pte_size;
1637                 ASSERT(pte_ptr <= end_pte_ptr);
1638                 if (pte_ptr == end_pte_ptr)
1639                         break;
1640         }
1641 
1642         /*
1643          * if we found a valid PTE, load the entire PTE
1644          */
1645         if (va < eaddr && pte_ptr != end_pte_ptr)
1646                 found_pte = GET_PTE((x86pte_t *)pte_ptr);
1647         x86pte_release_pagetable(ht);
1648 
1649 #if defined(__amd64)
1650         /*
1651          * deal with VA hole on amd64
1652          */
1653         if (l == mmu.max_level && va >= mmu.hole_start && va <= mmu.hole_end)
1654                 va = mmu.hole_end + va - mmu.hole_start;
1655 #endif /* __amd64 */
1656 
1657         *vap = va;
1658         return (found_pte);
1659 }
1660 
1661 /*
1662  * Find the address and htable for the first populated translation at or
1663  * above the given virtual address.  The caller may also specify an upper
1664  * limit to the address range to search.  Uses level information to quickly
1665  * skip unpopulated sections of virtual address spaces.
1666  *
1667  * If not found returns NULL. When found, returns the htable and virt addr
1668  * and has a hold on the htable.
1669  */
1670 x86pte_t
1671 htable_walk(
1672         struct hat *hat,
1673         htable_t **htp,
1674         uintptr_t *vaddr,
1675         uintptr_t eaddr)
1676 {
1677         uintptr_t va = *vaddr;
1678         htable_t *ht;
1679         htable_t *prev = *htp;
1680         level_t l;
1681         level_t max_mapped_level;
1682         x86pte_t pte;
1683 
1684         ASSERT(eaddr > va);
1685 
1686         /*
1687          * If this is a user address, then we know we need not look beyond
1688          * kernelbase.
1689          */
1690         ASSERT(hat == kas.a_hat || eaddr <= kernelbase ||
1691             eaddr == HTABLE_WALK_TO_END);
1692         if (hat != kas.a_hat && eaddr == HTABLE_WALK_TO_END)
1693                 eaddr = kernelbase;
1694 
1695         /*
1696          * If we're coming in with a previous page table, search it first
1697          * without doing an htable_lookup(), this should be frequent.
1698          */
1699         if (prev) {
1700                 ASSERT(prev->ht_busy > 0);
1701                 ASSERT(prev->ht_vaddr <= va);
1702                 l = prev->ht_level;
1703                 if (va <= HTABLE_LAST_PAGE(prev)) {
1704                         pte = htable_scan(prev, &va, eaddr);
1705 
1706                         if (PTE_ISPAGE(pte, l)) {
1707                                 *vaddr = va;
1708                                 *htp = prev;
1709                                 return (pte);
1710                         }
1711                 }
1712 
1713                 /*
1714                  * We found nothing in the htable provided by the caller,
1715                  * so fall through and do the full search
1716                  */
1717                 htable_release(prev);
1718         }
1719 
1720         /*
1721          * Find the level of the largest pagesize used by this HAT.
1722          */
1723         if (hat->hat_ism_pgcnt > 0) {
1724                 max_mapped_level = mmu.umax_page_level;
1725         } else {
1726                 max_mapped_level = 0;
1727                 for (l = 1; l <= mmu.max_page_level; ++l)
1728                         if (hat->hat_pages_mapped[l] != 0)
1729                                 max_mapped_level = l;
1730         }
1731 
1732         while (va < eaddr && va >= *vaddr) {
1733                 /*
1734                  *  Find lowest table with any entry for given address.
1735                  */
1736                 for (l = 0; l <= TOP_LEVEL(hat); ++l) {
1737                         ht = htable_lookup(hat, va, l);
1738                         if (ht != NULL) {
1739                                 pte = htable_scan(ht, &va, eaddr);
1740                                 if (PTE_ISPAGE(pte, l)) {
1741                                         VERIFY(!IN_VA_HOLE(va));
1742                                         *vaddr = va;
1743                                         *htp = ht;
1744                                         return (pte);
1745                                 }
1746                                 htable_release(ht);
1747                                 break;
1748                         }
1749 
1750                         /*
1751                          * No htable at this level for the address. If there
1752                          * is no larger page size that could cover it, we can
1753                          * skip right to the start of the next page table.
1754                          */
1755                         ASSERT(l < TOP_LEVEL(hat));
1756                         if (l >= max_mapped_level) {
1757                                 va = NEXT_ENTRY_VA(va, l + 1);
1758                                 if (va >= eaddr)
1759                                         break;
1760                         }
1761                 }
1762         }
1763 
1764         *vaddr = 0;
1765         *htp = NULL;
1766         return (0);
1767 }
1768 
1769 /*
1770  * Find the htable and page table entry index of the given virtual address
1771  * with pagesize at or below given level.
1772  * If not found returns NULL. When found, returns the htable, sets
1773  * entry, and has a hold on the htable.
1774  */
1775 htable_t *
1776 htable_getpte(
1777         struct hat *hat,
1778         uintptr_t vaddr,
1779         uint_t *entry,
1780         x86pte_t *pte,
1781         level_t level)
1782 {
1783         htable_t        *ht;
1784         level_t         l;
1785         uint_t          e;
1786 
1787         ASSERT(level <= mmu.max_page_level);
1788 
1789         for (l = 0; l <= level; ++l) {
1790                 ht = htable_lookup(hat, vaddr, l);
1791                 if (ht == NULL)
1792                         continue;
1793                 e = htable_va2entry(vaddr, ht);
1794                 if (entry != NULL)
1795                         *entry = e;
1796                 if (pte != NULL)
1797                         *pte = x86pte_get(ht, e);
1798                 return (ht);
1799         }
1800         return (NULL);
1801 }
1802 
1803 /*
1804  * Find the htable and page table entry index of the given virtual address.
1805  * There must be a valid page mapped at the given address.
1806  * If not found returns NULL. When found, returns the htable, sets
1807  * entry, and has a hold on the htable.
1808  */
1809 htable_t *
1810 htable_getpage(struct hat *hat, uintptr_t vaddr, uint_t *entry)
1811 {
1812         htable_t        *ht;
1813         uint_t          e;
1814         x86pte_t        pte;
1815 
1816         ht = htable_getpte(hat, vaddr, &e, &pte, mmu.max_page_level);
1817         if (ht == NULL)
1818                 return (NULL);
1819 
1820         if (entry)
1821                 *entry = e;
1822 
1823         if (PTE_ISPAGE(pte, ht->ht_level))
1824                 return (ht);
1825         htable_release(ht);
1826         return (NULL);
1827 }
1828 
1829 
1830 void
1831 htable_init()
1832 {
1833         /*
1834          * To save on kernel VA usage, we avoid debug information in 32 bit
1835          * kernels.
1836          */
1837 #if defined(__amd64)
1838         int     kmem_flags = KMC_NOHASH;
1839 #elif defined(__i386)
1840         int     kmem_flags = KMC_NOHASH | KMC_NODEBUG;
1841 #endif
1842 
1843         /*
1844          * initialize kmem caches
1845          */
1846         htable_cache = kmem_cache_create("htable_t",
1847             sizeof (htable_t), 0, NULL, NULL,
1848             htable_reap, NULL, hat_memload_arena, kmem_flags);
1849 }
1850 
1851 /*
1852  * get the pte index for the virtual address in the given htable's pagetable
1853  */
1854 uint_t
1855 htable_va2entry(uintptr_t va, htable_t *ht)
1856 {
1857         level_t l = ht->ht_level;
1858 
1859         ASSERT(va >= ht->ht_vaddr);
1860         ASSERT(va <= HTABLE_LAST_PAGE(ht));
1861         return ((va >> LEVEL_SHIFT(l)) & (HTABLE_NUM_PTES(ht) - 1));
1862 }
1863 
1864 /*
1865  * Given an htable and the index of a pte in it, return the virtual address
1866  * of the page.
1867  */
1868 uintptr_t
1869 htable_e2va(htable_t *ht, uint_t entry)
1870 {
1871         level_t l = ht->ht_level;
1872         uintptr_t va;
1873 
1874         ASSERT(entry < HTABLE_NUM_PTES(ht));
1875         va = ht->ht_vaddr + ((uintptr_t)entry << LEVEL_SHIFT(l));
1876 
1877         /*
1878          * Need to skip over any VA hole in top level table
1879          */
1880 #if defined(__amd64)
1881         if (ht->ht_level == mmu.max_level && va >= mmu.hole_start)
1882                 va += ((mmu.hole_end - mmu.hole_start) + 1);
1883 #endif
1884 
1885         return (va);
1886 }
1887 
1888 /*
1889  * The code uses compare and swap instructions to read/write PTE's to
1890  * avoid atomicity problems, since PTEs can be 8 bytes on 32 bit systems.
1891  * will naturally be atomic.
1892  *
1893  * The combination of using kpreempt_disable()/_enable() and the hci_mutex
1894  * are used to ensure that an interrupt won't overwrite a temporary mapping
1895  * while it's in use. If an interrupt thread tries to access a PTE, it will
1896  * yield briefly back to the pinned thread which holds the cpu's hci_mutex.
1897  */
1898 void
1899 x86pte_cpu_init(cpu_t *cpu)
1900 {
1901         struct hat_cpu_info *hci;
1902 
1903         hci = kmem_zalloc(sizeof (*hci), KM_SLEEP);
1904         mutex_init(&hci->hci_mutex, NULL, MUTEX_DEFAULT, NULL);
1905         cpu->cpu_hat_info = hci;
1906 }
1907 
1908 void
1909 x86pte_cpu_fini(cpu_t *cpu)
1910 {
1911         struct hat_cpu_info *hci = cpu->cpu_hat_info;
1912 
1913         kmem_free(hci, sizeof (*hci));
1914         cpu->cpu_hat_info = NULL;
1915 }
1916 
1917 #ifdef __i386
1918 /*
1919  * On 32 bit kernels, loading a 64 bit PTE is a little tricky
1920  */
1921 x86pte_t
1922 get_pte64(x86pte_t *ptr)
1923 {
1924         volatile uint32_t *p = (uint32_t *)ptr;
1925         x86pte_t t;
1926 
1927         ASSERT(mmu.pae_hat != 0);
1928         for (;;) {
1929                 t = p[0];
1930                 t |= (uint64_t)p[1] << 32;
1931                 if ((t & 0xffffffff) == p[0])
1932                         return (t);
1933         }
1934 }
1935 #endif /* __i386 */
1936 
1937 /*
1938  * Disable preemption and establish a mapping to the pagetable with the
1939  * given pfn. This is optimized for there case where it's the same
1940  * pfn as we last used referenced from this CPU.
1941  */
1942 static x86pte_t *
1943 x86pte_access_pagetable(htable_t *ht, uint_t index)
1944 {
1945         /*
1946          * VLP pagetables are contained in the hat_t
1947          */
1948         if (ht->ht_flags & HTABLE_VLP)
1949                 return (PT_INDEX_PTR(ht->ht_hat->hat_vlp_ptes, index));
1950         return (x86pte_mapin(ht->ht_pfn, index, ht));
1951 }
1952 
1953 /*
1954  * map the given pfn into the page table window.
1955  */
1956 /*ARGSUSED*/
1957 x86pte_t *
1958 x86pte_mapin(pfn_t pfn, uint_t index, htable_t *ht)
1959 {
1960         x86pte_t *pteptr;
1961         x86pte_t pte = 0;
1962         x86pte_t newpte;
1963         int x;
1964 
1965         ASSERT(pfn != PFN_INVALID);
1966 
1967         if (!khat_running) {
1968                 caddr_t va = kbm_remap_window(pfn_to_pa(pfn), 1);
1969                 return (PT_INDEX_PTR(va, index));
1970         }
1971 
1972         /*
1973          * If kpm is available, use it.
1974          */
1975         if (kpm_vbase)
1976                 return (PT_INDEX_PTR(hat_kpm_pfn2va(pfn), index));
1977 
1978         /*
1979          * Disable preemption and grab the CPU's hci_mutex
1980          */
1981         kpreempt_disable();
1982         ASSERT(CPU->cpu_hat_info != NULL);
1983         mutex_enter(&CPU->cpu_hat_info->hci_mutex);
1984         x = PWIN_TABLE(CPU->cpu_id);
1985         pteptr = (x86pte_t *)PWIN_PTE_VA(x);
1986 #ifndef __xpv
1987         if (mmu.pae_hat)
1988                 pte = *pteptr;
1989         else
1990                 pte = *(x86pte32_t *)pteptr;
1991 #endif
1992 
1993         newpte = MAKEPTE(pfn, 0) | mmu.pt_global | mmu.pt_nx;
1994 
1995         /*
1996          * For hardware we can use a writable mapping.
1997          */
1998 #ifdef __xpv
1999         if (IN_XPV_PANIC())
2000 #endif
2001                 newpte |= PT_WRITABLE;
2002 
2003         if (!PTE_EQUIV(newpte, pte)) {
2004 
2005 #ifdef __xpv
2006                 if (!IN_XPV_PANIC()) {
2007                         xen_map(newpte, PWIN_VA(x));
2008                 } else
2009 #endif
2010                 {
2011                         XPV_ALLOW_PAGETABLE_UPDATES();
2012                         if (mmu.pae_hat)
2013                                 *pteptr = newpte;
2014                         else
2015                                 *(x86pte32_t *)pteptr = newpte;
2016                         XPV_DISALLOW_PAGETABLE_UPDATES();
2017                         mmu_tlbflush_entry((caddr_t)(PWIN_VA(x)));
2018                 }
2019         }
2020         return (PT_INDEX_PTR(PWIN_VA(x), index));
2021 }
2022 
2023 /*
2024  * Release access to a page table.
2025  */
2026 static void
2027 x86pte_release_pagetable(htable_t *ht)
2028 {
2029         /*
2030          * nothing to do for VLP htables
2031          */
2032         if (ht->ht_flags & HTABLE_VLP)
2033                 return;
2034 
2035         x86pte_mapout();
2036 }
2037 
2038 void
2039 x86pte_mapout(void)
2040 {
2041         if (kpm_vbase != NULL || !khat_running)
2042                 return;
2043 
2044         /*
2045          * Drop the CPU's hci_mutex and restore preemption.
2046          */
2047 #ifdef __xpv
2048         if (!IN_XPV_PANIC()) {
2049                 uintptr_t va;
2050 
2051                 /*
2052                  * We need to always clear the mapping in case a page
2053                  * that was once a page table page is ballooned out.
2054                  */
2055                 va = (uintptr_t)PWIN_VA(PWIN_TABLE(CPU->cpu_id));
2056                 (void) HYPERVISOR_update_va_mapping(va, 0,
2057                     UVMF_INVLPG | UVMF_LOCAL);
2058         }
2059 #endif
2060         mutex_exit(&CPU->cpu_hat_info->hci_mutex);
2061         kpreempt_enable();
2062 }
2063 
2064 /*
2065  * Atomic retrieval of a pagetable entry
2066  */
2067 x86pte_t
2068 x86pte_get(htable_t *ht, uint_t entry)
2069 {
2070         x86pte_t        pte;
2071         x86pte_t        *ptep;
2072 
2073         /*
2074          * Be careful that loading PAE entries in 32 bit kernel is atomic.
2075          */
2076         ASSERT(entry < mmu.ptes_per_table);
2077         ptep = x86pte_access_pagetable(ht, entry);
2078         pte = GET_PTE(ptep);
2079         x86pte_release_pagetable(ht);
2080         return (pte);
2081 }
2082 
2083 /*
2084  * Atomic unconditional set of a page table entry, it returns the previous
2085  * value. For pre-existing mappings if the PFN changes, then we don't care
2086  * about the old pte's REF / MOD bits. If the PFN remains the same, we leave
2087  * the MOD/REF bits unchanged.
2088  *
2089  * If asked to overwrite a link to a lower page table with a large page
2090  * mapping, this routine returns the special value of LPAGE_ERROR. This
2091  * allows the upper HAT layers to retry with a smaller mapping size.
2092  */
2093 x86pte_t
2094 x86pte_set(htable_t *ht, uint_t entry, x86pte_t new, void *ptr)
2095 {
2096         x86pte_t        old;
2097         x86pte_t        prev;
2098         x86pte_t        *ptep;
2099         level_t         l = ht->ht_level;
2100         x86pte_t        pfn_mask = (l != 0) ? PT_PADDR_LGPG : PT_PADDR;
2101         x86pte_t        n;
2102         uintptr_t       addr = htable_e2va(ht, entry);
2103         hat_t           *hat = ht->ht_hat;
2104 
2105         ASSERT(new != 0); /* don't use to invalidate a PTE, see x86pte_update */
2106         ASSERT(!(ht->ht_flags & HTABLE_SHARED_PFN));
2107         if (ptr == NULL)
2108                 ptep = x86pte_access_pagetable(ht, entry);
2109         else
2110                 ptep = ptr;
2111 
2112         /*
2113          * Install the new PTE. If remapping the same PFN, then
2114          * copy existing REF/MOD bits to new mapping.
2115          */
2116         do {
2117                 prev = GET_PTE(ptep);
2118                 n = new;
2119                 if (PTE_ISVALID(n) && (prev & pfn_mask) == (new & pfn_mask))
2120                         n |= prev & (PT_REF | PT_MOD);
2121 
2122                 /*
2123                  * Another thread may have installed this mapping already,
2124                  * flush the local TLB and be done.
2125                  */
2126                 if (prev == n) {
2127                         old = new;
2128 #ifdef __xpv
2129                         if (!IN_XPV_PANIC())
2130                                 xen_flush_va((caddr_t)addr);
2131                         else
2132 #endif
2133                                 mmu_tlbflush_entry((caddr_t)addr);
2134                         goto done;
2135                 }
2136 
2137                 /*
2138                  * Detect if we have a collision of installing a large
2139                  * page mapping where there already is a lower page table.
2140                  */
2141                 if (l > 0 && (prev & PT_VALID) && !(prev & PT_PAGESIZE)) {
2142                         old = LPAGE_ERROR;
2143                         goto done;
2144                 }
2145 
2146                 XPV_ALLOW_PAGETABLE_UPDATES();
2147                 old = CAS_PTE(ptep, prev, n);
2148                 XPV_DISALLOW_PAGETABLE_UPDATES();
2149         } while (old != prev);
2150 
2151         /*
2152          * Do a TLB demap if needed, ie. the old pte was valid.
2153          *
2154          * Note that a stale TLB writeback to the PTE here either can't happen
2155          * or doesn't matter. The PFN can only change for NOSYNC|NOCONSIST
2156          * mappings, but they were created with REF and MOD already set, so
2157          * no stale writeback will happen.
2158          *
2159          * Segmap is the only place where remaps happen on the same pfn and for
2160          * that we want to preserve the stale REF/MOD bits.
2161          */
2162         if (old & PT_REF)
2163                 hat_tlb_inval(hat, addr);
2164 
2165 done:
2166         if (ptr == NULL)
2167                 x86pte_release_pagetable(ht);
2168         return (old);
2169 }
2170 
2171 /*
2172  * Atomic compare and swap of a page table entry. No TLB invalidates are done.
2173  * This is used for links between pagetables of different levels.
2174  * Note we always create these links with dirty/access set, so they should
2175  * never change.
2176  */
2177 x86pte_t
2178 x86pte_cas(htable_t *ht, uint_t entry, x86pte_t old, x86pte_t new)
2179 {
2180         x86pte_t        pte;
2181         x86pte_t        *ptep;
2182 #ifdef __xpv
2183         /*
2184          * We can't use writable pagetables for upper level tables, so fake it.
2185          */
2186         mmu_update_t t[2];
2187         int cnt = 1;
2188         int count;
2189         maddr_t ma;
2190 
2191         if (!IN_XPV_PANIC()) {
2192                 ASSERT(!(ht->ht_flags & HTABLE_VLP));    /* no VLP yet */
2193                 ma = pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(ht->ht_pfn), entry));
2194                 t[0].ptr = ma | MMU_NORMAL_PT_UPDATE;
2195                 t[0].val = new;
2196 
2197 #if defined(__amd64)
2198                 /*
2199                  * On the 64-bit hypervisor we need to maintain the user mode
2200                  * top page table too.
2201                  */
2202                 if (ht->ht_level == mmu.max_level && ht->ht_hat != kas.a_hat) {
2203                         ma = pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(
2204                             ht->ht_hat->hat_user_ptable), entry));
2205                         t[1].ptr = ma | MMU_NORMAL_PT_UPDATE;
2206                         t[1].val = new;
2207                         ++cnt;
2208                 }
2209 #endif  /* __amd64 */
2210 
2211                 if (HYPERVISOR_mmu_update(t, cnt, &count, DOMID_SELF))
2212                         panic("HYPERVISOR_mmu_update() failed");
2213                 ASSERT(count == cnt);
2214                 return (old);
2215         }
2216 #endif
2217         ptep = x86pte_access_pagetable(ht, entry);
2218         XPV_ALLOW_PAGETABLE_UPDATES();
2219         pte = CAS_PTE(ptep, old, new);
2220         XPV_DISALLOW_PAGETABLE_UPDATES();
2221         x86pte_release_pagetable(ht);
2222         return (pte);
2223 }
2224 
2225 /*
2226  * Invalidate a page table entry as long as it currently maps something that
2227  * matches the value determined by expect.
2228  *
2229  * If tlb is set, also invalidates any TLB entries.
2230  *
2231  * Returns the previous value of the PTE.
2232  */
2233 x86pte_t
2234 x86pte_inval(
2235         htable_t *ht,
2236         uint_t entry,
2237         x86pte_t expect,
2238         x86pte_t *pte_ptr,
2239         boolean_t tlb)
2240 {
2241         x86pte_t        *ptep;
2242         x86pte_t        oldpte;
2243         x86pte_t        found;
2244 
2245         ASSERT(!(ht->ht_flags & HTABLE_SHARED_PFN));
2246         ASSERT(ht->ht_level <= mmu.max_page_level);
2247 
2248         if (pte_ptr != NULL)
2249                 ptep = pte_ptr;
2250         else
2251                 ptep = x86pte_access_pagetable(ht, entry);
2252 
2253 #if defined(__xpv)
2254         /*
2255          * If exit()ing just use HYPERVISOR_mmu_update(), as we can't be racing
2256          * with anything else.
2257          */
2258         if ((ht->ht_hat->hat_flags & HAT_FREEING) && !IN_XPV_PANIC()) {
2259                 int count;
2260                 mmu_update_t t[1];
2261                 maddr_t ma;
2262 
2263                 oldpte = GET_PTE(ptep);
2264                 if (expect != 0 && (oldpte & PT_PADDR) != (expect & PT_PADDR))
2265                         goto done;
2266                 ma = pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(ht->ht_pfn), entry));
2267                 t[0].ptr = ma | MMU_NORMAL_PT_UPDATE;
2268                 t[0].val = 0;
2269                 if (HYPERVISOR_mmu_update(t, 1, &count, DOMID_SELF))
2270                         panic("HYPERVISOR_mmu_update() failed");
2271                 ASSERT(count == 1);
2272                 goto done;
2273         }
2274 #endif /* __xpv */
2275 
2276         /*
2277          * Note that the loop is needed to handle changes due to h/w updating
2278          * of PT_MOD/PT_REF.
2279          */
2280         do {
2281                 oldpte = GET_PTE(ptep);
2282                 if (expect != 0 && (oldpte & PT_PADDR) != (expect & PT_PADDR))
2283                         goto done;
2284                 XPV_ALLOW_PAGETABLE_UPDATES();
2285                 found = CAS_PTE(ptep, oldpte, 0);
2286                 XPV_DISALLOW_PAGETABLE_UPDATES();
2287         } while (found != oldpte);
2288         if (tlb && (oldpte & (PT_REF | PT_MOD)))
2289                 hat_tlb_inval(ht->ht_hat, htable_e2va(ht, entry));
2290 
2291 done:
2292         if (pte_ptr == NULL)
2293                 x86pte_release_pagetable(ht);
2294         return (oldpte);
2295 }
2296 
2297 /*
2298  * Change a page table entry af it currently matches the value in expect.
2299  */
2300 x86pte_t
2301 x86pte_update(
2302         htable_t *ht,
2303         uint_t entry,
2304         x86pte_t expect,
2305         x86pte_t new)
2306 {
2307         x86pte_t        *ptep;
2308         x86pte_t        found;
2309 
2310         ASSERT(new != 0);
2311         ASSERT(!(ht->ht_flags & HTABLE_SHARED_PFN));
2312         ASSERT(ht->ht_level <= mmu.max_page_level);
2313 
2314         ptep = x86pte_access_pagetable(ht, entry);
2315         XPV_ALLOW_PAGETABLE_UPDATES();
2316         found = CAS_PTE(ptep, expect, new);
2317         XPV_DISALLOW_PAGETABLE_UPDATES();
2318         if (found == expect) {
2319                 hat_tlb_inval(ht->ht_hat, htable_e2va(ht, entry));
2320 
2321                 /*
2322                  * When removing write permission *and* clearing the
2323                  * MOD bit, check if a write happened via a stale
2324                  * TLB entry before the TLB shootdown finished.
2325                  *
2326                  * If it did happen, simply re-enable write permission and
2327                  * act like the original CAS failed.
2328                  */
2329                 if ((expect & (PT_WRITABLE | PT_MOD)) == PT_WRITABLE &&
2330                     (new & (PT_WRITABLE | PT_MOD)) == 0 &&
2331                     (GET_PTE(ptep) & PT_MOD) != 0) {
2332                         do {
2333                                 found = GET_PTE(ptep);
2334                                 XPV_ALLOW_PAGETABLE_UPDATES();
2335                                 found =
2336                                     CAS_PTE(ptep, found, found | PT_WRITABLE);
2337                                 XPV_DISALLOW_PAGETABLE_UPDATES();
2338                         } while ((found & PT_WRITABLE) == 0);
2339                 }
2340         }
2341         x86pte_release_pagetable(ht);
2342         return (found);
2343 }
2344 
2345 #ifndef __xpv
2346 /*
2347  * Copy page tables - this is just a little more complicated than the
2348  * previous routines. Note that it's also not atomic! It also is never
2349  * used for VLP pagetables.
2350  */
2351 void
2352 x86pte_copy(htable_t *src, htable_t *dest, uint_t entry, uint_t count)
2353 {
2354         caddr_t src_va;
2355         caddr_t dst_va;
2356         size_t size;
2357         x86pte_t *pteptr;
2358         x86pte_t pte;
2359 
2360         ASSERT(khat_running);
2361         ASSERT(!(dest->ht_flags & HTABLE_VLP));
2362         ASSERT(!(src->ht_flags & HTABLE_VLP));
2363         ASSERT(!(src->ht_flags & HTABLE_SHARED_PFN));
2364         ASSERT(!(dest->ht_flags & HTABLE_SHARED_PFN));
2365 
2366         /*
2367          * Acquire access to the CPU pagetable windows for the dest and source.
2368          */
2369         dst_va = (caddr_t)x86pte_access_pagetable(dest, entry);
2370         if (kpm_vbase) {
2371                 src_va = (caddr_t)
2372                     PT_INDEX_PTR(hat_kpm_pfn2va(src->ht_pfn), entry);
2373         } else {
2374                 uint_t x = PWIN_SRC(CPU->cpu_id);
2375 
2376                 /*
2377                  * Finish defining the src pagetable mapping
2378                  */
2379                 src_va = (caddr_t)PT_INDEX_PTR(PWIN_VA(x), entry);
2380                 pte = MAKEPTE(src->ht_pfn, 0) | mmu.pt_global | mmu.pt_nx;
2381                 pteptr = (x86pte_t *)PWIN_PTE_VA(x);
2382                 if (mmu.pae_hat)
2383                         *pteptr = pte;
2384                 else
2385                         *(x86pte32_t *)pteptr = pte;
2386                 mmu_tlbflush_entry((caddr_t)(PWIN_VA(x)));
2387         }
2388 
2389         /*
2390          * now do the copy
2391          */
2392         size = count << mmu.pte_size_shift;
2393         bcopy(src_va, dst_va, size);
2394 
2395         x86pte_release_pagetable(dest);
2396 }
2397 
2398 #else /* __xpv */
2399 
2400 /*
2401  * The hypervisor only supports writable pagetables at level 0, so we have
2402  * to install these 1 by 1 the slow way.
2403  */
2404 void
2405 x86pte_copy(htable_t *src, htable_t *dest, uint_t entry, uint_t count)
2406 {
2407         caddr_t src_va;
2408         x86pte_t pte;
2409 
2410         ASSERT(!IN_XPV_PANIC());
2411         src_va = (caddr_t)x86pte_access_pagetable(src, entry);
2412         while (count) {
2413                 if (mmu.pae_hat)
2414                         pte = *(x86pte_t *)src_va;
2415                 else
2416                         pte = *(x86pte32_t *)src_va;
2417                 if (pte != 0) {
2418                         set_pteval(pfn_to_pa(dest->ht_pfn), entry,
2419                             dest->ht_level, pte);
2420 #ifdef __amd64
2421                         if (dest->ht_level == mmu.max_level &&
2422                             htable_e2va(dest, entry) < HYPERVISOR_VIRT_END)
2423                                 set_pteval(
2424                                     pfn_to_pa(dest->ht_hat->hat_user_ptable),
2425                                     entry, dest->ht_level, pte);
2426 #endif
2427                 }
2428                 --count;
2429                 ++entry;
2430                 src_va += mmu.pte_size;
2431         }
2432         x86pte_release_pagetable(src);
2433 }
2434 #endif /* __xpv */
2435 
2436 /*
2437  * Zero page table entries - Note this doesn't use atomic stores!
2438  */
2439 static void
2440 x86pte_zero(htable_t *dest, uint_t entry, uint_t count)
2441 {
2442         caddr_t dst_va;
2443         size_t size;
2444 #ifdef __xpv
2445         int x;
2446         x86pte_t newpte;
2447 #endif
2448 
2449         /*
2450          * Map in the page table to be zeroed.
2451          */
2452         ASSERT(!(dest->ht_flags & HTABLE_SHARED_PFN));
2453         ASSERT(!(dest->ht_flags & HTABLE_VLP));
2454 
2455         /*
2456          * On the hypervisor we don't use x86pte_access_pagetable() since
2457          * in this case the page is not pinned yet.
2458          */
2459 #ifdef __xpv
2460         if (kpm_vbase == NULL) {
2461                 kpreempt_disable();
2462                 ASSERT(CPU->cpu_hat_info != NULL);
2463                 mutex_enter(&CPU->cpu_hat_info->hci_mutex);
2464                 x = PWIN_TABLE(CPU->cpu_id);
2465                 newpte = MAKEPTE(dest->ht_pfn, 0) | PT_WRITABLE;
2466                 xen_map(newpte, PWIN_VA(x));
2467                 dst_va = (caddr_t)PT_INDEX_PTR(PWIN_VA(x), entry);
2468         } else
2469 #endif
2470                 dst_va = (caddr_t)x86pte_access_pagetable(dest, entry);
2471 
2472         size = count << mmu.pte_size_shift;
2473         ASSERT(size > BLOCKZEROALIGN);
2474 #ifdef __i386
2475         if (!is_x86_feature(x86_featureset, X86FSET_SSE2))
2476                 bzero(dst_va, size);
2477         else
2478 #endif
2479                 block_zero_no_xmm(dst_va, size);
2480 
2481 #ifdef __xpv
2482         if (kpm_vbase == NULL) {
2483                 xen_map(0, PWIN_VA(x));
2484                 mutex_exit(&CPU->cpu_hat_info->hci_mutex);
2485                 kpreempt_enable();
2486         } else
2487 #endif
2488                 x86pte_release_pagetable(dest);
2489 }
2490 
2491 /*
2492  * Called to ensure that all pagetables are in the system dump
2493  */
2494 void
2495 hat_dump(void)
2496 {
2497         hat_t *hat;
2498         uint_t h;
2499         htable_t *ht;
2500 
2501         /*
2502          * Dump all page tables
2503          */
2504         for (hat = kas.a_hat; hat != NULL; hat = hat->hat_next) {
2505                 for (h = 0; h < hat->hat_num_hash; ++h) {
2506                         for (ht = hat->hat_ht_hash[h]; ht; ht = ht->ht_next) {
2507                                 if ((ht->ht_flags & HTABLE_VLP) == 0)
2508                                         dump_page(ht->ht_pfn);
2509                         }
2510                 }
2511         }
2512 }