1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 /*
  26  * Copyright (c) 2014 by Delphix. All rights reserved.
  27  */
  28 
  29 #ifndef _VM_HTABLE_H
  30 #define _VM_HTABLE_H
  31 
  32 #ifdef  __cplusplus
  33 extern "C" {
  34 #endif
  35 
  36 #if defined(__GNUC__) && defined(_ASM_INLINES) && defined(_KERNEL)
  37 #include <asm/htable.h>
  38 #endif
  39 
  40 extern void atomic_andb(uint8_t *addr, uint8_t value);
  41 extern void atomic_orb(uint8_t *addr, uint8_t value);
  42 extern void atomic_inc16(uint16_t *addr);
  43 extern void atomic_dec16(uint16_t *addr);
  44 extern void mmu_tlbflush_entry(caddr_t addr);
  45 
  46 /*
  47  * Each hardware page table has an htable_t describing it.
  48  *
  49  * We use a reference counter mechanism to detect when we can free an htable.
  50  * In the implmentation the reference count is split into 2 separate counters:
  51  *
  52  *      ht_busy is a traditional reference count of uses of the htable pointer
  53  *
  54  *      ht_valid_cnt is a count of how references are implied by valid PTE/PTP
  55  *               entries in the pagetable
  56  *
  57  * ht_busy is only incremented by htable_lookup() or htable_create()
  58  * while holding the appropriate hash_table mutex. While installing a new
  59  * valid PTE or PTP, in order to increment ht_valid_cnt a thread must have
  60  * done an htable_lookup() or htable_create() but not the htable_release yet.
  61  *
  62  * htable_release(), while holding the mutex, can know that if
  63  * busy == 1 and valid_cnt == 0, the htable can be free'd.
  64  *
  65  * The fields have been ordered to make htable_lookup() fast. Hence,
  66  * ht_hat, ht_vaddr, ht_level and ht_next need to be clustered together.
  67  */
  68 struct htable {
  69         struct htable   *ht_next;       /* forward link for hash table */
  70         struct hat      *ht_hat;        /* hat this mapping comes from */
  71         uintptr_t       ht_vaddr;       /* virt addr at start of this table */
  72         int8_t          ht_level;       /* page table level: 0=4K, 1=2M, ... */
  73         uint8_t         ht_flags;       /* see below */
  74         int16_t         ht_busy;        /* implements locking protocol */
  75         int16_t         ht_valid_cnt;   /* # of valid entries in this table */
  76         uint32_t        ht_lock_cnt;    /* # of locked entries in this table */
  77                                         /* never used for kernel hat */
  78         pfn_t           ht_pfn;         /* pfn of page of the pagetable */
  79         struct htable   *ht_prev;       /* backward link for hash table */
  80         struct htable   *ht_parent;     /* htable that points to this htable */
  81         struct htable   *ht_shares;     /* for HTABLE_SHARED_PFN only */
  82 };
  83 typedef struct htable htable_t;
  84 
  85 /*
  86  * Flags values for htable ht_flags field:
  87  *
  88  * HTABLE_VLP - this is the top level htable of a VLP HAT.
  89  *
  90  * HTABLE_SHARED_PFN - this htable had its PFN assigned from sharing another
  91  *      htable. Used by hat_share() for ISM.
  92  */
  93 #define HTABLE_VLP              (0x01)
  94 #define HTABLE_SHARED_PFN       (0x02)
  95 
  96 /*
  97  * The htable hash table hashing function.  The 28 is so that high
  98  * order bits are include in the hash index to skew the wrap
  99  * around of addresses. Even though the hash buckets are stored per
 100  * hat we include the value of hat pointer in the hash function so
 101  * that the secondary hash for the htable mutex winds up begin different in
 102  * every address space.
 103  */
 104 #define HTABLE_HASH(hat, va, lvl)                                       \
 105         ((((va) >> LEVEL_SHIFT(1)) + ((va) >> 28) + (lvl) +         \
 106         ((uintptr_t)(hat) >> 4)) & ((hat)->hat_num_hash - 1))
 107 
 108 /*
 109  * Each CPU gets a unique hat_cpu_info structure in cpu_hat_info.
 110  */
 111 struct hat_cpu_info {
 112         kmutex_t hci_mutex;             /* mutex to ensure sequential usage */
 113 #if defined(__amd64)
 114         pfn_t   hci_vlp_pfn;            /* pfn of hci_vlp_l3ptes */
 115         x86pte_t *hci_vlp_l3ptes;       /* VLP Level==3 pagetable (top) */
 116         x86pte_t *hci_vlp_l2ptes;       /* VLP Level==2 pagetable */
 117 #endif  /* __amd64 */
 118 };
 119 
 120 
 121 /*
 122  * Compute the last page aligned VA mapped by an htable.
 123  *
 124  * Given a va and a level, compute the virtual address of the start of the
 125  * next page at that level.
 126  *
 127  * XX64 - The check for the VA hole needs to be better generalized.
 128  */
 129 #if defined(__amd64)
 130 #define HTABLE_NUM_PTES(ht)     (((ht)->ht_flags & HTABLE_VLP) ? 4 : 512)
 131 
 132 #define HTABLE_LAST_PAGE(ht)                                            \
 133         ((ht)->ht_level == mmu.max_level ? ((uintptr_t)0UL - MMU_PAGESIZE) :\
 134         ((ht)->ht_vaddr - MMU_PAGESIZE +                             \
 135         ((uintptr_t)HTABLE_NUM_PTES(ht) << LEVEL_SHIFT((ht)->ht_level))))
 136 
 137 #define NEXT_ENTRY_VA(va, l)    \
 138         ((va & LEVEL_MASK(l)) + LEVEL_SIZE(l) == mmu.hole_start ?   \
 139         mmu.hole_end : (va & LEVEL_MASK(l)) + LEVEL_SIZE(l))
 140 
 141 #elif defined(__i386)
 142 
 143 #define HTABLE_NUM_PTES(ht)     \
 144         (!mmu.pae_hat ? 1024 : ((ht)->ht_level == 2 ? 4 : 512))
 145 
 146 #define HTABLE_LAST_PAGE(ht)    ((ht)->ht_vaddr - MMU_PAGESIZE + \
 147         ((uintptr_t)HTABLE_NUM_PTES(ht) << LEVEL_SHIFT((ht)->ht_level)))
 148 
 149 #define NEXT_ENTRY_VA(va, l) ((va & LEVEL_MASK(l)) + LEVEL_SIZE(l))
 150 
 151 #endif
 152 
 153 #if defined(_KERNEL)
 154 
 155 /*
 156  * initialization function called from hat_init()
 157  */
 158 extern void htable_init(void);
 159 
 160 /*
 161  * Functions to lookup, or "lookup and create", the htable corresponding
 162  * to the virtual address "vaddr"  in the "hat" at the given "level" of
 163  * page tables. htable_lookup() may return NULL if no such entry exists.
 164  *
 165  * On return the given htable is marked busy (a shared lock) - this prevents
 166  * the htable from being stolen or freed) until htable_release() is called.
 167  *
 168  * If kalloc_flag is set on an htable_create() we can't call kmem allocation
 169  * routines for this htable, since it's for the kernel hat itself.
 170  *
 171  * htable_acquire() is used when an htable pointer has been extracted from
 172  * an hment and we need to get a reference to the htable.
 173  */
 174 extern htable_t *htable_lookup(struct hat *hat, uintptr_t vaddr, level_t level);
 175 extern htable_t *htable_create(struct hat *hat, uintptr_t vaddr, level_t level,
 176         htable_t *shared);
 177 extern void htable_acquire(htable_t *);
 178 
 179 extern void htable_release(htable_t *ht);
 180 extern void htable_destroy(htable_t *ht);
 181 
 182 /*
 183  * Code to free all remaining htables for a hat. Called after the hat is no
 184  * longer in use by any thread.
 185  */
 186 extern void htable_purge_hat(struct hat *hat);
 187 
 188 /*
 189  * Find the htable, page table entry index, and PTE of the given virtual
 190  * address.  If not found returns NULL. When found, returns the htable_t *,
 191  * sets entry, and has a hold on the htable.
 192  */
 193 extern htable_t *htable_getpte(struct hat *, uintptr_t, uint_t *, x86pte_t *,
 194         level_t);
 195 
 196 /*
 197  * Similar to hat_getpte(), except that this only succeeds if a valid
 198  * page mapping is present.
 199  */
 200 extern htable_t *htable_getpage(struct hat *hat, uintptr_t va, uint_t *entry);
 201 
 202 /*
 203  * Called to allocate initial/additional htables for reserve.
 204  */
 205 extern void htable_initial_reserve(uint_t);
 206 extern void htable_reserve(uint_t);
 207 
 208 /*
 209  * Used to readjust the htable reserve after the reserve list has been used.
 210  * Also called after boot to release left over boot reserves.
 211  */
 212 extern void htable_adjust_reserve(void);
 213 
 214 /*
 215  * return number of bytes mapped by all the htables in a given hat
 216  */
 217 extern size_t htable_mapped(struct hat *);
 218 
 219 
 220 /*
 221  * Attach initial pagetables as htables
 222  */
 223 extern void htable_attach(struct hat *, uintptr_t, level_t, struct htable *,
 224     pfn_t);
 225 
 226 /*
 227  * Routine to find the next populated htable at or above a given virtual
 228  * address. Can specify an upper limit, or HTABLE_WALK_TO_END to indicate
 229  * that it should search the entire address space.  Similar to
 230  * hat_getpte(), but used for walking through address ranges. It can be
 231  * used like this:
 232  *
 233  *      va = ...
 234  *      ht = NULL;
 235  *      while (va < end_va) {
 236  *              pte = htable_walk(hat, &ht, &va, end_va);
 237  *              if (!pte)
 238  *                      break;
 239  *
 240  *              ... code to operate on page at va ...
 241  *
 242  *              va += LEVEL_SIZE(ht->ht_level);
 243  *      }
 244  *      if (ht)
 245  *              htable_release(ht);
 246  *
 247  */
 248 extern x86pte_t htable_walk(struct hat *hat, htable_t **ht, uintptr_t *va,
 249         uintptr_t eaddr);
 250 
 251 #define HTABLE_WALK_TO_END ((uintptr_t)-1)
 252 
 253 /*
 254  * Utilities convert between virtual addresses and page table entry indeces.
 255  */
 256 extern uint_t htable_va2entry(uintptr_t va, htable_t *ht);
 257 extern uintptr_t htable_e2va(htable_t *ht, uint_t entry);
 258 
 259 /*
 260  * Interfaces that provide access to page table entries via the htable.
 261  *
 262  * Note that all accesses except x86pte_copy() and x86pte_zero() are atomic.
 263  */
 264 extern void     x86pte_cpu_init(cpu_t *);
 265 extern void     x86pte_cpu_fini(cpu_t *);
 266 
 267 extern x86pte_t x86pte_get(htable_t *, uint_t entry);
 268 
 269 /*
 270  * x86pte_set returns LPAGE_ERROR if it's asked to overwrite a page table
 271  * link with a large page mapping.
 272  */
 273 #define LPAGE_ERROR (-(x86pte_t)1)
 274 extern x86pte_t x86pte_set(htable_t *, uint_t entry, x86pte_t new, void *);
 275 
 276 extern x86pte_t x86pte_inval(htable_t *ht, uint_t entry,
 277         x86pte_t old, x86pte_t *ptr, boolean_t tlb);
 278 
 279 extern x86pte_t x86pte_update(htable_t *ht, uint_t entry,
 280         x86pte_t old, x86pte_t new);
 281 
 282 extern void     x86pte_copy(htable_t *src, htable_t *dest, uint_t entry,
 283         uint_t cnt);
 284 
 285 /*
 286  * access to a pagetable knowing only the pfn
 287  */
 288 extern x86pte_t *x86pte_mapin(pfn_t, uint_t, htable_t *);
 289 extern void x86pte_mapout(void);
 290 
 291 /*
 292  * these are actually inlines for "lock; incw", "lock; decw", etc. instructions.
 293  */
 294 #define HTABLE_INC(x)   atomic_inc16((uint16_t *)&x)
 295 #define HTABLE_DEC(x)   atomic_dec16((uint16_t *)&x)
 296 #define HTABLE_LOCK_INC(ht)     atomic_inc_32(&(ht)->ht_lock_cnt)
 297 #define HTABLE_LOCK_DEC(ht)     atomic_dec_32(&(ht)->ht_lock_cnt)
 298 
 299 #ifdef __xpv
 300 extern void xen_flush_va(caddr_t va);
 301 extern void xen_gflush_va(caddr_t va, cpuset_t);
 302 extern void xen_flush_tlb(void);
 303 extern void xen_gflush_tlb(cpuset_t);
 304 extern void xen_pin(pfn_t, level_t);
 305 extern void xen_unpin(pfn_t);
 306 extern int xen_kpm_page(pfn_t, uint_t);
 307 
 308 /*
 309  * The hypervisor maps all page tables into our address space read-only.
 310  * Under normal circumstances, the hypervisor then handles all updates to
 311  * the page tables underneath the covers for us.  However, when we are
 312  * trying to dump core after a hypervisor panic, the hypervisor is no
 313  * longer available to do these updates.  To work around the protection
 314  * problem, we simply disable write-protect checking for the duration of a
 315  * pagetable update operation.
 316  */
 317 #define XPV_ALLOW_PAGETABLE_UPDATES()                                   \
 318         {                                                               \
 319                 if (IN_XPV_PANIC())                                     \
 320                         setcr0((getcr0() & ~CR0_WP) & 0xffffffff);      \
 321         }
 322 #define XPV_DISALLOW_PAGETABLE_UPDATES()                                \
 323         {                                                               \
 324                 if (IN_XPV_PANIC() > 0)                                      \
 325                         setcr0((getcr0() | CR0_WP) & 0xffffffff);   \
 326         }
 327 
 328 #else /* __xpv */
 329 
 330 #define XPV_ALLOW_PAGETABLE_UPDATES()
 331 #define XPV_DISALLOW_PAGETABLE_UPDATES()
 332 
 333 #endif
 334 
 335 #endif  /* _KERNEL */
 336 
 337 
 338 #ifdef  __cplusplus
 339 }
 340 #endif
 341 
 342 #endif  /* _VM_HTABLE_H */