1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2019 Joyent, Inc.
  24  */
  25 
  26 /*
  27  * UNIX machine dependent virtual memory support.
  28  */
  29 
  30 #ifndef _VM_DEP_H
  31 #define _VM_DEP_H
  32 
  33 #ifdef  __cplusplus
  34 extern "C" {
  35 #endif
  36 
  37 #include <vm/hat_sfmmu.h>
  38 #include <sys/archsystm.h>
  39 #include <sys/memnode.h>
  40 
  41 #define GETTICK()       gettick()
  42 
  43 /* tick value that should be used for random values */
  44 extern u_longlong_t randtick(void);
  45 
  46 /*
  47  * Per page size free lists. Allocated dynamically.
  48  */
  49 #define MAX_MEM_TYPES   2       /* 0 = reloc, 1 = noreloc */
  50 #define MTYPE_RELOC     0
  51 #define MTYPE_NORELOC   1
  52 
  53 #define PP_2_MTYPE(pp)  (PP_ISNORELOC(pp) ? MTYPE_NORELOC : MTYPE_RELOC)
  54 
  55 #define MTYPE_INIT(mtype, vp, vaddr, flags, pgsz)                       \
  56         mtype = (flags & PG_NORELOC) ? MTYPE_NORELOC : MTYPE_RELOC;
  57 
  58 /* mtype init for page_get_replacement_page */
  59 #define MTYPE_PGR_INIT(mtype, flags, pp, pgcnt)                 \
  60         mtype = (flags & PG_NORELOC) ? MTYPE_NORELOC : MTYPE_RELOC;
  61 
  62 #define MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi)                     \
  63         pfnlo = mem_node_config[mnode].physbase;                        \
  64         pfnhi = mem_node_config[mnode].physmax;
  65 
  66 /*
  67  * candidate counters in vm_pagelist.c are indexed by color and range
  68  */
  69 #define MAX_MNODE_MRANGES               MAX_MEM_TYPES
  70 #define MNODE_RANGE_CNT(mnode)          MAX_MNODE_MRANGES
  71 #define MNODE_MAX_MRANGE(mnode)         (MAX_MEM_TYPES - 1)
  72 #define MTYPE_2_MRANGE(mnode, mtype)    (mtype)
  73 
  74 /*
  75  * Internal PG_ flags.
  76  */
  77 #define PGI_RELOCONLY   0x10000 /* acts in the opposite sense to PG_NORELOC */
  78 #define PGI_NOCAGE      0x20000 /* indicates Cage is disabled */
  79 #define PGI_PGCPHIPRI   0x40000 /* page_get_contig_page priority allocation */
  80 #define PGI_PGCPSZC0    0x80000 /* relocate base pagesize page */
  81 
  82 /*
  83  * PGI mtype flags - should not overlap PGI flags
  84  */
  85 #define PGI_MT_RANGE    0x1000000       /* mtype range */
  86 #define PGI_MT_NEXT     0x2000000       /* get next mtype */
  87 
  88 extern page_t ***page_freelists[MMU_PAGE_SIZES][MAX_MEM_TYPES];
  89 extern page_t ***page_cachelists[MAX_MEM_TYPES];
  90 
  91 #define PAGE_FREELISTS(mnode, szc, color, mtype) \
  92         (*(page_freelists[szc][mtype][mnode] + (color)))
  93 
  94 #define PAGE_CACHELISTS(mnode, color, mtype) \
  95         (*(page_cachelists[mtype][mnode] + (color)))
  96 
  97 /*
  98  * There are 'page_colors' colors/bins.  Spread them out under a
  99  * couple of locks.  There are mutexes for both the page freelist
 100  * and the page cachelist.  We want enough locks to make contention
 101  * reasonable, but not too many -- otherwise page_freelist_lock() gets
 102  * so expensive that it becomes the bottleneck!
 103  */
 104 #define NPC_MUTEX       16
 105 
 106 extern kmutex_t *fpc_mutex[NPC_MUTEX];
 107 extern kmutex_t *cpc_mutex[NPC_MUTEX];
 108 
 109 /*
 110  * Iterator provides the info needed to convert RA to PA.
 111  * MEM_NODE_ITERATOR_INIT() should be called before
 112  * PAGE_NEXT_PFN_FOR_COLOR() if pfn was not obtained via a previous
 113  * PAGE_NEXT_PFN_FOR_COLOR() call. Iterator caches color 2 hash
 114  * translations requiring initializer call if color or ceq_mask changes,
 115  * even if pfn doesn't. MEM_NODE_ITERATOR_INIT() must also be called before
 116  * PFN_2_COLOR() that uses a valid iterator argument.
 117  *
 118  * plat_mem_node_iterator_init() starts from last mblock in continuation
 119  * case which may be invalid because memory DR.  To detect this situation
 120  * mi_genid is checked against mpo_genid which is incremented after a
 121  * memory DR operation.  See also plat_slice_add()/plat_slice_del().
 122  */
 123 #ifdef  sun4v
 124 
 125 typedef struct mem_node_iterator {
 126         uint_t mi_mnode;                /* mnode in which to iterate */
 127         int mi_init;                    /* set to 1 when first init */
 128         int mi_genid;                   /* set/checked against mpo_genid */
 129         int mi_last_mblock;             /* last mblock visited */
 130         uint_t mi_hash_ceq_mask;        /* cached copy of ceq_mask */
 131         uint_t mi_hash_color;           /* cached copy of color */
 132         uint_t mi_mnode_mask;           /* number of mask bits */
 133         uint_t mi_mnode_pfn_shift;      /* mnode position in pfn */
 134         pfn_t mi_mblock_base;           /* first valid pfn in current mblock */
 135         pfn_t mi_mblock_end;            /* last valid pfn in current mblock */
 136         pfn_t mi_ra_to_pa;              /* ra adjustment for current mblock */
 137         pfn_t mi_mnode_pfn_mask;        /* mask to obtain mnode id bits */
 138 } mem_node_iterator_t;
 139 
 140 #define MEM_NODE_ITERATOR_DECL(it) \
 141         mem_node_iterator_t it
 142 #define MEM_NODE_ITERATOR_INIT(pfn, mnode, szc, it) \
 143         (pfn) = plat_mem_node_iterator_init((pfn), (mnode), (szc), (it), 1)
 144 
 145 extern pfn_t plat_mem_node_iterator_init(pfn_t, int, uchar_t,
 146     mem_node_iterator_t *, int);
 147 extern pfn_t plat_rapfn_to_papfn(pfn_t);
 148 extern int interleaved_mnodes;
 149 
 150 #else   /* sun4v */
 151 
 152 #define MEM_NODE_ITERATOR_DECL(it) \
 153         void *it = NULL
 154 #define MEM_NODE_ITERATOR_INIT(pfn, mnode, szc, it)
 155 
 156 #endif  /* sun4v */
 157 
 158 /*
 159  * Return the mnode limits so that hpc_counters length and base
 160  * index can be determined. When interleaved_mnodes is set, we
 161  * create an array only for the first mnode that exists. All other
 162  * mnodes will share the array in this case.
 163  * If interleaved_mnodes is not set, simply return the limits for
 164  * the given mnode.
 165  */
 166 #define HPM_COUNTERS_LIMITS(mnode, physbase, physmax, first)            \
 167         if (!interleaved_mnodes) {                                      \
 168                 (physbase) = mem_node_config[(mnode)].physbase;         \
 169                 (physmax) = mem_node_config[(mnode)].physmax;           \
 170                 (first) = (mnode);                                      \
 171         } else if ((first) < 0) {                                    \
 172                 mem_node_max_range(&(physbase), &(physmax));            \
 173                 (first) = (mnode);                                      \
 174         }
 175 
 176 #define PAGE_CTRS_WRITE_LOCK(mnode)                                     \
 177         if (!interleaved_mnodes) {                                      \
 178                 rw_enter(&page_ctrs_rwlock[(mnode)], RW_WRITER);    \
 179                 page_freelist_lock(mnode);                              \
 180         } else {                                                        \
 181                 /* changing shared hpm_counters */                      \
 182                 int _i;                                                 \
 183                 for (_i = 0; _i < max_mem_nodes; _i++) {             \
 184                         rw_enter(&page_ctrs_rwlock[_i], RW_WRITER); \
 185                         page_freelist_lock(_i);                         \
 186                 }                                                       \
 187         }
 188 
 189 #define PAGE_CTRS_WRITE_UNLOCK(mnode)                                   \
 190         if (!interleaved_mnodes) {                                      \
 191                 page_freelist_unlock(mnode);                            \
 192                 rw_exit(&page_ctrs_rwlock[(mnode)]);                        \
 193         } else {                                                        \
 194                 int _i;                                                 \
 195                 for (_i = 0; _i < max_mem_nodes; _i++) {             \
 196                         page_freelist_unlock(_i);                       \
 197                         rw_exit(&page_ctrs_rwlock[_i]);                     \
 198                 }                                                       \
 199         }
 200 
 201 /*
 202  * cpu specific color conversion functions
 203  */
 204 extern uint_t page_get_nsz_color_mask_cpu(uchar_t, uint_t);
 205 #pragma weak page_get_nsz_color_mask_cpu
 206 
 207 extern uint_t page_get_nsz_color_cpu(uchar_t, uint_t);
 208 #pragma weak page_get_nsz_color_cpu
 209 
 210 extern uint_t page_get_color_shift_cpu(uchar_t, uchar_t);
 211 #pragma weak page_get_color_shift_cpu
 212 
 213 extern uint_t page_convert_color_cpu(uint_t, uchar_t, uchar_t);
 214 #pragma weak page_convert_color_cpu
 215 
 216 extern pfn_t page_next_pfn_for_color_cpu(pfn_t,
 217     uchar_t, uint_t, uint_t, uint_t, void *);
 218 #pragma weak page_next_pfn_for_color_cpu
 219 
 220 extern uint_t  page_pfn_2_color_cpu(pfn_t, uchar_t, void *);
 221 #pragma weak page_pfn_2_color_cpu
 222 
 223 #define PAGE_GET_COLOR_SHIFT(szc, nszc)                         \
 224         ((&page_get_color_shift_cpu != NULL) ?                      \
 225             page_get_color_shift_cpu(szc, nszc) :               \
 226             (hw_page_array[(nszc)].hp_shift -                   \
 227                 hw_page_array[(szc)].hp_shift))
 228 
 229 #define PAGE_CONVERT_COLOR(ncolor, szc, nszc)                   \
 230         ((&page_convert_color_cpu != NULL) ?                        \
 231             page_convert_color_cpu(ncolor, szc, nszc) :         \
 232             ((ncolor) << PAGE_GET_COLOR_SHIFT((szc), (nszc))))
 233 
 234 #define PFN_2_COLOR(pfn, szc, it)                               \
 235         ((&page_pfn_2_color_cpu != NULL) ?                  \
 236             page_pfn_2_color_cpu(pfn, szc, it) :                \
 237             ((pfn & (hw_page_array[0].hp_colors - 1)) >>      \
 238                 (hw_page_array[szc].hp_shift -                  \
 239                     hw_page_array[0].hp_shift)))
 240 
 241 #define PNUM_SIZE(szc)                                                  \
 242         (hw_page_array[(szc)].hp_pgcnt)
 243 #define PNUM_SHIFT(szc)                                                 \
 244         (hw_page_array[(szc)].hp_shift - hw_page_array[0].hp_shift)
 245 #define PAGE_GET_SHIFT(szc)                                             \
 246         (hw_page_array[(szc)].hp_shift)
 247 #define PAGE_GET_PAGECOLORS(szc)                                        \
 248         (hw_page_array[(szc)].hp_colors)
 249 
 250 /*
 251  * This macro calculates the next sequential pfn with the specified
 252  * color using color equivalency mask
 253  */
 254 #define PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, color, ceq_mask, color_mask, it)   \
 255         {                                                                    \
 256                 ASSERT(((color) & ~(ceq_mask)) == 0);                        \
 257                 if (&page_next_pfn_for_color_cpu == NULL) {                  \
 258                         uint_t  pfn_shift = PAGE_BSZS_SHIFT(szc);            \
 259                         pfn_t   spfn = pfn >> pfn_shift;                     \
 260                         pfn_t   stride = (ceq_mask) + 1;                     \
 261                         ASSERT((((ceq_mask) + 1) & (ceq_mask)) == 0);        \
 262                         if (((spfn ^ (color)) & (ceq_mask)) == 0) {          \
 263                                 pfn += stride << pfn_shift;                  \
 264                         } else {                                             \
 265                                 pfn = (spfn & ~(pfn_t)(ceq_mask)) | (color); \
 266                                 pfn = (pfn > spfn ? pfn : pfn + stride) <<   \
 267                                     pfn_shift;                               \
 268                         }                                                    \
 269                 } else {                                                     \
 270                     pfn = page_next_pfn_for_color_cpu(pfn, szc, color,       \
 271                         ceq_mask, color_mask, it);                           \
 272                 }                                                            \
 273         }
 274 
 275 /* get the color equivalency mask for the next szc */
 276 #define PAGE_GET_NSZ_MASK(szc, mask)                                         \
 277         ((&page_get_nsz_color_mask_cpu == NULL) ?                            \
 278             ((mask) >> (PAGE_GET_SHIFT((szc) + 1) - PAGE_GET_SHIFT(szc))) :  \
 279             page_get_nsz_color_mask_cpu(szc, mask))
 280 
 281 /* get the color of the next szc */
 282 #define PAGE_GET_NSZ_COLOR(szc, color)                                       \
 283         ((&page_get_nsz_color_cpu == NULL) ?                                 \
 284             ((color) >> (PAGE_GET_SHIFT((szc) + 1) - PAGE_GET_SHIFT(szc))) : \
 285             page_get_nsz_color_cpu(szc, color))
 286 
 287 /* Find the bin for the given page if it was of size szc */
 288 #define PP_2_BIN_SZC(pp, szc)   (PFN_2_COLOR(pp->p_pagenum, szc, (void *)(-1)))
 289 
 290 #define PP_2_BIN(pp)            (PP_2_BIN_SZC(pp, pp->p_szc))
 291 
 292 #define PP_2_MEM_NODE(pp)       (PFN_2_MEM_NODE(pp->p_pagenum))
 293 
 294 #define PC_BIN_MUTEX(mnode, bin, flags) ((flags & PG_FREE_LIST) ?   \
 295         &fpc_mutex[(bin) & (NPC_MUTEX - 1)][mnode] :                    \
 296         &cpc_mutex[(bin) & (NPC_MUTEX - 1)][mnode])
 297 
 298 #define FPC_MUTEX(mnode, i)     (&fpc_mutex[i][mnode])
 299 #define CPC_MUTEX(mnode, i)     (&cpc_mutex[i][mnode])
 300 
 301 #define PFN_BASE(pfnum, szc)    (pfnum & ~((1 << PAGE_BSZS_SHIFT(szc)) - 1))
 302 
 303 /*
 304  * this structure is used for walking free page lists
 305  * controls when to split large pages into smaller pages,
 306  * and when to coalesce smaller pages into larger pages
 307  */
 308 typedef struct page_list_walker {
 309         uint_t  plw_colors;             /* num of colors for szc */
 310         uint_t  plw_color_mask;         /* colors-1 */
 311         uint_t  plw_bin_step;           /* next bin: 1 or 2 */
 312         uint_t  plw_count;              /* loop count */
 313         uint_t  plw_bin0;               /* starting bin */
 314         uint_t  plw_bin_marker;         /* bin after initial jump */
 315         uint_t  plw_bin_split_prev;     /* last bin we tried to split */
 316         uint_t  plw_do_split;           /* set if OK to split */
 317         uint_t  plw_split_next;         /* next bin to split */
 318         uint_t  plw_ceq_dif;            /* number of different color groups */
 319                                         /* to check */
 320         uint_t  plw_ceq_mask[MMU_PAGE_SIZES + 1]; /* color equiv mask */
 321         uint_t  plw_bins[MMU_PAGE_SIZES + 1];   /* num of bins */
 322 } page_list_walker_t;
 323 
 324 void    page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin,
 325     int can_split, int use_ceq, page_list_walker_t *plw);
 326 
 327 typedef char    hpmctr_t;
 328 
 329 #ifdef DEBUG
 330 #define CHK_LPG(pp, szc)        chk_lpg(pp, szc)
 331 extern void     chk_lpg(page_t *, uchar_t);
 332 #else
 333 #define CHK_LPG(pp, szc)
 334 #endif
 335 
 336 /*
 337  * page list count per mnode and type.
 338  */
 339 typedef struct {
 340         pgcnt_t plc_mt_pgmax;           /* max page cnt */
 341         pgcnt_t plc_mt_clpgcnt;         /* cache list cnt */
 342         pgcnt_t plc_mt_flpgcnt;         /* free list cnt - small pages */
 343         pgcnt_t plc_mt_lgpgcnt;         /* free list cnt - large pages */
 344 #ifdef DEBUG
 345         struct {
 346                 pgcnt_t plc_mts_pgcnt;  /* per page size count */
 347                 int     plc_mts_colors;
 348                 pgcnt_t *plc_mtsc_pgcnt; /* per color bin count */
 349         } plc_mts[MMU_PAGE_SIZES];
 350 #endif
 351 } plcnt_t[MAX_MEM_NODES][MAX_MEM_TYPES];
 352 
 353 #ifdef DEBUG
 354 
 355 #define PLCNT_SZ(ctrs_sz) {                                             \
 356         int     szc;                                                    \
 357         for (szc = 0; szc < mmu_page_sizes; szc++) {                 \
 358                 int     colors = page_get_pagecolors(szc);              \
 359                 ctrs_sz += (max_mem_nodes * MAX_MEM_TYPES *             \
 360                     colors * sizeof (pgcnt_t));                         \
 361         }                                                               \
 362 }
 363 
 364 #define PLCNT_INIT(base) {                                              \
 365         int     mn, mt, szc, colors;                                    \
 366         for (szc = 0; szc < mmu_page_sizes; szc++) {                 \
 367                 colors = page_get_pagecolors(szc);                      \
 368                 for (mn = 0; mn < max_mem_nodes; mn++) {             \
 369                         for (mt = 0; mt < MAX_MEM_TYPES; mt++) {     \
 370                                 plcnt[mn][mt].plc_mts[szc].             \
 371                                     plc_mts_colors = colors;            \
 372                                 plcnt[mn][mt].plc_mts[szc].             \
 373                                     plc_mtsc_pgcnt = (pgcnt_t *)base;   \
 374                                 base += (colors * sizeof (pgcnt_t));    \
 375                         }                                               \
 376                 }                                                       \
 377         }                                                               \
 378 }
 379 
 380 #define PLCNT_DO(pp, mn, mtype, szc, cnt, flags) {                      \
 381         int     bin = PP_2_BIN(pp);                                     \
 382         if (flags & PG_CACHE_LIST)                                  \
 383                 atomic_add_long(&plcnt[mn][mtype].plc_mt_clpgcnt, cnt);     \
 384         else if (szc)                                                   \
 385                 atomic_add_long(&plcnt[mn][mtype].plc_mt_lgpgcnt, cnt);     \
 386         else                                                            \
 387                 atomic_add_long(&plcnt[mn][mtype].plc_mt_flpgcnt, cnt);     \
 388         atomic_add_long(&plcnt[mn][mtype].plc_mts[szc].plc_mts_pgcnt,       \
 389             cnt);                                                       \
 390         atomic_add_long(&plcnt[mn][mtype].plc_mts[szc].                     \
 391             plc_mtsc_pgcnt[bin], cnt);                                  \
 392 }
 393 
 394 #else
 395 
 396 #define PLCNT_SZ(ctrs_sz)
 397 
 398 #define PLCNT_INIT(base)
 399 
 400 /* PG_FREE_LIST may not be explicitly set in flags for large pages */
 401 
 402 #define PLCNT_DO(pp, mn, mtype, szc, cnt, flags) {                      \
 403         if (flags & PG_CACHE_LIST)                                  \
 404                 atomic_add_long(&plcnt[mn][mtype].plc_mt_clpgcnt, cnt);     \
 405         else if (szc)                                                   \
 406                 atomic_add_long(&plcnt[mn][mtype].plc_mt_lgpgcnt, cnt);     \
 407         else                                                            \
 408                 atomic_add_long(&plcnt[mn][mtype].plc_mt_flpgcnt, cnt);     \
 409 }
 410 
 411 #endif
 412 
 413 #define PLCNT_INCR(pp, mn, mtype, szc, flags) {                         \
 414         long    cnt = (1 << PAGE_BSZS_SHIFT(szc));                        \
 415         PLCNT_DO(pp, mn, mtype, szc, cnt, flags);                       \
 416 }
 417 
 418 #define PLCNT_DECR(pp, mn, mtype, szc, flags) {                         \
 419         long    cnt = ((-1) << PAGE_BSZS_SHIFT(szc));                     \
 420         PLCNT_DO(pp, mn, mtype, szc, cnt, flags);                       \
 421 }
 422 
 423 /*
 424  * macros to update page list max counts - done when pages transferred
 425  * from RELOC to NORELOC mtype (kcage_init or kcage_assimilate_page).
 426  */
 427 
 428 #define PLCNT_XFER_NORELOC(pp) {                                        \
 429         long    cnt = (1 << PAGE_BSZS_SHIFT((pp)->p_szc));             \
 430         int     mn = PP_2_MEM_NODE(pp);                                 \
 431         atomic_add_long(&plcnt[mn][MTYPE_NORELOC].plc_mt_pgmax, cnt);       \
 432         atomic_add_long(&plcnt[mn][MTYPE_RELOC].plc_mt_pgmax, -cnt);        \
 433 }
 434 
 435 /*
 436  * macro to modify the page list max counts when memory is added to
 437  * the page lists during startup (add_physmem) or during a DR operation
 438  * when memory is added (kphysm_add_memory_dynamic) or deleted
 439  * (kphysm_del_cleanup).
 440  */
 441 #define PLCNT_MODIFY_MAX(pfn, cnt) {                                           \
 442         spgcnt_t _cnt = (spgcnt_t)(cnt);                                       \
 443         pgcnt_t _acnt = ABS(_cnt);                                             \
 444         int _mn;                                                               \
 445         pgcnt_t _np;                                                           \
 446         if (&plat_mem_node_intersect_range != NULL) {                              \
 447                 for (_mn = 0; _mn < max_mem_nodes; _mn++) {                 \
 448                         plat_mem_node_intersect_range((pfn), _acnt, _mn, &_np);\
 449                         if (_np == 0)                                          \
 450                                 continue;                                      \
 451                         atomic_add_long(&plcnt[_mn][MTYPE_RELOC].plc_mt_pgmax, \
 452                             (_cnt < 0) ? -_np : _np);                               \
 453                 }                                                              \
 454         } else {                                                               \
 455                 pfn_t _pfn = (pfn);                                            \
 456                 pfn_t _endpfn = _pfn + _acnt;                                  \
 457                 while (_pfn < _endpfn) {                                    \
 458                         _mn = PFN_2_MEM_NODE(_pfn);                            \
 459                         _np = MIN(_endpfn, mem_node_config[_mn].physmax + 1) - \
 460                             _pfn;                                              \
 461                         _pfn += _np;                                           \
 462                         atomic_add_long(&plcnt[_mn][MTYPE_RELOC].plc_mt_pgmax, \
 463                             (_cnt < 0) ? -_np : _np);                               \
 464                 }                                                              \
 465         }                                                                      \
 466 }
 467 
 468 /*
 469  * macro to call page_ctrs_adjust() when memory is added
 470  * during a DR operation.
 471  */
 472 #define PAGE_CTRS_ADJUST(pfn, cnt, rv) {                                       \
 473         spgcnt_t _cnt = (spgcnt_t)(cnt);                                       \
 474         int _mn;                                                               \
 475         pgcnt_t _np;                                                           \
 476         if (&plat_mem_node_intersect_range != NULL) {                              \
 477                 for (_mn = 0; _mn < max_mem_nodes; _mn++) {                 \
 478                         plat_mem_node_intersect_range((pfn), _cnt, _mn, &_np); \
 479                         if (_np == 0)                                          \
 480                                 continue;                                      \
 481                         if ((rv = page_ctrs_adjust(_mn)) != 0)                 \
 482                                 break;                                         \
 483                 }                                                              \
 484         } else {                                                               \
 485                 pfn_t _pfn = (pfn);                                            \
 486                 pfn_t _endpfn = _pfn + _cnt;                                   \
 487                 while (_pfn < _endpfn) {                                    \
 488                         _mn = PFN_2_MEM_NODE(_pfn);                            \
 489                         _np = MIN(_endpfn, mem_node_config[_mn].physmax + 1) - \
 490                             _pfn;                                              \
 491                         _pfn += _np;                                           \
 492                         if ((rv = page_ctrs_adjust(_mn)) != 0)                 \
 493                                 break;                                         \
 494                 }                                                              \
 495         }                                                                      \
 496 }
 497 
 498 extern plcnt_t  plcnt;
 499 
 500 #define MNODE_PGCNT(mn)                                                 \
 501         (plcnt[mn][MTYPE_RELOC].plc_mt_clpgcnt +                        \
 502             plcnt[mn][MTYPE_NORELOC].plc_mt_clpgcnt +                   \
 503             plcnt[mn][MTYPE_RELOC].plc_mt_flpgcnt +                     \
 504             plcnt[mn][MTYPE_NORELOC].plc_mt_flpgcnt +                   \
 505             plcnt[mn][MTYPE_RELOC].plc_mt_lgpgcnt +                     \
 506             plcnt[mn][MTYPE_NORELOC].plc_mt_lgpgcnt)
 507 
 508 #define MNODETYPE_PGCNT(mn, mtype)                                      \
 509         (plcnt[mn][mtype].plc_mt_clpgcnt +                              \
 510             plcnt[mn][mtype].plc_mt_flpgcnt +                           \
 511             plcnt[mn][mtype].plc_mt_lgpgcnt)
 512 
 513 /*
 514  * macros to loop through the mtype range - MTYPE_START returns -1 in
 515  * mtype if no pages in mnode/mtype and possibly NEXT mtype.
 516  */
 517 #define MTYPE_START(mnode, mtype, flags) {                              \
 518         if (plcnt[mnode][mtype].plc_mt_pgmax == 0) {                    \
 519                 ASSERT(mtype == MTYPE_RELOC ||                          \
 520                     MNODETYPE_PGCNT(mnode, mtype) == 0 ||               \
 521                     plcnt[mnode][mtype].plc_mt_pgmax != 0);             \
 522                 MTYPE_NEXT(mnode, mtype, flags);                        \
 523         }                                                               \
 524 }
 525 
 526 /*
 527  * if allocation from the RELOC pool failed and there is sufficient cage
 528  * memory, attempt to allocate from the NORELOC pool.
 529  */
 530 #define MTYPE_NEXT(mnode, mtype, flags) {                               \
 531         if (!(flags & (PG_NORELOC | PGI_NOCAGE | PGI_RELOCONLY)) && \
 532             (kcage_freemem >= kcage_lotsfree)) {                     \
 533                 if (plcnt[mnode][MTYPE_NORELOC].plc_mt_pgmax == 0) {    \
 534                         ASSERT(MNODETYPE_PGCNT(mnode, MTYPE_NORELOC) == 0 || \
 535                             plcnt[mnode][MTYPE_NORELOC].plc_mt_pgmax != 0);  \
 536                         mtype = -1;                                     \
 537                 } else {                                                \
 538                         mtype = MTYPE_NORELOC;                          \
 539                         flags |= PG_NORELOC;                            \
 540                 }                                                       \
 541         } else {                                                        \
 542                 mtype = -1;                                             \
 543         }                                                               \
 544 }
 545 
 546 /*
 547  * get the ecache setsize for the current cpu.
 548  */
 549 #define CPUSETSIZE()    (cpunodes[CPU->cpu_id].ecache_setsize)
 550 
 551 extern struct cpu       cpu0;
 552 #define CPU0            &cpu0
 553 
 554 #define PAGE_BSZS_SHIFT(szc)    TTE_BSZS_SHIFT(szc)
 555 /*
 556  * For sfmmu each larger page is 8 times the size of the previous
 557  * size page.
 558  */
 559 #define FULL_REGION_CNT(rg_szc) (8)
 560 
 561 /*
 562  * The counter base must be per page_counter element to prevent
 563  * races when re-indexing, and the base page size element should
 564  * be aligned on a boundary of the given region size.
 565  *
 566  * We also round up the number of pages spanned by the counters
 567  * for a given region to PC_BASE_ALIGN in certain situations to simplify
 568  * the coding for some non-performance critical routines.
 569  */
 570 #define PC_BASE_ALIGN           ((pfn_t)1 << PAGE_BSZS_SHIFT(mmu_page_sizes-1))
 571 #define PC_BASE_ALIGN_MASK      (PC_BASE_ALIGN - 1)
 572 
 573 extern int ecache_alignsize;
 574 #define L2CACHE_ALIGN           ecache_alignsize
 575 #define L2CACHE_ALIGN_MAX       512
 576 
 577 extern int update_proc_pgcolorbase_after_fork;
 578 extern int consistent_coloring;
 579 extern uint_t vac_colors_mask;
 580 extern int vac_size;
 581 extern int vac_shift;
 582 
 583 /*
 584  * Kernel mem segment in 64-bit space
 585  */
 586 extern caddr_t kmem64_base, kmem64_end, kmem64_aligned_end;
 587 extern int kmem64_alignsize, kmem64_szc;
 588 extern uint64_t kmem64_pabase;
 589 extern int max_bootlp_tteszc;
 590 
 591 /*
 592  * Maximum and default values for user heap, stack, private and shared
 593  * anonymous memory, and user text and initialized data.
 594  *
 595  * Initial values are defined in architecture specific mach_vm_dep.c file.
 596  * Used by map_pgsz*() routines.
 597  */
 598 extern size_t max_uheap_lpsize;
 599 extern size_t default_uheap_lpsize;
 600 extern size_t max_ustack_lpsize;
 601 extern size_t default_ustack_lpsize;
 602 extern size_t max_privmap_lpsize;
 603 extern size_t max_uidata_lpsize;
 604 extern size_t max_utext_lpsize;
 605 extern size_t max_shm_lpsize;
 606 
 607 /*
 608  * For adjusting the default lpsize, for DTLB-limited page sizes.
 609  */
 610 extern void adjust_data_maxlpsize(size_t ismpagesize);
 611 
 612 /*
 613  * Sanity control. Don't use large pages regardless of user
 614  * settings if there's less than priv or shm_lpg_min_physmem memory installed.
 615  * The units for this variable are 8K pages.
 616  */
 617 extern pgcnt_t privm_lpg_min_physmem;
 618 extern pgcnt_t shm_lpg_min_physmem;
 619 
 620 /*
 621  * AS_2_BIN macro controls the page coloring policy.
 622  * 0 (default) uses various vaddr bits
 623  * 1 virtual=paddr
 624  * 2 bin hopping
 625  */
 626 #define AS_2_BIN(as, seg, vp, addr, bin, szc)                           \
 627 switch (consistent_coloring) {                                          \
 628         default:                                                        \
 629                 cmn_err(CE_WARN,                                        \
 630                         "AS_2_BIN: bad consistent coloring value");     \
 631                 /* assume default algorithm -> continue */           \
 632         case 0: {                                                       \
 633                 uint32_t ndx, new;                                      \
 634                 int slew = 0;                                           \
 635                 pfn_t pfn;                                              \
 636                                                                         \
 637                 if (vp != NULL && IS_SWAPVP(vp) &&                      \
 638                     seg->s_ops == &segvn_ops)                            \
 639                         slew = as_color_bin(as);                        \
 640                                                                         \
 641                 pfn = ((uintptr_t)addr >> MMU_PAGESHIFT) +                \
 642                         (((uintptr_t)addr >> page_coloring_shift) <<        \
 643                         (vac_shift - MMU_PAGESHIFT));                   \
 644                 if ((szc) == 0 || &page_pfn_2_color_cpu == NULL) {  \
 645                         pfn += slew;                                    \
 646                         bin = PFN_2_COLOR(pfn, szc, NULL);              \
 647                 } else {                                                \
 648                         bin = PFN_2_COLOR(pfn, szc, NULL);              \
 649                         bin += slew >> (vac_shift - MMU_PAGESHIFT);       \
 650                         bin &= hw_page_array[(szc)].hp_colors - 1;  \
 651                 }                                                       \
 652                 break;                                                  \
 653         }                                                               \
 654         case 1:                                                         \
 655                 bin = PFN_2_COLOR(((uintptr_t)addr >> MMU_PAGESHIFT),     \
 656                     szc, NULL);                                         \
 657                 break;                                                  \
 658         case 2: {                                                       \
 659                 int cnt = as_color_bin(as);                             \
 660                 uint_t color_mask = page_get_pagecolors(0) - 1;         \
 661                                                                         \
 662                 /* make sure physical color aligns with vac color */    \
 663                 while ((cnt & vac_colors_mask) !=                   \
 664                     addr_to_vcolor(addr)) {                             \
 665                         cnt++;                                          \
 666                 }                                                       \
 667                 bin = cnt = cnt & color_mask;                               \
 668                 bin >>= PAGE_GET_COLOR_SHIFT(0, szc);                   \
 669                 /* update per as page coloring fields */                \
 670                 cnt = (cnt + 1) & color_mask;                               \
 671                 if (cnt == (as_color_start(as) & color_mask)) {             \
 672                         cnt = as_color_start(as) = as_color_start(as) + \
 673                                 PGCLR_LOOPFACTOR;                       \
 674                 }                                                       \
 675                 as_color_bin(as) = cnt & color_mask;                        \
 676                 break;                                                  \
 677         }                                                               \
 678 }                                                                       \
 679         ASSERT(bin < page_get_pagecolors(szc));
 680 
 681 /*
 682  * cpu private vm data - accessed thru CPU->cpu_vm_data
 683  *      vc_pnum_memseg: tracks last memseg visited in page_numtopp_nolock()
 684  *      vc_pnext_memseg: tracks last memseg visited in page_nextn()
 685  *      vc_kmptr: unaligned kmem pointer for this vm_cpu_data_t
 686  *      vc_kmsize: orignal kmem size for this vm_cpu_data_t
 687  */
 688 
 689 typedef struct {
 690         struct memseg   *vc_pnum_memseg;
 691         struct memseg   *vc_pnext_memseg;
 692         void            *vc_kmptr;
 693         size_t          vc_kmsize;
 694 } vm_cpu_data_t;
 695 
 696 /* allocation size to ensure vm_cpu_data_t resides in its own cache line */
 697 #define VM_CPU_DATA_PADSIZE                                             \
 698         (P2ROUNDUP(sizeof (vm_cpu_data_t), L2CACHE_ALIGN_MAX))
 699 
 700 /*
 701  * Function to get an ecache color bin: F(as, cnt, vcolor).
 702  * the goal of this function is to:
 703  * - to spread a processes' physical pages across the entire ecache to
 704  *      maximize its use.
 705  * - to minimize vac flushes caused when we reuse a physical page on a
 706  *      different vac color than it was previously used.
 707  * - to prevent all processes to use the same exact colors and trash each
 708  *      other.
 709  *
 710  * cnt is a bin ptr kept on a per as basis.  As we page_create we increment
 711  * the ptr so we spread out the physical pages to cover the entire ecache.
 712  * The virtual color is made a subset of the physical color in order to
 713  * in minimize virtual cache flushing.
 714  * We add in the as to spread out different as.  This happens when we
 715  * initialize the start count value.
 716  * sizeof(struct as) is 60 so we shift by 3 to get into the bit range
 717  * that will tend to change.  For example, on spitfire based machines
 718  * (vcshft == 1) contigous as are spread bu ~6 bins.
 719  * vcshft provides for proper virtual color alignment.
 720  * In theory cnt should be updated using cas only but if we are off by one
 721  * or 2 it is no big deal.
 722  * We also keep a start value which is used to randomize on what bin we
 723  * start counting when it is time to start another loop. This avoids
 724  * contigous allocations of ecache size to point to the same bin.
 725  * Why 3? Seems work ok. Better than 7 or anything larger.
 726  */
 727 #define PGCLR_LOOPFACTOR 3
 728 
 729 /*
 730  * When a bin is empty, and we can't satisfy a color request correctly,
 731  * we scan.  If we assume that the programs have reasonable spatial
 732  * behavior, then it will not be a good idea to use the adjacent color.
 733  * Using the adjacent color would result in virtually adjacent addresses
 734  * mapping into the same spot in the cache.  So, if we stumble across
 735  * an empty bin, skip a bunch before looking.  After the first skip,
 736  * then just look one bin at a time so we don't miss our cache on
 737  * every look. Be sure to check every bin.  Page_create() will panic
 738  * if we miss a page.
 739  *
 740  * This also explains the `<=' in the for loops in both page_get_freelist()
 741  * and page_get_cachelist().  Since we checked the target bin, skipped
 742  * a bunch, then continued one a time, we wind up checking the target bin
 743  * twice to make sure we get all of them bins.
 744  */
 745 #define BIN_STEP        20
 746 
 747 #ifdef VM_STATS
 748 struct vmm_vmstats_str {
 749         ulong_t pgf_alloc[MMU_PAGE_SIZES];      /* page_get_freelist */
 750         ulong_t pgf_allocok[MMU_PAGE_SIZES];
 751         ulong_t pgf_allocokrem[MMU_PAGE_SIZES];
 752         ulong_t pgf_allocfailed[MMU_PAGE_SIZES];
 753         ulong_t pgf_allocdeferred;
 754         ulong_t pgf_allocretry[MMU_PAGE_SIZES];
 755         ulong_t pgc_alloc;                      /* page_get_cachelist */
 756         ulong_t pgc_allocok;
 757         ulong_t pgc_allocokrem;
 758         ulong_t pgc_allocokdeferred;
 759         ulong_t pgc_allocfailed;
 760         ulong_t pgcp_alloc[MMU_PAGE_SIZES];     /* page_get_contig_pages */
 761         ulong_t pgcp_allocfailed[MMU_PAGE_SIZES];
 762         ulong_t pgcp_allocempty[MMU_PAGE_SIZES];
 763         ulong_t pgcp_allocok[MMU_PAGE_SIZES];
 764         ulong_t ptcp[MMU_PAGE_SIZES];           /* page_trylock_contig_pages */
 765         ulong_t ptcpfreethresh[MMU_PAGE_SIZES];
 766         ulong_t ptcpfailexcl[MMU_PAGE_SIZES];
 767         ulong_t ptcpfailszc[MMU_PAGE_SIZES];
 768         ulong_t ptcpfailcage[MMU_PAGE_SIZES];
 769         ulong_t ptcpok[MMU_PAGE_SIZES];
 770         ulong_t pgmf_alloc[MMU_PAGE_SIZES];     /* page_get_mnode_freelist */
 771         ulong_t pgmf_allocfailed[MMU_PAGE_SIZES];
 772         ulong_t pgmf_allocempty[MMU_PAGE_SIZES];
 773         ulong_t pgmf_allocok[MMU_PAGE_SIZES];
 774         ulong_t pgmc_alloc;                     /* page_get_mnode_cachelist */
 775         ulong_t pgmc_allocfailed;
 776         ulong_t pgmc_allocempty;
 777         ulong_t pgmc_allocok;
 778         ulong_t pladd_free[MMU_PAGE_SIZES];     /* page_list_add/sub */
 779         ulong_t plsub_free[MMU_PAGE_SIZES];
 780         ulong_t pladd_cache;
 781         ulong_t plsub_cache;
 782         ulong_t plsubpages_szcbig;
 783         ulong_t plsubpages_szc0;
 784         ulong_t pfs_req[MMU_PAGE_SIZES];        /* page_freelist_split */
 785         ulong_t pfs_demote[MMU_PAGE_SIZES];
 786         ulong_t pfc_coalok[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
 787         ulong_t ppr_reloc[MMU_PAGE_SIZES];      /* page_relocate */
 788         ulong_t ppr_relocok[MMU_PAGE_SIZES];
 789         ulong_t ppr_relocnoroot[MMU_PAGE_SIZES];
 790         ulong_t ppr_reloc_replnoroot[MMU_PAGE_SIZES];
 791         ulong_t ppr_relocnolock[MMU_PAGE_SIZES];
 792         ulong_t ppr_relocnomem[MMU_PAGE_SIZES];
 793         ulong_t ppr_krelocfail[MMU_PAGE_SIZES];
 794         ulong_t ppr_copyfail;
 795         /* page coalesce counter */
 796         ulong_t page_ctrs_coalesce[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
 797         /* candidates useful */
 798         ulong_t page_ctrs_cands_skip[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
 799         /* ctrs changed after locking */
 800         ulong_t page_ctrs_changed[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
 801         /* page_freelist_coalesce failed */
 802         ulong_t page_ctrs_failed[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
 803         ulong_t page_ctrs_coalesce_all; /* page coalesce all counter */
 804         ulong_t page_ctrs_cands_skip_all; /* candidates useful for all func */
 805 };
 806 extern struct vmm_vmstats_str vmm_vmstats;
 807 #endif  /* VM_STATS */
 808 
 809 /*
 810  * Used to hold off page relocations into the cage until OBP has completed
 811  * its boot-time handoff of its resources to the kernel.
 812  */
 813 extern int page_relocate_ready;
 814 
 815 /*
 816  * cpu/mmu-dependent vm variables may be reset at bootup.
 817  */
 818 extern uint_t mmu_page_sizes;
 819 extern uint_t max_mmu_page_sizes;
 820 extern uint_t mmu_hashcnt;
 821 extern uint_t max_mmu_hashcnt;
 822 extern size_t mmu_ism_pagesize;
 823 extern int mmu_exported_pagesize_mask;
 824 extern uint_t mmu_exported_page_sizes;
 825 extern uint_t szc_2_userszc[];
 826 extern uint_t userszc_2_szc[];
 827 
 828 #define mmu_legacy_page_sizes   mmu_exported_page_sizes
 829 #define USERSZC_2_SZC(userszc)  (userszc_2_szc[userszc])
 830 #define SZC_2_USERSZC(szc)      (szc_2_userszc[szc])
 831 
 832 /*
 833  * Platform specific page routines
 834  */
 835 extern void mach_page_add(page_t **, page_t *);
 836 extern void mach_page_sub(page_t **, page_t *);
 837 extern uint_t page_get_pagecolors(uint_t);
 838 extern void ppcopy_kernel__relocatable(page_t *, page_t *);
 839 #define ppcopy_kernel(p1, p2)   ppcopy_kernel__relocatable(p1, p2)
 840 
 841 /*
 842  * platform specific large pages for kernel heap support
 843  */
 844 extern size_t get_segkmem_lpsize(size_t lpsize);
 845 extern size_t mmu_get_kernel_lpsize(size_t lpsize);
 846 extern void mmu_init_kernel_pgsz(struct hat *hat);
 847 extern void mmu_init_kcontext();
 848 extern uint64_t kcontextreg;
 849 
 850 /*
 851  * Nucleus data page allocator routines
 852  */
 853 extern void ndata_alloc_init(struct memlist *, uintptr_t, uintptr_t);
 854 extern void *ndata_alloc(struct memlist *, size_t, size_t);
 855 extern void *ndata_extra_base(struct memlist *, size_t, caddr_t);
 856 extern size_t ndata_maxsize(struct memlist *);
 857 extern size_t ndata_spare(struct memlist *, size_t, size_t);
 858 
 859 #ifdef  __cplusplus
 860 }
 861 #endif
 862 
 863 #endif  /* _VM_DEP_H */