1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 /*
  26  * UNIX machine dependent virtual memory support.
  27  */
  28 
  29 #ifndef _VM_DEP_H
  30 #define _VM_DEP_H
  31 
  32 #ifdef  __cplusplus
  33 extern "C" {
  34 #endif
  35 
  36 #include <vm/hat_sfmmu.h>
  37 #include <sys/archsystm.h>
  38 #include <sys/memnode.h>
  39 
  40 #define GETTICK()       gettick()
  41 
  42 /* tick value that should be used for random values */
  43 extern u_longlong_t randtick(void);
  44 
  45 /*
  46  * Per page size free lists. Allocated dynamically.
  47  */
  48 #define MAX_MEM_TYPES   2       /* 0 = reloc, 1 = noreloc */
  49 #define MTYPE_RELOC     0
  50 #define MTYPE_NORELOC   1
  51 
  52 #define PP_2_MTYPE(pp)  (PP_ISNORELOC(pp) ? MTYPE_NORELOC : MTYPE_RELOC)
  53 
  54 #define MTYPE_INIT(mtype, vp, vaddr, flags, pgsz)                       \
  55         mtype = (flags & PG_NORELOC) ? MTYPE_NORELOC : MTYPE_RELOC;
  56 
  57 /* mtype init for page_get_replacement_page */
  58 #define MTYPE_PGR_INIT(mtype, flags, pp, mnode, pgcnt)                  \
  59         mtype = (flags & PG_NORELOC) ? MTYPE_NORELOC : MTYPE_RELOC;
  60 
  61 #define MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi)                     \
  62         pfnlo = mem_node_config[mnode].physbase;                        \
  63         pfnhi = mem_node_config[mnode].physmax;
  64 
  65 /*
  66  * candidate counters in vm_pagelist.c are indexed by color and range
  67  */
  68 #define MAX_MNODE_MRANGES               MAX_MEM_TYPES
  69 #define MNODE_RANGE_CNT(mnode)          MAX_MNODE_MRANGES
  70 #define MNODE_MAX_MRANGE(mnode)         (MAX_MEM_TYPES - 1)
  71 #define MTYPE_2_MRANGE(mnode, mtype)    (mtype)
  72 
  73 /*
  74  * Internal PG_ flags.
  75  */
  76 #define PGI_RELOCONLY   0x10000 /* acts in the opposite sense to PG_NORELOC */
  77 #define PGI_NOCAGE      0x20000 /* indicates Cage is disabled */
  78 #define PGI_PGCPHIPRI   0x40000 /* page_get_contig_page priority allocation */
  79 #define PGI_PGCPSZC0    0x80000 /* relocate base pagesize page */
  80 
  81 /*
  82  * PGI mtype flags - should not overlap PGI flags
  83  */
  84 #define PGI_MT_RANGE    0x1000000       /* mtype range */
  85 #define PGI_MT_NEXT     0x2000000       /* get next mtype */
  86 
  87 extern page_t ***page_freelists[MMU_PAGE_SIZES][MAX_MEM_TYPES];
  88 extern page_t ***page_cachelists[MAX_MEM_TYPES];
  89 
  90 #define PAGE_FREELISTS(mnode, szc, color, mtype) \
  91         (*(page_freelists[szc][mtype][mnode] + (color)))
  92 
  93 #define PAGE_CACHELISTS(mnode, color, mtype) \
  94         (*(page_cachelists[mtype][mnode] + (color)))
  95 
  96 /*
  97  * There are 'page_colors' colors/bins.  Spread them out under a
  98  * couple of locks.  There are mutexes for both the page freelist
  99  * and the page cachelist.  We want enough locks to make contention
 100  * reasonable, but not too many -- otherwise page_freelist_lock() gets
 101  * so expensive that it becomes the bottleneck!
 102  */
 103 #define NPC_MUTEX       16
 104 
 105 extern kmutex_t *fpc_mutex[NPC_MUTEX];
 106 extern kmutex_t *cpc_mutex[NPC_MUTEX];
 107 
 108 /*
 109  * Iterator provides the info needed to convert RA to PA.
 110  * MEM_NODE_ITERATOR_INIT() should be called before
 111  * PAGE_NEXT_PFN_FOR_COLOR() if pfn was not obtained via a previous
 112  * PAGE_NEXT_PFN_FOR_COLOR() call. Iterator caches color 2 hash
 113  * translations requiring initializer call if color or ceq_mask changes,
 114  * even if pfn doesn't. MEM_NODE_ITERATOR_INIT() must also be called before
 115  * PFN_2_COLOR() that uses a valid iterator argument.
 116  *
 117  * plat_mem_node_iterator_init() starts from last mblock in continuation
 118  * case which may be invalid because memory DR.  To detect this situation
 119  * mi_genid is checked against mpo_genid which is incremented after a
 120  * memory DR operation.  See also plat_slice_add()/plat_slice_del().
 121  */
 122 #ifdef  sun4v
 123 
 124 typedef struct mem_node_iterator {
 125         uint_t mi_mnode;                /* mnode in which to iterate */
 126         int mi_init;                    /* set to 1 when first init */
 127         int mi_genid;                   /* set/checked against mpo_genid */
 128         int mi_last_mblock;             /* last mblock visited */
 129         uint_t mi_hash_ceq_mask;        /* cached copy of ceq_mask */
 130         uint_t mi_hash_color;           /* cached copy of color */
 131         uint_t mi_mnode_mask;           /* number of mask bits */
 132         uint_t mi_mnode_pfn_shift;      /* mnode position in pfn */
 133         pfn_t mi_mblock_base;           /* first valid pfn in current mblock */
 134         pfn_t mi_mblock_end;            /* last valid pfn in current mblock */
 135         pfn_t mi_ra_to_pa;              /* ra adjustment for current mblock */
 136         pfn_t mi_mnode_pfn_mask;        /* mask to obtain mnode id bits */
 137 } mem_node_iterator_t;
 138 
 139 #define MEM_NODE_ITERATOR_DECL(it) \
 140         mem_node_iterator_t it
 141 #define MEM_NODE_ITERATOR_INIT(pfn, mnode, szc, it) \
 142         (pfn) = plat_mem_node_iterator_init((pfn), (mnode), (szc), (it), 1)
 143 
 144 extern pfn_t plat_mem_node_iterator_init(pfn_t, int, uchar_t,
 145     mem_node_iterator_t *, int);
 146 extern pfn_t plat_rapfn_to_papfn(pfn_t);
 147 extern int interleaved_mnodes;
 148 
 149 #else   /* sun4v */
 150 
 151 #define MEM_NODE_ITERATOR_DECL(it) \
 152         void *it = NULL
 153 #define MEM_NODE_ITERATOR_INIT(pfn, mnode, szc, it)
 154 
 155 #endif  /* sun4v */
 156 
 157 /*
 158  * Return the mnode limits so that hpc_counters length and base
 159  * index can be determined. When interleaved_mnodes is set, we
 160  * create an array only for the first mnode that exists. All other
 161  * mnodes will share the array in this case.
 162  * If interleaved_mnodes is not set, simply return the limits for
 163  * the given mnode.
 164  */
 165 #define HPM_COUNTERS_LIMITS(mnode, physbase, physmax, first)            \
 166         if (!interleaved_mnodes) {                                      \
 167                 (physbase) = mem_node_config[(mnode)].physbase;         \
 168                 (physmax) = mem_node_config[(mnode)].physmax;           \
 169                 (first) = (mnode);                                      \
 170         } else if ((first) < 0) {                                    \
 171                 mem_node_max_range(&(physbase), &(physmax));            \
 172                 (first) = (mnode);                                      \
 173         }
 174 
 175 #define PAGE_CTRS_WRITE_LOCK(mnode)                                     \
 176         if (!interleaved_mnodes) {                                      \
 177                 rw_enter(&page_ctrs_rwlock[(mnode)], RW_WRITER);    \
 178                 page_freelist_lock(mnode);                              \
 179         } else {                                                        \
 180                 /* changing shared hpm_counters */                      \
 181                 int _i;                                                 \
 182                 for (_i = 0; _i < max_mem_nodes; _i++) {             \
 183                         rw_enter(&page_ctrs_rwlock[_i], RW_WRITER); \
 184                         page_freelist_lock(_i);                         \
 185                 }                                                       \
 186         }
 187 
 188 #define PAGE_CTRS_WRITE_UNLOCK(mnode)                                   \
 189         if (!interleaved_mnodes) {                                      \
 190                 page_freelist_unlock(mnode);                            \
 191                 rw_exit(&page_ctrs_rwlock[(mnode)]);                        \
 192         } else {                                                        \
 193                 int _i;                                                 \
 194                 for (_i = 0; _i < max_mem_nodes; _i++) {             \
 195                         page_freelist_unlock(_i);                       \
 196                         rw_exit(&page_ctrs_rwlock[_i]);                     \
 197                 }                                                       \
 198         }
 199 
 200 /*
 201  * cpu specific color conversion functions
 202  */
 203 extern uint_t page_get_nsz_color_mask_cpu(uchar_t, uint_t);
 204 #pragma weak page_get_nsz_color_mask_cpu
 205 
 206 extern uint_t page_get_nsz_color_cpu(uchar_t, uint_t);
 207 #pragma weak page_get_nsz_color_cpu
 208 
 209 extern uint_t page_get_color_shift_cpu(uchar_t, uchar_t);
 210 #pragma weak page_get_color_shift_cpu
 211 
 212 extern uint_t page_convert_color_cpu(uint_t, uchar_t, uchar_t);
 213 #pragma weak page_convert_color_cpu
 214 
 215 extern pfn_t page_next_pfn_for_color_cpu(pfn_t,
 216     uchar_t, uint_t, uint_t, uint_t, void *);
 217 #pragma weak page_next_pfn_for_color_cpu
 218 
 219 extern uint_t  page_pfn_2_color_cpu(pfn_t, uchar_t, void *);
 220 #pragma weak page_pfn_2_color_cpu
 221 
 222 #define PAGE_GET_COLOR_SHIFT(szc, nszc)                         \
 223         ((&page_get_color_shift_cpu != NULL) ?                      \
 224             page_get_color_shift_cpu(szc, nszc) :               \
 225             (hw_page_array[(nszc)].hp_shift -                   \
 226                 hw_page_array[(szc)].hp_shift))
 227 
 228 #define PAGE_CONVERT_COLOR(ncolor, szc, nszc)                   \
 229         ((&page_convert_color_cpu != NULL) ?                        \
 230             page_convert_color_cpu(ncolor, szc, nszc) :         \
 231             ((ncolor) << PAGE_GET_COLOR_SHIFT((szc), (nszc))))
 232 
 233 #define PFN_2_COLOR(pfn, szc, it)                               \
 234         ((&page_pfn_2_color_cpu != NULL) ?                  \
 235             page_pfn_2_color_cpu(pfn, szc, it) :                \
 236             ((pfn & (hw_page_array[0].hp_colors - 1)) >>      \
 237                 (hw_page_array[szc].hp_shift -                  \
 238                     hw_page_array[0].hp_shift)))
 239 
 240 #define PNUM_SIZE(szc)                                                  \
 241         (hw_page_array[(szc)].hp_pgcnt)
 242 #define PNUM_SHIFT(szc)                                                 \
 243         (hw_page_array[(szc)].hp_shift - hw_page_array[0].hp_shift)
 244 #define PAGE_GET_SHIFT(szc)                                             \
 245         (hw_page_array[(szc)].hp_shift)
 246 #define PAGE_GET_PAGECOLORS(szc)                                        \
 247         (hw_page_array[(szc)].hp_colors)
 248 
 249 /*
 250  * This macro calculates the next sequential pfn with the specified
 251  * color using color equivalency mask
 252  */
 253 #define PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, color, ceq_mask, color_mask, it)   \
 254         {                                                                    \
 255                 ASSERT(((color) & ~(ceq_mask)) == 0);                        \
 256                 if (&page_next_pfn_for_color_cpu == NULL) {                  \
 257                         uint_t  pfn_shift = PAGE_BSZS_SHIFT(szc);            \
 258                         pfn_t   spfn = pfn >> pfn_shift;                     \
 259                         pfn_t   stride = (ceq_mask) + 1;                     \
 260                         ASSERT((((ceq_mask) + 1) & (ceq_mask)) == 0);        \
 261                         if (((spfn ^ (color)) & (ceq_mask)) == 0) {          \
 262                                 pfn += stride << pfn_shift;                  \
 263                         } else {                                             \
 264                                 pfn = (spfn & ~(pfn_t)(ceq_mask)) | (color); \
 265                                 pfn = (pfn > spfn ? pfn : pfn + stride) <<   \
 266                                     pfn_shift;                               \
 267                         }                                                    \
 268                 } else {                                                     \
 269                     pfn = page_next_pfn_for_color_cpu(pfn, szc, color,       \
 270                         ceq_mask, color_mask, it);                           \
 271                 }                                                            \
 272         }
 273 
 274 /* get the color equivalency mask for the next szc */
 275 #define PAGE_GET_NSZ_MASK(szc, mask)                                         \
 276         ((&page_get_nsz_color_mask_cpu == NULL) ?                            \
 277             ((mask) >> (PAGE_GET_SHIFT((szc) + 1) - PAGE_GET_SHIFT(szc))) :  \
 278             page_get_nsz_color_mask_cpu(szc, mask))
 279 
 280 /* get the color of the next szc */
 281 #define PAGE_GET_NSZ_COLOR(szc, color)                                       \
 282         ((&page_get_nsz_color_cpu == NULL) ?                                 \
 283             ((color) >> (PAGE_GET_SHIFT((szc) + 1) - PAGE_GET_SHIFT(szc))) : \
 284             page_get_nsz_color_cpu(szc, color))
 285 
 286 /* Find the bin for the given page if it was of size szc */
 287 #define PP_2_BIN_SZC(pp, szc)   (PFN_2_COLOR(pp->p_pagenum, szc, (void *)(-1)))
 288 
 289 #define PP_2_BIN(pp)            (PP_2_BIN_SZC(pp, pp->p_szc))
 290 
 291 #define PP_2_MEM_NODE(pp)       (PFN_2_MEM_NODE(pp->p_pagenum))
 292 
 293 #define PC_BIN_MUTEX(mnode, bin, flags) ((flags & PG_FREE_LIST) ?   \
 294         &fpc_mutex[(bin) & (NPC_MUTEX - 1)][mnode] :                    \
 295         &cpc_mutex[(bin) & (NPC_MUTEX - 1)][mnode])
 296 
 297 #define FPC_MUTEX(mnode, i)     (&fpc_mutex[i][mnode])
 298 #define CPC_MUTEX(mnode, i)     (&cpc_mutex[i][mnode])
 299 
 300 #define PFN_BASE(pfnum, szc)    (pfnum & ~((1 << PAGE_BSZS_SHIFT(szc)) - 1))
 301 
 302 /*
 303  * this structure is used for walking free page lists
 304  * controls when to split large pages into smaller pages,
 305  * and when to coalesce smaller pages into larger pages
 306  */
 307 typedef struct page_list_walker {
 308         uint_t  plw_colors;             /* num of colors for szc */
 309         uint_t  plw_color_mask;         /* colors-1 */
 310         uint_t  plw_bin_step;           /* next bin: 1 or 2 */
 311         uint_t  plw_count;              /* loop count */
 312         uint_t  plw_bin0;               /* starting bin */
 313         uint_t  plw_bin_marker;         /* bin after initial jump */
 314         uint_t  plw_bin_split_prev;     /* last bin we tried to split */
 315         uint_t  plw_do_split;           /* set if OK to split */
 316         uint_t  plw_split_next;         /* next bin to split */
 317         uint_t  plw_ceq_dif;            /* number of different color groups */
 318                                         /* to check */
 319         uint_t  plw_ceq_mask[MMU_PAGE_SIZES + 1]; /* color equiv mask */
 320         uint_t  plw_bins[MMU_PAGE_SIZES + 1];   /* num of bins */
 321 } page_list_walker_t;
 322 
 323 void    page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin,
 324     int can_split, int use_ceq, page_list_walker_t *plw);
 325 
 326 typedef char    hpmctr_t;
 327 
 328 #ifdef DEBUG
 329 #define CHK_LPG(pp, szc)        chk_lpg(pp, szc)
 330 extern void     chk_lpg(page_t *, uchar_t);
 331 #else
 332 #define CHK_LPG(pp, szc)
 333 #endif
 334 
 335 /*
 336  * page list count per mnode and type.
 337  */
 338 typedef struct {
 339         pgcnt_t plc_mt_pgmax;           /* max page cnt */
 340         pgcnt_t plc_mt_clpgcnt;         /* cache list cnt */
 341         pgcnt_t plc_mt_flpgcnt;         /* free list cnt - small pages */
 342         pgcnt_t plc_mt_lgpgcnt;         /* free list cnt - large pages */
 343 #ifdef DEBUG
 344         struct {
 345                 pgcnt_t plc_mts_pgcnt;  /* per page size count */
 346                 int     plc_mts_colors;
 347                 pgcnt_t *plc_mtsc_pgcnt; /* per color bin count */
 348         } plc_mts[MMU_PAGE_SIZES];
 349 #endif
 350 } plcnt_t[MAX_MEM_NODES][MAX_MEM_TYPES];
 351 
 352 #ifdef DEBUG
 353 
 354 #define PLCNT_SZ(ctrs_sz) {                                             \
 355         int     szc;                                                    \
 356         for (szc = 0; szc < mmu_page_sizes; szc++) {                 \
 357                 int     colors = page_get_pagecolors(szc);              \
 358                 ctrs_sz += (max_mem_nodes * MAX_MEM_TYPES *             \
 359                     colors * sizeof (pgcnt_t));                         \
 360         }                                                               \
 361 }
 362 
 363 #define PLCNT_INIT(base) {                                              \
 364         int     mn, mt, szc, colors;                                    \
 365         for (szc = 0; szc < mmu_page_sizes; szc++) {                 \
 366                 colors = page_get_pagecolors(szc);                      \
 367                 for (mn = 0; mn < max_mem_nodes; mn++) {             \
 368                         for (mt = 0; mt < MAX_MEM_TYPES; mt++) {     \
 369                                 plcnt[mn][mt].plc_mts[szc].             \
 370                                     plc_mts_colors = colors;            \
 371                                 plcnt[mn][mt].plc_mts[szc].             \
 372                                     plc_mtsc_pgcnt = (pgcnt_t *)base;   \
 373                                 base += (colors * sizeof (pgcnt_t));    \
 374                         }                                               \
 375                 }                                                       \
 376         }                                                               \
 377 }
 378 
 379 #define PLCNT_DO(pp, mn, mtype, szc, cnt, flags) {                      \
 380         int     bin = PP_2_BIN(pp);                                     \
 381         if (flags & PG_CACHE_LIST)                                  \
 382                 atomic_add_long(&plcnt[mn][mtype].plc_mt_clpgcnt, cnt);     \
 383         else if (szc)                                                   \
 384                 atomic_add_long(&plcnt[mn][mtype].plc_mt_lgpgcnt, cnt);     \
 385         else                                                            \
 386                 atomic_add_long(&plcnt[mn][mtype].plc_mt_flpgcnt, cnt);     \
 387         atomic_add_long(&plcnt[mn][mtype].plc_mts[szc].plc_mts_pgcnt,       \
 388             cnt);                                                       \
 389         atomic_add_long(&plcnt[mn][mtype].plc_mts[szc].                     \
 390             plc_mtsc_pgcnt[bin], cnt);                                  \
 391 }
 392 
 393 #else
 394 
 395 #define PLCNT_SZ(ctrs_sz)
 396 
 397 #define PLCNT_INIT(base)
 398 
 399 /* PG_FREE_LIST may not be explicitly set in flags for large pages */
 400 
 401 #define PLCNT_DO(pp, mn, mtype, szc, cnt, flags) {                      \
 402         if (flags & PG_CACHE_LIST)                                  \
 403                 atomic_add_long(&plcnt[mn][mtype].plc_mt_clpgcnt, cnt);     \
 404         else if (szc)                                                   \
 405                 atomic_add_long(&plcnt[mn][mtype].plc_mt_lgpgcnt, cnt);     \
 406         else                                                            \
 407                 atomic_add_long(&plcnt[mn][mtype].plc_mt_flpgcnt, cnt);     \
 408 }
 409 
 410 #endif
 411 
 412 #define PLCNT_INCR(pp, mn, mtype, szc, flags) {                         \
 413         long    cnt = (1 << PAGE_BSZS_SHIFT(szc));                        \
 414         PLCNT_DO(pp, mn, mtype, szc, cnt, flags);                       \
 415 }
 416 
 417 #define PLCNT_DECR(pp, mn, mtype, szc, flags) {                         \
 418         long    cnt = ((-1) << PAGE_BSZS_SHIFT(szc));                     \
 419         PLCNT_DO(pp, mn, mtype, szc, cnt, flags);                       \
 420 }
 421 
 422 /*
 423  * macros to update page list max counts - done when pages transferred
 424  * from RELOC to NORELOC mtype (kcage_init or kcage_assimilate_page).
 425  */
 426 
 427 #define PLCNT_XFER_NORELOC(pp) {                                        \
 428         long    cnt = (1 << PAGE_BSZS_SHIFT((pp)->p_szc));             \
 429         int     mn = PP_2_MEM_NODE(pp);                                 \
 430         atomic_add_long(&plcnt[mn][MTYPE_NORELOC].plc_mt_pgmax, cnt);       \
 431         atomic_add_long(&plcnt[mn][MTYPE_RELOC].plc_mt_pgmax, -cnt);        \
 432 }
 433 
 434 /*
 435  * macro to modify the page list max counts when memory is added to
 436  * the page lists during startup (add_physmem) or during a DR operation
 437  * when memory is added (kphysm_add_memory_dynamic) or deleted
 438  * (kphysm_del_cleanup).
 439  */
 440 #define PLCNT_MODIFY_MAX(pfn, cnt) {                                           \
 441         spgcnt_t _cnt = (spgcnt_t)(cnt);                                       \
 442         pgcnt_t _acnt = ABS(_cnt);                                             \
 443         int _mn;                                                               \
 444         pgcnt_t _np;                                                           \
 445         if (&plat_mem_node_intersect_range != NULL) {                              \
 446                 for (_mn = 0; _mn < max_mem_nodes; _mn++) {                 \
 447                         plat_mem_node_intersect_range((pfn), _acnt, _mn, &_np);\
 448                         if (_np == 0)                                          \
 449                                 continue;                                      \
 450                         atomic_add_long(&plcnt[_mn][MTYPE_RELOC].plc_mt_pgmax, \
 451                             (_cnt < 0) ? -_np : _np);                               \
 452                 }                                                              \
 453         } else {                                                               \
 454                 pfn_t _pfn = (pfn);                                            \
 455                 pfn_t _endpfn = _pfn + _acnt;                                  \
 456                 while (_pfn < _endpfn) {                                    \
 457                         _mn = PFN_2_MEM_NODE(_pfn);                            \
 458                         _np = MIN(_endpfn, mem_node_config[_mn].physmax + 1) - \
 459                             _pfn;                                              \
 460                         _pfn += _np;                                           \
 461                         atomic_add_long(&plcnt[_mn][MTYPE_RELOC].plc_mt_pgmax, \
 462                             (_cnt < 0) ? -_np : _np);                               \
 463                 }                                                              \
 464         }                                                                      \
 465 }
 466 
 467 /*
 468  * macro to call page_ctrs_adjust() when memory is added
 469  * during a DR operation.
 470  */
 471 #define PAGE_CTRS_ADJUST(pfn, cnt, rv) {                                       \
 472         spgcnt_t _cnt = (spgcnt_t)(cnt);                                       \
 473         int _mn;                                                               \
 474         pgcnt_t _np;                                                           \
 475         if (&plat_mem_node_intersect_range != NULL) {                              \
 476                 for (_mn = 0; _mn < max_mem_nodes; _mn++) {                 \
 477                         plat_mem_node_intersect_range((pfn), _cnt, _mn, &_np); \
 478                         if (_np == 0)                                          \
 479                                 continue;                                      \
 480                         if ((rv = page_ctrs_adjust(_mn)) != 0)                 \
 481                                 break;                                         \
 482                 }                                                              \
 483         } else {                                                               \
 484                 pfn_t _pfn = (pfn);                                            \
 485                 pfn_t _endpfn = _pfn + _cnt;                                   \
 486                 while (_pfn < _endpfn) {                                    \
 487                         _mn = PFN_2_MEM_NODE(_pfn);                            \
 488                         _np = MIN(_endpfn, mem_node_config[_mn].physmax + 1) - \
 489                             _pfn;                                              \
 490                         _pfn += _np;                                           \
 491                         if ((rv = page_ctrs_adjust(_mn)) != 0)                 \
 492                                 break;                                         \
 493                 }                                                              \
 494         }                                                                      \
 495 }
 496 
 497 extern plcnt_t  plcnt;
 498 
 499 #define MNODE_PGCNT(mn)                                                 \
 500         (plcnt[mn][MTYPE_RELOC].plc_mt_clpgcnt +                        \
 501             plcnt[mn][MTYPE_NORELOC].plc_mt_clpgcnt +                   \
 502             plcnt[mn][MTYPE_RELOC].plc_mt_flpgcnt +                     \
 503             plcnt[mn][MTYPE_NORELOC].plc_mt_flpgcnt +                   \
 504             plcnt[mn][MTYPE_RELOC].plc_mt_lgpgcnt +                     \
 505             plcnt[mn][MTYPE_NORELOC].plc_mt_lgpgcnt)
 506 
 507 #define MNODETYPE_PGCNT(mn, mtype)                                      \
 508         (plcnt[mn][mtype].plc_mt_clpgcnt +                              \
 509             plcnt[mn][mtype].plc_mt_flpgcnt +                           \
 510             plcnt[mn][mtype].plc_mt_lgpgcnt)
 511 
 512 /*
 513  * macros to loop through the mtype range - MTYPE_START returns -1 in
 514  * mtype if no pages in mnode/mtype and possibly NEXT mtype.
 515  */
 516 #define MTYPE_START(mnode, mtype, flags) {                              \
 517         if (plcnt[mnode][mtype].plc_mt_pgmax == 0) {                    \
 518                 ASSERT(mtype == MTYPE_RELOC ||                          \
 519                     MNODETYPE_PGCNT(mnode, mtype) == 0 ||               \
 520                     plcnt[mnode][mtype].plc_mt_pgmax != 0);             \
 521                 MTYPE_NEXT(mnode, mtype, flags);                        \
 522         }                                                               \
 523 }
 524 
 525 /*
 526  * if allocation from the RELOC pool failed and there is sufficient cage
 527  * memory, attempt to allocate from the NORELOC pool.
 528  */
 529 #define MTYPE_NEXT(mnode, mtype, flags) {                               \
 530         if (!(flags & (PG_NORELOC | PGI_NOCAGE | PGI_RELOCONLY)) && \
 531             (kcage_freemem >= kcage_lotsfree)) {                     \
 532                 if (plcnt[mnode][MTYPE_NORELOC].plc_mt_pgmax == 0) {    \
 533                         ASSERT(MNODETYPE_PGCNT(mnode, MTYPE_NORELOC) == 0 || \
 534                             plcnt[mnode][MTYPE_NORELOC].plc_mt_pgmax != 0);  \
 535                         mtype = -1;                                     \
 536                 } else {                                                \
 537                         mtype = MTYPE_NORELOC;                          \
 538                         flags |= PG_NORELOC;                            \
 539                 }                                                       \
 540         } else {                                                        \
 541                 mtype = -1;                                             \
 542         }                                                               \
 543 }
 544 
 545 /*
 546  * get the ecache setsize for the current cpu.
 547  */
 548 #define CPUSETSIZE()    (cpunodes[CPU->cpu_id].ecache_setsize)
 549 
 550 extern struct cpu       cpu0;
 551 #define CPU0            &cpu0
 552 
 553 #define PAGE_BSZS_SHIFT(szc)    TTE_BSZS_SHIFT(szc)
 554 /*
 555  * For sfmmu each larger page is 8 times the size of the previous
 556  * size page.
 557  */
 558 #define FULL_REGION_CNT(rg_szc) (8)
 559 
 560 /*
 561  * The counter base must be per page_counter element to prevent
 562  * races when re-indexing, and the base page size element should
 563  * be aligned on a boundary of the given region size.
 564  *
 565  * We also round up the number of pages spanned by the counters
 566  * for a given region to PC_BASE_ALIGN in certain situations to simplify
 567  * the coding for some non-performance critical routines.
 568  */
 569 #define PC_BASE_ALIGN           ((pfn_t)1 << PAGE_BSZS_SHIFT(mmu_page_sizes-1))
 570 #define PC_BASE_ALIGN_MASK      (PC_BASE_ALIGN - 1)
 571 
 572 extern int ecache_alignsize;
 573 #define L2CACHE_ALIGN           ecache_alignsize
 574 #define L2CACHE_ALIGN_MAX       512
 575 
 576 extern int update_proc_pgcolorbase_after_fork;
 577 extern int consistent_coloring;
 578 extern uint_t vac_colors_mask;
 579 extern int vac_size;
 580 extern int vac_shift;
 581 
 582 /*
 583  * Kernel mem segment in 64-bit space
 584  */
 585 extern caddr_t kmem64_base, kmem64_end, kmem64_aligned_end;
 586 extern int kmem64_alignsize, kmem64_szc;
 587 extern uint64_t kmem64_pabase;
 588 extern int max_bootlp_tteszc;
 589 
 590 /*
 591  * Maximum and default values for user heap, stack, private and shared
 592  * anonymous memory, and user text and initialized data.
 593  *
 594  * Initial values are defined in architecture specific mach_vm_dep.c file.
 595  * Used by map_pgsz*() routines.
 596  */
 597 extern size_t max_uheap_lpsize;
 598 extern size_t default_uheap_lpsize;
 599 extern size_t max_ustack_lpsize;
 600 extern size_t default_ustack_lpsize;
 601 extern size_t max_privmap_lpsize;
 602 extern size_t max_uidata_lpsize;
 603 extern size_t max_utext_lpsize;
 604 extern size_t max_shm_lpsize;
 605 
 606 /*
 607  * For adjusting the default lpsize, for DTLB-limited page sizes.
 608  */
 609 extern void adjust_data_maxlpsize(size_t ismpagesize);
 610 
 611 /*
 612  * Sanity control. Don't use large pages regardless of user
 613  * settings if there's less than priv or shm_lpg_min_physmem memory installed.
 614  * The units for this variable are 8K pages.
 615  */
 616 extern pgcnt_t privm_lpg_min_physmem;
 617 extern pgcnt_t shm_lpg_min_physmem;
 618 
 619 /*
 620  * AS_2_BIN macro controls the page coloring policy.
 621  * 0 (default) uses various vaddr bits
 622  * 1 virtual=paddr
 623  * 2 bin hopping
 624  */
 625 #define AS_2_BIN(as, seg, vp, addr, bin, szc)                           \
 626 switch (consistent_coloring) {                                          \
 627         default:                                                        \
 628                 cmn_err(CE_WARN,                                        \
 629                         "AS_2_BIN: bad consistent coloring value");     \
 630                 /* assume default algorithm -> continue */           \
 631         case 0: {                                                       \
 632                 uint32_t ndx, new;                                      \
 633                 int slew = 0;                                           \
 634                 pfn_t pfn;                                              \
 635                                                                         \
 636                 if (vp != NULL && IS_SWAPVP(vp) &&                      \
 637                     seg->s_ops == &segvn_ops)                            \
 638                         slew = as_color_bin(as);                        \
 639                                                                         \
 640                 pfn = ((uintptr_t)addr >> MMU_PAGESHIFT) +                \
 641                         (((uintptr_t)addr >> page_coloring_shift) <<        \
 642                         (vac_shift - MMU_PAGESHIFT));                   \
 643                 if ((szc) == 0 || &page_pfn_2_color_cpu == NULL) {  \
 644                         pfn += slew;                                    \
 645                         bin = PFN_2_COLOR(pfn, szc, NULL);              \
 646                 } else {                                                \
 647                         bin = PFN_2_COLOR(pfn, szc, NULL);              \
 648                         bin += slew >> (vac_shift - MMU_PAGESHIFT);       \
 649                         bin &= hw_page_array[(szc)].hp_colors - 1;  \
 650                 }                                                       \
 651                 break;                                                  \
 652         }                                                               \
 653         case 1:                                                         \
 654                 bin = PFN_2_COLOR(((uintptr_t)addr >> MMU_PAGESHIFT),     \
 655                     szc, NULL);                                         \
 656                 break;                                                  \
 657         case 2: {                                                       \
 658                 int cnt = as_color_bin(as);                             \
 659                 uint_t color_mask = page_get_pagecolors(0) - 1;         \
 660                                                                         \
 661                 /* make sure physical color aligns with vac color */    \
 662                 while ((cnt & vac_colors_mask) !=                   \
 663                     addr_to_vcolor(addr)) {                             \
 664                         cnt++;                                          \
 665                 }                                                       \
 666                 bin = cnt = cnt & color_mask;                               \
 667                 bin >>= PAGE_GET_COLOR_SHIFT(0, szc);                   \
 668                 /* update per as page coloring fields */                \
 669                 cnt = (cnt + 1) & color_mask;                               \
 670                 if (cnt == (as_color_start(as) & color_mask)) {             \
 671                         cnt = as_color_start(as) = as_color_start(as) + \
 672                                 PGCLR_LOOPFACTOR;                       \
 673                 }                                                       \
 674                 as_color_bin(as) = cnt & color_mask;                        \
 675                 break;                                                  \
 676         }                                                               \
 677 }                                                                       \
 678         ASSERT(bin < page_get_pagecolors(szc));
 679 
 680 /*
 681  * cpu private vm data - accessed thru CPU->cpu_vm_data
 682  *      vc_pnum_memseg: tracks last memseg visited in page_numtopp_nolock()
 683  *      vc_pnext_memseg: tracks last memseg visited in page_nextn()
 684  *      vc_kmptr: unaligned kmem pointer for this vm_cpu_data_t
 685  *      vc_kmsize: orignal kmem size for this vm_cpu_data_t
 686  */
 687 
 688 typedef struct {
 689         struct memseg   *vc_pnum_memseg;
 690         struct memseg   *vc_pnext_memseg;
 691         void            *vc_kmptr;
 692         size_t          vc_kmsize;
 693 } vm_cpu_data_t;
 694 
 695 /* allocation size to ensure vm_cpu_data_t resides in its own cache line */
 696 #define VM_CPU_DATA_PADSIZE                                             \
 697         (P2ROUNDUP(sizeof (vm_cpu_data_t), L2CACHE_ALIGN_MAX))
 698 
 699 /*
 700  * Function to get an ecache color bin: F(as, cnt, vcolor).
 701  * the goal of this function is to:
 702  * - to spread a processes' physical pages across the entire ecache to
 703  *      maximize its use.
 704  * - to minimize vac flushes caused when we reuse a physical page on a
 705  *      different vac color than it was previously used.
 706  * - to prevent all processes to use the same exact colors and trash each
 707  *      other.
 708  *
 709  * cnt is a bin ptr kept on a per as basis.  As we page_create we increment
 710  * the ptr so we spread out the physical pages to cover the entire ecache.
 711  * The virtual color is made a subset of the physical color in order to
 712  * in minimize virtual cache flushing.
 713  * We add in the as to spread out different as.  This happens when we
 714  * initialize the start count value.
 715  * sizeof(struct as) is 60 so we shift by 3 to get into the bit range
 716  * that will tend to change.  For example, on spitfire based machines
 717  * (vcshft == 1) contigous as are spread bu ~6 bins.
 718  * vcshft provides for proper virtual color alignment.
 719  * In theory cnt should be updated using cas only but if we are off by one
 720  * or 2 it is no big deal.
 721  * We also keep a start value which is used to randomize on what bin we
 722  * start counting when it is time to start another loop. This avoids
 723  * contigous allocations of ecache size to point to the same bin.
 724  * Why 3? Seems work ok. Better than 7 or anything larger.
 725  */
 726 #define PGCLR_LOOPFACTOR 3
 727 
 728 /*
 729  * When a bin is empty, and we can't satisfy a color request correctly,
 730  * we scan.  If we assume that the programs have reasonable spatial
 731  * behavior, then it will not be a good idea to use the adjacent color.
 732  * Using the adjacent color would result in virtually adjacent addresses
 733  * mapping into the same spot in the cache.  So, if we stumble across
 734  * an empty bin, skip a bunch before looking.  After the first skip,
 735  * then just look one bin at a time so we don't miss our cache on
 736  * every look. Be sure to check every bin.  Page_create() will panic
 737  * if we miss a page.
 738  *
 739  * This also explains the `<=' in the for loops in both page_get_freelist()
 740  * and page_get_cachelist().  Since we checked the target bin, skipped
 741  * a bunch, then continued one a time, we wind up checking the target bin
 742  * twice to make sure we get all of them bins.
 743  */
 744 #define BIN_STEP        20
 745 
 746 #ifdef VM_STATS
 747 struct vmm_vmstats_str {
 748         ulong_t pgf_alloc[MMU_PAGE_SIZES];      /* page_get_freelist */
 749         ulong_t pgf_allocok[MMU_PAGE_SIZES];
 750         ulong_t pgf_allocokrem[MMU_PAGE_SIZES];
 751         ulong_t pgf_allocfailed[MMU_PAGE_SIZES];
 752         ulong_t pgf_allocdeferred;
 753         ulong_t pgf_allocretry[MMU_PAGE_SIZES];
 754         ulong_t pgc_alloc;                      /* page_get_cachelist */
 755         ulong_t pgc_allocok;
 756         ulong_t pgc_allocokrem;
 757         ulong_t pgc_allocokdeferred;
 758         ulong_t pgc_allocfailed;
 759         ulong_t pgcp_alloc[MMU_PAGE_SIZES];     /* page_get_contig_pages */
 760         ulong_t pgcp_allocfailed[MMU_PAGE_SIZES];
 761         ulong_t pgcp_allocempty[MMU_PAGE_SIZES];
 762         ulong_t pgcp_allocok[MMU_PAGE_SIZES];
 763         ulong_t ptcp[MMU_PAGE_SIZES];           /* page_trylock_contig_pages */
 764         ulong_t ptcpfreethresh[MMU_PAGE_SIZES];
 765         ulong_t ptcpfailexcl[MMU_PAGE_SIZES];
 766         ulong_t ptcpfailszc[MMU_PAGE_SIZES];
 767         ulong_t ptcpfailcage[MMU_PAGE_SIZES];
 768         ulong_t ptcpok[MMU_PAGE_SIZES];
 769         ulong_t pgmf_alloc[MMU_PAGE_SIZES];     /* page_get_mnode_freelist */
 770         ulong_t pgmf_allocfailed[MMU_PAGE_SIZES];
 771         ulong_t pgmf_allocempty[MMU_PAGE_SIZES];
 772         ulong_t pgmf_allocok[MMU_PAGE_SIZES];
 773         ulong_t pgmc_alloc;                     /* page_get_mnode_cachelist */
 774         ulong_t pgmc_allocfailed;
 775         ulong_t pgmc_allocempty;
 776         ulong_t pgmc_allocok;
 777         ulong_t pladd_free[MMU_PAGE_SIZES];     /* page_list_add/sub */
 778         ulong_t plsub_free[MMU_PAGE_SIZES];
 779         ulong_t pladd_cache;
 780         ulong_t plsub_cache;
 781         ulong_t plsubpages_szcbig;
 782         ulong_t plsubpages_szc0;
 783         ulong_t pfs_req[MMU_PAGE_SIZES];        /* page_freelist_split */
 784         ulong_t pfs_demote[MMU_PAGE_SIZES];
 785         ulong_t pfc_coalok[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
 786         ulong_t ppr_reloc[MMU_PAGE_SIZES];      /* page_relocate */
 787         ulong_t ppr_relocok[MMU_PAGE_SIZES];
 788         ulong_t ppr_relocnoroot[MMU_PAGE_SIZES];
 789         ulong_t ppr_reloc_replnoroot[MMU_PAGE_SIZES];
 790         ulong_t ppr_relocnolock[MMU_PAGE_SIZES];
 791         ulong_t ppr_relocnomem[MMU_PAGE_SIZES];
 792         ulong_t ppr_krelocfail[MMU_PAGE_SIZES];
 793         ulong_t ppr_copyfail;
 794         /* page coalesce counter */
 795         ulong_t page_ctrs_coalesce[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
 796         /* candidates useful */
 797         ulong_t page_ctrs_cands_skip[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
 798         /* ctrs changed after locking */
 799         ulong_t page_ctrs_changed[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
 800         /* page_freelist_coalesce failed */
 801         ulong_t page_ctrs_failed[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
 802         ulong_t page_ctrs_coalesce_all; /* page coalesce all counter */
 803         ulong_t page_ctrs_cands_skip_all; /* candidates useful for all func */
 804 };
 805 extern struct vmm_vmstats_str vmm_vmstats;
 806 #endif  /* VM_STATS */
 807 
 808 /*
 809  * Used to hold off page relocations into the cage until OBP has completed
 810  * its boot-time handoff of its resources to the kernel.
 811  */
 812 extern int page_relocate_ready;
 813 
 814 /*
 815  * cpu/mmu-dependent vm variables may be reset at bootup.
 816  */
 817 extern uint_t mmu_page_sizes;
 818 extern uint_t max_mmu_page_sizes;
 819 extern uint_t mmu_hashcnt;
 820 extern uint_t max_mmu_hashcnt;
 821 extern size_t mmu_ism_pagesize;
 822 extern int mmu_exported_pagesize_mask;
 823 extern uint_t mmu_exported_page_sizes;
 824 extern uint_t szc_2_userszc[];
 825 extern uint_t userszc_2_szc[];
 826 
 827 #define mmu_legacy_page_sizes   mmu_exported_page_sizes
 828 #define USERSZC_2_SZC(userszc)  (userszc_2_szc[userszc])
 829 #define SZC_2_USERSZC(szc)      (szc_2_userszc[szc])
 830 
 831 /*
 832  * Platform specific page routines
 833  */
 834 extern void mach_page_add(page_t **, page_t *);
 835 extern void mach_page_sub(page_t **, page_t *);
 836 extern uint_t page_get_pagecolors(uint_t);
 837 extern void ppcopy_kernel__relocatable(page_t *, page_t *);
 838 #define ppcopy_kernel(p1, p2)   ppcopy_kernel__relocatable(p1, p2)
 839 
 840 /*
 841  * platform specific large pages for kernel heap support
 842  */
 843 extern size_t get_segkmem_lpsize(size_t lpsize);
 844 extern size_t mmu_get_kernel_lpsize(size_t lpsize);
 845 extern void mmu_init_kernel_pgsz(struct hat *hat);
 846 extern void mmu_init_kcontext();
 847 extern uint64_t kcontextreg;
 848 
 849 /*
 850  * Nucleus data page allocator routines
 851  */
 852 extern void ndata_alloc_init(struct memlist *, uintptr_t, uintptr_t);
 853 extern void *ndata_alloc(struct memlist *, size_t, size_t);
 854 extern void *ndata_extra_base(struct memlist *, size_t, caddr_t);
 855 extern size_t ndata_maxsize(struct memlist *);
 856 extern size_t ndata_spare(struct memlist *, size_t, size_t);
 857 
 858 #ifdef  __cplusplus
 859 }
 860 #endif
 861 
 862 #endif  /* _VM_DEP_H */