1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2019 Joyent, Inc.
24 */
25
26 /*
27 * UNIX machine dependent virtual memory support.
28 */
29
30 #ifndef _VM_DEP_H
31 #define _VM_DEP_H
32
33 #ifdef __cplusplus
34 extern "C" {
35 #endif
36
37 #include <vm/hat_sfmmu.h>
38 #include <sys/archsystm.h>
39 #include <sys/memnode.h>
40
41 #define GETTICK() gettick()
42
43 /* tick value that should be used for random values */
44 extern u_longlong_t randtick(void);
45
46 /*
47 * Per page size free lists. Allocated dynamically.
48 */
49 #define MAX_MEM_TYPES 2 /* 0 = reloc, 1 = noreloc */
50 #define MTYPE_RELOC 0
51 #define MTYPE_NORELOC 1
52
53 #define PP_2_MTYPE(pp) (PP_ISNORELOC(pp) ? MTYPE_NORELOC : MTYPE_RELOC)
54
55 #define MTYPE_INIT(mtype, vp, vaddr, flags, pgsz) \
56 mtype = (flags & PG_NORELOC) ? MTYPE_NORELOC : MTYPE_RELOC;
57
58 /* mtype init for page_get_replacement_page */
59 #define MTYPE_PGR_INIT(mtype, flags, pp, pgcnt) \
60 mtype = (flags & PG_NORELOC) ? MTYPE_NORELOC : MTYPE_RELOC;
61
62 #define MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi) \
63 pfnlo = mem_node_config[mnode].physbase; \
64 pfnhi = mem_node_config[mnode].physmax;
65
66 /*
67 * candidate counters in vm_pagelist.c are indexed by color and range
68 */
69 #define MAX_MNODE_MRANGES MAX_MEM_TYPES
70 #define MNODE_RANGE_CNT(mnode) MAX_MNODE_MRANGES
71 #define MNODE_MAX_MRANGE(mnode) (MAX_MEM_TYPES - 1)
72 #define MTYPE_2_MRANGE(mnode, mtype) (mtype)
73
74 /*
75 * Internal PG_ flags.
76 */
77 #define PGI_RELOCONLY 0x10000 /* acts in the opposite sense to PG_NORELOC */
78 #define PGI_NOCAGE 0x20000 /* indicates Cage is disabled */
79 #define PGI_PGCPHIPRI 0x40000 /* page_get_contig_page priority allocation */
80 #define PGI_PGCPSZC0 0x80000 /* relocate base pagesize page */
81
82 /*
83 * PGI mtype flags - should not overlap PGI flags
84 */
85 #define PGI_MT_RANGE 0x1000000 /* mtype range */
86 #define PGI_MT_NEXT 0x2000000 /* get next mtype */
87
88 extern page_t ***page_freelists[MMU_PAGE_SIZES][MAX_MEM_TYPES];
89 extern page_t ***page_cachelists[MAX_MEM_TYPES];
90
91 #define PAGE_FREELISTS(mnode, szc, color, mtype) \
92 (*(page_freelists[szc][mtype][mnode] + (color)))
93
94 #define PAGE_CACHELISTS(mnode, color, mtype) \
95 (*(page_cachelists[mtype][mnode] + (color)))
96
97 /*
98 * There are 'page_colors' colors/bins. Spread them out under a
99 * couple of locks. There are mutexes for both the page freelist
100 * and the page cachelist. We want enough locks to make contention
101 * reasonable, but not too many -- otherwise page_freelist_lock() gets
102 * so expensive that it becomes the bottleneck!
103 */
104 #define NPC_MUTEX 16
105
106 extern kmutex_t *fpc_mutex[NPC_MUTEX];
107 extern kmutex_t *cpc_mutex[NPC_MUTEX];
108
109 /*
110 * Iterator provides the info needed to convert RA to PA.
111 * MEM_NODE_ITERATOR_INIT() should be called before
112 * PAGE_NEXT_PFN_FOR_COLOR() if pfn was not obtained via a previous
113 * PAGE_NEXT_PFN_FOR_COLOR() call. Iterator caches color 2 hash
114 * translations requiring initializer call if color or ceq_mask changes,
115 * even if pfn doesn't. MEM_NODE_ITERATOR_INIT() must also be called before
116 * PFN_2_COLOR() that uses a valid iterator argument.
117 *
118 * plat_mem_node_iterator_init() starts from last mblock in continuation
119 * case which may be invalid because memory DR. To detect this situation
120 * mi_genid is checked against mpo_genid which is incremented after a
121 * memory DR operation. See also plat_slice_add()/plat_slice_del().
122 */
123 #ifdef sun4v
124
125 typedef struct mem_node_iterator {
126 uint_t mi_mnode; /* mnode in which to iterate */
127 int mi_init; /* set to 1 when first init */
128 int mi_genid; /* set/checked against mpo_genid */
129 int mi_last_mblock; /* last mblock visited */
130 uint_t mi_hash_ceq_mask; /* cached copy of ceq_mask */
131 uint_t mi_hash_color; /* cached copy of color */
132 uint_t mi_mnode_mask; /* number of mask bits */
133 uint_t mi_mnode_pfn_shift; /* mnode position in pfn */
134 pfn_t mi_mblock_base; /* first valid pfn in current mblock */
135 pfn_t mi_mblock_end; /* last valid pfn in current mblock */
136 pfn_t mi_ra_to_pa; /* ra adjustment for current mblock */
137 pfn_t mi_mnode_pfn_mask; /* mask to obtain mnode id bits */
138 } mem_node_iterator_t;
139
140 #define MEM_NODE_ITERATOR_DECL(it) \
141 mem_node_iterator_t it
142 #define MEM_NODE_ITERATOR_INIT(pfn, mnode, szc, it) \
143 (pfn) = plat_mem_node_iterator_init((pfn), (mnode), (szc), (it), 1)
144
145 extern pfn_t plat_mem_node_iterator_init(pfn_t, int, uchar_t,
146 mem_node_iterator_t *, int);
147 extern pfn_t plat_rapfn_to_papfn(pfn_t);
148 extern int interleaved_mnodes;
149
150 #else /* sun4v */
151
152 #define MEM_NODE_ITERATOR_DECL(it) \
153 void *it = NULL
154 #define MEM_NODE_ITERATOR_INIT(pfn, mnode, szc, it)
155
156 #endif /* sun4v */
157
158 /*
159 * Return the mnode limits so that hpc_counters length and base
160 * index can be determined. When interleaved_mnodes is set, we
161 * create an array only for the first mnode that exists. All other
162 * mnodes will share the array in this case.
163 * If interleaved_mnodes is not set, simply return the limits for
164 * the given mnode.
165 */
166 #define HPM_COUNTERS_LIMITS(mnode, physbase, physmax, first) \
167 if (!interleaved_mnodes) { \
168 (physbase) = mem_node_config[(mnode)].physbase; \
169 (physmax) = mem_node_config[(mnode)].physmax; \
170 (first) = (mnode); \
171 } else if ((first) < 0) { \
172 mem_node_max_range(&(physbase), &(physmax)); \
173 (first) = (mnode); \
174 }
175
176 #define PAGE_CTRS_WRITE_LOCK(mnode) \
177 if (!interleaved_mnodes) { \
178 rw_enter(&page_ctrs_rwlock[(mnode)], RW_WRITER); \
179 page_freelist_lock(mnode); \
180 } else { \
181 /* changing shared hpm_counters */ \
182 int _i; \
183 for (_i = 0; _i < max_mem_nodes; _i++) { \
184 rw_enter(&page_ctrs_rwlock[_i], RW_WRITER); \
185 page_freelist_lock(_i); \
186 } \
187 }
188
189 #define PAGE_CTRS_WRITE_UNLOCK(mnode) \
190 if (!interleaved_mnodes) { \
191 page_freelist_unlock(mnode); \
192 rw_exit(&page_ctrs_rwlock[(mnode)]); \
193 } else { \
194 int _i; \
195 for (_i = 0; _i < max_mem_nodes; _i++) { \
196 page_freelist_unlock(_i); \
197 rw_exit(&page_ctrs_rwlock[_i]); \
198 } \
199 }
200
201 /*
202 * cpu specific color conversion functions
203 */
204 extern uint_t page_get_nsz_color_mask_cpu(uchar_t, uint_t);
205 #pragma weak page_get_nsz_color_mask_cpu
206
207 extern uint_t page_get_nsz_color_cpu(uchar_t, uint_t);
208 #pragma weak page_get_nsz_color_cpu
209
210 extern uint_t page_get_color_shift_cpu(uchar_t, uchar_t);
211 #pragma weak page_get_color_shift_cpu
212
213 extern uint_t page_convert_color_cpu(uint_t, uchar_t, uchar_t);
214 #pragma weak page_convert_color_cpu
215
216 extern pfn_t page_next_pfn_for_color_cpu(pfn_t,
217 uchar_t, uint_t, uint_t, uint_t, void *);
218 #pragma weak page_next_pfn_for_color_cpu
219
220 extern uint_t page_pfn_2_color_cpu(pfn_t, uchar_t, void *);
221 #pragma weak page_pfn_2_color_cpu
222
223 #define PAGE_GET_COLOR_SHIFT(szc, nszc) \
224 ((&page_get_color_shift_cpu != NULL) ? \
225 page_get_color_shift_cpu(szc, nszc) : \
226 (hw_page_array[(nszc)].hp_shift - \
227 hw_page_array[(szc)].hp_shift))
228
229 #define PAGE_CONVERT_COLOR(ncolor, szc, nszc) \
230 ((&page_convert_color_cpu != NULL) ? \
231 page_convert_color_cpu(ncolor, szc, nszc) : \
232 ((ncolor) << PAGE_GET_COLOR_SHIFT((szc), (nszc))))
233
234 #define PFN_2_COLOR(pfn, szc, it) \
235 ((&page_pfn_2_color_cpu != NULL) ? \
236 page_pfn_2_color_cpu(pfn, szc, it) : \
237 ((pfn & (hw_page_array[0].hp_colors - 1)) >> \
238 (hw_page_array[szc].hp_shift - \
239 hw_page_array[0].hp_shift)))
240
241 #define PNUM_SIZE(szc) \
242 (hw_page_array[(szc)].hp_pgcnt)
243 #define PNUM_SHIFT(szc) \
244 (hw_page_array[(szc)].hp_shift - hw_page_array[0].hp_shift)
245 #define PAGE_GET_SHIFT(szc) \
246 (hw_page_array[(szc)].hp_shift)
247 #define PAGE_GET_PAGECOLORS(szc) \
248 (hw_page_array[(szc)].hp_colors)
249
250 /*
251 * This macro calculates the next sequential pfn with the specified
252 * color using color equivalency mask
253 */
254 #define PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, color, ceq_mask, color_mask, it) \
255 { \
256 ASSERT(((color) & ~(ceq_mask)) == 0); \
257 if (&page_next_pfn_for_color_cpu == NULL) { \
258 uint_t pfn_shift = PAGE_BSZS_SHIFT(szc); \
259 pfn_t spfn = pfn >> pfn_shift; \
260 pfn_t stride = (ceq_mask) + 1; \
261 ASSERT((((ceq_mask) + 1) & (ceq_mask)) == 0); \
262 if (((spfn ^ (color)) & (ceq_mask)) == 0) { \
263 pfn += stride << pfn_shift; \
264 } else { \
265 pfn = (spfn & ~(pfn_t)(ceq_mask)) | (color); \
266 pfn = (pfn > spfn ? pfn : pfn + stride) << \
267 pfn_shift; \
268 } \
269 } else { \
270 pfn = page_next_pfn_for_color_cpu(pfn, szc, color, \
271 ceq_mask, color_mask, it); \
272 } \
273 }
274
275 /* get the color equivalency mask for the next szc */
276 #define PAGE_GET_NSZ_MASK(szc, mask) \
277 ((&page_get_nsz_color_mask_cpu == NULL) ? \
278 ((mask) >> (PAGE_GET_SHIFT((szc) + 1) - PAGE_GET_SHIFT(szc))) : \
279 page_get_nsz_color_mask_cpu(szc, mask))
280
281 /* get the color of the next szc */
282 #define PAGE_GET_NSZ_COLOR(szc, color) \
283 ((&page_get_nsz_color_cpu == NULL) ? \
284 ((color) >> (PAGE_GET_SHIFT((szc) + 1) - PAGE_GET_SHIFT(szc))) : \
285 page_get_nsz_color_cpu(szc, color))
286
287 /* Find the bin for the given page if it was of size szc */
288 #define PP_2_BIN_SZC(pp, szc) (PFN_2_COLOR(pp->p_pagenum, szc, (void *)(-1)))
289
290 #define PP_2_BIN(pp) (PP_2_BIN_SZC(pp, pp->p_szc))
291
292 #define PP_2_MEM_NODE(pp) (PFN_2_MEM_NODE(pp->p_pagenum))
293
294 #define PC_BIN_MUTEX(mnode, bin, flags) ((flags & PG_FREE_LIST) ? \
295 &fpc_mutex[(bin) & (NPC_MUTEX - 1)][mnode] : \
296 &cpc_mutex[(bin) & (NPC_MUTEX - 1)][mnode])
297
298 #define FPC_MUTEX(mnode, i) (&fpc_mutex[i][mnode])
299 #define CPC_MUTEX(mnode, i) (&cpc_mutex[i][mnode])
300
301 #define PFN_BASE(pfnum, szc) (pfnum & ~((1 << PAGE_BSZS_SHIFT(szc)) - 1))
302
303 /*
304 * this structure is used for walking free page lists
305 * controls when to split large pages into smaller pages,
306 * and when to coalesce smaller pages into larger pages
307 */
308 typedef struct page_list_walker {
309 uint_t plw_colors; /* num of colors for szc */
310 uint_t plw_color_mask; /* colors-1 */
311 uint_t plw_bin_step; /* next bin: 1 or 2 */
312 uint_t plw_count; /* loop count */
313 uint_t plw_bin0; /* starting bin */
314 uint_t plw_bin_marker; /* bin after initial jump */
315 uint_t plw_bin_split_prev; /* last bin we tried to split */
316 uint_t plw_do_split; /* set if OK to split */
317 uint_t plw_split_next; /* next bin to split */
318 uint_t plw_ceq_dif; /* number of different color groups */
319 /* to check */
320 uint_t plw_ceq_mask[MMU_PAGE_SIZES + 1]; /* color equiv mask */
321 uint_t plw_bins[MMU_PAGE_SIZES + 1]; /* num of bins */
322 } page_list_walker_t;
323
324 void page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin,
325 int can_split, int use_ceq, page_list_walker_t *plw);
326
327 typedef char hpmctr_t;
328
329 #ifdef DEBUG
330 #define CHK_LPG(pp, szc) chk_lpg(pp, szc)
331 extern void chk_lpg(page_t *, uchar_t);
332 #else
333 #define CHK_LPG(pp, szc)
334 #endif
335
336 /*
337 * page list count per mnode and type.
338 */
339 typedef struct {
340 pgcnt_t plc_mt_pgmax; /* max page cnt */
341 pgcnt_t plc_mt_clpgcnt; /* cache list cnt */
342 pgcnt_t plc_mt_flpgcnt; /* free list cnt - small pages */
343 pgcnt_t plc_mt_lgpgcnt; /* free list cnt - large pages */
344 #ifdef DEBUG
345 struct {
346 pgcnt_t plc_mts_pgcnt; /* per page size count */
347 int plc_mts_colors;
348 pgcnt_t *plc_mtsc_pgcnt; /* per color bin count */
349 } plc_mts[MMU_PAGE_SIZES];
350 #endif
351 } plcnt_t[MAX_MEM_NODES][MAX_MEM_TYPES];
352
353 #ifdef DEBUG
354
355 #define PLCNT_SZ(ctrs_sz) { \
356 int szc; \
357 for (szc = 0; szc < mmu_page_sizes; szc++) { \
358 int colors = page_get_pagecolors(szc); \
359 ctrs_sz += (max_mem_nodes * MAX_MEM_TYPES * \
360 colors * sizeof (pgcnt_t)); \
361 } \
362 }
363
364 #define PLCNT_INIT(base) { \
365 int mn, mt, szc, colors; \
366 for (szc = 0; szc < mmu_page_sizes; szc++) { \
367 colors = page_get_pagecolors(szc); \
368 for (mn = 0; mn < max_mem_nodes; mn++) { \
369 for (mt = 0; mt < MAX_MEM_TYPES; mt++) { \
370 plcnt[mn][mt].plc_mts[szc]. \
371 plc_mts_colors = colors; \
372 plcnt[mn][mt].plc_mts[szc]. \
373 plc_mtsc_pgcnt = (pgcnt_t *)base; \
374 base += (colors * sizeof (pgcnt_t)); \
375 } \
376 } \
377 } \
378 }
379
380 #define PLCNT_DO(pp, mn, mtype, szc, cnt, flags) { \
381 int bin = PP_2_BIN(pp); \
382 if (flags & PG_CACHE_LIST) \
383 atomic_add_long(&plcnt[mn][mtype].plc_mt_clpgcnt, cnt); \
384 else if (szc) \
385 atomic_add_long(&plcnt[mn][mtype].plc_mt_lgpgcnt, cnt); \
386 else \
387 atomic_add_long(&plcnt[mn][mtype].plc_mt_flpgcnt, cnt); \
388 atomic_add_long(&plcnt[mn][mtype].plc_mts[szc].plc_mts_pgcnt, \
389 cnt); \
390 atomic_add_long(&plcnt[mn][mtype].plc_mts[szc]. \
391 plc_mtsc_pgcnt[bin], cnt); \
392 }
393
394 #else
395
396 #define PLCNT_SZ(ctrs_sz)
397
398 #define PLCNT_INIT(base)
399
400 /* PG_FREE_LIST may not be explicitly set in flags for large pages */
401
402 #define PLCNT_DO(pp, mn, mtype, szc, cnt, flags) { \
403 if (flags & PG_CACHE_LIST) \
404 atomic_add_long(&plcnt[mn][mtype].plc_mt_clpgcnt, cnt); \
405 else if (szc) \
406 atomic_add_long(&plcnt[mn][mtype].plc_mt_lgpgcnt, cnt); \
407 else \
408 atomic_add_long(&plcnt[mn][mtype].plc_mt_flpgcnt, cnt); \
409 }
410
411 #endif
412
413 #define PLCNT_INCR(pp, mn, mtype, szc, flags) { \
414 long cnt = (1 << PAGE_BSZS_SHIFT(szc)); \
415 PLCNT_DO(pp, mn, mtype, szc, cnt, flags); \
416 }
417
418 #define PLCNT_DECR(pp, mn, mtype, szc, flags) { \
419 long cnt = ((-1) << PAGE_BSZS_SHIFT(szc)); \
420 PLCNT_DO(pp, mn, mtype, szc, cnt, flags); \
421 }
422
423 /*
424 * macros to update page list max counts - done when pages transferred
425 * from RELOC to NORELOC mtype (kcage_init or kcage_assimilate_page).
426 */
427
428 #define PLCNT_XFER_NORELOC(pp) { \
429 long cnt = (1 << PAGE_BSZS_SHIFT((pp)->p_szc)); \
430 int mn = PP_2_MEM_NODE(pp); \
431 atomic_add_long(&plcnt[mn][MTYPE_NORELOC].plc_mt_pgmax, cnt); \
432 atomic_add_long(&plcnt[mn][MTYPE_RELOC].plc_mt_pgmax, -cnt); \
433 }
434
435 /*
436 * macro to modify the page list max counts when memory is added to
437 * the page lists during startup (add_physmem) or during a DR operation
438 * when memory is added (kphysm_add_memory_dynamic) or deleted
439 * (kphysm_del_cleanup).
440 */
441 #define PLCNT_MODIFY_MAX(pfn, cnt) { \
442 spgcnt_t _cnt = (spgcnt_t)(cnt); \
443 pgcnt_t _acnt = ABS(_cnt); \
444 int _mn; \
445 pgcnt_t _np; \
446 if (&plat_mem_node_intersect_range != NULL) { \
447 for (_mn = 0; _mn < max_mem_nodes; _mn++) { \
448 plat_mem_node_intersect_range((pfn), _acnt, _mn, &_np);\
449 if (_np == 0) \
450 continue; \
451 atomic_add_long(&plcnt[_mn][MTYPE_RELOC].plc_mt_pgmax, \
452 (_cnt < 0) ? -_np : _np); \
453 } \
454 } else { \
455 pfn_t _pfn = (pfn); \
456 pfn_t _endpfn = _pfn + _acnt; \
457 while (_pfn < _endpfn) { \
458 _mn = PFN_2_MEM_NODE(_pfn); \
459 _np = MIN(_endpfn, mem_node_config[_mn].physmax + 1) - \
460 _pfn; \
461 _pfn += _np; \
462 atomic_add_long(&plcnt[_mn][MTYPE_RELOC].plc_mt_pgmax, \
463 (_cnt < 0) ? -_np : _np); \
464 } \
465 } \
466 }
467
468 /*
469 * macro to call page_ctrs_adjust() when memory is added
470 * during a DR operation.
471 */
472 #define PAGE_CTRS_ADJUST(pfn, cnt, rv) { \
473 spgcnt_t _cnt = (spgcnt_t)(cnt); \
474 int _mn; \
475 pgcnt_t _np; \
476 if (&plat_mem_node_intersect_range != NULL) { \
477 for (_mn = 0; _mn < max_mem_nodes; _mn++) { \
478 plat_mem_node_intersect_range((pfn), _cnt, _mn, &_np); \
479 if (_np == 0) \
480 continue; \
481 if ((rv = page_ctrs_adjust(_mn)) != 0) \
482 break; \
483 } \
484 } else { \
485 pfn_t _pfn = (pfn); \
486 pfn_t _endpfn = _pfn + _cnt; \
487 while (_pfn < _endpfn) { \
488 _mn = PFN_2_MEM_NODE(_pfn); \
489 _np = MIN(_endpfn, mem_node_config[_mn].physmax + 1) - \
490 _pfn; \
491 _pfn += _np; \
492 if ((rv = page_ctrs_adjust(_mn)) != 0) \
493 break; \
494 } \
495 } \
496 }
497
498 extern plcnt_t plcnt;
499
500 #define MNODE_PGCNT(mn) \
501 (plcnt[mn][MTYPE_RELOC].plc_mt_clpgcnt + \
502 plcnt[mn][MTYPE_NORELOC].plc_mt_clpgcnt + \
503 plcnt[mn][MTYPE_RELOC].plc_mt_flpgcnt + \
504 plcnt[mn][MTYPE_NORELOC].plc_mt_flpgcnt + \
505 plcnt[mn][MTYPE_RELOC].plc_mt_lgpgcnt + \
506 plcnt[mn][MTYPE_NORELOC].plc_mt_lgpgcnt)
507
508 #define MNODETYPE_PGCNT(mn, mtype) \
509 (plcnt[mn][mtype].plc_mt_clpgcnt + \
510 plcnt[mn][mtype].plc_mt_flpgcnt + \
511 plcnt[mn][mtype].plc_mt_lgpgcnt)
512
513 /*
514 * macros to loop through the mtype range - MTYPE_START returns -1 in
515 * mtype if no pages in mnode/mtype and possibly NEXT mtype.
516 */
517 #define MTYPE_START(mnode, mtype, flags) { \
518 if (plcnt[mnode][mtype].plc_mt_pgmax == 0) { \
519 ASSERT(mtype == MTYPE_RELOC || \
520 MNODETYPE_PGCNT(mnode, mtype) == 0 || \
521 plcnt[mnode][mtype].plc_mt_pgmax != 0); \
522 MTYPE_NEXT(mnode, mtype, flags); \
523 } \
524 }
525
526 /*
527 * if allocation from the RELOC pool failed and there is sufficient cage
528 * memory, attempt to allocate from the NORELOC pool.
529 */
530 #define MTYPE_NEXT(mnode, mtype, flags) { \
531 if (!(flags & (PG_NORELOC | PGI_NOCAGE | PGI_RELOCONLY)) && \
532 (kcage_freemem >= kcage_lotsfree)) { \
533 if (plcnt[mnode][MTYPE_NORELOC].plc_mt_pgmax == 0) { \
534 ASSERT(MNODETYPE_PGCNT(mnode, MTYPE_NORELOC) == 0 || \
535 plcnt[mnode][MTYPE_NORELOC].plc_mt_pgmax != 0); \
536 mtype = -1; \
537 } else { \
538 mtype = MTYPE_NORELOC; \
539 flags |= PG_NORELOC; \
540 } \
541 } else { \
542 mtype = -1; \
543 } \
544 }
545
546 /*
547 * get the ecache setsize for the current cpu.
548 */
549 #define CPUSETSIZE() (cpunodes[CPU->cpu_id].ecache_setsize)
550
551 extern struct cpu cpu0;
552 #define CPU0 &cpu0
553
554 #define PAGE_BSZS_SHIFT(szc) TTE_BSZS_SHIFT(szc)
555 /*
556 * For sfmmu each larger page is 8 times the size of the previous
557 * size page.
558 */
559 #define FULL_REGION_CNT(rg_szc) (8)
560
561 /*
562 * The counter base must be per page_counter element to prevent
563 * races when re-indexing, and the base page size element should
564 * be aligned on a boundary of the given region size.
565 *
566 * We also round up the number of pages spanned by the counters
567 * for a given region to PC_BASE_ALIGN in certain situations to simplify
568 * the coding for some non-performance critical routines.
569 */
570 #define PC_BASE_ALIGN ((pfn_t)1 << PAGE_BSZS_SHIFT(mmu_page_sizes-1))
571 #define PC_BASE_ALIGN_MASK (PC_BASE_ALIGN - 1)
572
573 extern int ecache_alignsize;
574 #define L2CACHE_ALIGN ecache_alignsize
575 #define L2CACHE_ALIGN_MAX 512
576
577 extern int update_proc_pgcolorbase_after_fork;
578 extern int consistent_coloring;
579 extern uint_t vac_colors_mask;
580 extern int vac_size;
581 extern int vac_shift;
582
583 /*
584 * Kernel mem segment in 64-bit space
585 */
586 extern caddr_t kmem64_base, kmem64_end, kmem64_aligned_end;
587 extern int kmem64_alignsize, kmem64_szc;
588 extern uint64_t kmem64_pabase;
589 extern int max_bootlp_tteszc;
590
591 /*
592 * Maximum and default values for user heap, stack, private and shared
593 * anonymous memory, and user text and initialized data.
594 *
595 * Initial values are defined in architecture specific mach_vm_dep.c file.
596 * Used by map_pgsz*() routines.
597 */
598 extern size_t max_uheap_lpsize;
599 extern size_t default_uheap_lpsize;
600 extern size_t max_ustack_lpsize;
601 extern size_t default_ustack_lpsize;
602 extern size_t max_privmap_lpsize;
603 extern size_t max_uidata_lpsize;
604 extern size_t max_utext_lpsize;
605 extern size_t max_shm_lpsize;
606
607 /*
608 * For adjusting the default lpsize, for DTLB-limited page sizes.
609 */
610 extern void adjust_data_maxlpsize(size_t ismpagesize);
611
612 /*
613 * Sanity control. Don't use large pages regardless of user
614 * settings if there's less than priv or shm_lpg_min_physmem memory installed.
615 * The units for this variable are 8K pages.
616 */
617 extern pgcnt_t privm_lpg_min_physmem;
618 extern pgcnt_t shm_lpg_min_physmem;
619
620 /*
621 * AS_2_BIN macro controls the page coloring policy.
622 * 0 (default) uses various vaddr bits
623 * 1 virtual=paddr
624 * 2 bin hopping
625 */
626 #define AS_2_BIN(as, seg, vp, addr, bin, szc) \
627 switch (consistent_coloring) { \
628 default: \
629 cmn_err(CE_WARN, \
630 "AS_2_BIN: bad consistent coloring value"); \
631 /* assume default algorithm -> continue */ \
632 case 0: { \
633 uint32_t ndx, new; \
634 int slew = 0; \
635 pfn_t pfn; \
636 \
637 if (vp != NULL && IS_SWAPVP(vp) && \
638 seg->s_ops == &segvn_ops) \
639 slew = as_color_bin(as); \
640 \
641 pfn = ((uintptr_t)addr >> MMU_PAGESHIFT) + \
642 (((uintptr_t)addr >> page_coloring_shift) << \
643 (vac_shift - MMU_PAGESHIFT)); \
644 if ((szc) == 0 || &page_pfn_2_color_cpu == NULL) { \
645 pfn += slew; \
646 bin = PFN_2_COLOR(pfn, szc, NULL); \
647 } else { \
648 bin = PFN_2_COLOR(pfn, szc, NULL); \
649 bin += slew >> (vac_shift - MMU_PAGESHIFT); \
650 bin &= hw_page_array[(szc)].hp_colors - 1; \
651 } \
652 break; \
653 } \
654 case 1: \
655 bin = PFN_2_COLOR(((uintptr_t)addr >> MMU_PAGESHIFT), \
656 szc, NULL); \
657 break; \
658 case 2: { \
659 int cnt = as_color_bin(as); \
660 uint_t color_mask = page_get_pagecolors(0) - 1; \
661 \
662 /* make sure physical color aligns with vac color */ \
663 while ((cnt & vac_colors_mask) != \
664 addr_to_vcolor(addr)) { \
665 cnt++; \
666 } \
667 bin = cnt = cnt & color_mask; \
668 bin >>= PAGE_GET_COLOR_SHIFT(0, szc); \
669 /* update per as page coloring fields */ \
670 cnt = (cnt + 1) & color_mask; \
671 if (cnt == (as_color_start(as) & color_mask)) { \
672 cnt = as_color_start(as) = as_color_start(as) + \
673 PGCLR_LOOPFACTOR; \
674 } \
675 as_color_bin(as) = cnt & color_mask; \
676 break; \
677 } \
678 } \
679 ASSERT(bin < page_get_pagecolors(szc));
680
681 /*
682 * cpu private vm data - accessed thru CPU->cpu_vm_data
683 * vc_pnum_memseg: tracks last memseg visited in page_numtopp_nolock()
684 * vc_pnext_memseg: tracks last memseg visited in page_nextn()
685 * vc_kmptr: unaligned kmem pointer for this vm_cpu_data_t
686 * vc_kmsize: orignal kmem size for this vm_cpu_data_t
687 */
688
689 typedef struct {
690 struct memseg *vc_pnum_memseg;
691 struct memseg *vc_pnext_memseg;
692 void *vc_kmptr;
693 size_t vc_kmsize;
694 } vm_cpu_data_t;
695
696 /* allocation size to ensure vm_cpu_data_t resides in its own cache line */
697 #define VM_CPU_DATA_PADSIZE \
698 (P2ROUNDUP(sizeof (vm_cpu_data_t), L2CACHE_ALIGN_MAX))
699
700 /*
701 * Function to get an ecache color bin: F(as, cnt, vcolor).
702 * the goal of this function is to:
703 * - to spread a processes' physical pages across the entire ecache to
704 * maximize its use.
705 * - to minimize vac flushes caused when we reuse a physical page on a
706 * different vac color than it was previously used.
707 * - to prevent all processes to use the same exact colors and trash each
708 * other.
709 *
710 * cnt is a bin ptr kept on a per as basis. As we page_create we increment
711 * the ptr so we spread out the physical pages to cover the entire ecache.
712 * The virtual color is made a subset of the physical color in order to
713 * in minimize virtual cache flushing.
714 * We add in the as to spread out different as. This happens when we
715 * initialize the start count value.
716 * sizeof(struct as) is 60 so we shift by 3 to get into the bit range
717 * that will tend to change. For example, on spitfire based machines
718 * (vcshft == 1) contigous as are spread bu ~6 bins.
719 * vcshft provides for proper virtual color alignment.
720 * In theory cnt should be updated using cas only but if we are off by one
721 * or 2 it is no big deal.
722 * We also keep a start value which is used to randomize on what bin we
723 * start counting when it is time to start another loop. This avoids
724 * contigous allocations of ecache size to point to the same bin.
725 * Why 3? Seems work ok. Better than 7 or anything larger.
726 */
727 #define PGCLR_LOOPFACTOR 3
728
729 /*
730 * When a bin is empty, and we can't satisfy a color request correctly,
731 * we scan. If we assume that the programs have reasonable spatial
732 * behavior, then it will not be a good idea to use the adjacent color.
733 * Using the adjacent color would result in virtually adjacent addresses
734 * mapping into the same spot in the cache. So, if we stumble across
735 * an empty bin, skip a bunch before looking. After the first skip,
736 * then just look one bin at a time so we don't miss our cache on
737 * every look. Be sure to check every bin. Page_create() will panic
738 * if we miss a page.
739 *
740 * This also explains the `<=' in the for loops in both page_get_freelist()
741 * and page_get_cachelist(). Since we checked the target bin, skipped
742 * a bunch, then continued one a time, we wind up checking the target bin
743 * twice to make sure we get all of them bins.
744 */
745 #define BIN_STEP 20
746
747 #ifdef VM_STATS
748 struct vmm_vmstats_str {
749 ulong_t pgf_alloc[MMU_PAGE_SIZES]; /* page_get_freelist */
750 ulong_t pgf_allocok[MMU_PAGE_SIZES];
751 ulong_t pgf_allocokrem[MMU_PAGE_SIZES];
752 ulong_t pgf_allocfailed[MMU_PAGE_SIZES];
753 ulong_t pgf_allocdeferred;
754 ulong_t pgf_allocretry[MMU_PAGE_SIZES];
755 ulong_t pgc_alloc; /* page_get_cachelist */
756 ulong_t pgc_allocok;
757 ulong_t pgc_allocokrem;
758 ulong_t pgc_allocokdeferred;
759 ulong_t pgc_allocfailed;
760 ulong_t pgcp_alloc[MMU_PAGE_SIZES]; /* page_get_contig_pages */
761 ulong_t pgcp_allocfailed[MMU_PAGE_SIZES];
762 ulong_t pgcp_allocempty[MMU_PAGE_SIZES];
763 ulong_t pgcp_allocok[MMU_PAGE_SIZES];
764 ulong_t ptcp[MMU_PAGE_SIZES]; /* page_trylock_contig_pages */
765 ulong_t ptcpfreethresh[MMU_PAGE_SIZES];
766 ulong_t ptcpfailexcl[MMU_PAGE_SIZES];
767 ulong_t ptcpfailszc[MMU_PAGE_SIZES];
768 ulong_t ptcpfailcage[MMU_PAGE_SIZES];
769 ulong_t ptcpok[MMU_PAGE_SIZES];
770 ulong_t pgmf_alloc[MMU_PAGE_SIZES]; /* page_get_mnode_freelist */
771 ulong_t pgmf_allocfailed[MMU_PAGE_SIZES];
772 ulong_t pgmf_allocempty[MMU_PAGE_SIZES];
773 ulong_t pgmf_allocok[MMU_PAGE_SIZES];
774 ulong_t pgmc_alloc; /* page_get_mnode_cachelist */
775 ulong_t pgmc_allocfailed;
776 ulong_t pgmc_allocempty;
777 ulong_t pgmc_allocok;
778 ulong_t pladd_free[MMU_PAGE_SIZES]; /* page_list_add/sub */
779 ulong_t plsub_free[MMU_PAGE_SIZES];
780 ulong_t pladd_cache;
781 ulong_t plsub_cache;
782 ulong_t plsubpages_szcbig;
783 ulong_t plsubpages_szc0;
784 ulong_t pfs_req[MMU_PAGE_SIZES]; /* page_freelist_split */
785 ulong_t pfs_demote[MMU_PAGE_SIZES];
786 ulong_t pfc_coalok[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
787 ulong_t ppr_reloc[MMU_PAGE_SIZES]; /* page_relocate */
788 ulong_t ppr_relocok[MMU_PAGE_SIZES];
789 ulong_t ppr_relocnoroot[MMU_PAGE_SIZES];
790 ulong_t ppr_reloc_replnoroot[MMU_PAGE_SIZES];
791 ulong_t ppr_relocnolock[MMU_PAGE_SIZES];
792 ulong_t ppr_relocnomem[MMU_PAGE_SIZES];
793 ulong_t ppr_krelocfail[MMU_PAGE_SIZES];
794 ulong_t ppr_copyfail;
795 /* page coalesce counter */
796 ulong_t page_ctrs_coalesce[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
797 /* candidates useful */
798 ulong_t page_ctrs_cands_skip[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
799 /* ctrs changed after locking */
800 ulong_t page_ctrs_changed[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
801 /* page_freelist_coalesce failed */
802 ulong_t page_ctrs_failed[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
803 ulong_t page_ctrs_coalesce_all; /* page coalesce all counter */
804 ulong_t page_ctrs_cands_skip_all; /* candidates useful for all func */
805 };
806 extern struct vmm_vmstats_str vmm_vmstats;
807 #endif /* VM_STATS */
808
809 /*
810 * Used to hold off page relocations into the cage until OBP has completed
811 * its boot-time handoff of its resources to the kernel.
812 */
813 extern int page_relocate_ready;
814
815 /*
816 * cpu/mmu-dependent vm variables may be reset at bootup.
817 */
818 extern uint_t mmu_page_sizes;
819 extern uint_t max_mmu_page_sizes;
820 extern uint_t mmu_hashcnt;
821 extern uint_t max_mmu_hashcnt;
822 extern size_t mmu_ism_pagesize;
823 extern int mmu_exported_pagesize_mask;
824 extern uint_t mmu_exported_page_sizes;
825 extern uint_t szc_2_userszc[];
826 extern uint_t userszc_2_szc[];
827
828 #define mmu_legacy_page_sizes mmu_exported_page_sizes
829 #define USERSZC_2_SZC(userszc) (userszc_2_szc[userszc])
830 #define SZC_2_USERSZC(szc) (szc_2_userszc[szc])
831
832 /*
833 * Platform specific page routines
834 */
835 extern void mach_page_add(page_t **, page_t *);
836 extern void mach_page_sub(page_t **, page_t *);
837 extern uint_t page_get_pagecolors(uint_t);
838 extern void ppcopy_kernel__relocatable(page_t *, page_t *);
839 #define ppcopy_kernel(p1, p2) ppcopy_kernel__relocatable(p1, p2)
840
841 /*
842 * platform specific large pages for kernel heap support
843 */
844 extern size_t get_segkmem_lpsize(size_t lpsize);
845 extern size_t mmu_get_kernel_lpsize(size_t lpsize);
846 extern void mmu_init_kernel_pgsz(struct hat *hat);
847 extern void mmu_init_kcontext();
848 extern uint64_t kcontextreg;
849
850 /*
851 * Nucleus data page allocator routines
852 */
853 extern void ndata_alloc_init(struct memlist *, uintptr_t, uintptr_t);
854 extern void *ndata_alloc(struct memlist *, size_t, size_t);
855 extern void *ndata_extra_base(struct memlist *, size_t, caddr_t);
856 extern size_t ndata_maxsize(struct memlist *);
857 extern size_t ndata_spare(struct memlist *, size_t, size_t);
858
859 #ifdef __cplusplus
860 }
861 #endif
862
863 #endif /* _VM_DEP_H */