Print this page
11528 Makefile.noget can get gone
11529 Use -Wno-maybe-initialized
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/vm/vm_pagelist.c
+++ new/usr/src/uts/common/vm/vm_pagelist.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 */
24 24
25 25 /*
26 26 * Copyright 2012 Joyent, Inc. All rights reserved.
27 27 */
28 28
29 29 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
30 30 /* All Rights Reserved */
31 31
32 32 /*
33 33 * Portions of this source code were derived from Berkeley 4.3 BSD
34 34 * under license from the Regents of the University of California.
35 35 */
36 36
37 37
38 38 /*
39 39 * This file contains common functions to access and manage the page lists.
40 40 * Many of these routines originated from platform dependent modules
41 41 * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in
42 42 * a platform independent manner.
43 43 *
44 44 * vm/vm_dep.h provides for platform specific support.
45 45 */
46 46
47 47 #include <sys/types.h>
48 48 #include <sys/debug.h>
49 49 #include <sys/cmn_err.h>
50 50 #include <sys/systm.h>
51 51 #include <sys/atomic.h>
52 52 #include <sys/sysmacros.h>
53 53 #include <vm/as.h>
54 54 #include <vm/page.h>
55 55 #include <vm/seg_kmem.h>
56 56 #include <vm/seg_vn.h>
57 57 #include <sys/vmsystm.h>
58 58 #include <sys/memnode.h>
59 59 #include <vm/vm_dep.h>
60 60 #include <sys/lgrp.h>
61 61 #include <sys/mem_config.h>
62 62 #include <sys/callb.h>
63 63 #include <sys/mem_cage.h>
64 64 #include <sys/sdt.h>
65 65 #include <sys/dumphdr.h>
66 66 #include <sys/swap.h>
67 67
68 68 extern uint_t vac_colors;
69 69
70 70 #define MAX_PRAGMA_ALIGN 128
71 71
72 72 /* vm_cpu_data0 for the boot cpu before kmem is initialized */
73 73
74 74 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN
75 75 #pragma align L2CACHE_ALIGN_MAX(vm_cpu_data0)
76 76 #else
77 77 #pragma align MAX_PRAGMA_ALIGN(vm_cpu_data0)
78 78 #endif
79 79 char vm_cpu_data0[VM_CPU_DATA_PADSIZE];
80 80
81 81 /*
82 82 * number of page colors equivalent to reqested color in page_get routines.
83 83 * If set, keeps large pages intact longer and keeps MPO allocation
84 84 * from the local mnode in favor of acquiring the 'correct' page color from
85 85 * a demoted large page or from a remote mnode.
86 86 */
87 87 uint_t colorequiv;
88 88
89 89 /*
90 90 * color equivalency mask for each page size.
91 91 * Mask is computed based on cpu L2$ way sizes and colorequiv global.
92 92 * High 4 bits determine the number of high order bits of the color to ignore.
93 93 * Low 4 bits determines number of low order bits of color to ignore (it's only
94 94 * relevant for hashed index based page coloring).
95 95 */
96 96 uchar_t colorequivszc[MMU_PAGE_SIZES];
97 97
98 98 /*
99 99 * if set, specifies the percentage of large pages that are free from within
100 100 * a large page region before attempting to lock those pages for
101 101 * page_get_contig_pages processing.
102 102 *
103 103 * Should be turned on when kpr is available when page_trylock_contig_pages
104 104 * can be more selective.
105 105 */
106 106
107 107 int ptcpthreshold;
108 108
109 109 /*
110 110 * Limit page get contig page search based on failure cnts in pgcpfailcnt[].
111 111 * Enabled by default via pgcplimitsearch.
112 112 *
113 113 * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed
114 114 * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper
115 115 * bound. This upper bound range guarantees:
116 116 * - all large page 'slots' will be searched over time
117 117 * - the minimum (1) large page candidates considered on each pgcp call
118 118 * - count doesn't wrap around to 0
119 119 */
120 120 pgcnt_t pgcpfailcnt[MMU_PAGE_SIZES];
121 121 int pgcplimitsearch = 1;
122 122
123 123 #define PGCPFAILMAX (1 << (highbit(physinstalled) - 1))
124 124 #define SETPGCPFAILCNT(szc) \
125 125 if (++pgcpfailcnt[szc] >= PGCPFAILMAX) \
126 126 pgcpfailcnt[szc] = PGCPFAILMAX / 2;
127 127
128 128 #ifdef VM_STATS
129 129 struct vmm_vmstats_str vmm_vmstats;
130 130
131 131 #endif /* VM_STATS */
132 132
133 133 #if defined(__sparc)
134 134 #define LPGCREATE 0
135 135 #else
136 136 /* enable page_get_contig_pages */
137 137 #define LPGCREATE 1
138 138 #endif
139 139
140 140 int pg_contig_disable;
141 141 int pg_lpgcreate_nocage = LPGCREATE;
142 142
143 143 /*
144 144 * page_freelist_split pfn flag to signify no lo or hi pfn requirement.
145 145 */
146 146 #define PFNNULL 0
147 147
148 148 /* Flags involved in promotion and demotion routines */
149 149 #define PC_FREE 0x1 /* put page on freelist */
150 150 #define PC_ALLOC 0x2 /* return page for allocation */
151 151
152 152 /*
153 153 * Flag for page_demote to be used with PC_FREE to denote that we don't care
154 154 * what the color is as the color parameter to the function is ignored.
155 155 */
156 156 #define PC_NO_COLOR (-1)
157 157
158 158 /* mtype value for page_promote to use when mtype does not matter */
159 159 #define PC_MTYPE_ANY (-1)
160 160
161 161 /*
162 162 * page counters candidates info
163 163 * See page_ctrs_cands comment below for more details.
164 164 * fields are as follows:
165 165 * pcc_pages_free: # pages which freelist coalesce can create
166 166 * pcc_color_free: pointer to page free counts per color
167 167 */
168 168 typedef struct pcc_info {
169 169 pgcnt_t pcc_pages_free;
170 170 pgcnt_t *pcc_color_free;
171 171 uint_t pad[12];
172 172 } pcc_info_t;
173 173
174 174 /*
175 175 * On big machines it can take a long time to check page_counters
176 176 * arrays. page_ctrs_cands is a summary array whose elements are a dynamically
177 177 * updated sum of all elements of the corresponding page_counters arrays.
178 178 * page_freelist_coalesce() searches page_counters only if an appropriate
179 179 * element of page_ctrs_cands array is greater than 0.
180 180 *
181 181 * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g)
182 182 */
183 183 pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES];
184 184
185 185 /*
186 186 * Return in val the total number of free pages which can be created
187 187 * for the given mnode (m), mrange (g), and region size (r)
188 188 */
189 189 #define PGCTRS_CANDS_GETVALUE(m, g, r, val) { \
190 190 int i; \
191 191 val = 0; \
192 192 for (i = 0; i < NPC_MUTEX; i++) { \
193 193 val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free; \
194 194 } \
195 195 }
196 196
197 197 /*
198 198 * Return in val the total number of free pages which can be created
199 199 * for the given mnode (m), mrange (g), region size (r), and color (c)
200 200 */
201 201 #define PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) { \
202 202 int i; \
203 203 val = 0; \
204 204 ASSERT((c) < PAGE_GET_PAGECOLORS(r)); \
205 205 for (i = 0; i < NPC_MUTEX; i++) { \
206 206 val += \
207 207 page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)]; \
208 208 } \
209 209 }
210 210
211 211 /*
212 212 * We can only allow a single thread to update a counter within the physical
213 213 * range of the largest supported page size. That is the finest granularity
214 214 * possible since the counter values are dependent on each other
215 215 * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the
216 216 * ctr_mutex lock index for a particular physical range.
217 217 */
218 218 static kmutex_t *ctr_mutex[NPC_MUTEX];
219 219
220 220 #define PP_CTR_LOCK_INDX(pp) \
221 221 (((pp)->p_pagenum >> \
222 222 (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1))
223 223
224 224 #define INVALID_COLOR 0xffffffff
225 225 #define INVALID_MASK 0xffffffff
226 226
227 227 /*
228 228 * Local functions prototypes.
229 229 */
230 230
231 231 void page_ctr_add(int, int, page_t *, int);
232 232 void page_ctr_add_internal(int, int, page_t *, int);
233 233 void page_ctr_sub(int, int, page_t *, int);
234 234 void page_ctr_sub_internal(int, int, page_t *, int);
235 235 void page_freelist_lock(int);
236 236 void page_freelist_unlock(int);
237 237 page_t *page_promote(int, pfn_t, uchar_t, int, int);
238 238 page_t *page_demote(int, pfn_t, pfn_t, uchar_t, uchar_t, int, int);
239 239 page_t *page_freelist_split(uchar_t,
240 240 uint_t, int, int, pfn_t, pfn_t, page_list_walker_t *);
241 241 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
242 242 static int page_trylock_cons(page_t *pp, se_t se);
243 243
244 244 /*
245 245 * The page_counters array below is used to keep track of free contiguous
246 246 * physical memory. A hw_page_map_t will be allocated per mnode per szc.
247 247 * This contains an array of counters, the size of the array, a shift value
248 248 * used to convert a pagenum into a counter array index or vice versa, as
249 249 * well as a cache of the last successful index to be promoted to a larger
250 250 * page size. As an optimization, we keep track of the last successful index
251 251 * to be promoted per page color for the given size region, and this is
252 252 * allocated dynamically based upon the number of colors for a given
253 253 * region size.
254 254 *
255 255 * Conceptually, the page counters are represented as:
256 256 *
257 257 * page_counters[region_size][mnode]
258 258 *
259 259 * region_size: size code of a candidate larger page made up
260 260 * of contiguous free smaller pages.
261 261 *
262 262 * page_counters[region_size][mnode].hpm_counters[index]:
263 263 * represents how many (region_size - 1) pages either
264 264 * exist or can be created within the given index range.
265 265 *
266 266 * Let's look at a sparc example:
267 267 * If we want to create a free 512k page, we look at region_size 2
268 268 * for the mnode we want. We calculate the index and look at a specific
269 269 * hpm_counters location. If we see 8 (FULL_REGION_CNT on sparc) at
270 270 * this location, it means that 8 64k pages either exist or can be created
271 271 * from 8K pages in order to make a single free 512k page at the given
272 272 * index. Note that when a region is full, it will contribute to the
273 273 * counts in the region above it. Thus we will not know what page
274 274 * size the free pages will be which can be promoted to this new free
275 275 * page unless we look at all regions below the current region.
276 276 */
277 277
278 278 /*
279 279 * Note: hpmctr_t is defined in platform vm_dep.h
280 280 * hw_page_map_t contains all the information needed for the page_counters
281 281 * logic. The fields are as follows:
282 282 *
283 283 * hpm_counters: dynamically allocated array to hold counter data
284 284 * hpm_entries: entries in hpm_counters
285 285 * hpm_shift: shift for pnum/array index conv
286 286 * hpm_base: PFN mapped to counter index 0
287 287 * hpm_color_current: last index in counter array for this color at
288 288 * which we successfully created a large page
289 289 */
290 290 typedef struct hw_page_map {
291 291 hpmctr_t *hpm_counters;
292 292 size_t hpm_entries;
293 293 int hpm_shift;
294 294 pfn_t hpm_base;
295 295 size_t *hpm_color_current[MAX_MNODE_MRANGES];
296 296 #if defined(__sparc)
297 297 uint_t pad[4];
298 298 #endif
299 299 } hw_page_map_t;
300 300
301 301 /*
302 302 * Element zero is not used, but is allocated for convenience.
303 303 */
304 304 static hw_page_map_t *page_counters[MMU_PAGE_SIZES];
305 305
306 306 /*
307 307 * Cached value of MNODE_RANGE_CNT(mnode).
308 308 * This is a function call in x86.
309 309 */
310 310 static int mnode_nranges[MAX_MEM_NODES];
311 311 static int mnode_maxmrange[MAX_MEM_NODES];
312 312
313 313 /*
314 314 * The following macros are convenient ways to get access to the individual
315 315 * elements of the page_counters arrays. They can be used on both
316 316 * the left side and right side of equations.
317 317 */
318 318 #define PAGE_COUNTERS(mnode, rg_szc, idx) \
319 319 (page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)])
320 320
321 321 #define PAGE_COUNTERS_COUNTERS(mnode, rg_szc) \
322 322 (page_counters[(rg_szc)][(mnode)].hpm_counters)
323 323
324 324 #define PAGE_COUNTERS_SHIFT(mnode, rg_szc) \
325 325 (page_counters[(rg_szc)][(mnode)].hpm_shift)
326 326
327 327 #define PAGE_COUNTERS_ENTRIES(mnode, rg_szc) \
328 328 (page_counters[(rg_szc)][(mnode)].hpm_entries)
329 329
330 330 #define PAGE_COUNTERS_BASE(mnode, rg_szc) \
331 331 (page_counters[(rg_szc)][(mnode)].hpm_base)
332 332
333 333 #define PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g) \
334 334 (page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)])
335 335
336 336 #define PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange) \
337 337 (page_counters[(rg_szc)][(mnode)]. \
338 338 hpm_color_current[(mrange)][(color)])
339 339
340 340 #define PNUM_TO_IDX(mnode, rg_szc, pnum) \
341 341 (((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >> \
342 342 PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))
343 343
344 344 #define IDX_TO_PNUM(mnode, rg_szc, index) \
345 345 (PAGE_COUNTERS_BASE((mnode), (rg_szc)) + \
346 346 ((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc))))
347 347
348 348 /*
349 349 * Protects the hpm_counters and hpm_color_current memory from changing while
350 350 * looking at page counters information.
351 351 * Grab the write lock to modify what these fields point at.
352 352 * Grab the read lock to prevent any pointers from changing.
353 353 * The write lock can not be held during memory allocation due to a possible
354 354 * recursion deadlock with trying to grab the read lock while the
355 355 * write lock is already held.
356 356 */
357 357 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES];
358 358
359 359
360 360 /*
361 361 * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t.
362 362 */
363 363 void
364 364 cpu_vm_data_init(struct cpu *cp)
365 365 {
366 366 if (cp == CPU0) {
367 367 cp->cpu_vm_data = (void *)&vm_cpu_data0;
368 368 } else {
369 369 void *kmptr;
370 370 int align;
371 371 size_t sz;
372 372
373 373 align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX;
374 374 sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align;
375 375 kmptr = kmem_zalloc(sz, KM_SLEEP);
376 376 cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align);
377 377 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr;
378 378 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz;
379 379 }
380 380 }
381 381
382 382 /*
383 383 * free cpu_vm_data
384 384 */
385 385 void
386 386 cpu_vm_data_destroy(struct cpu *cp)
387 387 {
388 388 if (cp->cpu_seqid && cp->cpu_vm_data) {
389 389 ASSERT(cp != CPU0);
390 390 kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr,
391 391 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize);
392 392 }
393 393 cp->cpu_vm_data = NULL;
394 394 }
395 395
396 396
397 397 /*
398 398 * page size to page size code
399 399 */
400 400 int
401 401 page_szc(size_t pagesize)
402 402 {
403 403 int i = 0;
404 404
405 405 while (hw_page_array[i].hp_size) {
406 406 if (pagesize == hw_page_array[i].hp_size)
407 407 return (i);
408 408 i++;
409 409 }
410 410 return (-1);
411 411 }
412 412
413 413 /*
414 414 * page size to page size code with the restriction that it be a supported
415 415 * user page size. If it's not a supported user page size, -1 will be returned.
416 416 */
417 417 int
418 418 page_szc_user_filtered(size_t pagesize)
419 419 {
420 420 int szc = page_szc(pagesize);
421 421 if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) {
422 422 return (szc);
423 423 }
424 424 return (-1);
425 425 }
426 426
427 427 /*
428 428 * Return how many page sizes are available for the user to use. This is
429 429 * what the hardware supports and not based upon how the OS implements the
430 430 * support of different page sizes.
431 431 *
432 432 * If legacy is non-zero, return the number of pagesizes available to legacy
433 433 * applications. The number of legacy page sizes might be less than the
434 434 * exported user page sizes. This is to prevent legacy applications that
435 435 * use the largest page size returned from getpagesizes(3c) from inadvertantly
436 436 * using the 'new' large pagesizes.
437 437 */
438 438 uint_t
439 439 page_num_user_pagesizes(int legacy)
440 440 {
441 441 if (legacy)
442 442 return (mmu_legacy_page_sizes);
443 443 return (mmu_exported_page_sizes);
444 444 }
445 445
446 446 uint_t
447 447 page_num_pagesizes(void)
448 448 {
449 449 return (mmu_page_sizes);
450 450 }
451 451
452 452 /*
453 453 * returns the count of the number of base pagesize pages associated with szc
454 454 */
455 455 pgcnt_t
456 456 page_get_pagecnt(uint_t szc)
457 457 {
458 458 if (szc >= mmu_page_sizes)
459 459 panic("page_get_pagecnt: out of range %d", szc);
460 460 return (hw_page_array[szc].hp_pgcnt);
461 461 }
462 462
463 463 size_t
464 464 page_get_pagesize(uint_t szc)
465 465 {
466 466 if (szc >= mmu_page_sizes)
467 467 panic("page_get_pagesize: out of range %d", szc);
468 468 return (hw_page_array[szc].hp_size);
469 469 }
470 470
471 471 /*
472 472 * Return the size of a page based upon the index passed in. An index of
473 473 * zero refers to the smallest page size in the system, and as index increases
474 474 * it refers to the next larger supported page size in the system.
475 475 * Note that szc and userszc may not be the same due to unsupported szc's on
476 476 * some systems.
477 477 */
478 478 size_t
479 479 page_get_user_pagesize(uint_t userszc)
480 480 {
481 481 uint_t szc = USERSZC_2_SZC(userszc);
482 482
483 483 if (szc >= mmu_page_sizes)
484 484 panic("page_get_user_pagesize: out of range %d", szc);
485 485 return (hw_page_array[szc].hp_size);
486 486 }
487 487
488 488 uint_t
489 489 page_get_shift(uint_t szc)
490 490 {
491 491 if (szc >= mmu_page_sizes)
492 492 panic("page_get_shift: out of range %d", szc);
493 493 return (PAGE_GET_SHIFT(szc));
494 494 }
495 495
496 496 uint_t
497 497 page_get_pagecolors(uint_t szc)
498 498 {
499 499 if (szc >= mmu_page_sizes)
500 500 panic("page_get_pagecolors: out of range %d", szc);
501 501 return (PAGE_GET_PAGECOLORS(szc));
502 502 }
503 503
504 504 /*
505 505 * this assigns the desired equivalent color after a split
506 506 */
507 507 uint_t
508 508 page_correct_color(uchar_t szc, uchar_t nszc, uint_t color,
509 509 uint_t ncolor, uint_t ceq_mask)
510 510 {
511 511 ASSERT(nszc > szc);
512 512 ASSERT(szc < mmu_page_sizes);
513 513 ASSERT(color < PAGE_GET_PAGECOLORS(szc));
514 514 ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc));
515 515
516 516 color &= ceq_mask;
517 517 ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc);
518 518 return (color | (ncolor & ~ceq_mask));
519 519 }
520 520
521 521 /*
522 522 * The interleaved_mnodes flag is set when mnodes overlap in
523 523 * the physbase..physmax range, but have disjoint slices.
524 524 * In this case hpm_counters is shared by all mnodes.
525 525 * This flag is set dynamically by the platform.
526 526 */
527 527 int interleaved_mnodes = 0;
528 528
529 529 /*
530 530 * Called by startup().
531 531 * Size up the per page size free list counters based on physmax
532 532 * of each node and max_mem_nodes.
533 533 *
534 534 * If interleaved_mnodes is set we need to find the first mnode that
535 535 * exists. hpm_counters for the first mnode will then be shared by
536 536 * all other mnodes. If interleaved_mnodes is not set, just set
537 537 * first=mnode each time. That means there will be no sharing.
538 538 */
539 539 size_t
540 540 page_ctrs_sz(void)
541 541 {
542 542 int r; /* region size */
543 543 int mnode;
544 544 int firstmn; /* first mnode that exists */
545 545 int nranges;
546 546 pfn_t physbase;
547 547 pfn_t physmax;
548 548 uint_t ctrs_sz = 0;
549 549 int i;
550 550 pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
551 551
552 552 /*
553 553 * We need to determine how many page colors there are for each
554 554 * page size in order to allocate memory for any color specific
555 555 * arrays.
556 556 */
557 557 for (i = 0; i < mmu_page_sizes; i++) {
558 558 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
559 559 }
560 560
561 561 for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
562 562
563 563 pgcnt_t r_pgcnt;
564 564 pfn_t r_base;
565 565 pgcnt_t r_align;
566 566
567 567 if (mem_node_config[mnode].exists == 0)
568 568 continue;
569 569
570 570 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
571 571 nranges = MNODE_RANGE_CNT(mnode);
572 572 mnode_nranges[mnode] = nranges;
573 573 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
574 574
575 575 /*
576 576 * determine size needed for page counter arrays with
577 577 * base aligned to large page size.
578 578 */
579 579 for (r = 1; r < mmu_page_sizes; r++) {
580 580 /* add in space for hpm_color_current */
581 581 ctrs_sz += sizeof (size_t) *
582 582 colors_per_szc[r] * nranges;
583 583
584 584 if (firstmn != mnode)
585 585 continue;
586 586
587 587 /* add in space for hpm_counters */
588 588 r_align = page_get_pagecnt(r);
589 589 r_base = physbase;
590 590 r_base &= ~(r_align - 1);
591 591 r_pgcnt = howmany(physmax - r_base + 1, r_align);
592 592
593 593 /*
594 594 * Round up to always allocate on pointer sized
595 595 * boundaries.
596 596 */
597 597 ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)),
598 598 sizeof (hpmctr_t *));
599 599 }
600 600 }
601 601
602 602 for (r = 1; r < mmu_page_sizes; r++) {
603 603 ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t));
604 604 }
605 605
606 606 /* add in space for page_ctrs_cands and pcc_color_free */
607 607 ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes *
608 608 mmu_page_sizes * NPC_MUTEX;
609 609
610 610 for (mnode = 0; mnode < max_mem_nodes; mnode++) {
611 611
612 612 if (mem_node_config[mnode].exists == 0)
613 613 continue;
614 614
615 615 nranges = mnode_nranges[mnode];
616 616 ctrs_sz += sizeof (pcc_info_t) * nranges *
617 617 mmu_page_sizes * NPC_MUTEX;
618 618 for (r = 1; r < mmu_page_sizes; r++) {
619 619 ctrs_sz += sizeof (pgcnt_t) * nranges *
620 620 colors_per_szc[r] * NPC_MUTEX;
621 621 }
622 622 }
623 623
624 624 /* ctr_mutex */
625 625 ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t));
626 626
627 627 /* size for page list counts */
628 628 PLCNT_SZ(ctrs_sz);
629 629
630 630 /*
631 631 * add some slop for roundups. page_ctrs_alloc will roundup the start
632 632 * address of the counters to ecache_alignsize boundary for every
633 633 * memory node.
634 634 */
635 635 return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN);
636 636 }
637 637
638 638 caddr_t
639 639 page_ctrs_alloc(caddr_t alloc_base)
640 640 {
641 641 int mnode;
642 642 int mrange, nranges;
643 643 int r; /* region size */
644 644 int i;
645 645 int firstmn; /* first mnode that exists */
646 646 pfn_t physbase;
647 647 pfn_t physmax;
648 648 pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
649 649
650 650 /*
651 651 * We need to determine how many page colors there are for each
652 652 * page size in order to allocate memory for any color specific
653 653 * arrays.
654 654 */
655 655 for (i = 0; i < mmu_page_sizes; i++) {
656 656 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
657 657 }
658 658
659 659 for (r = 1; r < mmu_page_sizes; r++) {
660 660 page_counters[r] = (hw_page_map_t *)alloc_base;
661 661 alloc_base += (max_mem_nodes * sizeof (hw_page_map_t));
662 662 }
663 663
664 664 /* page_ctrs_cands and pcc_color_free array */
665 665 for (i = 0; i < NPC_MUTEX; i++) {
666 666 for (r = 1; r < mmu_page_sizes; r++) {
667 667
668 668 page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base;
669 669 alloc_base += sizeof (pcc_info_t *) * max_mem_nodes;
670 670
671 671 for (mnode = 0; mnode < max_mem_nodes; mnode++) {
672 672 pcc_info_t *pi;
673 673
674 674 if (mem_node_config[mnode].exists == 0)
675 675 continue;
676 676
677 677 nranges = mnode_nranges[mnode];
678 678
679 679 pi = (pcc_info_t *)alloc_base;
680 680 alloc_base += sizeof (pcc_info_t) * nranges;
681 681 page_ctrs_cands[i][r][mnode] = pi;
682 682
683 683 for (mrange = 0; mrange < nranges; mrange++) {
684 684 pi->pcc_color_free =
685 685 (pgcnt_t *)alloc_base;
686 686 alloc_base += sizeof (pgcnt_t) *
687 687 colors_per_szc[r];
688 688 pi++;
689 689 }
690 690 }
691 691 }
692 692 }
693 693
694 694 /* ctr_mutex */
695 695 for (i = 0; i < NPC_MUTEX; i++) {
696 696 ctr_mutex[i] = (kmutex_t *)alloc_base;
697 697 alloc_base += (max_mem_nodes * sizeof (kmutex_t));
698 698 }
699 699
700 700 /* initialize page list counts */
701 701 PLCNT_INIT(alloc_base);
702 702
703 703 for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
704 704
705 705 pgcnt_t r_pgcnt;
706 706 pfn_t r_base;
707 707 pgcnt_t r_align;
708 708 int r_shift;
709 709 int nranges = mnode_nranges[mnode];
710 710
711 711 if (mem_node_config[mnode].exists == 0)
712 712 continue;
713 713
714 714 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
715 715
716 716 for (r = 1; r < mmu_page_sizes; r++) {
717 717 /*
718 718 * the page_counters base has to be aligned to the
719 719 * page count of page size code r otherwise the counts
720 720 * will cross large page boundaries.
721 721 */
722 722 r_align = page_get_pagecnt(r);
723 723 r_base = physbase;
724 724 /* base needs to be aligned - lower to aligned value */
725 725 r_base &= ~(r_align - 1);
726 726 r_pgcnt = howmany(physmax - r_base + 1, r_align);
727 727 r_shift = PAGE_BSZS_SHIFT(r);
728 728
729 729 PAGE_COUNTERS_SHIFT(mnode, r) = r_shift;
730 730 PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt;
731 731 PAGE_COUNTERS_BASE(mnode, r) = r_base;
732 732 for (mrange = 0; mrange < nranges; mrange++) {
733 733 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
734 734 r, mrange) = (size_t *)alloc_base;
735 735 alloc_base += sizeof (size_t) *
736 736 colors_per_szc[r];
737 737 }
738 738 for (i = 0; i < colors_per_szc[r]; i++) {
739 739 uint_t color_mask = colors_per_szc[r] - 1;
740 740 pfn_t pfnum = r_base;
741 741 size_t idx;
742 742 int mrange;
743 743 MEM_NODE_ITERATOR_DECL(it);
744 744
745 745 MEM_NODE_ITERATOR_INIT(pfnum, mnode, r, &it);
746 746 if (pfnum == (pfn_t)-1) {
747 747 idx = 0;
748 748 } else {
749 749 PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
750 750 color_mask, color_mask, &it);
751 751 idx = PNUM_TO_IDX(mnode, r, pfnum);
752 752 idx = (idx >= r_pgcnt) ? 0 : idx;
753 753 }
754 754 for (mrange = 0; mrange < nranges; mrange++) {
755 755 PAGE_COUNTERS_CURRENT_COLOR(mnode,
756 756 r, i, mrange) = idx;
757 757 }
758 758 }
759 759
760 760 /* hpm_counters may be shared by all mnodes */
761 761 if (firstmn == mnode) {
762 762 PAGE_COUNTERS_COUNTERS(mnode, r) =
763 763 (hpmctr_t *)alloc_base;
764 764 alloc_base +=
765 765 P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt),
766 766 sizeof (hpmctr_t *));
767 767 } else {
768 768 PAGE_COUNTERS_COUNTERS(mnode, r) =
769 769 PAGE_COUNTERS_COUNTERS(firstmn, r);
770 770 }
771 771
772 772 /*
773 773 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
774 774 * satisfy the identity requirement.
775 775 * We should be able to go from one to the other
776 776 * and get consistent values.
777 777 */
778 778 ASSERT(PNUM_TO_IDX(mnode, r,
779 779 (IDX_TO_PNUM(mnode, r, 0))) == 0);
780 780 ASSERT(IDX_TO_PNUM(mnode, r,
781 781 (PNUM_TO_IDX(mnode, r, r_base))) == r_base);
782 782 }
783 783 /*
784 784 * Roundup the start address of the page_counters to
785 785 * cache aligned boundary for every memory node.
786 786 * page_ctrs_sz() has added some slop for these roundups.
787 787 */
788 788 alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base,
789 789 L2CACHE_ALIGN);
790 790 }
791 791
792 792 /* Initialize other page counter specific data structures. */
793 793 for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) {
794 794 rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL);
795 795 }
796 796
797 797 return (alloc_base);
798 798 }
799 799
800 800 /*
801 801 * Functions to adjust region counters for each size free list.
802 802 * Caller is responsible to acquire the ctr_mutex lock if necessary and
803 803 * thus can be called during startup without locks.
804 804 */
805 805 /* ARGSUSED */
806 806 void
807 807 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags)
808 808 {
809 809 ssize_t r; /* region size */
810 810 ssize_t idx;
811 811 pfn_t pfnum;
812 812 int lckidx;
813 813
814 814 ASSERT(mnode == PP_2_MEM_NODE(pp));
815 815 ASSERT(mtype == PP_2_MTYPE(pp));
816 816
817 817 ASSERT(pp->p_szc < mmu_page_sizes);
818 818
819 819 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
820 820
821 821 /* no counter update needed for largest page size */
822 822 if (pp->p_szc >= mmu_page_sizes - 1) {
823 823 return;
824 824 }
825 825
826 826 r = pp->p_szc + 1;
827 827 pfnum = pp->p_pagenum;
828 828 lckidx = PP_CTR_LOCK_INDX(pp);
829 829
830 830 /*
831 831 * Increment the count of free pages for the current
832 832 * region. Continue looping up in region size incrementing
833 833 * count if the preceeding region is full.
834 834 */
835 835 while (r < mmu_page_sizes) {
836 836 idx = PNUM_TO_IDX(mnode, r, pfnum);
837 837
838 838 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
839 839 ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r));
840 840
841 841 if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) {
842 842 break;
843 843 } else {
844 844 int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
845 845 pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
846 846 [MTYPE_2_MRANGE(mnode, root_mtype)];
847 847
848 848 cand->pcc_pages_free++;
849 849 cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++;
850 850 }
851 851 r++;
852 852 }
853 853 }
854 854
855 855 void
856 856 page_ctr_add(int mnode, int mtype, page_t *pp, int flags)
857 857 {
858 858 int lckidx = PP_CTR_LOCK_INDX(pp);
859 859 kmutex_t *lock = &ctr_mutex[lckidx][mnode];
860 860
861 861 mutex_enter(lock);
862 862 page_ctr_add_internal(mnode, mtype, pp, flags);
863 863 mutex_exit(lock);
864 864 }
865 865
866 866 void
867 867 page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags)
868 868 {
869 869 int lckidx;
870 870 ssize_t r; /* region size */
871 871 ssize_t idx;
872 872 pfn_t pfnum;
873 873
874 874 ASSERT(mnode == PP_2_MEM_NODE(pp));
875 875 ASSERT(mtype == PP_2_MTYPE(pp));
876 876
877 877 ASSERT(pp->p_szc < mmu_page_sizes);
878 878
879 879 PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags);
880 880
881 881 /* no counter update needed for largest page size */
882 882 if (pp->p_szc >= mmu_page_sizes - 1) {
883 883 return;
884 884 }
885 885
886 886 r = pp->p_szc + 1;
887 887 pfnum = pp->p_pagenum;
888 888 lckidx = PP_CTR_LOCK_INDX(pp);
889 889
890 890 /*
891 891 * Decrement the count of free pages for the current
892 892 * region. Continue looping up in region size decrementing
893 893 * count if the preceeding region was full.
894 894 */
895 895 while (r < mmu_page_sizes) {
896 896 idx = PNUM_TO_IDX(mnode, r, pfnum);
897 897
898 898 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
899 899 ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0);
900 900
901 901 if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) {
902 902 break;
903 903 } else {
904 904 int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
905 905 pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
906 906 [MTYPE_2_MRANGE(mnode, root_mtype)];
907 907
908 908 ASSERT(cand->pcc_pages_free != 0);
909 909 ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0);
910 910
911 911 cand->pcc_pages_free--;
912 912 cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--;
913 913 }
914 914 r++;
915 915 }
916 916 }
917 917
918 918 void
919 919 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags)
920 920 {
921 921 int lckidx = PP_CTR_LOCK_INDX(pp);
922 922 kmutex_t *lock = &ctr_mutex[lckidx][mnode];
923 923
924 924 mutex_enter(lock);
925 925 page_ctr_sub_internal(mnode, mtype, pp, flags);
926 926 mutex_exit(lock);
927 927 }
928 928
929 929 /*
930 930 * Adjust page counters following a memory attach, since typically the
931 931 * size of the array needs to change, and the PFN to counter index
932 932 * mapping needs to change.
933 933 *
934 934 * It is possible this mnode did not exist at startup. In that case
935 935 * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges
936 936 * to change (a theoretical possibility on x86), which means pcc_color_free
937 937 * arrays must be extended.
938 938 */
939 939 uint_t
940 940 page_ctrs_adjust(int mnode)
941 941 {
942 942 pgcnt_t npgs;
943 943 int r; /* region size */
944 944 int i;
945 945 size_t pcsz, old_csz;
946 946 hpmctr_t *new_ctr, *old_ctr;
947 947 pfn_t oldbase, newbase;
948 948 pfn_t physbase, physmax;
949 949 size_t old_npgs;
950 950 hpmctr_t *ctr_cache[MMU_PAGE_SIZES];
951 951 size_t size_cache[MMU_PAGE_SIZES];
952 952 size_t *color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
953 953 size_t *old_color_array[MAX_MNODE_MRANGES];
954 954 pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
955 955 pcc_info_t **cands_cache;
956 956 pcc_info_t *old_pi, *pi;
957 957 pgcnt_t *pgcntp;
958 958 int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode);
959 959 int cands_cache_nranges;
960 960 int old_maxmrange, new_maxmrange;
961 961 int rc = 0;
962 962 int oldmnode;
963 963
964 964 cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX *
965 965 MMU_PAGE_SIZES, KM_NOSLEEP);
966 966 if (cands_cache == NULL)
967 967 return (ENOMEM);
968 968
969 969 i = -1;
970 970 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i);
971 971
972 972 newbase = physbase & ~PC_BASE_ALIGN_MASK;
973 973 npgs = roundup(physmax, PC_BASE_ALIGN) - newbase;
974 974
975 975 /* prepare to free non-null pointers on the way out */
976 976 cands_cache_nranges = nranges;
977 977 bzero(ctr_cache, sizeof (ctr_cache));
978 978 bzero(color_cache, sizeof (color_cache));
979 979
980 980 /*
981 981 * We need to determine how many page colors there are for each
982 982 * page size in order to allocate memory for any color specific
983 983 * arrays.
984 984 */
985 985 for (r = 0; r < mmu_page_sizes; r++) {
986 986 colors_per_szc[r] = PAGE_GET_PAGECOLORS(r);
987 987 }
988 988
989 989 /*
990 990 * Preallocate all of the new hpm_counters arrays as we can't
991 991 * hold the page_ctrs_rwlock as a writer and allocate memory.
992 992 * If we can't allocate all of the arrays, undo our work so far
993 993 * and return failure.
994 994 */
995 995 for (r = 1; r < mmu_page_sizes; r++) {
996 996 pcsz = npgs >> PAGE_BSZS_SHIFT(r);
997 997 size_cache[r] = pcsz;
998 998 ctr_cache[r] = kmem_zalloc(pcsz *
999 999 sizeof (hpmctr_t), KM_NOSLEEP);
1000 1000 if (ctr_cache[r] == NULL) {
1001 1001 rc = ENOMEM;
1002 1002 goto cleanup;
1003 1003 }
1004 1004 }
1005 1005
1006 1006 /*
1007 1007 * Preallocate all of the new color current arrays as we can't
1008 1008 * hold the page_ctrs_rwlock as a writer and allocate memory.
1009 1009 * If we can't allocate all of the arrays, undo our work so far
1010 1010 * and return failure.
1011 1011 */
1012 1012 for (r = 1; r < mmu_page_sizes; r++) {
1013 1013 for (mrange = 0; mrange < nranges; mrange++) {
1014 1014 color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) *
1015 1015 colors_per_szc[r], KM_NOSLEEP);
1016 1016 if (color_cache[r][mrange] == NULL) {
1017 1017 rc = ENOMEM;
1018 1018 goto cleanup;
1019 1019 }
1020 1020 }
1021 1021 }
1022 1022
1023 1023 /*
1024 1024 * Preallocate all of the new pcc_info_t arrays as we can't
1025 1025 * hold the page_ctrs_rwlock as a writer and allocate memory.
1026 1026 * If we can't allocate all of the arrays, undo our work so far
1027 1027 * and return failure.
1028 1028 */
1029 1029 for (r = 1; r < mmu_page_sizes; r++) {
1030 1030 for (i = 0; i < NPC_MUTEX; i++) {
1031 1031 pi = kmem_zalloc(nranges * sizeof (pcc_info_t),
1032 1032 KM_NOSLEEP);
1033 1033 if (pi == NULL) {
1034 1034 rc = ENOMEM;
1035 1035 goto cleanup;
1036 1036 }
1037 1037 cands_cache[i * MMU_PAGE_SIZES + r] = pi;
1038 1038
1039 1039 for (mrange = 0; mrange < nranges; mrange++, pi++) {
1040 1040 pgcntp = kmem_zalloc(colors_per_szc[r] *
1041 1041 sizeof (pgcnt_t), KM_NOSLEEP);
1042 1042 if (pgcntp == NULL) {
1043 1043 rc = ENOMEM;
1044 1044 goto cleanup;
1045 1045 }
1046 1046 pi->pcc_color_free = pgcntp;
1047 1047 }
1048 1048 }
1049 1049 }
1050 1050
1051 1051 /*
1052 1052 * Grab the write lock to prevent others from walking these arrays
1053 1053 * while we are modifying them.
1054 1054 */
1055 1055 PAGE_CTRS_WRITE_LOCK(mnode);
1056 1056
1057 1057 /*
1058 1058 * For interleaved mnodes, find the first mnode
1059 1059 * with valid page counters since the current
1060 1060 * mnode may have just been added and not have
1061 1061 * valid page counters.
1062 1062 */
1063 1063 if (interleaved_mnodes) {
1064 1064 for (i = 0; i < max_mem_nodes; i++)
1065 1065 if (PAGE_COUNTERS_COUNTERS(i, 1) != NULL)
1066 1066 break;
1067 1067 ASSERT(i < max_mem_nodes);
1068 1068 oldmnode = i;
1069 1069 } else
1070 1070 oldmnode = mnode;
1071 1071
1072 1072 old_nranges = mnode_nranges[mnode];
1073 1073 cands_cache_nranges = old_nranges;
1074 1074 mnode_nranges[mnode] = nranges;
1075 1075 old_maxmrange = mnode_maxmrange[mnode];
1076 1076 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
1077 1077 new_maxmrange = mnode_maxmrange[mnode];
1078 1078
1079 1079 for (r = 1; r < mmu_page_sizes; r++) {
1080 1080 PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r);
1081 1081 old_ctr = PAGE_COUNTERS_COUNTERS(oldmnode, r);
1082 1082 old_csz = PAGE_COUNTERS_ENTRIES(oldmnode, r);
1083 1083 oldbase = PAGE_COUNTERS_BASE(oldmnode, r);
1084 1084 old_npgs = old_csz << PAGE_COUNTERS_SHIFT(oldmnode, r);
1085 1085 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1086 1086 old_color_array[mrange] =
1087 1087 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
1088 1088 r, mrange);
1089 1089 }
1090 1090
1091 1091 pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r);
1092 1092 new_ctr = ctr_cache[r];
1093 1093 ctr_cache[r] = NULL;
1094 1094 if (old_ctr != NULL &&
1095 1095 (oldbase + old_npgs > newbase) &&
1096 1096 (newbase + npgs > oldbase)) {
1097 1097 /*
1098 1098 * Map the intersection of the old and new
1099 1099 * counters into the new array.
1100 1100 */
1101 1101 size_t offset;
1102 1102 if (newbase > oldbase) {
1103 1103 offset = (newbase - oldbase) >>
1104 1104 PAGE_COUNTERS_SHIFT(mnode, r);
1105 1105 bcopy(old_ctr + offset, new_ctr,
1106 1106 MIN(pcsz, (old_csz - offset)) *
1107 1107 sizeof (hpmctr_t));
1108 1108 } else {
1109 1109 offset = (oldbase - newbase) >>
1110 1110 PAGE_COUNTERS_SHIFT(mnode, r);
1111 1111 bcopy(old_ctr, new_ctr + offset,
1112 1112 MIN(pcsz - offset, old_csz) *
1113 1113 sizeof (hpmctr_t));
1114 1114 }
1115 1115 }
1116 1116
1117 1117 PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr;
1118 1118 PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz;
1119 1119 PAGE_COUNTERS_BASE(mnode, r) = newbase;
1120 1120
1121 1121 /* update shared hpm_counters in other mnodes */
1122 1122 if (interleaved_mnodes) {
1123 1123 for (i = 0; i < max_mem_nodes; i++) {
1124 1124 if ((i == mnode) ||
1125 1125 (mem_node_config[i].exists == 0))
1126 1126 continue;
1127 1127 ASSERT(
1128 1128 PAGE_COUNTERS_COUNTERS(i, r) == old_ctr ||
1129 1129 PAGE_COUNTERS_COUNTERS(i, r) == NULL);
1130 1130 PAGE_COUNTERS_COUNTERS(i, r) = new_ctr;
1131 1131 PAGE_COUNTERS_ENTRIES(i, r) = pcsz;
1132 1132 PAGE_COUNTERS_BASE(i, r) = newbase;
1133 1133 }
1134 1134 }
1135 1135
1136 1136 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1137 1137 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) =
1138 1138 color_cache[r][mrange];
1139 1139 color_cache[r][mrange] = NULL;
1140 1140 }
1141 1141 /*
1142 1142 * for now, just reset on these events as it's probably
1143 1143 * not worthwhile to try and optimize this.
1144 1144 */
1145 1145 for (i = 0; i < colors_per_szc[r]; i++) {
1146 1146 uint_t color_mask = colors_per_szc[r] - 1;
1147 1147 int mlo = interleaved_mnodes ? 0 : mnode;
1148 1148 int mhi = interleaved_mnodes ? max_mem_nodes :
1149 1149 (mnode + 1);
1150 1150 int m;
1151 1151 pfn_t pfnum;
1152 1152 size_t idx;
1153 1153 MEM_NODE_ITERATOR_DECL(it);
1154 1154
1155 1155 for (m = mlo; m < mhi; m++) {
1156 1156 if (mem_node_config[m].exists == 0)
1157 1157 continue;
1158 1158 pfnum = newbase;
1159 1159 MEM_NODE_ITERATOR_INIT(pfnum, m, r, &it);
1160 1160 if (pfnum == (pfn_t)-1) {
1161 1161 idx = 0;
1162 1162 } else {
1163 1163 PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
1164 1164 color_mask, color_mask, &it);
1165 1165 idx = PNUM_TO_IDX(m, r, pfnum);
1166 1166 idx = (idx < pcsz) ? idx : 0;
1167 1167 }
1168 1168 for (mrange = 0; mrange < nranges; mrange++) {
1169 1169 if (PAGE_COUNTERS_CURRENT_COLOR_ARRAY(m,
1170 1170 r, mrange) != NULL)
1171 1171 PAGE_COUNTERS_CURRENT_COLOR(m,
1172 1172 r, i, mrange) = idx;
1173 1173 }
1174 1174 }
1175 1175 }
1176 1176
1177 1177 /* cache info for freeing out of the critical path */
1178 1178 if ((caddr_t)old_ctr >= kernelheap &&
1179 1179 (caddr_t)old_ctr < ekernelheap) {
1180 1180 ctr_cache[r] = old_ctr;
1181 1181 size_cache[r] = old_csz;
1182 1182 }
1183 1183 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1184 1184 size_t *tmp = old_color_array[mrange];
1185 1185 if ((caddr_t)tmp >= kernelheap &&
1186 1186 (caddr_t)tmp < ekernelheap) {
1187 1187 color_cache[r][mrange] = tmp;
1188 1188 }
1189 1189 }
1190 1190 /*
1191 1191 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
1192 1192 * satisfy the identity requirement.
1193 1193 * We should be able to go from one to the other
1194 1194 * and get consistent values.
1195 1195 */
1196 1196 ASSERT(PNUM_TO_IDX(mnode, r,
1197 1197 (IDX_TO_PNUM(mnode, r, 0))) == 0);
1198 1198 ASSERT(IDX_TO_PNUM(mnode, r,
1199 1199 (PNUM_TO_IDX(mnode, r, newbase))) == newbase);
1200 1200
1201 1201 /* pcc_info_t and pcc_color_free */
1202 1202 for (i = 0; i < NPC_MUTEX; i++) {
1203 1203 pcc_info_t *epi;
1204 1204 pcc_info_t *eold_pi;
1205 1205
1206 1206 pi = cands_cache[i * MMU_PAGE_SIZES + r];
1207 1207 old_pi = page_ctrs_cands[i][r][mnode];
1208 1208 page_ctrs_cands[i][r][mnode] = pi;
1209 1209 cands_cache[i * MMU_PAGE_SIZES + r] = old_pi;
1210 1210
1211 1211 /* preserve old pcc_color_free values, if any */
1212 1212 if (old_pi == NULL)
1213 1213 continue;
1214 1214
1215 1215 /*
1216 1216 * when/if x86 does DR, must account for
1217 1217 * possible change in range index when
1218 1218 * preserving pcc_info
1219 1219 */
1220 1220 epi = &pi[nranges];
1221 1221 eold_pi = &old_pi[old_nranges];
1222 1222 if (new_maxmrange > old_maxmrange) {
1223 1223 pi += new_maxmrange - old_maxmrange;
1224 1224 } else if (new_maxmrange < old_maxmrange) {
1225 1225 old_pi += old_maxmrange - new_maxmrange;
1226 1226 }
1227 1227 for (; pi < epi && old_pi < eold_pi; pi++, old_pi++) {
1228 1228 pcc_info_t tmp = *pi;
1229 1229 *pi = *old_pi;
1230 1230 *old_pi = tmp;
1231 1231 }
1232 1232 }
1233 1233 }
1234 1234 PAGE_CTRS_WRITE_UNLOCK(mnode);
1235 1235
1236 1236 /*
1237 1237 * Now that we have dropped the write lock, it is safe to free all
1238 1238 * of the memory we have cached above.
1239 1239 * We come thru here to free memory when pre-alloc fails, and also to
1240 1240 * free old pointers which were recorded while locked.
1241 1241 */
1242 1242 cleanup:
1243 1243 for (r = 1; r < mmu_page_sizes; r++) {
1244 1244 if (ctr_cache[r] != NULL) {
1245 1245 kmem_free(ctr_cache[r],
1246 1246 size_cache[r] * sizeof (hpmctr_t));
1247 1247 }
1248 1248 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1249 1249 if (color_cache[r][mrange] != NULL) {
1250 1250 kmem_free(color_cache[r][mrange],
1251 1251 colors_per_szc[r] * sizeof (size_t));
1252 1252 }
1253 1253 }
1254 1254 for (i = 0; i < NPC_MUTEX; i++) {
1255 1255 pi = cands_cache[i * MMU_PAGE_SIZES + r];
1256 1256 if (pi == NULL)
1257 1257 continue;
1258 1258 nr = cands_cache_nranges;
1259 1259 for (mrange = 0; mrange < nr; mrange++, pi++) {
1260 1260 pgcntp = pi->pcc_color_free;
1261 1261 if (pgcntp == NULL)
1262 1262 continue;
1263 1263 if ((caddr_t)pgcntp >= kernelheap &&
1264 1264 (caddr_t)pgcntp < ekernelheap) {
1265 1265 kmem_free(pgcntp,
1266 1266 colors_per_szc[r] *
1267 1267 sizeof (pgcnt_t));
1268 1268 }
1269 1269 }
1270 1270 pi = cands_cache[i * MMU_PAGE_SIZES + r];
1271 1271 if ((caddr_t)pi >= kernelheap &&
1272 1272 (caddr_t)pi < ekernelheap) {
1273 1273 kmem_free(pi, nr * sizeof (pcc_info_t));
1274 1274 }
1275 1275 }
1276 1276 }
1277 1277
1278 1278 kmem_free(cands_cache,
1279 1279 sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES);
1280 1280 return (rc);
1281 1281 }
1282 1282
1283 1283 /*
1284 1284 * Cleanup the hpm_counters field in the page counters
1285 1285 * array.
1286 1286 */
1287 1287 void
1288 1288 page_ctrs_cleanup(void)
1289 1289 {
1290 1290 int r; /* region size */
1291 1291 int i; /* mnode index */
1292 1292
1293 1293 /*
1294 1294 * Get the page counters write lock while we are
1295 1295 * setting the page hpm_counters field to NULL
1296 1296 * for non-existent mnodes.
1297 1297 */
1298 1298 for (i = 0; i < max_mem_nodes; i++) {
1299 1299 PAGE_CTRS_WRITE_LOCK(i);
1300 1300 if (mem_node_config[i].exists) {
1301 1301 PAGE_CTRS_WRITE_UNLOCK(i);
1302 1302 continue;
1303 1303 }
1304 1304 for (r = 1; r < mmu_page_sizes; r++) {
1305 1305 PAGE_COUNTERS_COUNTERS(i, r) = NULL;
1306 1306 }
1307 1307 PAGE_CTRS_WRITE_UNLOCK(i);
1308 1308 }
1309 1309 }
1310 1310
1311 1311 #ifdef DEBUG
1312 1312
1313 1313 /*
1314 1314 * confirm pp is a large page corresponding to szc
1315 1315 */
1316 1316 void
1317 1317 chk_lpg(page_t *pp, uchar_t szc)
1318 1318 {
1319 1319 spgcnt_t npgs = page_get_pagecnt(pp->p_szc);
1320 1320 uint_t noreloc;
1321 1321
1322 1322 if (npgs == 1) {
1323 1323 ASSERT(pp->p_szc == 0);
1324 1324 ASSERT(pp->p_next == pp);
1325 1325 ASSERT(pp->p_prev == pp);
1326 1326 return;
1327 1327 }
1328 1328
1329 1329 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
1330 1330 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
1331 1331
1332 1332 ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs));
1333 1333 ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1));
1334 1334 ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1)));
1335 1335 ASSERT(pp->p_prev == (pp + (npgs - 1)));
1336 1336
1337 1337 /*
1338 1338 * Check list of pages.
1339 1339 */
1340 1340 noreloc = PP_ISNORELOC(pp);
1341 1341 while (npgs--) {
1342 1342 if (npgs != 0) {
1343 1343 ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1);
1344 1344 ASSERT(pp->p_next == (pp + 1));
1345 1345 }
1346 1346 ASSERT(pp->p_szc == szc);
1347 1347 ASSERT(PP_ISFREE(pp));
1348 1348 ASSERT(PP_ISAGED(pp));
1349 1349 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
1350 1350 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
1351 1351 ASSERT(pp->p_vnode == NULL);
1352 1352 ASSERT(PP_ISNORELOC(pp) == noreloc);
1353 1353
1354 1354 pp = pp->p_next;
1355 1355 }
1356 1356 }
1357 1357 #endif /* DEBUG */
1358 1358
1359 1359 void
1360 1360 page_freelist_lock(int mnode)
1361 1361 {
1362 1362 int i;
1363 1363 for (i = 0; i < NPC_MUTEX; i++) {
1364 1364 mutex_enter(FPC_MUTEX(mnode, i));
1365 1365 mutex_enter(CPC_MUTEX(mnode, i));
1366 1366 }
1367 1367 }
1368 1368
1369 1369 void
1370 1370 page_freelist_unlock(int mnode)
1371 1371 {
1372 1372 int i;
1373 1373 for (i = 0; i < NPC_MUTEX; i++) {
1374 1374 mutex_exit(FPC_MUTEX(mnode, i));
1375 1375 mutex_exit(CPC_MUTEX(mnode, i));
1376 1376 }
1377 1377 }
1378 1378
1379 1379 /*
1380 1380 * add pp to the specified page list. Defaults to head of the page list
1381 1381 * unless PG_LIST_TAIL is specified.
1382 1382 */
1383 1383 void
1384 1384 page_list_add(page_t *pp, int flags)
1385 1385 {
1386 1386 page_t **ppp;
1387 1387 kmutex_t *pcm;
1388 1388 uint_t bin, mtype;
1389 1389 int mnode;
1390 1390
1391 1391 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1392 1392 ASSERT(PP_ISFREE(pp));
1393 1393 ASSERT(!hat_page_is_mapped(pp));
1394 1394 ASSERT(hat_page_getshare(pp) == 0);
1395 1395
1396 1396 /*
1397 1397 * Large pages should be freed via page_list_add_pages().
1398 1398 */
1399 1399 ASSERT(pp->p_szc == 0);
1400 1400
1401 1401 /*
1402 1402 * Don't need to lock the freelist first here
1403 1403 * because the page isn't on the freelist yet.
1404 1404 * This means p_szc can't change on us.
1405 1405 */
1406 1406
1407 1407 bin = PP_2_BIN(pp);
1408 1408 mnode = PP_2_MEM_NODE(pp);
1409 1409 mtype = PP_2_MTYPE(pp);
1410 1410
1411 1411 if (flags & PG_LIST_ISINIT) {
1412 1412 /*
1413 1413 * PG_LIST_ISINIT is set during system startup (ie. single
1414 1414 * threaded), add a page to the free list and add to the
1415 1415 * the free region counters w/o any locking
1416 1416 */
1417 1417 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1418 1418
1419 1419 /* inline version of page_add() */
1420 1420 if (*ppp != NULL) {
1421 1421 pp->p_next = *ppp;
1422 1422 pp->p_prev = (*ppp)->p_prev;
1423 1423 (*ppp)->p_prev = pp;
1424 1424 pp->p_prev->p_next = pp;
1425 1425 } else
1426 1426 *ppp = pp;
1427 1427
1428 1428 page_ctr_add_internal(mnode, mtype, pp, flags);
1429 1429 VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1430 1430 } else {
1431 1431 pcm = PC_BIN_MUTEX(mnode, bin, flags);
1432 1432
1433 1433 if (flags & PG_FREE_LIST) {
1434 1434 VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1435 1435 ASSERT(PP_ISAGED(pp));
1436 1436 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1437 1437
1438 1438 } else {
1439 1439 VM_STAT_ADD(vmm_vmstats.pladd_cache);
1440 1440 ASSERT(pp->p_vnode);
1441 1441 ASSERT((pp->p_offset & PAGEOFFSET) == 0);
1442 1442 ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1443 1443 }
1444 1444 mutex_enter(pcm);
1445 1445 page_add(ppp, pp);
1446 1446
1447 1447 if (flags & PG_LIST_TAIL)
1448 1448 *ppp = (*ppp)->p_next;
1449 1449 /*
1450 1450 * Add counters before releasing pcm mutex to avoid a race with
1451 1451 * page_freelist_coalesce and page_freelist_split.
1452 1452 */
1453 1453 page_ctr_add(mnode, mtype, pp, flags);
1454 1454 mutex_exit(pcm);
1455 1455 }
1456 1456
1457 1457
1458 1458 #if defined(__sparc)
1459 1459 if (PP_ISNORELOC(pp)) {
1460 1460 kcage_freemem_add(1);
1461 1461 }
1462 1462 #endif
1463 1463 /*
1464 1464 * It is up to the caller to unlock the page!
1465 1465 */
1466 1466 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1467 1467 }
1468 1468
1469 1469
1470 1470 #ifdef __sparc
1471 1471 /*
1472 1472 * This routine is only used by kcage_init during system startup.
1473 1473 * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add
1474 1474 * without the overhead of taking locks and updating counters.
1475 1475 */
1476 1476 void
1477 1477 page_list_noreloc_startup(page_t *pp)
1478 1478 {
1479 1479 page_t **ppp;
1480 1480 uint_t bin;
1481 1481 int mnode;
1482 1482 int mtype;
1483 1483 int flags = 0;
1484 1484
1485 1485 /*
1486 1486 * If this is a large page on the freelist then
1487 1487 * break it up into smaller pages.
1488 1488 */
1489 1489 if (pp->p_szc != 0)
1490 1490 page_boot_demote(pp);
1491 1491
1492 1492 /*
1493 1493 * Get list page is currently on.
1494 1494 */
1495 1495 bin = PP_2_BIN(pp);
1496 1496 mnode = PP_2_MEM_NODE(pp);
1497 1497 mtype = PP_2_MTYPE(pp);
1498 1498 ASSERT(mtype == MTYPE_RELOC);
1499 1499 ASSERT(pp->p_szc == 0);
1500 1500
1501 1501 if (PP_ISAGED(pp)) {
1502 1502 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1503 1503 flags |= PG_FREE_LIST;
1504 1504 } else {
1505 1505 ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1506 1506 flags |= PG_CACHE_LIST;
1507 1507 }
1508 1508
1509 1509 ASSERT(*ppp != NULL);
1510 1510
1511 1511 /*
1512 1512 * Delete page from current list.
1513 1513 */
1514 1514 if (*ppp == pp)
1515 1515 *ppp = pp->p_next; /* go to next page */
1516 1516 if (*ppp == pp) {
1517 1517 *ppp = NULL; /* page list is gone */
1518 1518 } else {
1519 1519 pp->p_prev->p_next = pp->p_next;
1520 1520 pp->p_next->p_prev = pp->p_prev;
1521 1521 }
1522 1522
1523 1523 /*
1524 1524 * Decrement page counters
1525 1525 */
1526 1526 page_ctr_sub_internal(mnode, mtype, pp, flags);
1527 1527
1528 1528 /*
1529 1529 * Set no reloc for cage initted pages.
1530 1530 */
1531 1531 PP_SETNORELOC(pp);
1532 1532
1533 1533 mtype = PP_2_MTYPE(pp);
1534 1534 ASSERT(mtype == MTYPE_NORELOC);
1535 1535
1536 1536 /*
1537 1537 * Get new list for page.
1538 1538 */
1539 1539 if (PP_ISAGED(pp)) {
1540 1540 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1541 1541 } else {
1542 1542 ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1543 1543 }
1544 1544
1545 1545 /*
1546 1546 * Insert page on new list.
1547 1547 */
1548 1548 if (*ppp == NULL) {
1549 1549 *ppp = pp;
1550 1550 pp->p_next = pp->p_prev = pp;
1551 1551 } else {
1552 1552 pp->p_next = *ppp;
1553 1553 pp->p_prev = (*ppp)->p_prev;
1554 1554 (*ppp)->p_prev = pp;
1555 1555 pp->p_prev->p_next = pp;
1556 1556 }
1557 1557
1558 1558 /*
1559 1559 * Increment page counters
1560 1560 */
1561 1561 page_ctr_add_internal(mnode, mtype, pp, flags);
1562 1562
1563 1563 /*
1564 1564 * Update cage freemem counter
1565 1565 */
1566 1566 atomic_inc_ulong(&kcage_freemem);
1567 1567 }
1568 1568 #else /* __sparc */
1569 1569
1570 1570 /* ARGSUSED */
1571 1571 void
1572 1572 page_list_noreloc_startup(page_t *pp)
1573 1573 {
1574 1574 panic("page_list_noreloc_startup: should be here only for sparc");
1575 1575 }
1576 1576 #endif
1577 1577
1578 1578 void
1579 1579 page_list_add_pages(page_t *pp, int flags)
1580 1580 {
1581 1581 kmutex_t *pcm;
1582 1582 pgcnt_t pgcnt;
1583 1583 uint_t bin, mtype, i;
1584 1584 int mnode;
1585 1585
1586 1586 /* default to freelist/head */
1587 1587 ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0);
1588 1588
1589 1589 CHK_LPG(pp, pp->p_szc);
1590 1590 VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]);
1591 1591
1592 1592 bin = PP_2_BIN(pp);
1593 1593 mnode = PP_2_MEM_NODE(pp);
1594 1594 mtype = PP_2_MTYPE(pp);
1595 1595
1596 1596 if (flags & PG_LIST_ISINIT) {
1597 1597 ASSERT(pp->p_szc == mmu_page_sizes - 1);
1598 1598 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1599 1599 ASSERT(!PP_ISNORELOC(pp));
1600 1600 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
1601 1601 } else {
1602 1602
1603 1603 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
1604 1604
1605 1605 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1606 1606
1607 1607 mutex_enter(pcm);
1608 1608 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1609 1609 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
1610 1610 mutex_exit(pcm);
1611 1611
1612 1612 pgcnt = page_get_pagecnt(pp->p_szc);
1613 1613 #if defined(__sparc)
1614 1614 if (PP_ISNORELOC(pp))
1615 1615 kcage_freemem_add(pgcnt);
1616 1616 #endif
1617 1617 for (i = 0; i < pgcnt; i++, pp++)
1618 1618 page_unlock_nocapture(pp);
1619 1619 }
1620 1620 }
1621 1621
1622 1622 /*
1623 1623 * During boot, need to demote a large page to base
1624 1624 * pagesize pages for seg_kmem for use in boot_alloc()
1625 1625 */
1626 1626 void
1627 1627 page_boot_demote(page_t *pp)
1628 1628 {
1629 1629 ASSERT(pp->p_szc != 0);
1630 1630 ASSERT(PP_ISFREE(pp));
1631 1631 ASSERT(PP_ISAGED(pp));
1632 1632
1633 1633 (void) page_demote(PP_2_MEM_NODE(pp),
1634 1634 PFN_BASE(pp->p_pagenum, pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR,
1635 1635 PC_FREE);
1636 1636
1637 1637 ASSERT(PP_ISFREE(pp));
1638 1638 ASSERT(PP_ISAGED(pp));
1639 1639 ASSERT(pp->p_szc == 0);
1640 1640 }
1641 1641
1642 1642 /*
1643 1643 * Take a particular page off of whatever freelist the page
1644 1644 * is claimed to be on.
1645 1645 *
1646 1646 * NOTE: Only used for PAGESIZE pages.
1647 1647 */
1648 1648 void
1649 1649 page_list_sub(page_t *pp, int flags)
1650 1650 {
1651 1651 int bin;
1652 1652 uint_t mtype;
1653 1653 int mnode;
1654 1654 kmutex_t *pcm;
1655 1655 page_t **ppp;
1656 1656
1657 1657 ASSERT(PAGE_EXCL(pp));
1658 1658 ASSERT(PP_ISFREE(pp));
1659 1659
1660 1660 /*
1661 1661 * The p_szc field can only be changed by page_promote()
1662 1662 * and page_demote(). Only free pages can be promoted and
1663 1663 * demoted and the free list MUST be locked during these
1664 1664 * operations. So to prevent a race in page_list_sub()
1665 1665 * between computing which bin of the freelist lock to
1666 1666 * grab and actually grabing the lock we check again that
1667 1667 * the bin we locked is still the correct one. Notice that
1668 1668 * the p_szc field could have actually changed on us but
1669 1669 * if the bin happens to still be the same we are safe.
1670 1670 */
1671 1671 try_again:
1672 1672 bin = PP_2_BIN(pp);
1673 1673 mnode = PP_2_MEM_NODE(pp);
1674 1674 pcm = PC_BIN_MUTEX(mnode, bin, flags);
1675 1675 mutex_enter(pcm);
1676 1676 if (PP_2_BIN(pp) != bin) {
1677 1677 mutex_exit(pcm);
1678 1678 goto try_again;
1679 1679 }
1680 1680 mtype = PP_2_MTYPE(pp);
1681 1681
1682 1682 if (flags & PG_FREE_LIST) {
1683 1683 VM_STAT_ADD(vmm_vmstats.plsub_free[0]);
1684 1684 ASSERT(PP_ISAGED(pp));
1685 1685 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1686 1686 } else {
1687 1687 VM_STAT_ADD(vmm_vmstats.plsub_cache);
1688 1688 ASSERT(!PP_ISAGED(pp));
1689 1689 ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1690 1690 }
1691 1691
1692 1692 /*
1693 1693 * Common PAGESIZE case.
1694 1694 *
1695 1695 * Note that we locked the freelist. This prevents
1696 1696 * any page promotion/demotion operations. Therefore
1697 1697 * the p_szc will not change until we drop pcm mutex.
1698 1698 */
1699 1699 if (pp->p_szc == 0) {
1700 1700 page_sub(ppp, pp);
1701 1701 /*
1702 1702 * Subtract counters before releasing pcm mutex
1703 1703 * to avoid race with page_freelist_coalesce.
1704 1704 */
1705 1705 page_ctr_sub(mnode, mtype, pp, flags);
1706 1706 mutex_exit(pcm);
1707 1707
1708 1708 #if defined(__sparc)
1709 1709 if (PP_ISNORELOC(pp)) {
1710 1710 kcage_freemem_sub(1);
1711 1711 }
1712 1712 #endif
1713 1713 return;
1714 1714 }
1715 1715
1716 1716 /*
1717 1717 * Large pages on the cache list are not supported.
1718 1718 */
1719 1719 if (flags & PG_CACHE_LIST)
1720 1720 panic("page_list_sub: large page on cachelist");
1721 1721
1722 1722 /*
1723 1723 * Slow but rare.
1724 1724 *
1725 1725 * Somebody wants this particular page which is part
1726 1726 * of a large page. In this case we just demote the page
1727 1727 * if it's on the freelist.
1728 1728 *
1729 1729 * We have to drop pcm before locking the entire freelist.
1730 1730 * Once we have re-locked the freelist check to make sure
1731 1731 * the page hasn't already been demoted or completely
1732 1732 * freed.
1733 1733 */
1734 1734 mutex_exit(pcm);
1735 1735 page_freelist_lock(mnode);
1736 1736 if (pp->p_szc != 0) {
1737 1737 /*
1738 1738 * Large page is on freelist.
1739 1739 */
1740 1740 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc),
1741 1741 0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
1742 1742 }
1743 1743 ASSERT(PP_ISFREE(pp));
1744 1744 ASSERT(PP_ISAGED(pp));
1745 1745 ASSERT(pp->p_szc == 0);
1746 1746
1747 1747 /*
1748 1748 * Subtract counters before releasing pcm mutex
1749 1749 * to avoid race with page_freelist_coalesce.
1750 1750 */
1751 1751 bin = PP_2_BIN(pp);
1752 1752 mtype = PP_2_MTYPE(pp);
1753 1753 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1754 1754
1755 1755 page_sub(ppp, pp);
1756 1756 page_ctr_sub(mnode, mtype, pp, flags);
1757 1757 page_freelist_unlock(mnode);
1758 1758
1759 1759 #if defined(__sparc)
1760 1760 if (PP_ISNORELOC(pp)) {
1761 1761 kcage_freemem_sub(1);
1762 1762 }
1763 1763 #endif
1764 1764 }
1765 1765
1766 1766 void
1767 1767 page_list_sub_pages(page_t *pp, uint_t szc)
1768 1768 {
1769 1769 kmutex_t *pcm;
1770 1770 uint_t bin, mtype;
1771 1771 int mnode;
1772 1772
1773 1773 ASSERT(PAGE_EXCL(pp));
1774 1774 ASSERT(PP_ISFREE(pp));
1775 1775 ASSERT(PP_ISAGED(pp));
1776 1776
1777 1777 /*
1778 1778 * See comment in page_list_sub().
1779 1779 */
1780 1780 try_again:
1781 1781 bin = PP_2_BIN(pp);
1782 1782 mnode = PP_2_MEM_NODE(pp);
1783 1783 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1784 1784 mutex_enter(pcm);
1785 1785 if (PP_2_BIN(pp) != bin) {
1786 1786 mutex_exit(pcm);
1787 1787 goto try_again;
1788 1788 }
1789 1789
1790 1790 /*
1791 1791 * If we're called with a page larger than szc or it got
1792 1792 * promoted above szc before we locked the freelist then
1793 1793 * drop pcm and re-lock entire freelist. If page still larger
1794 1794 * than szc then demote it.
1795 1795 */
1796 1796 if (pp->p_szc > szc) {
1797 1797 mutex_exit(pcm);
1798 1798 pcm = NULL;
1799 1799 page_freelist_lock(mnode);
1800 1800 if (pp->p_szc > szc) {
1801 1801 VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig);
1802 1802 (void) page_demote(mnode,
1803 1803 PFN_BASE(pp->p_pagenum, pp->p_szc), 0,
1804 1804 pp->p_szc, szc, PC_NO_COLOR, PC_FREE);
1805 1805 }
1806 1806 bin = PP_2_BIN(pp);
1807 1807 }
1808 1808 ASSERT(PP_ISFREE(pp));
1809 1809 ASSERT(PP_ISAGED(pp));
1810 1810 ASSERT(pp->p_szc <= szc);
1811 1811 ASSERT(pp == PP_PAGEROOT(pp));
1812 1812
1813 1813 VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]);
1814 1814
1815 1815 mtype = PP_2_MTYPE(pp);
1816 1816 if (pp->p_szc != 0) {
1817 1817 page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1818 1818 CHK_LPG(pp, pp->p_szc);
1819 1819 } else {
1820 1820 VM_STAT_ADD(vmm_vmstats.plsubpages_szc0);
1821 1821 page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1822 1822 }
1823 1823 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
1824 1824
1825 1825 if (pcm != NULL) {
1826 1826 mutex_exit(pcm);
1827 1827 } else {
1828 1828 page_freelist_unlock(mnode);
1829 1829 }
1830 1830
1831 1831 #if defined(__sparc)
1832 1832 if (PP_ISNORELOC(pp)) {
1833 1833 pgcnt_t pgcnt;
1834 1834
1835 1835 pgcnt = page_get_pagecnt(pp->p_szc);
1836 1836 kcage_freemem_sub(pgcnt);
1837 1837 }
1838 1838 #endif
1839 1839 }
1840 1840
1841 1841 /*
1842 1842 * Add the page to the front of a linked list of pages
1843 1843 * using the p_next & p_prev pointers for the list.
1844 1844 * The caller is responsible for protecting the list pointers.
1845 1845 */
1846 1846 void
1847 1847 mach_page_add(page_t **ppp, page_t *pp)
1848 1848 {
1849 1849 if (*ppp == NULL) {
1850 1850 pp->p_next = pp->p_prev = pp;
1851 1851 } else {
1852 1852 pp->p_next = *ppp;
1853 1853 pp->p_prev = (*ppp)->p_prev;
1854 1854 (*ppp)->p_prev = pp;
1855 1855 pp->p_prev->p_next = pp;
1856 1856 }
1857 1857 *ppp = pp;
1858 1858 }
1859 1859
1860 1860 /*
1861 1861 * Remove this page from a linked list of pages
1862 1862 * using the p_next & p_prev pointers for the list.
1863 1863 *
1864 1864 * The caller is responsible for protecting the list pointers.
1865 1865 */
1866 1866 void
1867 1867 mach_page_sub(page_t **ppp, page_t *pp)
1868 1868 {
1869 1869 ASSERT(PP_ISFREE(pp));
1870 1870
1871 1871 if (*ppp == NULL || pp == NULL)
1872 1872 panic("mach_page_sub");
1873 1873
1874 1874 if (*ppp == pp)
1875 1875 *ppp = pp->p_next; /* go to next page */
1876 1876
1877 1877 if (*ppp == pp)
1878 1878 *ppp = NULL; /* page list is gone */
1879 1879 else {
1880 1880 pp->p_prev->p_next = pp->p_next;
1881 1881 pp->p_next->p_prev = pp->p_prev;
1882 1882 }
1883 1883 pp->p_prev = pp->p_next = pp; /* make pp a list of one */
1884 1884 }
1885 1885
1886 1886 /*
1887 1887 * Routine fsflush uses to gradually coalesce the free list into larger pages.
1888 1888 */
1889 1889 void
1890 1890 page_promote_size(page_t *pp, uint_t cur_szc)
1891 1891 {
1892 1892 pfn_t pfn;
1893 1893 int mnode;
1894 1894 int idx;
1895 1895 int new_szc = cur_szc + 1;
1896 1896 int full = FULL_REGION_CNT(new_szc);
1897 1897
1898 1898 pfn = page_pptonum(pp);
1899 1899 mnode = PFN_2_MEM_NODE(pfn);
1900 1900
1901 1901 page_freelist_lock(mnode);
1902 1902
1903 1903 idx = PNUM_TO_IDX(mnode, new_szc, pfn);
1904 1904 if (PAGE_COUNTERS(mnode, new_szc, idx) == full)
1905 1905 (void) page_promote(mnode, pfn, new_szc, PC_FREE, PC_MTYPE_ANY);
1906 1906
1907 1907 page_freelist_unlock(mnode);
1908 1908 }
1909 1909
1910 1910 static uint_t page_promote_err;
1911 1911 static uint_t page_promote_noreloc_err;
1912 1912
1913 1913 /*
1914 1914 * Create a single larger page (of szc new_szc) from smaller contiguous pages
1915 1915 * for the given mnode starting at pfnum. Pages involved are on the freelist
1916 1916 * before the call and may be returned to the caller if requested, otherwise
1917 1917 * they will be placed back on the freelist.
1918 1918 * If flags is PC_ALLOC, then the large page will be returned to the user in
1919 1919 * a state which is consistent with a page being taken off the freelist. If
1920 1920 * we failed to lock the new large page, then we will return NULL to the
1921 1921 * caller and put the large page on the freelist instead.
1922 1922 * If flags is PC_FREE, then the large page will be placed on the freelist,
1923 1923 * and NULL will be returned.
1924 1924 * The caller is responsible for locking the freelist as well as any other
1925 1925 * accounting which needs to be done for a returned page.
1926 1926 *
1927 1927 * RFE: For performance pass in pp instead of pfnum so
1928 1928 * we can avoid excessive calls to page_numtopp_nolock().
1929 1929 * This would depend on an assumption that all contiguous
1930 1930 * pages are in the same memseg so we can just add/dec
1931 1931 * our pp.
1932 1932 *
1933 1933 * Lock ordering:
1934 1934 *
1935 1935 * There is a potential but rare deadlock situation
1936 1936 * for page promotion and demotion operations. The problem
1937 1937 * is there are two paths into the freelist manager and
1938 1938 * they have different lock orders:
1939 1939 *
1940 1940 * page_create()
1941 1941 * lock freelist
1942 1942 * page_lock(EXCL)
1943 1943 * unlock freelist
1944 1944 * return
1945 1945 * caller drops page_lock
1946 1946 *
1947 1947 * page_free() and page_reclaim()
1948 1948 * caller grabs page_lock(EXCL)
1949 1949 *
1950 1950 * lock freelist
1951 1951 * unlock freelist
1952 1952 * drop page_lock
1953 1953 *
1954 1954 * What prevents a thread in page_create() from deadlocking
1955 1955 * with a thread freeing or reclaiming the same page is the
1956 1956 * page_trylock() in page_get_freelist(). If the trylock fails
1957 1957 * it skips the page.
1958 1958 *
1959 1959 * The lock ordering for promotion and demotion is the same as
1960 1960 * for page_create(). Since the same deadlock could occur during
1961 1961 * page promotion and freeing or reclaiming of a page on the
1962 1962 * cache list we might have to fail the operation and undo what
1963 1963 * have done so far. Again this is rare.
1964 1964 */
1965 1965 page_t *
1966 1966 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags, int mtype)
1967 1967 {
1968 1968 page_t *pp, *pplist, *tpp, *start_pp;
1969 1969 pgcnt_t new_npgs, npgs;
1970 1970 uint_t bin;
1971 1971 pgcnt_t tmpnpgs, pages_left;
1972 1972 uint_t noreloc;
1973 1973 int which_list;
1974 1974 ulong_t index;
1975 1975 kmutex_t *phm;
1976 1976
1977 1977 /*
1978 1978 * General algorithm:
1979 1979 * Find the starting page
1980 1980 * Walk each page struct removing it from the freelist,
1981 1981 * and linking it to all the other pages removed.
1982 1982 * Once all pages are off the freelist,
1983 1983 * walk the list, modifying p_szc to new_szc and what
1984 1984 * ever other info needs to be done to create a large free page.
1985 1985 * According to the flags, either return the page or put it
1986 1986 * on the freelist.
1987 1987 */
1988 1988
1989 1989 start_pp = page_numtopp_nolock(pfnum);
1990 1990 ASSERT(start_pp && (start_pp->p_pagenum == pfnum));
1991 1991 new_npgs = page_get_pagecnt(new_szc);
1992 1992 ASSERT(IS_P2ALIGNED(pfnum, new_npgs));
1993 1993
1994 1994 /* don't return page of the wrong mtype */
1995 1995 if (mtype != PC_MTYPE_ANY && mtype != PP_2_MTYPE(start_pp))
1996 1996 return (NULL);
1997 1997
1998 1998 /*
1999 1999 * Loop through smaller pages to confirm that all pages
2000 2000 * give the same result for PP_ISNORELOC().
2001 2001 * We can check this reliably here as the protocol for setting
2002 2002 * P_NORELOC requires pages to be taken off the free list first.
2003 2003 */
2004 2004 noreloc = PP_ISNORELOC(start_pp);
2005 2005 for (pp = start_pp + new_npgs; --pp > start_pp; ) {
2006 2006 if (noreloc != PP_ISNORELOC(pp)) {
2007 2007 page_promote_noreloc_err++;
2008 2008 page_promote_err++;
2009 2009 return (NULL);
2010 2010 }
2011 2011 }
2012 2012
2013 2013 pages_left = new_npgs;
2014 2014 pplist = NULL;
2015 2015 pp = start_pp;
2016 2016
2017 2017 /* Loop around coalescing the smaller pages into a big page. */
2018 2018 while (pages_left) {
2019 2019 /*
2020 2020 * Remove from the freelist.
2021 2021 */
2022 2022 ASSERT(PP_ISFREE(pp));
2023 2023 bin = PP_2_BIN(pp);
2024 2024 ASSERT(mnode == PP_2_MEM_NODE(pp));
2025 2025 mtype = PP_2_MTYPE(pp);
2026 2026 if (PP_ISAGED(pp)) {
2027 2027
2028 2028 /*
2029 2029 * PG_FREE_LIST
2030 2030 */
2031 2031 if (pp->p_szc) {
2032 2032 page_vpsub(&PAGE_FREELISTS(mnode,
2033 2033 pp->p_szc, bin, mtype), pp);
2034 2034 } else {
2035 2035 mach_page_sub(&PAGE_FREELISTS(mnode, 0,
2036 2036 bin, mtype), pp);
2037 2037 }
2038 2038 which_list = PG_FREE_LIST;
2039 2039 } else {
2040 2040 ASSERT(pp->p_szc == 0);
2041 2041
2042 2042 /*
2043 2043 * PG_CACHE_LIST
2044 2044 *
2045 2045 * Since this page comes from the
2046 2046 * cachelist, we must destroy the
2047 2047 * vnode association.
2048 2048 */
2049 2049 if (!page_trylock(pp, SE_EXCL)) {
2050 2050 goto fail_promote;
2051 2051 }
2052 2052
2053 2053 /*
2054 2054 * We need to be careful not to deadlock
2055 2055 * with another thread in page_lookup().
2056 2056 * The page_lookup() thread could be holding
2057 2057 * the same phm that we need if the two
2058 2058 * pages happen to hash to the same phm lock.
2059 2059 * At this point we have locked the entire
2060 2060 * freelist and page_lookup() could be trying
2061 2061 * to grab a freelist lock.
2062 2062 */
2063 2063 index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset);
2064 2064 phm = PAGE_HASH_MUTEX(index);
2065 2065 if (!mutex_tryenter(phm)) {
2066 2066 page_unlock_nocapture(pp);
2067 2067 goto fail_promote;
2068 2068 }
2069 2069
2070 2070 mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp);
2071 2071 page_hashout(pp, phm);
2072 2072 mutex_exit(phm);
2073 2073 PP_SETAGED(pp);
2074 2074 page_unlock_nocapture(pp);
2075 2075 which_list = PG_CACHE_LIST;
2076 2076 }
2077 2077 page_ctr_sub(mnode, mtype, pp, which_list);
2078 2078
2079 2079 /*
2080 2080 * Concatenate the smaller page(s) onto
2081 2081 * the large page list.
2082 2082 */
2083 2083 tmpnpgs = npgs = page_get_pagecnt(pp->p_szc);
2084 2084 pages_left -= npgs;
2085 2085 tpp = pp;
2086 2086 while (npgs--) {
2087 2087 tpp->p_szc = new_szc;
2088 2088 tpp = tpp->p_next;
2089 2089 }
2090 2090 page_list_concat(&pplist, &pp);
2091 2091 pp += tmpnpgs;
2092 2092 }
2093 2093 CHK_LPG(pplist, new_szc);
2094 2094
2095 2095 /*
2096 2096 * return the page to the user if requested
2097 2097 * in the properly locked state.
2098 2098 */
2099 2099 if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) {
2100 2100 return (pplist);
2101 2101 }
2102 2102
2103 2103 /*
2104 2104 * Otherwise place the new large page on the freelist
2105 2105 */
2106 2106 bin = PP_2_BIN(pplist);
2107 2107 mnode = PP_2_MEM_NODE(pplist);
2108 2108 mtype = PP_2_MTYPE(pplist);
2109 2109 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist);
2110 2110
2111 2111 page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST);
2112 2112 return (NULL);
2113 2113
2114 2114 fail_promote:
2115 2115 /*
2116 2116 * A thread must have still been freeing or
2117 2117 * reclaiming the page on the cachelist.
2118 2118 * To prevent a deadlock undo what we have
2119 2119 * done sofar and return failure. This
2120 2120 * situation can only happen while promoting
2121 2121 * PAGESIZE pages.
2122 2122 */
2123 2123 page_promote_err++;
2124 2124 while (pplist) {
2125 2125 pp = pplist;
2126 2126 mach_page_sub(&pplist, pp);
2127 2127 pp->p_szc = 0;
2128 2128 bin = PP_2_BIN(pp);
2129 2129 mtype = PP_2_MTYPE(pp);
2130 2130 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp);
2131 2131 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2132 2132 }
2133 2133 return (NULL);
2134 2134
2135 2135 }
2136 2136
2137 2137 /*
2138 2138 * Break up a large page into smaller size pages.
2139 2139 * Pages involved are on the freelist before the call and may
2140 2140 * be returned to the caller if requested, otherwise they will
2141 2141 * be placed back on the freelist.
2142 2142 * The caller is responsible for locking the freelist as well as any other
2143 2143 * accounting which needs to be done for a returned page.
2144 2144 * If flags is not PC_ALLOC, the color argument is ignored, and thus
2145 2145 * technically, any value may be passed in but PC_NO_COLOR is the standard
2146 2146 * which should be followed for clarity's sake.
2147 2147 * Returns a page whose pfn is < pfnmax
2148 2148 */
2149 2149 page_t *
2150 2150 page_demote(int mnode, pfn_t pfnum, pfn_t pfnmax, uchar_t cur_szc,
2151 2151 uchar_t new_szc, int color, int flags)
2152 2152 {
2153 2153 page_t *pp, *pplist, *npplist;
2154 2154 pgcnt_t npgs, n;
2155 2155 uint_t bin;
2156 2156 uint_t mtype;
2157 2157 page_t *ret_pp = NULL;
2158 2158
2159 2159 ASSERT(cur_szc != 0);
2160 2160 ASSERT(new_szc < cur_szc);
2161 2161
2162 2162 pplist = page_numtopp_nolock(pfnum);
2163 2163 ASSERT(pplist != NULL);
2164 2164
2165 2165 ASSERT(pplist->p_szc == cur_szc);
2166 2166
2167 2167 bin = PP_2_BIN(pplist);
2168 2168 ASSERT(mnode == PP_2_MEM_NODE(pplist));
2169 2169 mtype = PP_2_MTYPE(pplist);
2170 2170 page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist);
2171 2171
2172 2172 CHK_LPG(pplist, cur_szc);
2173 2173 page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST);
2174 2174
2175 2175 /*
2176 2176 * Number of PAGESIZE pages for smaller new_szc
2177 2177 * page.
2178 2178 */
2179 2179 npgs = page_get_pagecnt(new_szc);
2180 2180
2181 2181 while (pplist) {
2182 2182 pp = pplist;
2183 2183
2184 2184 ASSERT(pp->p_szc == cur_szc);
2185 2185
2186 2186 /*
2187 2187 * We either break it up into PAGESIZE pages or larger.
2188 2188 */
2189 2189 if (npgs == 1) { /* PAGESIZE case */
2190 2190 mach_page_sub(&pplist, pp);
2191 2191 ASSERT(pp->p_szc == cur_szc);
2192 2192 ASSERT(new_szc == 0);
2193 2193 ASSERT(mnode == PP_2_MEM_NODE(pp));
2194 2194 pp->p_szc = new_szc;
2195 2195 bin = PP_2_BIN(pp);
2196 2196 if ((bin == color) && (flags == PC_ALLOC) &&
2197 2197 (ret_pp == NULL) && (pfnmax == 0 ||
2198 2198 pp->p_pagenum < pfnmax) &&
2199 2199 page_trylock_cons(pp, SE_EXCL)) {
2200 2200 ret_pp = pp;
2201 2201 } else {
2202 2202 mtype = PP_2_MTYPE(pp);
2203 2203 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin,
2204 2204 mtype), pp);
2205 2205 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2206 2206 }
2207 2207 } else {
2208 2208 page_t *try_to_return_this_page = NULL;
2209 2209 int count = 0;
2210 2210
2211 2211 /*
2212 2212 * Break down into smaller lists of pages.
2213 2213 */
2214 2214 page_list_break(&pplist, &npplist, npgs);
2215 2215
2216 2216 pp = pplist;
2217 2217 n = npgs;
2218 2218 while (n--) {
2219 2219 ASSERT(pp->p_szc == cur_szc);
2220 2220 /*
2221 2221 * Check whether all the pages in this list
2222 2222 * fit the request criteria.
2223 2223 */
2224 2224 if (pfnmax == 0 || pp->p_pagenum < pfnmax) {
2225 2225 count++;
2226 2226 }
2227 2227 pp->p_szc = new_szc;
2228 2228 pp = pp->p_next;
2229 2229 }
2230 2230
2231 2231 if (count == npgs &&
2232 2232 (pfnmax == 0 || pp->p_pagenum < pfnmax)) {
2233 2233 try_to_return_this_page = pp;
2234 2234 }
2235 2235
2236 2236 CHK_LPG(pplist, new_szc);
2237 2237
2238 2238 bin = PP_2_BIN(pplist);
2239 2239 if (try_to_return_this_page)
2240 2240 ASSERT(mnode ==
2241 2241 PP_2_MEM_NODE(try_to_return_this_page));
2242 2242 if ((bin == color) && (flags == PC_ALLOC) &&
2243 2243 (ret_pp == NULL) && try_to_return_this_page &&
2244 2244 page_trylock_cons(try_to_return_this_page,
2245 2245 SE_EXCL)) {
2246 2246 ret_pp = try_to_return_this_page;
2247 2247 } else {
2248 2248 mtype = PP_2_MTYPE(pp);
2249 2249 page_vpadd(&PAGE_FREELISTS(mnode, new_szc,
2250 2250 bin, mtype), pplist);
2251 2251
2252 2252 page_ctr_add(mnode, mtype, pplist,
2253 2253 PG_FREE_LIST);
2254 2254 }
2255 2255 pplist = npplist;
2256 2256 }
2257 2257 }
2258 2258 return (ret_pp);
2259 2259 }
2260 2260
2261 2261 int mpss_coalesce_disable = 0;
2262 2262
2263 2263 /*
2264 2264 * Coalesce free pages into a page of the given szc and color if possible.
2265 2265 * Return the pointer to the page created, otherwise, return NULL.
2266 2266 *
2267 2267 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2268 2268 */
2269 2269 page_t *
2270 2270 page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask,
2271 2271 int mtype, pfn_t pfnhi)
2272 2272 {
2273 2273 int r = szc; /* region size */
2274 2274 int mrange;
2275 2275 uint_t full, bin, color_mask, wrap = 0;
2276 2276 pfn_t pfnum, lo, hi;
2277 2277 size_t len, idx, idx0;
2278 2278 pgcnt_t cands = 0, szcpgcnt = page_get_pagecnt(szc);
2279 2279 page_t *ret_pp;
2280 2280 MEM_NODE_ITERATOR_DECL(it);
2281 2281 #if defined(__sparc)
2282 2282 pfn_t pfnum0, nlo, nhi;
2283 2283 #endif
2284 2284
2285 2285 if (mpss_coalesce_disable) {
2286 2286 ASSERT(szc < MMU_PAGE_SIZES);
2287 2287 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[szc][0]);
2288 2288 return (NULL);
2289 2289 }
2290 2290
2291 2291 ASSERT(szc < mmu_page_sizes);
2292 2292 color_mask = PAGE_GET_PAGECOLORS(szc) - 1;
2293 2293 ASSERT(ceq_mask <= color_mask);
2294 2294 ASSERT(color <= color_mask);
2295 2295 color &= ceq_mask;
2296 2296
2297 2297 /* Prevent page_counters dynamic memory from being freed */
2298 2298 rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2299 2299
2300 2300 mrange = MTYPE_2_MRANGE(mnode, mtype);
2301 2301 ASSERT(mrange < mnode_nranges[mnode]);
2302 2302 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[r][mrange]);
2303 2303
2304 2304 /* get pfn range for mtype */
2305 2305 len = PAGE_COUNTERS_ENTRIES(mnode, r);
2306 2306 MNODETYPE_2_PFN(mnode, mtype, lo, hi);
2307 2307 hi++;
2308 2308
2309 2309 /* use lower limit if given */
2310 2310 if (pfnhi != PFNNULL && pfnhi < hi)
2311 2311 hi = pfnhi;
2312 2312
2313 2313 /* round to szcpgcnt boundaries */
2314 2314 lo = P2ROUNDUP(lo, szcpgcnt);
2315 2315 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
2316 2316 if (lo == (pfn_t)-1) {
2317 2317 rw_exit(&page_ctrs_rwlock[mnode]);
2318 2318 return (NULL);
2319 2319 }
2320 2320 hi = hi & ~(szcpgcnt - 1);
2321 2321
2322 2322 /* set lo to the closest pfn of the right color */
2323 2323 if (((PFN_2_COLOR(lo, szc, &it) ^ color) & ceq_mask) ||
2324 2324 (interleaved_mnodes && PFN_2_MEM_NODE(lo) != mnode)) {
2325 2325 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask,
2326 2326 &it);
2327 2327 }
2328 2328
2329 2329 if (hi <= lo) {
2330 2330 rw_exit(&page_ctrs_rwlock[mnode]);
2331 2331 return (NULL);
2332 2332 }
2333 2333
2334 2334 full = FULL_REGION_CNT(r);
2335 2335
2336 2336 /* calculate the number of page candidates and initial search index */
2337 2337 bin = color;
2338 2338 idx0 = (size_t)(-1);
2339 2339 do {
2340 2340 pgcnt_t acand;
2341 2341
2342 2342 PGCTRS_CANDS_GETVALUECOLOR(mnode, mrange, r, bin, acand);
2343 2343 if (acand) {
2344 2344 idx = PAGE_COUNTERS_CURRENT_COLOR(mnode,
2345 2345 r, bin, mrange);
2346 2346 idx0 = MIN(idx0, idx);
2347 2347 cands += acand;
2348 2348 }
2349 2349 bin = ADD_MASKED(bin, 1, ceq_mask, color_mask);
2350 2350 } while (bin != color);
2351 2351
2352 2352 if (cands == 0) {
2353 2353 VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip[r][mrange]);
2354 2354 rw_exit(&page_ctrs_rwlock[mnode]);
2355 2355 return (NULL);
2356 2356 }
2357 2357
2358 2358 pfnum = IDX_TO_PNUM(mnode, r, idx0);
2359 2359 if (pfnum < lo || pfnum >= hi) {
2360 2360 pfnum = lo;
2361 2361 } else {
2362 2362 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2363 2363 if (pfnum == (pfn_t)-1) {
2364 2364 pfnum = lo;
2365 2365 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2366 2366 ASSERT(pfnum != (pfn_t)-1);
2367 2367 } else if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask ||
2368 2368 (interleaved_mnodes && PFN_2_MEM_NODE(pfnum) != mnode)) {
2369 2369 /* invalid color, get the closest correct pfn */
2370 2370 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2371 2371 color_mask, &it);
2372 2372 if (pfnum >= hi) {
2373 2373 pfnum = lo;
2374 2374 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2375 2375 }
2376 2376 }
2377 2377 }
2378 2378
2379 2379 /* set starting index */
2380 2380 idx0 = PNUM_TO_IDX(mnode, r, pfnum);
2381 2381 ASSERT(idx0 < len);
2382 2382
2383 2383 #if defined(__sparc)
2384 2384 pfnum0 = pfnum; /* page corresponding to idx0 */
2385 2385 nhi = 0; /* search kcage ranges */
2386 2386 #endif
2387 2387
2388 2388 for (idx = idx0; wrap == 0 || (idx < idx0 && wrap < 2); ) {
2389 2389
2390 2390 #if defined(__sparc)
2391 2391 /*
2392 2392 * Find lowest intersection of kcage ranges and mnode.
2393 2393 * MTYPE_NORELOC means look in the cage, otherwise outside.
2394 2394 */
2395 2395 if (nhi <= pfnum) {
2396 2396 if (kcage_next_range(mtype == MTYPE_NORELOC, pfnum,
2397 2397 (wrap == 0 ? hi : pfnum0), &nlo, &nhi))
2398 2398 goto wrapit;
2399 2399
2400 2400 /* jump to the next page in the range */
2401 2401 if (pfnum < nlo) {
2402 2402 pfnum = P2ROUNDUP(nlo, szcpgcnt);
2403 2403 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2404 2404 idx = PNUM_TO_IDX(mnode, r, pfnum);
2405 2405 if (idx >= len || pfnum >= hi)
2406 2406 goto wrapit;
2407 2407 if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) &
2408 2408 ceq_mask)
2409 2409 goto next;
2410 2410 if (interleaved_mnodes &&
2411 2411 PFN_2_MEM_NODE(pfnum) != mnode)
2412 2412 goto next;
2413 2413 }
2414 2414 }
2415 2415 #endif
2416 2416
2417 2417 if (PAGE_COUNTERS(mnode, r, idx) != full)
2418 2418 goto next;
2419 2419
2420 2420 /*
2421 2421 * RFE: For performance maybe we can do something less
2422 2422 * brutal than locking the entire freelist. So far
2423 2423 * this doesn't seem to be a performance problem?
2424 2424 */
2425 2425 page_freelist_lock(mnode);
2426 2426 if (PAGE_COUNTERS(mnode, r, idx) == full) {
2427 2427 ret_pp =
2428 2428 page_promote(mnode, pfnum, r, PC_ALLOC, mtype);
2429 2429 if (ret_pp != NULL) {
2430 2430 VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]);
2431 2431 PAGE_COUNTERS_CURRENT_COLOR(mnode, r,
2432 2432 PFN_2_COLOR(pfnum, szc, &it), mrange) = idx;
2433 2433 page_freelist_unlock(mnode);
2434 2434 rw_exit(&page_ctrs_rwlock[mnode]);
2435 2435 #if defined(__sparc)
2436 2436 if (PP_ISNORELOC(ret_pp)) {
2437 2437 pgcnt_t npgs;
2438 2438
2439 2439 npgs = page_get_pagecnt(ret_pp->p_szc);
2440 2440 kcage_freemem_sub(npgs);
2441 2441 }
2442 2442 #endif
2443 2443 return (ret_pp);
2444 2444 }
2445 2445 } else {
2446 2446 VM_STAT_ADD(vmm_vmstats.page_ctrs_changed[r][mrange]);
2447 2447 }
2448 2448
2449 2449 page_freelist_unlock(mnode);
2450 2450 /*
2451 2451 * No point looking for another page if we've
2452 2452 * already tried all of the ones that
2453 2453 * page_ctr_cands indicated. Stash off where we left
2454 2454 * off.
2455 2455 * Note: this is not exact since we don't hold the
2456 2456 * page_freelist_locks before we initially get the
2457 2457 * value of cands for performance reasons, but should
2458 2458 * be a decent approximation.
2459 2459 */
2460 2460 if (--cands == 0) {
2461 2461 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color, mrange) =
2462 2462 idx;
2463 2463 break;
2464 2464 }
2465 2465 next:
2466 2466 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2467 2467 color_mask, &it);
2468 2468 idx = PNUM_TO_IDX(mnode, r, pfnum);
2469 2469 if (idx >= len || pfnum >= hi) {
2470 2470 wrapit:
2471 2471 pfnum = lo;
2472 2472 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2473 2473 idx = PNUM_TO_IDX(mnode, r, pfnum);
2474 2474 wrap++;
2475 2475 #if defined(__sparc)
2476 2476 nhi = 0; /* search kcage ranges */
2477 2477 #endif
2478 2478 }
2479 2479 }
2480 2480
2481 2481 rw_exit(&page_ctrs_rwlock[mnode]);
2482 2482 VM_STAT_ADD(vmm_vmstats.page_ctrs_failed[r][mrange]);
2483 2483 return (NULL);
2484 2484 }
2485 2485
2486 2486 /*
2487 2487 * For the given mnode, promote as many small pages to large pages as possible.
2488 2488 * mnode can be -1, which means do them all
2489 2489 */
2490 2490 void
2491 2491 page_freelist_coalesce_all(int mnode)
2492 2492 {
2493 2493 int r; /* region size */
2494 2494 int idx, full;
2495 2495 size_t len;
2496 2496 int doall = interleaved_mnodes || mnode < 0;
2497 2497 int mlo = doall ? 0 : mnode;
2498 2498 int mhi = doall ? max_mem_nodes : (mnode + 1);
2499 2499
2500 2500 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all);
2501 2501
2502 2502 if (mpss_coalesce_disable) {
2503 2503 return;
2504 2504 }
2505 2505
2506 2506 /*
2507 2507 * Lock the entire freelist and coalesce what we can.
2508 2508 *
2509 2509 * Always promote to the largest page possible
2510 2510 * first to reduce the number of page promotions.
2511 2511 */
2512 2512 for (mnode = mlo; mnode < mhi; mnode++) {
2513 2513 rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2514 2514 page_freelist_lock(mnode);
2515 2515 }
2516 2516 for (r = mmu_page_sizes - 1; r > 0; r--) {
2517 2517 for (mnode = mlo; mnode < mhi; mnode++) {
2518 2518 pgcnt_t cands = 0;
2519 2519 int mrange, nranges = mnode_nranges[mnode];
2520 2520
2521 2521 for (mrange = 0; mrange < nranges; mrange++) {
2522 2522 PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands);
2523 2523 if (cands != 0)
2524 2524 break;
2525 2525 }
2526 2526 if (cands == 0) {
2527 2527 VM_STAT_ADD(vmm_vmstats.
2528 2528 page_ctrs_cands_skip_all);
2529 2529 continue;
2530 2530 }
2531 2531
2532 2532 full = FULL_REGION_CNT(r);
2533 2533 len = PAGE_COUNTERS_ENTRIES(mnode, r);
2534 2534
2535 2535 for (idx = 0; idx < len; idx++) {
2536 2536 if (PAGE_COUNTERS(mnode, r, idx) == full) {
2537 2537 pfn_t pfnum =
2538 2538 IDX_TO_PNUM(mnode, r, idx);
2539 2539 int tmnode = interleaved_mnodes ?
2540 2540 PFN_2_MEM_NODE(pfnum) : mnode;
2541 2541
2542 2542 ASSERT(pfnum >=
2543 2543 mem_node_config[tmnode].physbase &&
2544 2544 pfnum <
2545 2545 mem_node_config[tmnode].physmax);
2546 2546
2547 2547 (void) page_promote(tmnode,
2548 2548 pfnum, r, PC_FREE, PC_MTYPE_ANY);
2549 2549 }
2550 2550 }
2551 2551 /* shared hpm_counters covers all mnodes, so we quit */
2552 2552 if (interleaved_mnodes)
2553 2553 break;
2554 2554 }
2555 2555 }
2556 2556 for (mnode = mlo; mnode < mhi; mnode++) {
2557 2557 page_freelist_unlock(mnode);
2558 2558 rw_exit(&page_ctrs_rwlock[mnode]);
2559 2559 }
2560 2560 }
2561 2561
2562 2562 /*
2563 2563 * This is where all polices for moving pages around
2564 2564 * to different page size free lists is implemented.
2565 2565 * Returns 1 on success, 0 on failure.
2566 2566 *
2567 2567 * So far these are the priorities for this algorithm in descending
2568 2568 * order:
2569 2569 *
2570 2570 * 1) When servicing a request try to do so with a free page
2571 2571 * from next size up. Helps defer fragmentation as long
2572 2572 * as possible.
2573 2573 *
2574 2574 * 2) Page coalesce on demand. Only when a freelist
2575 2575 * larger than PAGESIZE is empty and step 1
2576 2576 * will not work since all larger size lists are
2577 2577 * also empty.
2578 2578 *
2579 2579 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2580 2580 */
2581 2581
2582 2582 page_t *
2583 2583 page_freelist_split(uchar_t szc, uint_t color, int mnode, int mtype,
2584 2584 pfn_t pfnlo, pfn_t pfnhi, page_list_walker_t *plw)
2585 2585 {
2586 2586 uchar_t nszc = szc + 1;
2587 2587 uint_t bin, sbin, bin_prev;
2588 2588 page_t *pp, *firstpp;
2589 2589 page_t *ret_pp = NULL;
2590 2590 uint_t color_mask;
2591 2591
2592 2592 if (nszc == mmu_page_sizes)
2593 2593 return (NULL);
2594 2594
2595 2595 ASSERT(nszc < mmu_page_sizes);
2596 2596 color_mask = PAGE_GET_PAGECOLORS(nszc) - 1;
2597 2597 bin = sbin = PAGE_GET_NSZ_COLOR(szc, color);
2598 2598 bin_prev = (plw->plw_bin_split_prev == color) ? INVALID_COLOR :
2599 2599 PAGE_GET_NSZ_COLOR(szc, plw->plw_bin_split_prev);
2600 2600
2601 2601 VM_STAT_ADD(vmm_vmstats.pfs_req[szc]);
2602 2602 /*
2603 2603 * First try to break up a larger page to fill current size freelist.
2604 2604 */
2605 2605 while (plw->plw_bins[nszc] != 0) {
2606 2606
2607 2607 ASSERT(nszc < mmu_page_sizes);
2608 2608
2609 2609 /*
2610 2610 * If page found then demote it.
2611 2611 */
2612 2612 if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) {
2613 2613 page_freelist_lock(mnode);
2614 2614 firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype);
2615 2615
2616 2616 /*
2617 2617 * If pfnhi is not PFNNULL, look for large page below
2618 2618 * pfnhi. PFNNULL signifies no pfn requirement.
2619 2619 */
2620 2620 if (pp &&
2621 2621 ((pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) ||
2622 2622 (pfnlo != PFNNULL && pp->p_pagenum < pfnlo))) {
2623 2623 do {
2624 2624 pp = pp->p_vpnext;
2625 2625 if (pp == firstpp) {
2626 2626 pp = NULL;
2627 2627 break;
2628 2628 }
2629 2629 } while ((pfnhi != PFNNULL &&
2630 2630 pp->p_pagenum >= pfnhi) ||
2631 2631 (pfnlo != PFNNULL &&
2632 2632 pp->p_pagenum < pfnlo));
2633 2633
2634 2634 if (pfnhi != PFNNULL && pp != NULL)
2635 2635 ASSERT(pp->p_pagenum < pfnhi);
2636 2636
2637 2637 if (pfnlo != PFNNULL && pp != NULL)
2638 2638 ASSERT(pp->p_pagenum >= pfnlo);
2639 2639 }
2640 2640 if (pp) {
2641 2641 uint_t ccolor = page_correct_color(szc, nszc,
2642 2642 color, bin, plw->plw_ceq_mask[szc]);
2643 2643
2644 2644 ASSERT(pp->p_szc == nszc);
2645 2645 VM_STAT_ADD(vmm_vmstats.pfs_demote[nszc]);
2646 2646 ret_pp = page_demote(mnode, pp->p_pagenum,
2647 2647 pfnhi, pp->p_szc, szc, ccolor, PC_ALLOC);
2648 2648 if (ret_pp) {
2649 2649 page_freelist_unlock(mnode);
2650 2650 #if defined(__sparc)
2651 2651 if (PP_ISNORELOC(ret_pp)) {
2652 2652 pgcnt_t npgs;
2653 2653
2654 2654 npgs = page_get_pagecnt(
2655 2655 ret_pp->p_szc);
2656 2656 kcage_freemem_sub(npgs);
2657 2657 }
2658 2658 #endif
2659 2659 return (ret_pp);
2660 2660 }
2661 2661 }
2662 2662 page_freelist_unlock(mnode);
2663 2663 }
2664 2664
2665 2665 /* loop through next size bins */
2666 2666 bin = ADD_MASKED(bin, 1, plw->plw_ceq_mask[nszc], color_mask);
2667 2667 plw->plw_bins[nszc]--;
2668 2668
2669 2669 if (bin == sbin) {
2670 2670 uchar_t nnszc = nszc + 1;
2671 2671
2672 2672 /* we are done with this page size - check next */
2673 2673 if (plw->plw_bins[nnszc] == 0)
2674 2674 /* we have already checked next size bins */
2675 2675 break;
2676 2676
2677 2677 bin = sbin = PAGE_GET_NSZ_COLOR(nszc, bin);
2678 2678 if (bin_prev != INVALID_COLOR) {
2679 2679 bin_prev = PAGE_GET_NSZ_COLOR(nszc, bin_prev);
2680 2680 if (!((bin ^ bin_prev) &
2681 2681 plw->plw_ceq_mask[nnszc]))
2682 2682 break;
2683 2683 }
2684 2684 ASSERT(nnszc < mmu_page_sizes);
2685 2685 color_mask = PAGE_GET_PAGECOLORS(nnszc) - 1;
2686 2686 nszc = nnszc;
2687 2687 ASSERT(nszc < mmu_page_sizes);
2688 2688 }
2689 2689 }
2690 2690
2691 2691 return (ret_pp);
2692 2692 }
2693 2693
2694 2694 /*
2695 2695 * Helper routine used only by the freelist code to lock
2696 2696 * a page. If the page is a large page then it succeeds in
2697 2697 * locking all the constituent pages or none at all.
2698 2698 * Returns 1 on sucess, 0 on failure.
2699 2699 */
2700 2700 static int
2701 2701 page_trylock_cons(page_t *pp, se_t se)
2702 2702 {
2703 2703 page_t *tpp, *first_pp = pp;
2704 2704
2705 2705 /*
2706 2706 * Fail if can't lock first or only page.
2707 2707 */
2708 2708 if (!page_trylock(pp, se)) {
2709 2709 return (0);
2710 2710 }
2711 2711
2712 2712 /*
2713 2713 * PAGESIZE: common case.
2714 2714 */
2715 2715 if (pp->p_szc == 0) {
2716 2716 return (1);
2717 2717 }
2718 2718
2719 2719 /*
2720 2720 * Large page case.
2721 2721 */
2722 2722 tpp = pp->p_next;
2723 2723 while (tpp != pp) {
2724 2724 if (!page_trylock(tpp, se)) {
2725 2725 /*
2726 2726 * On failure unlock what we have locked so far.
2727 2727 * We want to avoid attempting to capture these
2728 2728 * pages as the pcm mutex may be held which could
2729 2729 * lead to a recursive mutex panic.
2730 2730 */
2731 2731 while (first_pp != tpp) {
2732 2732 page_unlock_nocapture(first_pp);
2733 2733 first_pp = first_pp->p_next;
2734 2734 }
2735 2735 return (0);
2736 2736 }
2737 2737 tpp = tpp->p_next;
2738 2738 }
2739 2739 return (1);
2740 2740 }
2741 2741
2742 2742 /*
2743 2743 * init context for walking page lists
2744 2744 * Called when a page of the given szc in unavailable. Sets markers
2745 2745 * for the beginning of the search to detect when search has
2746 2746 * completed a full cycle. Sets flags for splitting larger pages
2747 2747 * and coalescing smaller pages. Page walking procedes until a page
2748 2748 * of the desired equivalent color is found.
2749 2749 */
2750 2750 void
2751 2751 page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split,
2752 2752 int use_ceq, page_list_walker_t *plw)
2753 2753 {
2754 2754 uint_t nszc, ceq_mask, colors;
2755 2755 uchar_t ceq = use_ceq ? colorequivszc[szc] : 0;
2756 2756
2757 2757 ASSERT(szc < mmu_page_sizes);
2758 2758 colors = PAGE_GET_PAGECOLORS(szc);
2759 2759
2760 2760 plw->plw_colors = colors;
2761 2761 plw->plw_color_mask = colors - 1;
2762 2762 plw->plw_bin_marker = plw->plw_bin0 = bin;
2763 2763 plw->plw_bin_split_prev = bin;
2764 2764 plw->plw_bin_step = (szc == 0) ? vac_colors : 1;
2765 2765
2766 2766 /*
2767 2767 * if vac aliasing is possible make sure lower order color
2768 2768 * bits are never ignored
2769 2769 */
2770 2770 if (vac_colors > 1)
2771 2771 ceq &= 0xf0;
2772 2772
2773 2773 /*
2774 2774 * calculate the number of non-equivalent colors and
2775 2775 * color equivalency mask
2776 2776 */
2777 2777 plw->plw_ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
2778 2778 ASSERT(szc > 0 || plw->plw_ceq_dif >= vac_colors);
2779 2779 ASSERT(plw->plw_ceq_dif > 0);
2780 2780 plw->plw_ceq_mask[szc] = (plw->plw_ceq_dif - 1) << (ceq & 0xf);
2781 2781
2782 2782 if (flags & PG_MATCH_COLOR) {
2783 2783 if (cpu_page_colors < 0) {
2784 2784 /*
2785 2785 * this is a heterogeneous machine with different CPUs
2786 2786 * having different size e$ (not supported for ni2/rock
2787 2787 */
2788 2788 uint_t cpucolors = CPUSETSIZE() >> PAGE_GET_SHIFT(szc);
2789 2789 cpucolors = MAX(cpucolors, 1);
2790 2790 ceq_mask = plw->plw_color_mask & (cpucolors - 1);
2791 2791 plw->plw_ceq_mask[szc] =
2792 2792 MIN(ceq_mask, plw->plw_ceq_mask[szc]);
2793 2793 }
2794 2794 plw->plw_ceq_dif = 1;
2795 2795 }
2796 2796
2797 2797 /* we can split pages in the freelist, but not the cachelist */
2798 2798 if (can_split) {
2799 2799 plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0;
2800 2800
2801 2801 /* set next szc color masks and number of free list bins */
2802 2802 for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) {
2803 2803 plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc,
2804 2804 plw->plw_ceq_mask[szc]);
2805 2805 plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc);
2806 2806 }
2807 2807 plw->plw_ceq_mask[nszc] = INVALID_MASK;
2808 2808 plw->plw_bins[nszc] = 0;
2809 2809
2810 2810 } else {
2811 2811 ASSERT(szc == 0);
2812 2812 plw->plw_do_split = 0;
2813 2813 plw->plw_bins[1] = 0;
2814 2814 plw->plw_ceq_mask[1] = INVALID_MASK;
2815 2815 }
2816 2816 }
2817 2817
2818 2818 /*
2819 2819 * set mark to flag where next split should occur
2820 2820 */
2821 2821 #define PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) { \
2822 2822 uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin); \
2823 2823 uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0); \
2824 2824 uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask; \
2825 2825 plw->plw_split_next = \
2826 2826 INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask); \
2827 2827 if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \
2828 2828 plw->plw_split_next = \
2829 2829 INC_MASKED(plw->plw_split_next, \
2830 2830 neq_mask, plw->plw_color_mask); \
2831 2831 } \
2832 2832 }
2833 2833
2834 2834 uint_t
2835 2835 page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw)
2836 2836 {
2837 2837 uint_t neq_mask = ~plw->plw_ceq_mask[szc] & plw->plw_color_mask;
2838 2838 uint_t bin0_nsz, nbin_nsz, nbin0, nbin;
2839 2839 uchar_t nszc = szc + 1;
2840 2840
2841 2841 nbin = ADD_MASKED(bin,
2842 2842 plw->plw_bin_step, neq_mask, plw->plw_color_mask);
2843 2843
2844 2844 if (plw->plw_do_split) {
2845 2845 plw->plw_bin_split_prev = bin;
2846 2846 PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw);
2847 2847 plw->plw_do_split = 0;
2848 2848 }
2849 2849
2850 2850 if (szc == 0) {
2851 2851 if (plw->plw_count != 0 || plw->plw_ceq_dif == vac_colors) {
2852 2852 if (nbin == plw->plw_bin0 &&
2853 2853 (vac_colors == 1 || nbin != plw->plw_bin_marker)) {
2854 2854 nbin = ADD_MASKED(nbin, plw->plw_bin_step,
2855 2855 neq_mask, plw->plw_color_mask);
2856 2856 plw->plw_bin_split_prev = plw->plw_bin0;
2857 2857 }
2858 2858
2859 2859 if (vac_colors > 1 && nbin == plw->plw_bin_marker) {
2860 2860 plw->plw_bin_marker =
2861 2861 nbin = INC_MASKED(nbin, neq_mask,
2862 2862 plw->plw_color_mask);
2863 2863 plw->plw_bin_split_prev = plw->plw_bin0;
2864 2864 /*
2865 2865 * large pages all have the same vac color
2866 2866 * so by now we should be done with next
2867 2867 * size page splitting process
2868 2868 */
2869 2869 ASSERT(plw->plw_bins[1] == 0);
2870 2870 plw->plw_do_split = 0;
2871 2871 return (nbin);
2872 2872 }
2873 2873
2874 2874 } else {
2875 2875 uint_t bin_jump = (vac_colors == 1) ?
2876 2876 (BIN_STEP & ~3) - (plw->plw_bin0 & 3) : BIN_STEP;
2877 2877
2878 2878 bin_jump &= ~(vac_colors - 1);
2879 2879
2880 2880 nbin0 = ADD_MASKED(plw->plw_bin0, bin_jump, neq_mask,
2881 2881 plw->plw_color_mask);
2882 2882
2883 2883 if ((nbin0 ^ plw->plw_bin0) & plw->plw_ceq_mask[szc]) {
2884 2884
2885 2885 plw->plw_bin_marker = nbin = nbin0;
2886 2886
2887 2887 if (plw->plw_bins[nszc] != 0) {
2888 2888 /*
2889 2889 * check if next page size bin is the
2890 2890 * same as the next page size bin for
2891 2891 * bin0
2892 2892 */
2893 2893 nbin_nsz = PAGE_GET_NSZ_COLOR(szc,
2894 2894 nbin);
2895 2895 bin0_nsz = PAGE_GET_NSZ_COLOR(szc,
2896 2896 plw->plw_bin0);
2897 2897
2898 2898 if ((bin0_nsz ^ nbin_nsz) &
2899 2899 plw->plw_ceq_mask[nszc])
2900 2900 plw->plw_do_split = 1;
2901 2901 }
2902 2902 return (nbin);
2903 2903 }
2904 2904 }
2905 2905 }
2906 2906
2907 2907 if (plw->plw_bins[nszc] != 0) {
2908 2908 nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin);
2909 2909 if (!((plw->plw_split_next ^ nbin_nsz) &
2910 2910 plw->plw_ceq_mask[nszc]))
2911 2911 plw->plw_do_split = 1;
2912 2912 }
2913 2913
2914 2914 return (nbin);
2915 2915 }
2916 2916
2917 2917 page_t *
2918 2918 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc,
2919 2919 uint_t flags)
2920 2920 {
2921 2921 kmutex_t *pcm;
2922 2922 page_t *pp, *first_pp;
2923 2923 uint_t sbin;
2924 2924 int plw_initialized;
2925 2925 page_list_walker_t plw;
2926 2926
2927 2927 ASSERT(szc < mmu_page_sizes);
2928 2928
2929 2929 VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]);
2930 2930
2931 2931 MTYPE_START(mnode, mtype, flags);
2932 2932 if (mtype < 0) { /* mnode does not have memory in mtype range */
2933 2933 VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]);
2934 2934 return (NULL);
2935 2935 }
2936 2936 try_again:
2937 2937
2938 2938 plw_initialized = 0;
2939 2939 plw.plw_ceq_dif = 1;
2940 2940
2941 2941 /*
2942 2942 * Only hold one freelist lock at a time, that way we
2943 2943 * can start anywhere and not have to worry about lock
2944 2944 * ordering.
2945 2945 */
2946 2946 for (plw.plw_count = 0;
2947 2947 plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
2948 2948 sbin = bin;
2949 2949 do {
2950 2950 if (!PAGE_FREELISTS(mnode, szc, bin, mtype))
2951 2951 goto bin_empty_1;
2952 2952
2953 2953 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
2954 2954 mutex_enter(pcm);
2955 2955 pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
2956 2956 if (pp == NULL)
2957 2957 goto bin_empty_0;
2958 2958
2959 2959 /*
2960 2960 * These were set before the page
2961 2961 * was put on the free list,
2962 2962 * they must still be set.
2963 2963 */
2964 2964 ASSERT(PP_ISFREE(pp));
2965 2965 ASSERT(PP_ISAGED(pp));
2966 2966 ASSERT(pp->p_vnode == NULL);
2967 2967 ASSERT(pp->p_hash == NULL);
2968 2968 ASSERT(pp->p_offset == (u_offset_t)-1);
2969 2969 ASSERT(pp->p_szc == szc);
2970 2970 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
2971 2971
2972 2972 /*
2973 2973 * Walk down the hash chain.
2974 2974 * 8k pages are linked on p_next
2975 2975 * and p_prev fields. Large pages
2976 2976 * are a contiguous group of
2977 2977 * constituent pages linked together
2978 2978 * on their p_next and p_prev fields.
2979 2979 * The large pages are linked together
2980 2980 * on the hash chain using p_vpnext
2981 2981 * p_vpprev of the base constituent
2982 2982 * page of each large page.
2983 2983 */
2984 2984 first_pp = pp;
2985 2985 while (IS_DUMP_PAGE(pp) || !page_trylock_cons(pp,
2986 2986 SE_EXCL)) {
2987 2987 if (szc == 0) {
2988 2988 pp = pp->p_next;
2989 2989 } else {
2990 2990 pp = pp->p_vpnext;
2991 2991 }
2992 2992
2993 2993 ASSERT(PP_ISFREE(pp));
2994 2994 ASSERT(PP_ISAGED(pp));
2995 2995 ASSERT(pp->p_vnode == NULL);
2996 2996 ASSERT(pp->p_hash == NULL);
2997 2997 ASSERT(pp->p_offset == (u_offset_t)-1);
2998 2998 ASSERT(pp->p_szc == szc);
2999 2999 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3000 3000
3001 3001 if (pp == first_pp)
3002 3002 goto bin_empty_0;
3003 3003 }
3004 3004
3005 3005 ASSERT(pp != NULL);
3006 3006 ASSERT(mtype == PP_2_MTYPE(pp));
3007 3007 ASSERT(pp->p_szc == szc);
3008 3008 if (szc == 0) {
3009 3009 page_sub(&PAGE_FREELISTS(mnode,
3010 3010 szc, bin, mtype), pp);
3011 3011 } else {
3012 3012 page_vpsub(&PAGE_FREELISTS(mnode,
3013 3013 szc, bin, mtype), pp);
3014 3014 CHK_LPG(pp, szc);
3015 3015 }
3016 3016 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
3017 3017
3018 3018 if ((PP_ISFREE(pp) == 0) || (PP_ISAGED(pp) == 0))
3019 3019 panic("free page is not. pp %p", (void *)pp);
3020 3020 mutex_exit(pcm);
3021 3021
3022 3022 #if defined(__sparc)
3023 3023 ASSERT(!kcage_on || PP_ISNORELOC(pp) ||
3024 3024 (flags & PG_NORELOC) == 0);
3025 3025
3026 3026 if (PP_ISNORELOC(pp))
3027 3027 kcage_freemem_sub(page_get_pagecnt(szc));
3028 3028 #endif
3029 3029 VM_STAT_ADD(vmm_vmstats.pgmf_allocok[szc]);
3030 3030 return (pp);
3031 3031
3032 3032 bin_empty_0:
3033 3033 mutex_exit(pcm);
3034 3034 bin_empty_1:
3035 3035 if (plw_initialized == 0) {
3036 3036 page_list_walk_init(szc, flags, bin, 1, 1,
3037 3037 &plw);
3038 3038 plw_initialized = 1;
3039 3039 ASSERT(plw.plw_colors <=
3040 3040 PAGE_GET_PAGECOLORS(szc));
3041 3041 ASSERT(plw.plw_colors > 0);
3042 3042 ASSERT((plw.plw_colors &
3043 3043 (plw.plw_colors - 1)) == 0);
3044 3044 ASSERT(bin < plw.plw_colors);
3045 3045 ASSERT(plw.plw_ceq_mask[szc] < plw.plw_colors);
3046 3046 }
3047 3047 /* calculate the next bin with equivalent color */
3048 3048 bin = ADD_MASKED(bin, plw.plw_bin_step,
3049 3049 plw.plw_ceq_mask[szc], plw.plw_color_mask);
3050 3050 } while (sbin != bin);
3051 3051
3052 3052 /*
3053 3053 * color bins are all empty if color match. Try and
3054 3054 * satisfy the request by breaking up or coalescing
3055 3055 * pages from a different size freelist of the correct
3056 3056 * color that satisfies the ORIGINAL color requested.
3057 3057 * If that fails then try pages of the same size but
3058 3058 * different colors assuming we are not called with
3059 3059 * PG_MATCH_COLOR.
3060 3060 */
3061 3061 if (plw.plw_do_split &&
3062 3062 (pp = page_freelist_split(szc, bin, mnode,
3063 3063 mtype, PFNNULL, PFNNULL, &plw)) != NULL)
3064 3064 return (pp);
3065 3065
3066 3066 if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc,
3067 3067 bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) != NULL)
3068 3068 return (pp);
3069 3069
3070 3070 if (plw.plw_ceq_dif > 1)
3071 3071 bin = page_list_walk_next_bin(szc, bin, &plw);
3072 3072 }
3073 3073
3074 3074 /* if allowed, cycle through additional mtypes */
3075 3075 MTYPE_NEXT(mnode, mtype, flags);
3076 3076 if (mtype >= 0)
3077 3077 goto try_again;
3078 3078
3079 3079 VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]);
3080 3080
3081 3081 return (NULL);
3082 3082 }
3083 3083
3084 3084 /*
3085 3085 * Returns the count of free pages for 'pp' with size code 'szc'.
3086 3086 * Note: This function does not return an exact value as the page freelist
3087 3087 * locks are not held and thus the values in the page_counters may be
3088 3088 * changing as we walk through the data.
3089 3089 */
3090 3090 static int
3091 3091 page_freecnt(int mnode, page_t *pp, uchar_t szc)
3092 3092 {
3093 3093 pgcnt_t pgfree;
3094 3094 pgcnt_t cnt;
3095 3095 ssize_t r = szc; /* region size */
3096 3096 ssize_t idx;
3097 3097 int i;
3098 3098 int full, range;
3099 3099
3100 3100 /* Make sure pagenum passed in is aligned properly */
3101 3101 ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0);
3102 3102 ASSERT(szc > 0);
3103 3103
3104 3104 /* Prevent page_counters dynamic memory from being freed */
3105 3105 rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
3106 3106 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
3107 3107 cnt = PAGE_COUNTERS(mnode, r, idx);
3108 3108 pgfree = cnt << PNUM_SHIFT(r - 1);
3109 3109 range = FULL_REGION_CNT(szc);
3110 3110
3111 3111 /* Check for completely full region */
3112 3112 if (cnt == range) {
3113 3113 rw_exit(&page_ctrs_rwlock[mnode]);
3114 3114 return (pgfree);
3115 3115 }
3116 3116
3117 3117 while (--r > 0) {
3118 3118 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
3119 3119 full = FULL_REGION_CNT(r);
3120 3120 for (i = 0; i < range; i++, idx++) {
3121 3121 cnt = PAGE_COUNTERS(mnode, r, idx);
3122 3122 /*
3123 3123 * If cnt here is full, that means we have already
3124 3124 * accounted for these pages earlier.
3125 3125 */
3126 3126 if (cnt != full) {
3127 3127 pgfree += (cnt << PNUM_SHIFT(r - 1));
3128 3128 }
3129 3129 }
3130 3130 range *= full;
3131 3131 }
3132 3132 rw_exit(&page_ctrs_rwlock[mnode]);
3133 3133 return (pgfree);
3134 3134 }
3135 3135
3136 3136 /*
3137 3137 * Called from page_geti_contig_pages to exclusively lock constituent pages
3138 3138 * starting from 'spp' for page size code 'szc'.
3139 3139 *
3140 3140 * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc'
3141 3141 * region needs to be greater than or equal to the threshold.
3142 3142 */
3143 3143 static int
3144 3144 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags)
3145 3145 {
3146 3146 pgcnt_t pgcnt = PNUM_SIZE(szc);
3147 3147 pgcnt_t pgfree, i;
3148 3148 page_t *pp;
3149 3149
3150 3150 VM_STAT_ADD(vmm_vmstats.ptcp[szc]);
3151 3151
3152 3152
3153 3153 if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI))
3154 3154 goto skipptcpcheck;
3155 3155 /*
3156 3156 * check if there are sufficient free pages available before attempting
3157 3157 * to trylock. Count is approximate as page counters can change.
3158 3158 */
3159 3159 pgfree = page_freecnt(mnode, spp, szc);
3160 3160
3161 3161 /* attempt to trylock if there are sufficient already free pages */
3162 3162 if (pgfree < pgcnt/ptcpthreshold) {
3163 3163 VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]);
3164 3164 return (0);
3165 3165 }
3166 3166
3167 3167 skipptcpcheck:
3168 3168
3169 3169 for (i = 0; i < pgcnt; i++) {
3170 3170 pp = &spp[i];
3171 3171 if (!page_trylock(pp, SE_EXCL)) {
3172 3172 VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]);
3173 3173 while (--i != (pgcnt_t)-1) {
3174 3174 pp = &spp[i];
3175 3175 ASSERT(PAGE_EXCL(pp));
3176 3176 page_unlock_nocapture(pp);
3177 3177 }
3178 3178 return (0);
3179 3179 }
3180 3180 ASSERT(spp[i].p_pagenum == spp->p_pagenum + i);
3181 3181 if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) &&
3182 3182 !PP_ISFREE(pp)) {
3183 3183 VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]);
3184 3184 ASSERT(i == 0);
3185 3185 page_unlock_nocapture(pp);
3186 3186 return (0);
3187 3187 }
3188 3188
3189 3189 /*
3190 3190 * If a page has been marked non-relocatable or has been
3191 3191 * explicitly locked in memory, we don't want to relocate it;
3192 3192 * unlock the pages and fail the operation.
3193 3193 */
3194 3194 if (PP_ISNORELOC(pp) ||
3195 3195 pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
3196 3196 VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]);
3197 3197 while (i != (pgcnt_t)-1) {
3198 3198 pp = &spp[i];
3199 3199 ASSERT(PAGE_EXCL(pp));
3200 3200 page_unlock_nocapture(pp);
3201 3201 i--;
3202 3202 }
3203 3203 return (0);
3204 3204 }
3205 3205 }
3206 3206 VM_STAT_ADD(vmm_vmstats.ptcpok[szc]);
3207 3207 return (1);
3208 3208 }
3209 3209
3210 3210 /*
3211 3211 * Claim large page pointed to by 'pp'. 'pp' is the starting set
3212 3212 * of 'szc' constituent pages that had been locked exclusively previously.
3213 3213 * Will attempt to relocate constituent pages in use.
3214 3214 */
3215 3215 static page_t *
3216 3216 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags)
3217 3217 {
3218 3218 spgcnt_t pgcnt, npgs, i;
3219 3219 page_t *targpp, *rpp, *hpp;
3220 3220 page_t *replpp = NULL;
3221 3221 page_t *pplist = NULL;
3222 3222
3223 3223 ASSERT(pp != NULL);
3224 3224
3225 3225 pgcnt = page_get_pagecnt(szc);
3226 3226 while (pgcnt) {
3227 3227 ASSERT(PAGE_EXCL(pp));
3228 3228 ASSERT(!PP_ISNORELOC(pp));
3229 3229 if (PP_ISFREE(pp)) {
3230 3230 /*
3231 3231 * If this is a PG_FREE_LIST page then its
3232 3232 * size code can change underneath us due to
3233 3233 * page promotion or demotion. As an optimzation
3234 3234 * use page_list_sub_pages() instead of
3235 3235 * page_list_sub().
3236 3236 */
3237 3237 if (PP_ISAGED(pp)) {
3238 3238 page_list_sub_pages(pp, szc);
3239 3239 if (pp->p_szc == szc) {
3240 3240 return (pp);
3241 3241 }
3242 3242 ASSERT(pp->p_szc < szc);
3243 3243 npgs = page_get_pagecnt(pp->p_szc);
3244 3244 hpp = pp;
3245 3245 for (i = 0; i < npgs; i++, pp++) {
3246 3246 pp->p_szc = szc;
3247 3247 }
3248 3248 page_list_concat(&pplist, &hpp);
3249 3249 pgcnt -= npgs;
3250 3250 continue;
3251 3251 }
3252 3252 ASSERT(!PP_ISAGED(pp));
3253 3253 ASSERT(pp->p_szc == 0);
3254 3254 page_list_sub(pp, PG_CACHE_LIST);
3255 3255 page_hashout(pp, NULL);
3256 3256 PP_SETAGED(pp);
3257 3257 pp->p_szc = szc;
3258 3258 page_list_concat(&pplist, &pp);
3259 3259 pp++;
3260 3260 pgcnt--;
3261 3261 continue;
3262 3262 }
3263 3263 npgs = page_get_pagecnt(pp->p_szc);
3264 3264
3265 3265 /*
3266 3266 * page_create_wait freemem accounting done by caller of
3267 3267 * page_get_freelist and not necessary to call it prior to
3268 3268 * calling page_get_replacement_page.
3269 3269 *
3270 3270 * page_get_replacement_page can call page_get_contig_pages
3271 3271 * to acquire a large page (szc > 0); the replacement must be
3272 3272 * smaller than the contig page size to avoid looping or
3273 3273 * szc == 0 and PGI_PGCPSZC0 is set.
3274 3274 */
3275 3275 if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) {
3276 3276 replpp = page_get_replacement_page(pp, NULL, 0);
3277 3277 if (replpp) {
3278 3278 npgs = page_get_pagecnt(pp->p_szc);
3279 3279 ASSERT(npgs <= pgcnt);
3280 3280 targpp = pp;
3281 3281 }
3282 3282 }
3283 3283
3284 3284 /*
3285 3285 * If replacement is NULL or do_page_relocate fails, fail
3286 3286 * coalescing of pages.
3287 3287 */
3288 3288 if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0,
3289 3289 &npgs, NULL) != 0)) {
3290 3290 /*
3291 3291 * Unlock un-processed target list
3292 3292 */
3293 3293 while (pgcnt--) {
3294 3294 ASSERT(PAGE_EXCL(pp));
3295 3295 page_unlock_nocapture(pp);
3296 3296 pp++;
3297 3297 }
3298 3298 /*
3299 3299 * Free the processed target list.
3300 3300 */
3301 3301 while (pplist) {
3302 3302 pp = pplist;
3303 3303 page_sub(&pplist, pp);
3304 3304 ASSERT(PAGE_EXCL(pp));
3305 3305 ASSERT(pp->p_szc == szc);
3306 3306 ASSERT(PP_ISFREE(pp));
3307 3307 ASSERT(PP_ISAGED(pp));
3308 3308 pp->p_szc = 0;
3309 3309 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
3310 3310 page_unlock_nocapture(pp);
3311 3311 }
3312 3312
3313 3313 if (replpp != NULL)
3314 3314 page_free_replacement_page(replpp);
3315 3315
3316 3316 return (NULL);
3317 3317 }
3318 3318 ASSERT(pp == targpp);
3319 3319
3320 3320 /* LINTED */
3321 3321 ASSERT(hpp = pp); /* That's right, it's an assignment */
3322 3322
3323 3323 pp += npgs;
3324 3324 pgcnt -= npgs;
3325 3325
3326 3326 while (npgs--) {
3327 3327 ASSERT(PAGE_EXCL(targpp));
3328 3328 ASSERT(!PP_ISFREE(targpp));
3329 3329 ASSERT(!PP_ISNORELOC(targpp));
3330 3330 PP_SETFREE(targpp);
3331 3331 ASSERT(PP_ISAGED(targpp));
3332 3332 ASSERT(targpp->p_szc < szc || (szc == 0 &&
3333 3333 (flags & PGI_PGCPSZC0)));
3334 3334 targpp->p_szc = szc;
3335 3335 targpp = targpp->p_next;
3336 3336
3337 3337 rpp = replpp;
3338 3338 ASSERT(rpp != NULL);
3339 3339 page_sub(&replpp, rpp);
3340 3340 ASSERT(PAGE_EXCL(rpp));
3341 3341 ASSERT(!PP_ISFREE(rpp));
3342 3342 page_unlock_nocapture(rpp);
3343 3343 }
3344 3344 ASSERT(targpp == hpp);
3345 3345 ASSERT(replpp == NULL);
3346 3346 page_list_concat(&pplist, &targpp);
3347 3347 }
3348 3348 CHK_LPG(pplist, szc);
3349 3349 return (pplist);
3350 3350 }
3351 3351
3352 3352 /*
3353 3353 * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code
3354 3354 * of 0 means nothing left after trim.
3355 3355 */
3356 3356 int
3357 3357 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi)
3358 3358 {
3359 3359 pfn_t kcagepfn;
3360 3360 int decr;
3361 3361 int rc = 0;
3362 3362
3363 3363 if (PP_ISNORELOC(mseg->pages)) {
3364 3364 if (PP_ISNORELOC(mseg->epages - 1) == 0) {
3365 3365
3366 3366 /* lower part of this mseg inside kernel cage */
3367 3367 decr = kcage_current_pfn(&kcagepfn);
3368 3368
3369 3369 /* kernel cage may have transitioned past mseg */
3370 3370 if (kcagepfn >= mseg->pages_base &&
3371 3371 kcagepfn < mseg->pages_end) {
3372 3372 ASSERT(decr == 0);
3373 3373 *lo = MAX(kcagepfn, pfnlo);
3374 3374 *hi = MIN(pfnhi, (mseg->pages_end - 1));
3375 3375 rc = 1;
3376 3376 }
3377 3377 }
3378 3378 /* else entire mseg in the cage */
3379 3379 } else {
3380 3380 if (PP_ISNORELOC(mseg->epages - 1)) {
3381 3381
3382 3382 /* upper part of this mseg inside kernel cage */
3383 3383 decr = kcage_current_pfn(&kcagepfn);
3384 3384
3385 3385 /* kernel cage may have transitioned past mseg */
3386 3386 if (kcagepfn >= mseg->pages_base &&
3387 3387 kcagepfn < mseg->pages_end) {
3388 3388 ASSERT(decr);
3389 3389 *hi = MIN(kcagepfn, pfnhi);
3390 3390 *lo = MAX(pfnlo, mseg->pages_base);
3391 3391 rc = 1;
3392 3392 }
3393 3393 } else {
3394 3394 /* entire mseg outside of kernel cage */
3395 3395 *lo = MAX(pfnlo, mseg->pages_base);
3396 3396 *hi = MIN(pfnhi, (mseg->pages_end - 1));
3397 3397 rc = 1;
3398 3398 }
3399 3399 }
3400 3400 return (rc);
3401 3401 }
3402 3402
3403 3403 /*
3404 3404 * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a
3405 3405 * page with size code 'szc'. Claiming such a page requires acquiring
3406 3406 * exclusive locks on all constituent pages (page_trylock_contig_pages),
3407 3407 * relocating pages in use and concatenating these constituent pages into a
3408 3408 * large page.
3409 3409 *
3410 3410 * The page lists do not have such a large page and page_freelist_split has
3411 3411 * already failed to demote larger pages and/or coalesce smaller free pages.
3412 3412 *
3413 3413 * 'flags' may specify PG_COLOR_MATCH which would limit the search of large
3414 3414 * pages with the same color as 'bin'.
3415 3415 *
3416 3416 * 'pfnflag' specifies the subset of the pfn range to search.
3417 3417 */
3418 3418
3419 3419 static page_t *
3420 3420 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
3421 3421 pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag)
3422 3422 {
3423 3423 struct memseg *mseg;
3424 3424 pgcnt_t szcpgcnt = page_get_pagecnt(szc);
3425 3425 pgcnt_t szcpgmask = szcpgcnt - 1;
3426 3426 pfn_t randpfn;
3427 3427 page_t *pp, *randpp, *endpp;
3428 3428 uint_t colors, ceq_mask;
3429 3429 /* LINTED : set but not used in function */
3430 3430 uint_t color_mask __unused;
3431 3431 pfn_t hi, lo;
3432 3432 uint_t skip;
3433 3433 MEM_NODE_ITERATOR_DECL(it);
3434 3434
3435 3435 ASSERT(szc != 0 || (flags & PGI_PGCPSZC0));
3436 3436
3437 3437 pfnlo = P2ROUNDUP(pfnlo, szcpgcnt);
3438 3438
3439 3439 if ((pfnhi - pfnlo) + 1 < szcpgcnt || pfnlo >= pfnhi)
3440 3440 return (NULL);
3441 3441
3442 3442 ASSERT(szc < mmu_page_sizes);
3443 3443
3444 3444 colors = PAGE_GET_PAGECOLORS(szc);
3445 3445 color_mask = colors - 1;
3446 3446 if ((colors > 1) && (flags & PG_MATCH_COLOR)) {
3447 3447 uchar_t ceq = colorequivszc[szc];
3448 3448 uint_t ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
3449 3449
3450 3450 ASSERT(ceq_dif > 0);
3451 3451 ceq_mask = (ceq_dif - 1) << (ceq & 0xf);
3452 3452 } else {
3453 3453 ceq_mask = 0;
3454 3454 }
3455 3455
3456 3456 ASSERT(bin < colors);
3457 3457
3458 3458 /* clear "non-significant" color bits */
3459 3459 bin &= ceq_mask;
3460 3460
3461 3461 /*
3462 3462 * trim the pfn range to search based on pfnflag. pfnflag is set
3463 3463 * when there have been previous page_get_contig_page failures to
3464 3464 * limit the search.
3465 3465 *
3466 3466 * The high bit in pfnflag specifies the number of 'slots' in the
3467 3467 * pfn range and the remainder of pfnflag specifies which slot.
3468 3468 * For example, a value of 1010b would mean the second slot of
3469 3469 * the pfn range that has been divided into 8 slots.
3470 3470 */
3471 3471 if (pfnflag > 1) {
3472 3472 int slots = 1 << (highbit(pfnflag) - 1);
3473 3473 int slotid = pfnflag & (slots - 1);
3474 3474 pgcnt_t szcpages;
3475 3475 int slotlen;
3476 3476
3477 3477 pfnhi = P2ALIGN((pfnhi + 1), szcpgcnt) - 1;
3478 3478 szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt;
3479 3479 slotlen = howmany(szcpages, slots);
3480 3480 /* skip if 'slotid' slot is empty */
3481 3481 if (slotid * slotlen >= szcpages)
3482 3482 return (NULL);
3483 3483 pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt);
3484 3484 ASSERT(pfnlo < pfnhi);
3485 3485 if (pfnhi > pfnlo + (slotlen * szcpgcnt))
3486 3486 pfnhi = pfnlo + (slotlen * szcpgcnt) - 1;
3487 3487 }
3488 3488
3489 3489 /*
3490 3490 * This routine is can be called recursively so we shouldn't
3491 3491 * acquire a reader lock if a write request is pending. This
3492 3492 * could lead to a deadlock with the DR thread.
3493 3493 *
3494 3494 * Returning NULL informs the caller that we could not get
3495 3495 * a contig page with the required characteristics.
3496 3496 */
3497 3497
3498 3498 if (!memsegs_trylock(0))
3499 3499 return (NULL);
3500 3500
3501 3501 /*
3502 3502 * loop through memsegs to look for contig page candidates
3503 3503 */
3504 3504
3505 3505 for (mseg = memsegs; mseg != NULL; mseg = mseg->next) {
3506 3506 if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) {
3507 3507 /* no overlap */
3508 3508 continue;
3509 3509 }
3510 3510
3511 3511 if (mseg->pages_end - mseg->pages_base < szcpgcnt)
3512 3512 /* mseg too small */
3513 3513 continue;
3514 3514
3515 3515 /*
3516 3516 * trim off kernel cage pages from pfn range and check for
3517 3517 * a trimmed pfn range returned that does not span the
3518 3518 * desired large page size.
3519 3519 */
3520 3520 if (kcage_on) {
3521 3521 if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0 ||
3522 3522 lo >= hi || ((hi - lo) + 1) < szcpgcnt)
3523 3523 continue;
3524 3524 } else {
3525 3525 lo = MAX(pfnlo, mseg->pages_base);
3526 3526 hi = MIN(pfnhi, (mseg->pages_end - 1));
3527 3527 }
3528 3528
3529 3529 /* round to szcpgcnt boundaries */
3530 3530 lo = P2ROUNDUP(lo, szcpgcnt);
3531 3531
3532 3532 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
3533 3533 hi = P2ALIGN((hi + 1), szcpgcnt) - 1;
3534 3534
3535 3535 if (hi <= lo)
3536 3536 continue;
3537 3537
3538 3538 /*
3539 3539 * set lo to point to the pfn for the desired bin. Large
3540 3540 * page sizes may only have a single page color
3541 3541 */
3542 3542 skip = szcpgcnt;
3543 3543 if (ceq_mask > 0 || interleaved_mnodes) {
3544 3544 /* set lo to point at appropriate color */
3545 3545 if (((PFN_2_COLOR(lo, szc, &it) ^ bin) & ceq_mask) ||
3546 3546 (interleaved_mnodes &&
3547 3547 PFN_2_MEM_NODE(lo) != mnode)) {
3548 3548 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask,
3549 3549 color_mask, &it);
3550 3550 }
3551 3551 if (hi <= lo)
3552 3552 /* mseg cannot satisfy color request */
3553 3553 continue;
3554 3554 }
3555 3555
3556 3556 /* randomly choose a point between lo and hi to begin search */
3557 3557
3558 3558 randpfn = (pfn_t)GETTICK();
3559 3559 randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1);
3560 3560 MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc, &it);
3561 3561 if (ceq_mask || interleaved_mnodes || randpfn == (pfn_t)-1) {
3562 3562 if (randpfn != (pfn_t)-1) {
3563 3563 PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin,
3564 3564 ceq_mask, color_mask, &it);
3565 3565 }
3566 3566 if (randpfn >= hi) {
3567 3567 randpfn = lo;
3568 3568 MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc,
3569 3569 &it);
3570 3570 }
3571 3571 }
3572 3572 randpp = mseg->pages + (randpfn - mseg->pages_base);
3573 3573
3574 3574 ASSERT(randpp->p_pagenum == randpfn);
3575 3575
3576 3576 pp = randpp;
3577 3577 endpp = mseg->pages + (hi - mseg->pages_base) + 1;
3578 3578
3579 3579 ASSERT(randpp + szcpgcnt <= endpp);
3580 3580
3581 3581 do {
3582 3582 ASSERT(!(pp->p_pagenum & szcpgmask));
3583 3583 ASSERT(((PP_2_BIN(pp) ^ bin) & ceq_mask) == 0);
3584 3584
3585 3585 if (page_trylock_contig_pages(mnode, pp, szc, flags)) {
3586 3586 /* pages unlocked by page_claim on failure */
3587 3587 if (page_claim_contig_pages(pp, szc, flags)) {
3588 3588 memsegs_unlock(0);
3589 3589 return (pp);
3590 3590 }
3591 3591 }
3592 3592
3593 3593 if (ceq_mask == 0 && !interleaved_mnodes) {
3594 3594 pp += skip;
3595 3595 } else {
3596 3596 pfn_t pfn = pp->p_pagenum;
3597 3597
3598 3598 PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin,
3599 3599 ceq_mask, color_mask, &it);
3600 3600 if (pfn == (pfn_t)-1) {
3601 3601 pp = endpp;
3602 3602 } else {
3603 3603 pp = mseg->pages +
3604 3604 (pfn - mseg->pages_base);
3605 3605 }
3606 3606 }
3607 3607 if (pp >= endpp) {
3608 3608 /* start from the beginning */
3609 3609 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
3610 3610 pp = mseg->pages + (lo - mseg->pages_base);
3611 3611 ASSERT(pp->p_pagenum == lo);
3612 3612 ASSERT(pp + szcpgcnt <= endpp);
3613 3613 }
3614 3614 } while (pp != randpp);
3615 3615 }
3616 3616 memsegs_unlock(0);
3617 3617 return (NULL);
3618 3618 }
3619 3619
3620 3620
3621 3621 /*
3622 3622 * controlling routine that searches through physical memory in an attempt to
3623 3623 * claim a large page based on the input parameters.
3624 3624 * on the page free lists.
3625 3625 *
3626 3626 * calls page_geti_contig_pages with an initial pfn range from the mnode
3627 3627 * and mtype. page_geti_contig_pages will trim off the parts of the pfn range
3628 3628 * that overlaps with the kernel cage or does not match the requested page
3629 3629 * color if PG_MATCH_COLOR is set. Since this search is very expensive,
3630 3630 * page_geti_contig_pages may further limit the search range based on
3631 3631 * previous failure counts (pgcpfailcnt[]).
3632 3632 *
3633 3633 * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base
3634 3634 * pagesize page that satisfies mtype.
3635 3635 */
3636 3636 page_t *
3637 3637 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc,
3638 3638 uint_t flags)
3639 3639 {
3640 3640 pfn_t pfnlo, pfnhi; /* contig pages pfn range */
3641 3641 page_t *pp;
3642 3642 pgcnt_t pfnflag = 0; /* no limit on search if 0 */
3643 3643
3644 3644 VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]);
3645 3645
3646 3646 /* no allocations from cage */
3647 3647 flags |= PGI_NOCAGE;
3648 3648
3649 3649 /* LINTED */
3650 3650 MTYPE_START(mnode, mtype, flags);
3651 3651 if (mtype < 0) { /* mnode does not have memory in mtype range */
3652 3652 VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]);
3653 3653 return (NULL);
3654 3654 }
3655 3655
3656 3656 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3657 3657
3658 3658 /* do not limit search and ignore color if hi pri */
3659 3659
3660 3660 if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0))
3661 3661 pfnflag = pgcpfailcnt[szc];
3662 3662
3663 3663 /* remove color match to improve chances */
3664 3664
3665 3665 if (flags & PGI_PGCPHIPRI || pfnflag)
3666 3666 flags &= ~PG_MATCH_COLOR;
3667 3667
3668 3668 do {
3669 3669 /* get pfn range based on mnode and mtype */
3670 3670 MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi);
3671 3671
3672 3672 ASSERT(pfnhi >= pfnlo);
3673 3673
3674 3674 pp = page_geti_contig_pages(mnode, bin, szc, flags,
3675 3675 pfnlo, pfnhi, pfnflag);
3676 3676
3677 3677 if (pp != NULL) {
3678 3678 pfnflag = pgcpfailcnt[szc];
3679 3679 if (pfnflag) {
3680 3680 /* double the search size */
3681 3681 pgcpfailcnt[szc] = pfnflag >> 1;
3682 3682 }
3683 3683 VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]);
3684 3684 return (pp);
3685 3685 }
3686 3686 MTYPE_NEXT(mnode, mtype, flags);
3687 3687 } while (mtype >= 0);
3688 3688
3689 3689 VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]);
3690 3690 return (NULL);
3691 3691 }
3692 3692
3693 3693 #if defined(__i386) || defined(__amd64)
3694 3694 /*
3695 3695 * Determine the likelihood of finding/coalescing a szc page.
3696 3696 * Return 0 if the likelihood is small otherwise return 1.
3697 3697 *
3698 3698 * For now, be conservative and check only 1g pages and return 0
3699 3699 * if there had been previous coalescing failures and the szc pages
3700 3700 * needed to satisfy request would exhaust most of freemem.
3701 3701 */
3702 3702 int
3703 3703 page_chk_freelist(uint_t szc)
3704 3704 {
3705 3705 pgcnt_t pgcnt;
3706 3706
3707 3707 if (szc <= 1)
3708 3708 return (1);
3709 3709
3710 3710 pgcnt = page_get_pagecnt(szc);
3711 3711 if (pgcpfailcnt[szc] && pgcnt + throttlefree >= freemem) {
3712 3712 VM_STAT_ADD(vmm_vmstats.pcf_deny[szc]);
3713 3713 return (0);
3714 3714 }
3715 3715 VM_STAT_ADD(vmm_vmstats.pcf_allow[szc]);
3716 3716 return (1);
3717 3717 }
3718 3718 #endif
3719 3719
3720 3720 /*
3721 3721 * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair.
3722 3722 *
3723 3723 * Does its own locking and accounting.
3724 3724 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3725 3725 * pages of the proper color even if there are pages of a different color.
3726 3726 *
3727 3727 * Finds a page, removes it, THEN locks it.
3728 3728 */
3729 3729
3730 3730 /*ARGSUSED*/
3731 3731 page_t *
3732 3732 page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg,
3733 3733 caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp)
3734 3734 {
3735 3735 struct as *as = seg->s_as;
3736 3736 page_t *pp = NULL;
3737 3737 ulong_t bin;
3738 3738 uchar_t szc;
3739 3739 int mnode;
3740 3740 int mtype;
3741 3741 page_t *(*page_get_func)(int, uint_t, int, uchar_t, uint_t);
3742 3742 lgrp_mnode_cookie_t lgrp_cookie;
3743 3743
3744 3744 page_get_func = page_get_mnode_freelist;
3745 3745
3746 3746 /*
3747 3747 * If we aren't passed a specific lgroup, or passed a freed lgrp
3748 3748 * assume we wish to allocate near to the current thread's home.
3749 3749 */
3750 3750 if (!LGRP_EXISTS(lgrp))
3751 3751 lgrp = lgrp_home_lgrp();
3752 3752
3753 3753 if (kcage_on) {
3754 3754 if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC &&
3755 3755 kcage_freemem < kcage_throttlefree + btop(size) &&
3756 3756 curthread != kcage_cageout_thread) {
3757 3757 /*
3758 3758 * Set a "reserve" of kcage_throttlefree pages for
3759 3759 * PG_PANIC and cageout thread allocations.
3760 3760 *
3761 3761 * Everybody else has to serialize in
3762 3762 * page_create_get_something() to get a cage page, so
3763 3763 * that we don't deadlock cageout!
3764 3764 */
3765 3765 return (NULL);
3766 3766 }
3767 3767 } else {
3768 3768 flags &= ~PG_NORELOC;
3769 3769 flags |= PGI_NOCAGE;
3770 3770 }
3771 3771
3772 3772 /* LINTED */
3773 3773 MTYPE_INIT(mtype, vp, vaddr, flags, size);
3774 3774
3775 3775 /*
3776 3776 * Convert size to page size code.
3777 3777 */
3778 3778 if ((szc = page_szc(size)) == (uchar_t)-1)
3779 3779 panic("page_get_freelist: illegal page size request");
3780 3780 ASSERT(szc < mmu_page_sizes);
3781 3781
3782 3782 VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]);
3783 3783
3784 3784 /* LINTED */
3785 3785 AS_2_BIN(as, seg, vp, vaddr, bin, szc);
3786 3786
3787 3787 ASSERT(bin < PAGE_GET_PAGECOLORS(szc));
3788 3788
3789 3789 /*
3790 3790 * Try to get a local page first, but try remote if we can't
3791 3791 * get a page of the right color.
3792 3792 */
3793 3793 pgretry:
3794 3794 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3795 3795 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3796 3796 pp = page_get_func(mnode, bin, mtype, szc, flags);
3797 3797 if (pp != NULL) {
3798 3798 VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]);
3799 3799 DTRACE_PROBE4(page__get,
3800 3800 lgrp_t *, lgrp,
3801 3801 int, mnode,
3802 3802 ulong_t, bin,
3803 3803 uint_t, flags);
3804 3804 return (pp);
3805 3805 }
3806 3806 }
3807 3807 ASSERT(pp == NULL);
3808 3808
3809 3809 /*
3810 3810 * for non-SZC0 PAGESIZE requests, check cachelist before checking
3811 3811 * remote free lists. Caller expected to call page_get_cachelist which
3812 3812 * will check local cache lists and remote free lists.
3813 3813 */
3814 3814 if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) {
3815 3815 VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred);
3816 3816 return (NULL);
3817 3817 }
3818 3818
3819 3819 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3820 3820
3821 3821 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3822 3822
3823 3823 if (!(flags & PG_LOCAL)) {
3824 3824 /*
3825 3825 * Try to get a non-local freelist page.
3826 3826 */
3827 3827 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3828 3828 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3829 3829 pp = page_get_func(mnode, bin, mtype, szc, flags);
3830 3830 if (pp != NULL) {
3831 3831 DTRACE_PROBE4(page__get,
3832 3832 lgrp_t *, lgrp,
3833 3833 int, mnode,
3834 3834 ulong_t, bin,
3835 3835 uint_t, flags);
3836 3836 VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]);
3837 3837 return (pp);
3838 3838 }
3839 3839 }
3840 3840 ASSERT(pp == NULL);
3841 3841 }
3842 3842
3843 3843 /*
3844 3844 * when the cage is off chances are page_get_contig_pages() will fail
3845 3845 * to lock a large page chunk therefore when the cage is off it's not
3846 3846 * called by default. this can be changed via /etc/system.
3847 3847 *
3848 3848 * page_get_contig_pages() also called to acquire a base pagesize page
3849 3849 * for page_create_get_something().
3850 3850 */
3851 3851 if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) &&
3852 3852 (kcage_on || pg_lpgcreate_nocage || szc == 0) &&
3853 3853 (page_get_func != page_get_contig_pages)) {
3854 3854
3855 3855 VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]);
3856 3856 page_get_func = page_get_contig_pages;
3857 3857 goto pgretry;
3858 3858 }
3859 3859
3860 3860 if (!(flags & PG_LOCAL) && pgcplimitsearch &&
3861 3861 page_get_func == page_get_contig_pages)
3862 3862 SETPGCPFAILCNT(szc);
3863 3863
3864 3864 VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]);
3865 3865 return (NULL);
3866 3866 }
3867 3867
3868 3868 /*
3869 3869 * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair.
3870 3870 *
3871 3871 * Does its own locking.
3872 3872 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3873 3873 * pages of the proper color even if there are pages of a different color.
3874 3874 * Otherwise, scan the bins for ones with pages. For each bin with pages,
3875 3875 * try to lock one of them. If no page can be locked, try the
3876 3876 * next bin. Return NULL if a page can not be found and locked.
3877 3877 *
3878 3878 * Finds a pages, trys to lock it, then removes it.
3879 3879 */
3880 3880
3881 3881 /*ARGSUSED*/
3882 3882 page_t *
3883 3883 page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg,
3884 3884 caddr_t vaddr, uint_t flags, struct lgrp *lgrp)
3885 3885 {
3886 3886 page_t *pp;
3887 3887 struct as *as = seg->s_as;
3888 3888 ulong_t bin;
3889 3889 /*LINTED*/
3890 3890 int mnode;
3891 3891 int mtype;
3892 3892 lgrp_mnode_cookie_t lgrp_cookie;
3893 3893
3894 3894 /*
3895 3895 * If we aren't passed a specific lgroup, or pasased a freed lgrp
3896 3896 * assume we wish to allocate near to the current thread's home.
3897 3897 */
3898 3898 if (!LGRP_EXISTS(lgrp))
3899 3899 lgrp = lgrp_home_lgrp();
3900 3900
3901 3901 if (!kcage_on) {
3902 3902 flags &= ~PG_NORELOC;
3903 3903 flags |= PGI_NOCAGE;
3904 3904 }
3905 3905
3906 3906 if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC &&
3907 3907 kcage_freemem <= kcage_throttlefree) {
3908 3908 /*
3909 3909 * Reserve kcage_throttlefree pages for critical kernel
3910 3910 * threads.
3911 3911 *
3912 3912 * Everybody else has to go to page_create_get_something()
3913 3913 * to get a cage page, so we don't deadlock cageout.
3914 3914 */
3915 3915 return (NULL);
3916 3916 }
3917 3917
3918 3918 /* LINTED */
3919 3919 AS_2_BIN(as, seg, vp, vaddr, bin, 0);
3920 3920
3921 3921 ASSERT(bin < PAGE_GET_PAGECOLORS(0));
3922 3922
3923 3923 /* LINTED */
3924 3924 MTYPE_INIT(mtype, vp, vaddr, flags, MMU_PAGESIZE);
3925 3925
3926 3926 VM_STAT_ADD(vmm_vmstats.pgc_alloc);
3927 3927
3928 3928 /*
3929 3929 * Try local cachelists first
3930 3930 */
3931 3931 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3932 3932 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3933 3933 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3934 3934 if (pp != NULL) {
3935 3935 VM_STAT_ADD(vmm_vmstats.pgc_allocok);
3936 3936 DTRACE_PROBE4(page__get,
3937 3937 lgrp_t *, lgrp,
3938 3938 int, mnode,
3939 3939 ulong_t, bin,
3940 3940 uint_t, flags);
3941 3941 return (pp);
3942 3942 }
3943 3943 }
3944 3944
3945 3945 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3946 3946
3947 3947 /*
3948 3948 * Try freelists/cachelists that are farther away
3949 3949 * This is our only chance to allocate remote pages for PAGESIZE
3950 3950 * requests.
3951 3951 */
3952 3952 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3953 3953 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3954 3954 pp = page_get_mnode_freelist(mnode, bin, mtype,
3955 3955 0, flags);
3956 3956 if (pp != NULL) {
3957 3957 VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred);
3958 3958 DTRACE_PROBE4(page__get,
3959 3959 lgrp_t *, lgrp,
3960 3960 int, mnode,
3961 3961 ulong_t, bin,
3962 3962 uint_t, flags);
3963 3963 return (pp);
3964 3964 }
3965 3965 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3966 3966 if (pp != NULL) {
3967 3967 VM_STAT_ADD(vmm_vmstats.pgc_allocokrem);
3968 3968 DTRACE_PROBE4(page__get,
3969 3969 lgrp_t *, lgrp,
3970 3970 int, mnode,
3971 3971 ulong_t, bin,
3972 3972 uint_t, flags);
3973 3973 return (pp);
3974 3974 }
3975 3975 }
3976 3976
3977 3977 VM_STAT_ADD(vmm_vmstats.pgc_allocfailed);
3978 3978 return (NULL);
3979 3979 }
3980 3980
3981 3981 page_t *
3982 3982 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype)
3983 3983 {
3984 3984 kmutex_t *pcm;
3985 3985 page_t *pp, *first_pp;
3986 3986 uint_t sbin;
3987 3987 int plw_initialized;
3988 3988 page_list_walker_t plw;
3989 3989
3990 3990 VM_STAT_ADD(vmm_vmstats.pgmc_alloc);
3991 3991
3992 3992 /* LINTED */
3993 3993 MTYPE_START(mnode, mtype, flags);
3994 3994 if (mtype < 0) { /* mnode does not have memory in mtype range */
3995 3995 VM_STAT_ADD(vmm_vmstats.pgmc_allocempty);
3996 3996 return (NULL);
3997 3997 }
3998 3998
3999 3999 try_again:
4000 4000
4001 4001 plw_initialized = 0;
4002 4002 plw.plw_ceq_dif = 1;
4003 4003
4004 4004 /*
4005 4005 * Only hold one cachelist lock at a time, that way we
4006 4006 * can start anywhere and not have to worry about lock
4007 4007 * ordering.
4008 4008 */
4009 4009
4010 4010 for (plw.plw_count = 0;
4011 4011 plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
4012 4012 sbin = bin;
4013 4013 do {
4014 4014
4015 4015 if (!PAGE_CACHELISTS(mnode, bin, mtype))
4016 4016 goto bin_empty_1;
4017 4017 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
4018 4018 mutex_enter(pcm);
4019 4019 pp = PAGE_CACHELISTS(mnode, bin, mtype);
4020 4020 if (pp == NULL)
4021 4021 goto bin_empty_0;
4022 4022
4023 4023 first_pp = pp;
4024 4024 ASSERT(pp->p_vnode);
4025 4025 ASSERT(PP_ISAGED(pp) == 0);
4026 4026 ASSERT(pp->p_szc == 0);
4027 4027 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
4028 4028 while (IS_DUMP_PAGE(pp) || !page_trylock(pp, SE_EXCL)) {
4029 4029 pp = pp->p_next;
4030 4030 ASSERT(pp->p_szc == 0);
4031 4031 if (pp == first_pp) {
4032 4032 /*
4033 4033 * We have searched the complete list!
4034 4034 * And all of them (might only be one)
4035 4035 * are locked. This can happen since
4036 4036 * these pages can also be found via
4037 4037 * the hash list. When found via the
4038 4038 * hash list, they are locked first,
4039 4039 * then removed. We give up to let the
4040 4040 * other thread run.
4041 4041 */
4042 4042 pp = NULL;
4043 4043 break;
4044 4044 }
4045 4045 ASSERT(pp->p_vnode);
4046 4046 ASSERT(PP_ISFREE(pp));
4047 4047 ASSERT(PP_ISAGED(pp) == 0);
4048 4048 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
4049 4049 mnode);
4050 4050 }
4051 4051
4052 4052 if (pp) {
4053 4053 page_t **ppp;
4054 4054 /*
4055 4055 * Found and locked a page.
4056 4056 * Pull it off the list.
4057 4057 */
4058 4058 ASSERT(mtype == PP_2_MTYPE(pp));
4059 4059 ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
4060 4060 page_sub(ppp, pp);
4061 4061 /*
4062 4062 * Subtract counters before releasing pcm mutex
4063 4063 * to avoid a race with page_freelist_coalesce
4064 4064 * and page_freelist_split.
4065 4065 */
4066 4066 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
4067 4067 mutex_exit(pcm);
4068 4068 ASSERT(pp->p_vnode);
4069 4069 ASSERT(PP_ISAGED(pp) == 0);
4070 4070 #if defined(__sparc)
4071 4071 ASSERT(!kcage_on ||
4072 4072 (flags & PG_NORELOC) == 0 ||
4073 4073 PP_ISNORELOC(pp));
4074 4074 if (PP_ISNORELOC(pp)) {
4075 4075 kcage_freemem_sub(1);
4076 4076 }
4077 4077 #endif
4078 4078 VM_STAT_ADD(vmm_vmstats. pgmc_allocok);
4079 4079 return (pp);
4080 4080 }
4081 4081 bin_empty_0:
4082 4082 mutex_exit(pcm);
4083 4083 bin_empty_1:
4084 4084 if (plw_initialized == 0) {
4085 4085 page_list_walk_init(0, flags, bin, 0, 1, &plw);
4086 4086 plw_initialized = 1;
4087 4087 }
4088 4088 /* calculate the next bin with equivalent color */
4089 4089 bin = ADD_MASKED(bin, plw.plw_bin_step,
4090 4090 plw.plw_ceq_mask[0], plw.plw_color_mask);
4091 4091 } while (sbin != bin);
4092 4092
4093 4093 if (plw.plw_ceq_dif > 1)
4094 4094 bin = page_list_walk_next_bin(0, bin, &plw);
4095 4095 }
4096 4096
4097 4097 MTYPE_NEXT(mnode, mtype, flags);
4098 4098 if (mtype >= 0)
4099 4099 goto try_again;
4100 4100
4101 4101 VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed);
4102 4102 return (NULL);
4103 4103 }
4104 4104
4105 4105 #ifdef DEBUG
4106 4106 #define REPL_PAGE_STATS
4107 4107 #endif /* DEBUG */
4108 4108
4109 4109 #ifdef REPL_PAGE_STATS
4110 4110 struct repl_page_stats {
4111 4111 uint_t ngets;
4112 4112 uint_t ngets_noreloc;
4113 4113 uint_t npgr_noreloc;
4114 4114 uint_t nnopage_first;
4115 4115 uint_t nnopage;
4116 4116 uint_t nhashout;
4117 4117 uint_t nnofree;
4118 4118 uint_t nnext_pp;
4119 4119 } repl_page_stats;
4120 4120 #define REPL_STAT_INCR(v) atomic_inc_32(&repl_page_stats.v)
4121 4121 #else /* REPL_PAGE_STATS */
4122 4122 #define REPL_STAT_INCR(v)
4123 4123 #endif /* REPL_PAGE_STATS */
4124 4124
4125 4125 int pgrppgcp;
4126 4126
4127 4127 /*
4128 4128 * The freemem accounting must be done by the caller.
4129 4129 * First we try to get a replacement page of the same size as like_pp,
4130 4130 * if that is not possible, then we just get a set of discontiguous
4131 4131 * PAGESIZE pages.
4132 4132 */
4133 4133 page_t *
4134 4134 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target,
4135 4135 uint_t pgrflags)
4136 4136 {
4137 4137 page_t *like_pp;
4138 4138 page_t *pp, *pplist;
4139 4139 page_t *pl = NULL;
4140 4140 ulong_t bin;
4141 4141 int mnode, page_mnode;
4142 4142 int szc;
4143 4143 spgcnt_t npgs, pg_cnt;
4144 4144 pfn_t pfnum;
4145 4145 int mtype;
4146 4146 int flags = 0;
4147 4147 lgrp_mnode_cookie_t lgrp_cookie;
4148 4148 lgrp_t *lgrp;
4149 4149
4150 4150 REPL_STAT_INCR(ngets);
4151 4151 like_pp = orig_like_pp;
4152 4152 ASSERT(PAGE_EXCL(like_pp));
4153 4153
4154 4154 szc = like_pp->p_szc;
4155 4155 npgs = page_get_pagecnt(szc);
4156 4156 /*
4157 4157 * Now we reset like_pp to the base page_t.
4158 4158 * That way, we won't walk past the end of this 'szc' page.
4159 4159 */
4160 4160 pfnum = PFN_BASE(like_pp->p_pagenum, szc);
4161 4161 like_pp = page_numtopp_nolock(pfnum);
4162 4162 ASSERT(like_pp->p_szc == szc);
4163 4163
4164 4164 if (PP_ISNORELOC(like_pp)) {
4165 4165 ASSERT(kcage_on);
4166 4166 REPL_STAT_INCR(ngets_noreloc);
4167 4167 flags = PGI_RELOCONLY;
4168 4168 } else if (pgrflags & PGR_NORELOC) {
4169 4169 ASSERT(kcage_on);
4170 4170 REPL_STAT_INCR(npgr_noreloc);
4171 4171 flags = PG_NORELOC;
↓ open down ↓ |
4171 lines elided |
↑ open up ↑ |
4172 4172 }
4173 4173
4174 4174 /*
4175 4175 * Kernel pages must always be replaced with the same size
4176 4176 * pages, since we cannot properly handle demotion of kernel
4177 4177 * pages.
4178 4178 */
4179 4179 if (PP_ISKAS(like_pp))
4180 4180 pgrflags |= PGR_SAMESZC;
4181 4181
4182 - /* LINTED */
4183 - MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs);
4182 + MTYPE_PGR_INIT(mtype, flags, like_pp, npgs);
4184 4183
4185 4184 while (npgs) {
4186 4185 pplist = NULL;
4187 4186 for (;;) {
4188 4187 pg_cnt = page_get_pagecnt(szc);
4189 4188 bin = PP_2_BIN(like_pp);
4190 4189 ASSERT(like_pp->p_szc == orig_like_pp->p_szc);
4191 4190 ASSERT(pg_cnt <= npgs);
4192 4191
4193 4192 /*
4194 4193 * If an lgroup was specified, try to get the
4195 4194 * page from that lgroup.
4196 4195 * NOTE: Must be careful with code below because
4197 4196 * lgroup may disappear and reappear since there
4198 4197 * is no locking for lgroup here.
4199 4198 */
4200 4199 if (LGRP_EXISTS(lgrp_target)) {
4201 4200 /*
4202 4201 * Keep local variable for lgroup separate
4203 4202 * from lgroup argument since this code should
4204 4203 * only be exercised when lgroup argument
4205 4204 * exists....
4206 4205 */
4207 4206 lgrp = lgrp_target;
4208 4207
4209 4208 /* Try the lgroup's freelists first */
4210 4209 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4211 4210 LGRP_SRCH_LOCAL);
4212 4211 while ((pplist == NULL) &&
4213 4212 (mnode = lgrp_memnode_choose(&lgrp_cookie))
4214 4213 != -1) {
4215 4214 pplist =
4216 4215 page_get_mnode_freelist(mnode, bin,
4217 4216 mtype, szc, flags);
4218 4217 }
4219 4218
4220 4219 /*
4221 4220 * Now try it's cachelists if this is a
4222 4221 * small page. Don't need to do it for
4223 4222 * larger ones since page_freelist_coalesce()
4224 4223 * already failed.
4225 4224 */
4226 4225 if (pplist != NULL || szc != 0)
4227 4226 break;
4228 4227
4229 4228 /* Now try it's cachelists */
4230 4229 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4231 4230 LGRP_SRCH_LOCAL);
4232 4231
4233 4232 while ((pplist == NULL) &&
4234 4233 (mnode = lgrp_memnode_choose(&lgrp_cookie))
4235 4234 != -1) {
4236 4235 pplist =
4237 4236 page_get_mnode_cachelist(bin, flags,
4238 4237 mnode, mtype);
4239 4238 }
4240 4239 if (pplist != NULL) {
4241 4240 page_hashout(pplist, NULL);
4242 4241 PP_SETAGED(pplist);
4243 4242 REPL_STAT_INCR(nhashout);
4244 4243 break;
4245 4244 }
4246 4245 /* Done looking in this lgroup. Bail out. */
4247 4246 break;
4248 4247 }
4249 4248
4250 4249 /*
4251 4250 * No lgroup was specified (or lgroup was removed by
4252 4251 * DR, so just try to get the page as close to
4253 4252 * like_pp's mnode as possible.
4254 4253 * First try the local freelist...
4255 4254 */
4256 4255 mnode = PP_2_MEM_NODE(like_pp);
4257 4256 pplist = page_get_mnode_freelist(mnode, bin,
4258 4257 mtype, szc, flags);
4259 4258 if (pplist != NULL)
4260 4259 break;
4261 4260
4262 4261 REPL_STAT_INCR(nnofree);
4263 4262
4264 4263 /*
4265 4264 * ...then the local cachelist. Don't need to do it for
4266 4265 * larger pages cause page_freelist_coalesce() already
4267 4266 * failed there anyway.
4268 4267 */
4269 4268 if (szc == 0) {
4270 4269 pplist = page_get_mnode_cachelist(bin, flags,
4271 4270 mnode, mtype);
4272 4271 if (pplist != NULL) {
4273 4272 page_hashout(pplist, NULL);
4274 4273 PP_SETAGED(pplist);
4275 4274 REPL_STAT_INCR(nhashout);
4276 4275 break;
4277 4276 }
4278 4277 }
4279 4278
4280 4279 /* Now try remote freelists */
4281 4280 page_mnode = mnode;
4282 4281 lgrp =
4283 4282 lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode));
4284 4283 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4285 4284 LGRP_SRCH_HIER);
4286 4285 while (pplist == NULL &&
4287 4286 (mnode = lgrp_memnode_choose(&lgrp_cookie))
4288 4287 != -1) {
4289 4288 /*
4290 4289 * Skip local mnode.
4291 4290 */
4292 4291 if ((mnode == page_mnode) ||
4293 4292 (mem_node_config[mnode].exists == 0))
4294 4293 continue;
4295 4294
4296 4295 pplist = page_get_mnode_freelist(mnode,
4297 4296 bin, mtype, szc, flags);
4298 4297 }
4299 4298
4300 4299 if (pplist != NULL)
4301 4300 break;
4302 4301
4303 4302
4304 4303 /* Now try remote cachelists */
4305 4304 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4306 4305 LGRP_SRCH_HIER);
4307 4306 while (pplist == NULL && szc == 0) {
4308 4307 mnode = lgrp_memnode_choose(&lgrp_cookie);
4309 4308 if (mnode == -1)
4310 4309 break;
4311 4310 /*
4312 4311 * Skip local mnode.
4313 4312 */
4314 4313 if ((mnode == page_mnode) ||
4315 4314 (mem_node_config[mnode].exists == 0))
4316 4315 continue;
4317 4316
4318 4317 pplist = page_get_mnode_cachelist(bin,
4319 4318 flags, mnode, mtype);
4320 4319
4321 4320 if (pplist != NULL) {
4322 4321 page_hashout(pplist, NULL);
4323 4322 PP_SETAGED(pplist);
4324 4323 REPL_STAT_INCR(nhashout);
4325 4324 break;
4326 4325 }
4327 4326 }
4328 4327
4329 4328 /*
4330 4329 * Break out of while loop under the following cases:
4331 4330 * - If we successfully got a page.
4332 4331 * - If pgrflags specified only returning a specific
4333 4332 * page size and we could not find that page size.
4334 4333 * - If we could not satisfy the request with PAGESIZE
4335 4334 * or larger pages.
4336 4335 */
4337 4336 if (pplist != NULL || szc == 0)
4338 4337 break;
4339 4338
4340 4339 if ((pgrflags & PGR_SAMESZC) || pgrppgcp) {
4341 4340 /* try to find contig page */
4342 4341
4343 4342 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4344 4343 LGRP_SRCH_HIER);
4345 4344
4346 4345 while ((pplist == NULL) &&
4347 4346 (mnode =
4348 4347 lgrp_memnode_choose(&lgrp_cookie))
4349 4348 != -1) {
4350 4349 pplist = page_get_contig_pages(
4351 4350 mnode, bin, mtype, szc,
4352 4351 flags | PGI_PGCPHIPRI);
4353 4352 }
4354 4353 break;
4355 4354 }
4356 4355
4357 4356 /*
4358 4357 * The correct thing to do here is try the next
4359 4358 * page size down using szc--. Due to a bug
4360 4359 * with the processing of HAT_RELOAD_SHARE
4361 4360 * where the sfmmu_ttecnt arrays of all
4362 4361 * hats sharing an ISM segment don't get updated,
4363 4362 * using intermediate size pages for relocation
4364 4363 * can lead to continuous page faults.
4365 4364 */
4366 4365 szc = 0;
4367 4366 }
4368 4367
4369 4368 if (pplist != NULL) {
4370 4369 DTRACE_PROBE4(page__get,
4371 4370 lgrp_t *, lgrp,
4372 4371 int, mnode,
4373 4372 ulong_t, bin,
4374 4373 uint_t, flags);
4375 4374
4376 4375 while (pplist != NULL && pg_cnt--) {
4377 4376 ASSERT(pplist != NULL);
4378 4377 pp = pplist;
4379 4378 page_sub(&pplist, pp);
4380 4379 PP_CLRFREE(pp);
4381 4380 PP_CLRAGED(pp);
4382 4381 page_list_concat(&pl, &pp);
4383 4382 npgs--;
4384 4383 like_pp = like_pp + 1;
4385 4384 REPL_STAT_INCR(nnext_pp);
4386 4385 }
4387 4386 ASSERT(pg_cnt == 0);
4388 4387 } else {
4389 4388 break;
4390 4389 }
4391 4390 }
4392 4391
4393 4392 if (npgs) {
4394 4393 /*
4395 4394 * We were unable to allocate the necessary number
4396 4395 * of pages.
4397 4396 * We need to free up any pl.
4398 4397 */
4399 4398 REPL_STAT_INCR(nnopage);
4400 4399 page_free_replacement_page(pl);
4401 4400 return (NULL);
4402 4401 } else {
4403 4402 return (pl);
4404 4403 }
4405 4404 }
4406 4405
4407 4406 /*
4408 4407 * demote a free large page to it's constituent pages
4409 4408 */
4410 4409 void
4411 4410 page_demote_free_pages(page_t *pp)
4412 4411 {
4413 4412
4414 4413 int mnode;
4415 4414
4416 4415 ASSERT(pp != NULL);
4417 4416 ASSERT(PAGE_LOCKED(pp));
4418 4417 ASSERT(PP_ISFREE(pp));
4419 4418 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
4420 4419
4421 4420 mnode = PP_2_MEM_NODE(pp);
4422 4421 page_freelist_lock(mnode);
4423 4422 if (pp->p_szc != 0) {
4424 4423 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum,
4425 4424 pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
4426 4425 }
4427 4426 page_freelist_unlock(mnode);
4428 4427 ASSERT(pp->p_szc == 0);
4429 4428 }
4430 4429
4431 4430 /*
4432 4431 * Factor in colorequiv to check additional 'equivalent' bins.
4433 4432 * colorequiv may be set in /etc/system
4434 4433 */
4435 4434 void
4436 4435 page_set_colorequiv_arr(void)
4437 4436 {
4438 4437 if (colorequiv > 1) {
4439 4438 int i;
4440 4439 uint_t sv_a = lowbit(colorequiv) - 1;
4441 4440
4442 4441 if (sv_a > 15)
4443 4442 sv_a = 15;
4444 4443
4445 4444 for (i = 0; i < MMU_PAGE_SIZES; i++) {
4446 4445 uint_t colors;
4447 4446 uint_t a = sv_a;
4448 4447
4449 4448 if ((colors = hw_page_array[i].hp_colors) <= 1) {
4450 4449 continue;
4451 4450 }
4452 4451 while ((colors >> a) == 0)
4453 4452 a--;
4454 4453 if ((a << 4) > colorequivszc[i]) {
4455 4454 colorequivszc[i] = (a << 4);
4456 4455 }
4457 4456 }
4458 4457 }
4459 4458 }
↓ open down ↓ |
266 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX