Print this page
7029 want per-process exploit mitigation features (secflags)
7030 want basic address space layout randomization (aslr)
7031 noexec_user_stack should be a secflag
7032 want a means to forbid mappings around NULL.
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/sun4/vm/vm_dep.c
+++ new/usr/src/uts/sun4/vm/vm_dep.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 24 */
25 25
26 26 /*
27 27 * UNIX machine dependent virtual memory support.
28 28 */
29 29
↓ open down ↓ |
29 lines elided |
↑ open up ↑ |
30 30 #include <sys/vm.h>
31 31 #include <sys/exec.h>
32 32
33 33 #include <sys/exechdr.h>
34 34 #include <vm/seg_kmem.h>
35 35 #include <sys/atomic.h>
36 36 #include <sys/archsystm.h>
37 37 #include <sys/machsystm.h>
38 38 #include <sys/kdi.h>
39 39 #include <sys/cpu_module.h>
40 +#include <sys/secflags.h>
40 41
41 42 #include <vm/hat_sfmmu.h>
42 43
43 44 #include <sys/memnode.h>
44 45
45 46 #include <sys/mem_config.h>
46 47 #include <sys/mem_cage.h>
47 48 #include <vm/vm_dep.h>
48 49 #include <vm/page.h>
49 50 #include <sys/platform_module.h>
50 51
51 52 /*
52 53 * These variables are set by module specific config routines.
53 54 * They are only set by modules which will use physical cache page coloring.
54 55 */
55 56 int do_pg_coloring = 0;
56 57
57 58 /*
58 59 * These variables can be conveniently patched at kernel load time to
59 60 * prevent do_pg_coloring from being enabled by
60 61 * module specific config routines.
61 62 */
62 63
63 64 int use_page_coloring = 1;
64 65
65 66 /*
66 67 * initialized by page_coloring_init()
67 68 */
68 69 extern uint_t page_colors;
69 70 extern uint_t page_colors_mask;
70 71 extern uint_t page_coloring_shift;
71 72 int cpu_page_colors;
72 73 uint_t vac_colors = 0;
73 74 uint_t vac_colors_mask = 0;
74 75
75 76 /* cpu specific coloring initialization */
76 77 extern void page_coloring_init_cpu();
77 78 #pragma weak page_coloring_init_cpu
78 79
79 80 /*
80 81 * get the ecache setsize for the current cpu.
81 82 */
82 83 #define CPUSETSIZE() (cpunodes[CPU->cpu_id].ecache_setsize)
83 84
84 85 plcnt_t plcnt; /* page list count */
85 86
86 87 /*
87 88 * This variable is set by the cpu module to contain the lowest
88 89 * address not affected by the SF_ERRATA_57 workaround. It should
89 90 * remain 0 if the workaround is not needed.
90 91 */
91 92 #if defined(SF_ERRATA_57)
92 93 caddr_t errata57_limit;
93 94 #endif
94 95
95 96 extern void page_relocate_hash(page_t *, page_t *);
96 97
97 98 /*
98 99 * these must be defined in platform specific areas
99 100 */
100 101 extern void map_addr_proc(caddr_t *, size_t, offset_t, int, caddr_t,
101 102 struct proc *, uint_t);
102 103 extern page_t *page_get_freelist(struct vnode *, u_offset_t, struct seg *,
103 104 caddr_t, size_t, uint_t, struct lgrp *);
104 105 /*
105 106 * Convert page frame number to an OBMEM page frame number
106 107 * (i.e. put in the type bits -- zero for this implementation)
107 108 */
108 109 pfn_t
109 110 impl_obmem_pfnum(pfn_t pf)
110 111 {
111 112 return (pf);
112 113 }
113 114
114 115 /*
115 116 * Use physmax to determine the highest physical page of DRAM memory
116 117 * It is assumed that any physical addresses above physmax is in IO space.
117 118 * We don't bother checking the low end because we assume that memory space
118 119 * begins at physical page frame 0.
119 120 *
120 121 * Return 1 if the page frame is onboard DRAM memory, else 0.
121 122 * Returns 0 for nvram so it won't be cached.
122 123 */
123 124 int
124 125 pf_is_memory(pfn_t pf)
125 126 {
126 127 /* We must be IO space */
127 128 if (pf > physmax)
128 129 return (0);
129 130
130 131 /* We must be memory space */
131 132 return (1);
132 133 }
133 134
134 135 /*
135 136 * Handle a pagefault.
136 137 */
137 138 faultcode_t
138 139 pagefault(caddr_t addr, enum fault_type type, enum seg_rw rw, int iskernel)
139 140 {
140 141 struct as *as;
141 142 struct proc *p;
142 143 faultcode_t res;
143 144 caddr_t base;
144 145 size_t len;
145 146 int err;
146 147
147 148 if (INVALID_VADDR(addr))
148 149 return (FC_NOMAP);
149 150
150 151 if (iskernel) {
151 152 as = &kas;
152 153 } else {
153 154 p = curproc;
154 155 as = p->p_as;
155 156 #if defined(SF_ERRATA_57)
156 157 /*
157 158 * Prevent infinite loops due to a segment driver
158 159 * setting the execute permissions and the sfmmu hat
159 160 * silently ignoring them.
160 161 */
161 162 if (rw == S_EXEC && AS_TYPE_64BIT(as) &&
162 163 addr < errata57_limit) {
163 164 res = FC_NOMAP;
164 165 goto out;
165 166 }
166 167 #endif
167 168 }
168 169
169 170 /*
170 171 * Dispatch pagefault.
171 172 */
172 173 res = as_fault(as->a_hat, as, addr, 1, type, rw);
173 174
174 175 /*
175 176 * If this isn't a potential unmapped hole in the user's
176 177 * UNIX data or stack segments, just return status info.
177 178 */
178 179 if (!(res == FC_NOMAP && iskernel == 0))
179 180 goto out;
180 181
181 182 /*
182 183 * Check to see if we happened to faulted on a currently unmapped
183 184 * part of the UNIX data or stack segments. If so, create a zfod
184 185 * mapping there and then try calling the fault routine again.
185 186 */
186 187 base = p->p_brkbase;
187 188 len = p->p_brksize;
188 189
189 190 if (addr < base || addr >= base + len) { /* data seg? */
190 191 base = (caddr_t)(p->p_usrstack - p->p_stksize);
191 192 len = p->p_stksize;
192 193 if (addr < base || addr >= p->p_usrstack) { /* stack seg? */
193 194 /* not in either UNIX data or stack segments */
194 195 res = FC_NOMAP;
195 196 goto out;
196 197 }
197 198 }
198 199
199 200 /* the rest of this function implements a 3.X 4.X 5.X compatibility */
200 201 /* This code is probably not needed anymore */
201 202
202 203 /* expand the gap to the page boundaries on each side */
203 204 len = (((uintptr_t)base + len + PAGEOFFSET) & PAGEMASK) -
204 205 ((uintptr_t)base & PAGEMASK);
205 206 base = (caddr_t)((uintptr_t)base & PAGEMASK);
206 207
207 208 as_rangelock(as);
208 209 as_purge(as);
209 210 if (as_gap(as, PAGESIZE, &base, &len, AH_CONTAIN, addr) == 0) {
210 211 err = as_map(as, base, len, segvn_create, zfod_argsp);
211 212 as_rangeunlock(as);
212 213 if (err) {
213 214 res = FC_MAKE_ERR(err);
214 215 goto out;
215 216 }
216 217 } else {
217 218 /*
218 219 * This page is already mapped by another thread after we
219 220 * returned from as_fault() above. We just fallthrough
220 221 * as_fault() below.
221 222 */
222 223 as_rangeunlock(as);
223 224 }
224 225
225 226 res = as_fault(as->a_hat, as, addr, 1, F_INVAL, rw);
226 227
227 228 out:
228 229
229 230 return (res);
230 231 }
231 232
232 233 /*
233 234 * This is the routine which defines the address limit implied
234 235 * by the flag '_MAP_LOW32'. USERLIMIT32 matches the highest
235 236 * mappable address in a 32-bit process on this platform (though
236 237 * perhaps we should make it be UINT32_MAX here?)
237 238 */
238 239 void
239 240 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
240 241 {
241 242 struct proc *p = curproc;
242 243 caddr_t userlimit = flags & _MAP_LOW32 ?
243 244 (caddr_t)USERLIMIT32 : p->p_as->a_userlimit;
244 245 map_addr_proc(addrp, len, off, vacalign, userlimit, p, flags);
245 246 }
246 247
247 248 /*
248 249 * Some V9 CPUs have holes in the middle of the 64-bit virtual address range.
249 250 */
250 251 caddr_t hole_start, hole_end;
251 252
252 253 /*
253 254 * kpm mapping window
254 255 */
255 256 caddr_t kpm_vbase;
256 257 size_t kpm_size;
257 258 uchar_t kpm_size_shift;
258 259
259 260 int valid_va_range_aligned_wraparound;
260 261 /*
261 262 * Determine whether [*basep, *basep + *lenp) contains a mappable range of
262 263 * addresses at least "minlen" long, where the base of the range is at "off"
263 264 * phase from an "align" boundary and there is space for a "redzone"-sized
264 265 * redzone on either side of the range. On success, 1 is returned and *basep
265 266 * and *lenp are adjusted to describe the acceptable range (including
266 267 * the redzone). On failure, 0 is returned.
267 268 */
268 269 int
269 270 valid_va_range_aligned(caddr_t *basep, size_t *lenp, size_t minlen, int dir,
270 271 size_t align, size_t redzone, size_t off)
271 272 {
272 273 caddr_t hi, lo;
273 274 size_t tot_len;
274 275
275 276 ASSERT(align == 0 ? off == 0 : off < align);
276 277 ASSERT(ISP2(align));
277 278 ASSERT(align == 0 || align >= PAGESIZE);
278 279
279 280 lo = *basep;
280 281 hi = lo + *lenp;
281 282 tot_len = minlen + 2 * redzone; /* need at least this much space */
282 283
283 284 /* If hi rolled over the top try cutting back. */
284 285 if (hi < lo) {
285 286 *lenp = 0UL - (uintptr_t)lo - 1UL;
286 287 /* Trying to see if this really happens, and then if so, why */
287 288 valid_va_range_aligned_wraparound++;
288 289 hi = lo + *lenp;
289 290 }
290 291 if (*lenp < tot_len) {
291 292 return (0);
292 293 }
293 294
294 295 /*
295 296 * Deal with a possible hole in the address range between
296 297 * hole_start and hole_end that should never be mapped by the MMU.
297 298 */
298 299
299 300 if (lo < hole_start) {
300 301 if (hi > hole_start)
301 302 if (hi < hole_end)
302 303 hi = hole_start;
303 304 else
304 305 /* lo < hole_start && hi >= hole_end */
305 306 if (dir == AH_LO) {
306 307 /*
307 308 * prefer lowest range
308 309 */
309 310 if (hole_start - lo >= tot_len)
310 311 hi = hole_start;
311 312 else if (hi - hole_end >= tot_len)
312 313 lo = hole_end;
313 314 else
314 315 return (0);
315 316 } else {
316 317 /*
317 318 * prefer highest range
318 319 */
319 320 if (hi - hole_end >= tot_len)
320 321 lo = hole_end;
321 322 else if (hole_start - lo >= tot_len)
322 323 hi = hole_start;
323 324 else
324 325 return (0);
325 326 }
326 327 } else {
327 328 /* lo >= hole_start */
328 329 if (hi < hole_end)
329 330 return (0);
330 331 if (lo < hole_end)
331 332 lo = hole_end;
332 333 }
333 334
334 335 /* Check if remaining length is too small */
335 336 if (hi - lo < tot_len) {
336 337 return (0);
337 338 }
338 339 if (align > 1) {
339 340 caddr_t tlo = lo + redzone;
340 341 caddr_t thi = hi - redzone;
341 342 tlo = (caddr_t)P2PHASEUP((uintptr_t)tlo, align, off);
342 343 if (tlo < lo + redzone) {
343 344 return (0);
344 345 }
345 346 if (thi < tlo || thi - tlo < minlen) {
346 347 return (0);
347 348 }
348 349 }
349 350 *basep = lo;
350 351 *lenp = hi - lo;
351 352 return (1);
352 353 }
353 354
354 355 /*
355 356 * Determine whether [*basep, *basep + *lenp) contains a mappable range of
356 357 * addresses at least "minlen" long. On success, 1 is returned and *basep
↓ open down ↓ |
307 lines elided |
↑ open up ↑ |
357 358 * and *lenp are adjusted to describe the acceptable range. On failure, 0
358 359 * is returned.
359 360 */
360 361 int
361 362 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
362 363 {
363 364 return (valid_va_range_aligned(basep, lenp, minlen, dir, 0, 0, 0));
364 365 }
365 366
366 367 /*
368 + * Default to forbidding the first 64k of address space. This protects most
369 + * reasonably sized structures from dereferences through NULL:
370 + * ((foo_t *)0)->bar
371 + */
372 +uintptr_t forbidden_null_mapping_sz = 0x10000;
373 +
374 +/*
367 375 * Determine whether [addr, addr+len] with protections `prot' are valid
368 376 * for a user address space.
369 377 */
370 378 /*ARGSUSED*/
371 379 int
372 380 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
373 381 caddr_t userlimit)
374 382 {
375 383 caddr_t eaddr = addr + len;
376 384
377 385 if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
378 386 return (RANGE_BADADDR);
379 387
388 + if ((addr <= (caddr_t)forbidden_null_mapping_sz) &&
389 + secflag_enabled(as->a_proc, PROC_SEC_FORBIDNULLMAP))
390 + return (RANGE_BADADDR);
391 +
380 392 /*
381 393 * Determine if the address range falls within an illegal
382 394 * range of the MMU.
383 395 */
384 396 if (eaddr > hole_start && addr < hole_end)
385 397 return (RANGE_BADADDR);
386 398
387 399 #if defined(SF_ERRATA_57)
388 400 /*
389 401 * Make sure USERLIMIT isn't raised too high
390 402 */
391 403 ASSERT64(addr <= (caddr_t)0xffffffff80000000ul ||
392 404 errata57_limit == 0);
393 405
394 406 if (AS_TYPE_64BIT(as) &&
395 407 (addr < errata57_limit) &&
396 408 (prot & PROT_EXEC))
397 409 return (RANGE_BADPROT);
398 410 #endif /* SF_ERRATA57 */
399 411 return (RANGE_OKAY);
400 412 }
401 413
402 414 /*
403 415 * Routine used to check to see if an a.out can be executed
404 416 * by the current machine/architecture.
405 417 */
406 418 int
407 419 chkaout(struct exdata *exp)
408 420 {
409 421 if (exp->ux_mach == M_SPARC)
410 422 return (0);
411 423 else
412 424 return (ENOEXEC);
413 425 }
414 426
415 427 /*
416 428 * The following functions return information about an a.out
417 429 * which is used when a program is executed.
418 430 */
419 431
420 432 /*
421 433 * Return the load memory address for the data segment.
422 434 */
423 435 caddr_t
424 436 getdmem(struct exec *exp)
425 437 {
426 438 /*
427 439 * XXX - Sparc Reference Hack approaching
428 440 * Remember that we are loading
429 441 * 8k executables into a 4k machine
430 442 * DATA_ALIGN == 2 * PAGESIZE
431 443 */
432 444 if (exp->a_text)
433 445 return ((caddr_t)(roundup(USRTEXT + exp->a_text, DATA_ALIGN)));
434 446 else
435 447 return ((caddr_t)USRTEXT);
436 448 }
437 449
438 450 /*
439 451 * Return the starting disk address for the data segment.
440 452 */
441 453 ulong_t
442 454 getdfile(struct exec *exp)
443 455 {
444 456 if (exp->a_magic == ZMAGIC)
445 457 return (exp->a_text);
446 458 else
447 459 return (sizeof (struct exec) + exp->a_text);
448 460 }
449 461
450 462 /*
451 463 * Return the load memory address for the text segment.
452 464 */
453 465
454 466 /*ARGSUSED*/
455 467 caddr_t
456 468 gettmem(struct exec *exp)
457 469 {
458 470 return ((caddr_t)USRTEXT);
459 471 }
460 472
461 473 /*
462 474 * Return the file byte offset for the text segment.
463 475 */
464 476 uint_t
465 477 gettfile(struct exec *exp)
466 478 {
467 479 if (exp->a_magic == ZMAGIC)
468 480 return (0);
469 481 else
470 482 return (sizeof (struct exec));
471 483 }
472 484
473 485 void
474 486 getexinfo(
475 487 struct exdata *edp_in,
476 488 struct exdata *edp_out,
477 489 int *pagetext,
478 490 int *pagedata)
479 491 {
480 492 *edp_out = *edp_in; /* structure copy */
481 493
482 494 if ((edp_in->ux_mag == ZMAGIC) &&
483 495 ((edp_in->vp->v_flag & VNOMAP) == 0)) {
484 496 *pagetext = 1;
485 497 *pagedata = 1;
486 498 } else {
487 499 *pagetext = 0;
488 500 *pagedata = 0;
489 501 }
490 502 }
491 503
492 504 /*
493 505 * Return non 0 value if the address may cause a VAC alias with KPM mappings.
494 506 * KPM selects an address such that it's equal offset modulo shm_alignment and
495 507 * assumes it can't be in VAC conflict with any larger than PAGESIZE mapping.
496 508 */
497 509 int
498 510 map_addr_vacalign_check(caddr_t addr, u_offset_t off)
499 511 {
500 512 if (vac) {
501 513 return (((uintptr_t)addr ^ off) & shm_alignment - 1);
502 514 } else {
503 515 return (0);
504 516 }
505 517 }
506 518
507 519 /*
508 520 * Sanity control. Don't use large pages regardless of user
509 521 * settings if there's less than priv or shm_lpg_min_physmem memory installed.
510 522 * The units for this variable is 8K pages.
511 523 */
512 524 pgcnt_t shm_lpg_min_physmem = 131072; /* 1GB */
513 525 pgcnt_t privm_lpg_min_physmem = 131072; /* 1GB */
514 526
515 527 static size_t
516 528 map_pgszheap(struct proc *p, caddr_t addr, size_t len)
517 529 {
518 530 size_t pgsz = MMU_PAGESIZE;
519 531 int szc;
520 532
521 533 /*
522 534 * If len is zero, retrieve from proc and don't demote the page size.
523 535 * Use atleast the default pagesize.
524 536 */
525 537 if (len == 0) {
526 538 len = p->p_brkbase + p->p_brksize - p->p_bssbase;
527 539 }
528 540 len = MAX(len, default_uheap_lpsize);
529 541
530 542 for (szc = mmu_page_sizes - 1; szc >= 0; szc--) {
531 543 pgsz = hw_page_array[szc].hp_size;
532 544 if ((disable_auto_data_large_pages & (1 << szc)) ||
533 545 pgsz > max_uheap_lpsize)
534 546 continue;
535 547 if (len >= pgsz) {
536 548 break;
537 549 }
538 550 }
539 551
540 552 /*
541 553 * If addr == 0 we were called by memcntl() when the
542 554 * size code is 0. Don't set pgsz less than current size.
543 555 */
544 556 if (addr == 0 && (pgsz < hw_page_array[p->p_brkpageszc].hp_size)) {
545 557 pgsz = hw_page_array[p->p_brkpageszc].hp_size;
546 558 }
547 559
548 560 return (pgsz);
549 561 }
550 562
551 563 static size_t
552 564 map_pgszstk(struct proc *p, caddr_t addr, size_t len)
553 565 {
554 566 size_t pgsz = MMU_PAGESIZE;
555 567 int szc;
556 568
557 569 /*
558 570 * If len is zero, retrieve from proc and don't demote the page size.
559 571 * Use atleast the default pagesize.
560 572 */
561 573 if (len == 0) {
562 574 len = p->p_stksize;
563 575 }
564 576 len = MAX(len, default_ustack_lpsize);
565 577
566 578 for (szc = mmu_page_sizes - 1; szc >= 0; szc--) {
567 579 pgsz = hw_page_array[szc].hp_size;
568 580 if ((disable_auto_data_large_pages & (1 << szc)) ||
569 581 pgsz > max_ustack_lpsize)
570 582 continue;
571 583 if (len >= pgsz) {
572 584 break;
573 585 }
574 586 }
575 587
576 588 /*
577 589 * If addr == 0 we were called by memcntl() or exec_args() when the
578 590 * size code is 0. Don't set pgsz less than current size.
579 591 */
580 592 if (addr == 0 && (pgsz < hw_page_array[p->p_stkpageszc].hp_size)) {
581 593 pgsz = hw_page_array[p->p_stkpageszc].hp_size;
582 594 }
583 595
584 596 return (pgsz);
585 597 }
586 598
587 599 static size_t
588 600 map_pgszism(caddr_t addr, size_t len)
589 601 {
590 602 uint_t szc;
591 603 size_t pgsz;
592 604
593 605 for (szc = mmu_page_sizes - 1; szc >= TTE4M; szc--) {
594 606 if (disable_ism_large_pages & (1 << szc))
595 607 continue;
596 608
597 609 pgsz = hw_page_array[szc].hp_size;
598 610 if ((len >= pgsz) && IS_P2ALIGNED(addr, pgsz))
599 611 return (pgsz);
600 612 }
601 613
602 614 return (DEFAULT_ISM_PAGESIZE);
603 615 }
604 616
605 617 /*
606 618 * Suggest a page size to be used to map a segment of type maptype and length
607 619 * len. Returns a page size (not a size code).
608 620 */
609 621 /* ARGSUSED */
610 622 size_t
611 623 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl)
612 624 {
613 625 size_t pgsz = MMU_PAGESIZE;
614 626
615 627 ASSERT(maptype != MAPPGSZ_VA);
616 628
617 629 if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) {
618 630 return (MMU_PAGESIZE);
619 631 }
620 632
621 633 switch (maptype) {
622 634 case MAPPGSZ_ISM:
623 635 pgsz = map_pgszism(addr, len);
624 636 break;
625 637
626 638 case MAPPGSZ_STK:
627 639 if (max_ustack_lpsize > MMU_PAGESIZE) {
628 640 pgsz = map_pgszstk(p, addr, len);
629 641 }
630 642 break;
631 643
632 644 case MAPPGSZ_HEAP:
633 645 if (max_uheap_lpsize > MMU_PAGESIZE) {
634 646 pgsz = map_pgszheap(p, addr, len);
635 647 }
636 648 break;
637 649 }
638 650 return (pgsz);
639 651 }
640 652
641 653
642 654 /* assumes TTE8K...TTE4M == szc */
643 655
644 656 static uint_t
645 657 map_szcvec(caddr_t addr, size_t size, uintptr_t off, int disable_lpgs,
646 658 size_t max_lpsize, size_t min_physmem)
647 659 {
648 660 caddr_t eaddr = addr + size;
649 661 uint_t szcvec = 0;
650 662 caddr_t raddr;
651 663 caddr_t readdr;
652 664 size_t pgsz;
653 665 int i;
654 666
655 667 if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) {
656 668 return (0);
657 669 }
658 670 for (i = mmu_page_sizes - 1; i > 0; i--) {
659 671 if (disable_lpgs & (1 << i)) {
660 672 continue;
661 673 }
662 674 pgsz = page_get_pagesize(i);
663 675 if (pgsz > max_lpsize) {
664 676 continue;
665 677 }
666 678 raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
667 679 readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
668 680 if (raddr < addr || raddr >= readdr) {
669 681 continue;
670 682 }
671 683 if (P2PHASE((uintptr_t)addr ^ off, pgsz)) {
672 684 continue;
673 685 }
674 686 szcvec |= (1 << i);
675 687 /*
676 688 * And or in the remaining enabled page sizes.
677 689 */
678 690 szcvec |= P2PHASE(~disable_lpgs, (1 << i));
679 691 szcvec &= ~1; /* no need to return 8K pagesize */
680 692 break;
681 693 }
682 694 return (szcvec);
683 695 }
684 696
685 697 /*
686 698 * Return a bit vector of large page size codes that
687 699 * can be used to map [addr, addr + len) region.
688 700 */
689 701 /* ARGSUSED */
690 702 uint_t
691 703 map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type,
692 704 int memcntl)
693 705 {
694 706 if (flags & MAP_TEXT) {
695 707 return (map_szcvec(addr, size, off,
696 708 disable_auto_text_large_pages,
697 709 max_utext_lpsize, shm_lpg_min_physmem));
698 710
699 711 } else if (flags & MAP_INITDATA) {
700 712 return (map_szcvec(addr, size, off,
701 713 disable_auto_data_large_pages,
702 714 max_uidata_lpsize, privm_lpg_min_physmem));
703 715
704 716 } else if (type == MAPPGSZC_SHM) {
705 717 return (map_szcvec(addr, size, off,
706 718 disable_auto_data_large_pages,
707 719 max_shm_lpsize, shm_lpg_min_physmem));
708 720
709 721 } else if (type == MAPPGSZC_HEAP) {
710 722 return (map_szcvec(addr, size, off,
711 723 disable_auto_data_large_pages,
712 724 max_uheap_lpsize, privm_lpg_min_physmem));
713 725
714 726 } else if (type == MAPPGSZC_STACK) {
715 727 return (map_szcvec(addr, size, off,
716 728 disable_auto_data_large_pages,
717 729 max_ustack_lpsize, privm_lpg_min_physmem));
718 730
719 731 } else {
720 732 return (map_szcvec(addr, size, off,
721 733 disable_auto_data_large_pages,
722 734 max_privmap_lpsize, privm_lpg_min_physmem));
723 735 }
724 736 }
725 737
726 738 /*
727 739 * Anchored in the table below are counters used to keep track
728 740 * of free contiguous physical memory. Each element of the table contains
729 741 * the array of counters, the size of array which is allocated during
730 742 * startup based on physmax and a shift value used to convert a pagenum
731 743 * into a counter array index or vice versa. The table has page size
732 744 * for rows and region size for columns:
733 745 *
734 746 * page_counters[page_size][region_size]
735 747 *
736 748 * page_size: TTE size code of pages on page_size freelist.
737 749 *
738 750 * region_size: TTE size code of a candidate larger page made up
739 751 * made up of contiguous free page_size pages.
740 752 *
741 753 * As you go across a page_size row increasing region_size each
742 754 * element keeps track of how many (region_size - 1) size groups
743 755 * made up of page_size free pages can be coalesced into a
744 756 * regsion_size page. Yuck! Lets try an example:
745 757 *
746 758 * page_counters[1][3] is the table element used for identifying
747 759 * candidate 4M pages from contiguous pages off the 64K free list.
748 760 * Each index in the page_counters[1][3].array spans 4M. Its the
749 761 * number of free 512K size (regsion_size - 1) groups of contiguous
750 762 * 64K free pages. So when page_counters[1][3].counters[n] == 8
751 763 * we know we have a candidate 4M page made up of 512K size groups
752 764 * of 64K free pages.
753 765 */
754 766
755 767 /*
756 768 * Per page size free lists. 3rd (max_mem_nodes) and 4th (page coloring bins)
757 769 * dimensions are allocated dynamically.
758 770 */
759 771 page_t ***page_freelists[MMU_PAGE_SIZES][MAX_MEM_TYPES];
760 772
761 773 /*
762 774 * For now there is only a single size cache list.
763 775 * Allocated dynamically.
764 776 */
765 777 page_t ***page_cachelists[MAX_MEM_TYPES];
766 778
767 779 kmutex_t *fpc_mutex[NPC_MUTEX];
768 780 kmutex_t *cpc_mutex[NPC_MUTEX];
769 781
770 782 /*
771 783 * Calculate space needed for page freelists and counters
772 784 */
773 785 size_t
774 786 calc_free_pagelist_sz(void)
775 787 {
776 788 int szc;
777 789 size_t alloc_sz, cache_sz, free_sz;
778 790
779 791 /*
780 792 * one cachelist per color, node, and type
781 793 */
782 794 cache_sz = (page_get_pagecolors(0) * sizeof (page_t *)) +
783 795 sizeof (page_t **);
784 796 cache_sz *= max_mem_nodes * MAX_MEM_TYPES;
785 797
786 798 /*
787 799 * one freelist per size, color, node, and type
788 800 */
789 801 free_sz = sizeof (page_t **);
790 802 for (szc = 0; szc < mmu_page_sizes; szc++)
791 803 free_sz += sizeof (page_t *) * page_get_pagecolors(szc);
792 804 free_sz *= max_mem_nodes * MAX_MEM_TYPES;
793 805
794 806 alloc_sz = cache_sz + free_sz + page_ctrs_sz();
795 807 return (alloc_sz);
796 808 }
797 809
798 810 caddr_t
799 811 alloc_page_freelists(caddr_t alloc_base)
800 812 {
801 813 int mnode, mtype;
802 814 int szc, clrs;
803 815
804 816 /*
805 817 * We only support small pages in the cachelist.
806 818 */
807 819 for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
808 820 page_cachelists[mtype] = (page_t ***)alloc_base;
809 821 alloc_base += (max_mem_nodes * sizeof (page_t **));
810 822 for (mnode = 0; mnode < max_mem_nodes; mnode++) {
811 823 page_cachelists[mtype][mnode] = (page_t **)alloc_base;
812 824 alloc_base +=
813 825 (page_get_pagecolors(0) * sizeof (page_t *));
814 826 }
815 827 }
816 828
817 829 /*
818 830 * Allocate freelists bins for all
819 831 * supported page sizes.
820 832 */
821 833 for (szc = 0; szc < mmu_page_sizes; szc++) {
822 834 clrs = page_get_pagecolors(szc);
823 835 for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
824 836 page_freelists[szc][mtype] = (page_t ***)alloc_base;
825 837 alloc_base += (max_mem_nodes * sizeof (page_t **));
826 838 for (mnode = 0; mnode < max_mem_nodes; mnode++) {
827 839 page_freelists[szc][mtype][mnode] =
828 840 (page_t **)alloc_base;
829 841 alloc_base += (clrs * (sizeof (page_t *)));
830 842 }
831 843 }
832 844 }
833 845
834 846 alloc_base = page_ctrs_alloc(alloc_base);
835 847 return (alloc_base);
836 848 }
837 849
838 850 /*
839 851 * Allocate page_freelists locks for a memnode from the nucleus data
840 852 * area. This is the first time that mmu_page_sizes is used during
841 853 * bootup, so check mmu_page_sizes initialization.
842 854 */
843 855 int
844 856 ndata_alloc_page_mutexs(struct memlist *ndata)
845 857 {
846 858 size_t alloc_sz;
847 859 caddr_t alloc_base;
848 860 int i;
849 861 void page_coloring_init();
850 862
851 863 page_coloring_init();
852 864 if (&mmu_init_mmu_page_sizes) {
853 865 if (!mmu_init_mmu_page_sizes(0)) {
854 866 cmn_err(CE_PANIC, "mmu_page_sizes %d not initialized",
855 867 mmu_page_sizes);
856 868 }
857 869 }
858 870 ASSERT(mmu_page_sizes >= DEFAULT_MMU_PAGE_SIZES);
859 871
860 872 /* fpc_mutex and cpc_mutex */
861 873 alloc_sz = 2 * NPC_MUTEX * max_mem_nodes * sizeof (kmutex_t);
862 874
863 875 alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize);
864 876 if (alloc_base == NULL)
865 877 return (-1);
866 878
867 879 ASSERT(((uintptr_t)alloc_base & (ecache_alignsize - 1)) == 0);
868 880
869 881 for (i = 0; i < NPC_MUTEX; i++) {
870 882 fpc_mutex[i] = (kmutex_t *)alloc_base;
871 883 alloc_base += (sizeof (kmutex_t) * max_mem_nodes);
872 884 cpc_mutex[i] = (kmutex_t *)alloc_base;
873 885 alloc_base += (sizeof (kmutex_t) * max_mem_nodes);
874 886 }
875 887 return (0);
876 888 }
877 889
878 890 /*
879 891 * To select our starting bin, we stride through the bins with a stride
880 892 * of 337. Why 337? It's prime, it's largeish, and it performs well both
881 893 * in simulation and practice for different workloads on varying cache sizes.
882 894 */
883 895 uint32_t color_start_current = 0;
884 896 uint32_t color_start_stride = 337;
885 897 int color_start_random = 0;
886 898
887 899 /* ARGSUSED */
888 900 uint_t
889 901 get_color_start(struct as *as)
890 902 {
891 903 uint32_t old, new;
892 904
893 905 if (consistent_coloring == 2 || color_start_random) {
894 906 return ((uint_t)(((gettick()) << (vac_shift - MMU_PAGESHIFT)) &
895 907 (hw_page_array[0].hp_colors - 1)));
896 908 }
897 909
898 910 do {
899 911 old = color_start_current;
900 912 new = old + (color_start_stride << (vac_shift - MMU_PAGESHIFT));
901 913 } while (atomic_cas_32(&color_start_current, old, new) != old);
902 914
903 915 return ((uint_t)(new));
904 916 }
905 917
906 918 /*
907 919 * Called once at startup from kphysm_init() -- before memialloc()
908 920 * is invoked to do the 1st page_free()/page_freelist_add().
909 921 *
910 922 * initializes page_colors and page_colors_mask based on ecache_setsize.
911 923 *
912 924 * Also initializes the counter locks.
913 925 */
914 926 void
915 927 page_coloring_init()
916 928 {
917 929 int a, i;
918 930 uint_t colors;
919 931
920 932 if (do_pg_coloring == 0) {
921 933 page_colors = 1;
922 934 for (i = 0; i < mmu_page_sizes; i++) {
923 935 colorequivszc[i] = 0;
924 936 hw_page_array[i].hp_colors = 1;
925 937 }
926 938 return;
927 939 }
928 940
929 941 /*
930 942 * Calculate page_colors from ecache_setsize. ecache_setsize contains
931 943 * the max ecache setsize of all cpus configured in the system or, for
932 944 * cheetah+ systems, the max possible ecache setsize for all possible
933 945 * cheetah+ cpus.
934 946 */
935 947 page_colors = ecache_setsize / MMU_PAGESIZE;
936 948 page_colors_mask = page_colors - 1;
937 949
938 950 vac_colors = vac_size / MMU_PAGESIZE;
939 951 vac_colors_mask = vac_colors -1;
940 952
941 953 page_coloring_shift = 0;
942 954 a = ecache_setsize;
943 955 while (a >>= 1) {
944 956 page_coloring_shift++;
945 957 }
946 958
947 959 /* initialize number of colors per page size */
948 960 for (i = 0; i < mmu_page_sizes; i++) {
949 961 hw_page_array[i].hp_colors = (page_colors_mask >>
950 962 (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift))
951 963 + 1;
952 964 colorequivszc[i] = 0;
953 965 }
954 966
955 967 /*
956 968 * initialize cpu_page_colors if ecache setsizes are homogenous.
957 969 * cpu_page_colors set to -1 during DR operation or during startup
958 970 * if setsizes are heterogenous.
959 971 *
960 972 * The value of cpu_page_colors determines if additional color bins
961 973 * need to be checked for a particular color in the page_get routines.
962 974 */
963 975 if (cpu_setsize > 0 && cpu_page_colors == 0 &&
964 976 cpu_setsize < ecache_setsize) {
965 977 cpu_page_colors = cpu_setsize / MMU_PAGESIZE;
966 978 a = lowbit(page_colors) - lowbit(cpu_page_colors);
967 979 ASSERT(a > 0);
968 980 ASSERT(a < 16);
969 981
970 982 for (i = 0; i < mmu_page_sizes; i++) {
971 983 if ((colors = hw_page_array[i].hp_colors) <= 1) {
972 984 continue;
973 985 }
974 986 while ((colors >> a) == 0)
975 987 a--;
976 988 ASSERT(a >= 0);
977 989
978 990 /* higher 4 bits encodes color equiv mask */
979 991 colorequivszc[i] = (a << 4);
980 992 }
981 993 }
982 994
983 995 /* do cpu specific color initialization */
984 996 if (&page_coloring_init_cpu) {
985 997 page_coloring_init_cpu();
986 998 }
987 999 }
988 1000
989 1001 int
990 1002 bp_color(struct buf *bp)
991 1003 {
992 1004 int color = -1;
993 1005
994 1006 if (vac) {
995 1007 if ((bp->b_flags & B_PAGEIO) != 0) {
996 1008 color = sfmmu_get_ppvcolor(bp->b_pages);
997 1009 } else if (bp->b_un.b_addr != NULL) {
998 1010 color = sfmmu_get_addrvcolor(bp->b_un.b_addr);
999 1011 }
1000 1012 }
1001 1013 return (color < 0 ? 0 : ptob(color));
1002 1014 }
1003 1015
1004 1016 /*
1005 1017 * Function for flushing D-cache when performing module relocations
1006 1018 * to an alternate mapping. Stubbed out on all platforms except sun4u,
1007 1019 * at least for now.
1008 1020 */
1009 1021 void
1010 1022 dcache_flushall()
1011 1023 {
1012 1024 sfmmu_cache_flushall();
1013 1025 }
1014 1026
1015 1027 static int
1016 1028 kdi_range_overlap(uintptr_t va1, size_t sz1, uintptr_t va2, size_t sz2)
1017 1029 {
1018 1030 if (va1 < va2 && va1 + sz1 <= va2)
1019 1031 return (0);
1020 1032
1021 1033 if (va2 < va1 && va2 + sz2 <= va1)
1022 1034 return (0);
1023 1035
1024 1036 return (1);
1025 1037 }
1026 1038
1027 1039 /*
1028 1040 * Return the number of bytes, relative to the beginning of a given range, that
1029 1041 * are non-toxic (can be read from and written to with relative impunity).
1030 1042 */
1031 1043 size_t
1032 1044 kdi_range_is_nontoxic(uintptr_t va, size_t sz, int write)
1033 1045 {
1034 1046 /* OBP reads are harmless, but we don't want people writing there */
1035 1047 if (write && kdi_range_overlap(va, sz, OFW_START_ADDR, OFW_END_ADDR -
1036 1048 OFW_START_ADDR + 1))
1037 1049 return (va < OFW_START_ADDR ? OFW_START_ADDR - va : 0);
1038 1050
1039 1051 if (kdi_range_overlap(va, sz, PIOMAPBASE, PIOMAPSIZE))
1040 1052 return (va < PIOMAPBASE ? PIOMAPBASE - va : 0);
1041 1053
1042 1054 return (sz); /* no overlap */
1043 1055 }
1044 1056
1045 1057 /*
1046 1058 * Minimum physmem required for enabling large pages for kernel heap
1047 1059 * Currently we do not enable lp for kmem on systems with less
1048 1060 * than 1GB of memory. This value can be changed via /etc/system
1049 1061 */
1050 1062 size_t segkmem_lpminphysmem = 0x40000000; /* 1GB */
1051 1063
1052 1064 /*
1053 1065 * this function chooses large page size for kernel heap
1054 1066 */
1055 1067 size_t
1056 1068 get_segkmem_lpsize(size_t lpsize)
1057 1069 {
1058 1070 size_t memtotal = physmem * PAGESIZE;
1059 1071 size_t mmusz;
1060 1072 uint_t szc;
1061 1073
1062 1074 if (memtotal < segkmem_lpminphysmem)
1063 1075 return (PAGESIZE);
1064 1076
1065 1077 if (plat_lpkmem_is_supported != NULL &&
1066 1078 plat_lpkmem_is_supported() == 0)
1067 1079 return (PAGESIZE);
1068 1080
1069 1081 mmusz = mmu_get_kernel_lpsize(lpsize);
1070 1082 szc = page_szc(mmusz);
1071 1083
1072 1084 while (szc) {
1073 1085 if (!(disable_large_pages & (1 << szc)))
1074 1086 return (page_get_pagesize(szc));
1075 1087 szc--;
1076 1088 }
1077 1089 return (PAGESIZE);
1078 1090 }
↓ open down ↓ |
689 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX