Print this page
12701 segspt_minfree needs right-sizing
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Patrick Mooney <pmooney@pfmooney.com>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/vm/seg_spt.c
+++ new/usr/src/uts/common/vm/seg_spt.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
↓ open down ↓ |
12 lines elided |
↑ open up ↑ |
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved.
23 - * Copyright 2018 Joyent, Inc.
23 + * Copyright 2019 Joyent, Inc.
24 24 * Copyright (c) 2016 by Delphix. All rights reserved.
25 25 */
26 26
27 27 #include <sys/param.h>
28 28 #include <sys/user.h>
29 29 #include <sys/mman.h>
30 30 #include <sys/kmem.h>
31 31 #include <sys/sysmacros.h>
32 32 #include <sys/cmn_err.h>
33 33 #include <sys/systm.h>
34 34 #include <sys/tuneable.h>
35 35 #include <vm/hat.h>
36 36 #include <vm/seg.h>
37 37 #include <vm/as.h>
38 38 #include <vm/anon.h>
39 39 #include <vm/page.h>
40 40 #include <sys/buf.h>
41 41 #include <sys/swap.h>
42 42 #include <sys/atomic.h>
43 43 #include <vm/seg_spt.h>
44 44 #include <sys/debug.h>
45 45 #include <sys/vtrace.h>
46 46 #include <sys/shm.h>
47 47 #include <sys/shm_impl.h>
48 48 #include <sys/lgrp.h>
49 49 #include <sys/vmsystm.h>
50 50 #include <sys/policy.h>
51 51 #include <sys/project.h>
52 52 #include <sys/tnf_probe.h>
↓ open down ↓ |
19 lines elided |
↑ open up ↑ |
53 53 #include <sys/zone.h>
54 54
55 55 #define SEGSPTADDR (caddr_t)0x0
56 56
57 57 /*
58 58 * # pages used for spt
59 59 */
60 60 size_t spt_used;
61 61
62 62 /*
63 - * segspt_minfree is the memory left for system after ISM
64 - * locked its pages; it is set up to 5% of availrmem in
65 - * sptcreate when ISM is created. ISM should not use more
66 - * than ~90% of availrmem; if it does, then the performance
67 - * of the system may decrease. Machines with large memories may
68 - * be able to use up more memory for ISM so we set the default
69 - * segspt_minfree to 5% (which gives ISM max 95% of availrmem.
70 - * If somebody wants even more memory for ISM (risking hanging
71 - * the system) they can patch the segspt_minfree to smaller number.
63 + * See spt_setminfree().
72 64 */
73 65 pgcnt_t segspt_minfree = 0;
66 +size_t segspt_minfree_clamp = (1UL << 30); /* 1GB in bytes */
74 67
75 68 static int segspt_create(struct seg **segpp, void *argsp);
76 69 static int segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize);
77 70 static void segspt_free(struct seg *seg);
78 71 static void segspt_free_pages(struct seg *seg, caddr_t addr, size_t len);
79 72 static lgrp_mem_policy_info_t *segspt_getpolicy(struct seg *seg, caddr_t addr);
80 73
81 74 /* ARGSUSED */
82 75 __NORETURN static int
83 76 segspt_badop_dup(struct seg *seg __unused, struct seg *newseg __unused)
84 77 {
85 78 panic("%s called", __func__);
86 79 }
87 80
88 81 /* ARGSUSED */
89 82 __NORETURN static faultcode_t
90 83 segspt_badop_fault(struct hat *hat, struct seg *seg, caddr_t addr,
91 84 size_t len, enum fault_type type, enum seg_rw rw)
92 85 {
93 86 panic("%s called", __func__);
94 87 }
95 88
96 89 /* ARGSUSED */
97 90 __NORETURN static faultcode_t
98 91 segspt_badop_faulta(struct seg *seg __unused, caddr_t addr __unused)
99 92 {
100 93 panic("%s called", __func__);
101 94 }
102 95
103 96 /* ARGSUSED */
104 97 __NORETURN static int
105 98 segspt_badop_prot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
106 99 {
107 100 panic("%s called", __func__);
108 101 }
109 102
110 103 /* ARGSUSED */
111 104 __NORETURN static int
112 105 segspt_badop_checkprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot)
113 106 {
114 107 panic("%s called", __func__);
115 108 }
116 109
117 110 /* ARGSUSED */
118 111 __NORETURN static int
119 112 segspt_badop_kluster(struct seg *seg, caddr_t addr, ssize_t delta)
120 113 {
121 114 panic("%s called", __func__);
122 115 }
123 116
124 117 /* ARGSUSED */
125 118 __NORETURN static size_t
126 119 segspt_badop_swapout(struct seg *seg)
127 120 {
128 121 panic("%s called", __func__);
129 122 }
130 123
131 124 /* ARGSUSED */
132 125 __NORETURN static int
133 126 segspt_badop_sync(struct seg *seg, caddr_t addr, size_t len, int attr,
134 127 uint_t flags)
135 128 {
136 129 panic("%s called", __func__);
137 130 }
138 131
139 132 /* ARGSUSED */
140 133 __NORETURN
141 134 static size_t
142 135 segspt_badop_incore(struct seg *seg, caddr_t addr, size_t len, char *vec)
143 136 {
144 137 panic("%s called", __func__);
145 138 }
146 139
147 140 /* ARGSUSED */
148 141 __NORETURN static int
149 142 segspt_badop_lockop(struct seg *seg, caddr_t addr, size_t len, int attr,
150 143 int op, ulong_t *lockmap, size_t pos)
151 144 {
152 145 panic("%s called", __func__);
153 146 }
154 147
155 148 /* ARGSUSED */
156 149 __NORETURN static int
157 150 segspt_badop_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
158 151 {
159 152 panic("%s called", __func__);
160 153 }
161 154
162 155 /* ARGSUSED */
163 156 __NORETURN static u_offset_t
164 157 segspt_badop_getoffset(struct seg *seg, caddr_t addr)
165 158 {
166 159 panic("%s called", __func__);
167 160 }
168 161
169 162 /* ARGSUSED */
170 163 __NORETURN static int
171 164 segspt_badop_gettype(struct seg *seg, caddr_t addr)
172 165 {
173 166 panic("%s called", __func__);
174 167 }
175 168
176 169 /* ARGSUSED */
177 170 __NORETURN static int
178 171 segspt_badop_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
179 172 {
180 173 panic("%s called", __func__);
181 174 }
182 175
183 176 /* ARGSUSED */
184 177 __NORETURN static int
185 178 segspt_badop_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
186 179 {
187 180 panic("%s called", __func__);
188 181 }
189 182
190 183 /* ARGSUSED */
191 184 __NORETURN static void
192 185 segspt_badop_dump(struct seg *seg)
193 186 {
194 187 panic("%s called", __func__);
195 188 }
196 189
197 190 /* ARGSUSED */
198 191 __NORETURN static int
199 192 segspt_badop_pagelock(struct seg *seg, caddr_t addr, size_t len,
200 193 struct page ***ppp, enum lock_type type, enum seg_rw rw)
201 194 {
202 195 panic("%s called", __func__);
203 196 }
204 197
205 198 /* ARGSUSED */
206 199 __NORETURN static int
207 200 segspt_badop_setpgsz(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
208 201 {
209 202 panic("%s called", __func__);
210 203 }
211 204
212 205 /* ARGSUSED */
213 206 __NORETURN static int
214 207 segspt_badop_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
215 208 {
216 209 panic("%s called", __func__);
217 210 }
218 211
219 212 /* ARGSUSED */
220 213 __NORETURN static int
221 214 segspt_badop_capable(struct seg *seg, segcapability_t capability)
222 215 {
223 216 panic("%s called", __func__);
224 217 }
225 218
226 219 struct seg_ops segspt_ops = {
227 220 segspt_badop_dup, /* dup */
228 221 segspt_unmap,
229 222 segspt_free,
230 223 segspt_badop_fault, /* fault */
231 224 segspt_badop_faulta, /* faulta */
232 225 segspt_badop_prot, /* setprot */
233 226 segspt_badop_checkprot, /* checkprot */
234 227 segspt_badop_kluster, /* kluster */
235 228 segspt_badop_swapout, /* swapout */
236 229 segspt_badop_sync, /* sync */
237 230 segspt_badop_incore, /* incore */
238 231 segspt_badop_lockop, /* lockop */
239 232 segspt_badop_getprot, /* getprot */
240 233 segspt_badop_getoffset, /* getoffset */
241 234 segspt_badop_gettype, /* gettype */
242 235 segspt_badop_getvp, /* getvp */
243 236 segspt_badop_advise, /* advise */
244 237 segspt_badop_dump, /* dump */
245 238 segspt_badop_pagelock, /* pagelock */
246 239 segspt_badop_setpgsz, /* setpgsz */
247 240 segspt_badop_getmemid, /* getmemid */
248 241 segspt_getpolicy, /* getpolicy */
249 242 segspt_badop_capable, /* capable */
250 243 seg_inherit_notsup /* inherit */
251 244 };
252 245
253 246 static int segspt_shmdup(struct seg *seg, struct seg *newseg);
254 247 static int segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize);
255 248 static void segspt_shmfree(struct seg *seg);
256 249 static faultcode_t segspt_shmfault(struct hat *hat, struct seg *seg,
257 250 caddr_t addr, size_t len, enum fault_type type, enum seg_rw rw);
258 251 static faultcode_t segspt_shmfaulta(struct seg *seg, caddr_t addr);
259 252 static int segspt_shmsetprot(struct seg *seg, caddr_t addr, size_t len,
260 253 uint_t prot);
261 254 static int segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size,
262 255 uint_t prot);
263 256 static int segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta);
264 257 static size_t segspt_shmswapout(struct seg *seg);
265 258 static size_t segspt_shmincore(struct seg *seg, caddr_t addr, size_t len,
266 259 char *vec);
267 260 static int segspt_shmsync(struct seg *seg, caddr_t addr, size_t len,
268 261 int attr, uint_t flags);
269 262 static int segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len,
270 263 int attr, int op, ulong_t *lockmap, size_t pos);
271 264 static int segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len,
272 265 uint_t *protv);
273 266 static u_offset_t segspt_shmgetoffset(struct seg *seg, caddr_t addr);
274 267 static int segspt_shmgettype(struct seg *seg, caddr_t addr);
275 268 static int segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp);
276 269 static int segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len,
277 270 uint_t behav);
278 271 static void segspt_shmdump(struct seg *seg);
279 272 static int segspt_shmpagelock(struct seg *, caddr_t, size_t,
280 273 struct page ***, enum lock_type, enum seg_rw);
281 274 static int segspt_shmsetpgsz(struct seg *, caddr_t, size_t, uint_t);
282 275 static int segspt_shmgetmemid(struct seg *, caddr_t, memid_t *);
283 276 static lgrp_mem_policy_info_t *segspt_shmgetpolicy(struct seg *, caddr_t);
284 277 static int segspt_shmcapable(struct seg *, segcapability_t);
285 278
286 279 struct seg_ops segspt_shmops = {
287 280 segspt_shmdup,
288 281 segspt_shmunmap,
289 282 segspt_shmfree,
290 283 segspt_shmfault,
291 284 segspt_shmfaulta,
292 285 segspt_shmsetprot,
293 286 segspt_shmcheckprot,
294 287 segspt_shmkluster,
295 288 segspt_shmswapout,
296 289 segspt_shmsync,
297 290 segspt_shmincore,
298 291 segspt_shmlockop,
299 292 segspt_shmgetprot,
300 293 segspt_shmgetoffset,
301 294 segspt_shmgettype,
302 295 segspt_shmgetvp,
303 296 segspt_shmadvise, /* advise */
304 297 segspt_shmdump,
305 298 segspt_shmpagelock,
306 299 segspt_shmsetpgsz,
307 300 segspt_shmgetmemid,
308 301 segspt_shmgetpolicy,
↓ open down ↓ |
225 lines elided |
↑ open up ↑ |
309 302 segspt_shmcapable,
310 303 seg_inherit_notsup
311 304 };
312 305
313 306 static void segspt_purge(struct seg *seg);
314 307 static int segspt_reclaim(void *, caddr_t, size_t, struct page **,
315 308 enum seg_rw, int);
316 309 static int spt_anon_getpages(struct seg *seg, caddr_t addr, size_t len,
317 310 page_t **ppa);
318 311
312 +/*
313 + * This value corresponds to headroom in availrmem that ISM can never allocate
314 + * (but others can). The original intent here was to prevent ISM from locking
315 + * all of the remaining availrmem into memory, making forward progress
316 + * difficult. It's not clear how much this matters on modern systems.
317 + *
318 + * The traditional default value of 5% of total memory is used, except on
319 + * systems where that quickly gets ridiculous: in that case we clamp at a rather
320 + * arbitrary value of 1GB.
321 + *
322 + * Note that since this is called lazily on the first sptcreate(), in theory,
323 + * this could represent a very small value if the system is heavily loaded
324 + * already. In practice, the first ISM user is pretty likely to come along
325 + * earlier during the system's operation.
326 + *
327 + * This never gets re-figured.
328 + */
329 +static void
330 +spt_setminfree(void)
331 +{
332 + segspt_minfree = availrmem / 20;
319 333
334 + if (segspt_minfree_clamp != 0 &&
335 + segspt_minfree > (segspt_minfree_clamp / PAGESIZE))
336 + segspt_minfree = segspt_minfree_clamp / PAGESIZE;
337 +}
320 338
321 -/*ARGSUSED*/
322 339 int
323 340 sptcreate(size_t size, struct seg **sptseg, struct anon_map *amp,
324 341 uint_t prot, uint_t flags, uint_t share_szc)
325 342 {
326 343 int err;
327 344 struct as *newas;
328 345 struct segspt_crargs sptcargs;
329 346
330 347 #ifdef DEBUG
331 348 TNF_PROBE_1(sptcreate, "spt", /* CSTYLED */,
332 349 tnf_ulong, size, size );
333 350 #endif
334 - if (segspt_minfree == 0) /* leave min 5% of availrmem for */
335 - segspt_minfree = availrmem/20; /* for the system */
351 + if (segspt_minfree == 0)
352 + spt_setminfree();
336 353
337 354 if (!hat_supported(HAT_SHARED_PT, (void *)0))
338 355 return (EINVAL);
339 356
340 357 /*
341 358 * get a new as for this shared memory segment
342 359 */
343 360 newas = as_alloc();
344 361 newas->a_proc = NULL;
345 362 sptcargs.amp = amp;
346 363 sptcargs.prot = prot;
347 364 sptcargs.flags = flags;
348 365 sptcargs.szc = share_szc;
349 366 /*
350 367 * create a shared page table (spt) segment
351 368 */
352 369
353 370 if (err = as_map(newas, SEGSPTADDR, size, segspt_create, &sptcargs)) {
354 371 as_free(newas);
355 372 return (err);
356 373 }
357 374 *sptseg = sptcargs.seg_spt;
358 375 return (0);
359 376 }
360 377
361 378 void
362 379 sptdestroy(struct as *as, struct anon_map *amp)
363 380 {
364 381
365 382 #ifdef DEBUG
366 383 TNF_PROBE_0(sptdestroy, "spt", /* CSTYLED */);
367 384 #endif
368 385 (void) as_unmap(as, SEGSPTADDR, amp->size);
369 386 as_free(as);
370 387 }
371 388
372 389 /*
373 390 * called from seg_free().
374 391 * free (i.e., unlock, unmap, return to free list)
375 392 * all the pages in the given seg.
376 393 */
377 394 void
378 395 segspt_free(struct seg *seg)
379 396 {
380 397 struct spt_data *sptd = (struct spt_data *)seg->s_data;
381 398
382 399 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
383 400
384 401 if (sptd != NULL) {
385 402 if (sptd->spt_realsize)
386 403 segspt_free_pages(seg, seg->s_base, sptd->spt_realsize);
387 404
388 405 if (sptd->spt_ppa_lckcnt) {
389 406 kmem_free(sptd->spt_ppa_lckcnt,
390 407 sizeof (*sptd->spt_ppa_lckcnt)
391 408 * btopr(sptd->spt_amp->size));
392 409 }
393 410 kmem_free(sptd->spt_vp, sizeof (*sptd->spt_vp));
394 411 cv_destroy(&sptd->spt_cv);
395 412 mutex_destroy(&sptd->spt_lock);
396 413 kmem_free(sptd, sizeof (*sptd));
397 414 }
398 415 }
399 416
400 417 /*ARGSUSED*/
401 418 static int
402 419 segspt_shmsync(struct seg *seg, caddr_t addr, size_t len, int attr,
403 420 uint_t flags)
404 421 {
405 422 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
406 423
407 424 return (0);
408 425 }
409 426
410 427 /*ARGSUSED*/
411 428 static size_t
412 429 segspt_shmincore(struct seg *seg, caddr_t addr, size_t len, char *vec)
413 430 {
414 431 caddr_t eo_seg;
415 432 pgcnt_t npages;
416 433 struct shm_data *shmd = (struct shm_data *)seg->s_data;
417 434 struct seg *sptseg;
418 435 struct spt_data *sptd;
419 436
420 437 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
421 438 #ifdef lint
422 439 seg = seg;
423 440 #endif
424 441 sptseg = shmd->shm_sptseg;
425 442 sptd = sptseg->s_data;
426 443
427 444 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
428 445 eo_seg = addr + len;
429 446 while (addr < eo_seg) {
430 447 /* page exists, and it's locked. */
431 448 *vec++ = SEG_PAGE_INCORE | SEG_PAGE_LOCKED |
432 449 SEG_PAGE_ANON;
433 450 addr += PAGESIZE;
434 451 }
435 452 return (len);
436 453 } else {
437 454 struct anon_map *amp = shmd->shm_amp;
438 455 struct anon *ap;
439 456 page_t *pp;
440 457 pgcnt_t anon_index;
441 458 struct vnode *vp;
442 459 u_offset_t off;
443 460 ulong_t i;
444 461 int ret;
445 462 anon_sync_obj_t cookie;
446 463
447 464 addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
448 465 anon_index = seg_page(seg, addr);
449 466 npages = btopr(len);
450 467 if (anon_index + npages > btopr(shmd->shm_amp->size)) {
451 468 return (EINVAL);
452 469 }
453 470 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
454 471 for (i = 0; i < npages; i++, anon_index++) {
455 472 ret = 0;
456 473 anon_array_enter(amp, anon_index, &cookie);
457 474 ap = anon_get_ptr(amp->ahp, anon_index);
458 475 if (ap != NULL) {
459 476 swap_xlate(ap, &vp, &off);
460 477 anon_array_exit(&cookie);
461 478 pp = page_lookup_nowait(vp, off, SE_SHARED);
462 479 if (pp != NULL) {
463 480 ret |= SEG_PAGE_INCORE | SEG_PAGE_ANON;
464 481 page_unlock(pp);
465 482 }
466 483 } else {
467 484 anon_array_exit(&cookie);
468 485 }
469 486 if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) {
470 487 ret |= SEG_PAGE_LOCKED;
471 488 }
472 489 *vec++ = (char)ret;
473 490 }
474 491 ANON_LOCK_EXIT(&->a_rwlock);
475 492 return (len);
476 493 }
477 494 }
478 495
479 496 static int
480 497 segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize)
481 498 {
482 499 size_t share_size;
483 500
484 501 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
485 502
486 503 /*
487 504 * seg.s_size may have been rounded up to the largest page size
488 505 * in shmat().
489 506 * XXX This should be cleanedup. sptdestroy should take a length
490 507 * argument which should be the same as sptcreate. Then
491 508 * this rounding would not be needed (or is done in shm.c)
492 509 * Only the check for full segment will be needed.
493 510 *
494 511 * XXX -- shouldn't raddr == 0 always? These tests don't seem
495 512 * to be useful at all.
496 513 */
497 514 share_size = page_get_pagesize(seg->s_szc);
498 515 ssize = P2ROUNDUP(ssize, share_size);
499 516
500 517 if (raddr == seg->s_base && ssize == seg->s_size) {
501 518 seg_free(seg);
502 519 return (0);
503 520 } else
504 521 return (EINVAL);
505 522 }
506 523
507 524 int
508 525 segspt_create(struct seg **segpp, void *argsp)
509 526 {
510 527 struct seg *seg = *segpp;
511 528 int err;
512 529 caddr_t addr = seg->s_base;
513 530 struct spt_data *sptd;
514 531 struct segspt_crargs *sptcargs = (struct segspt_crargs *)argsp;
515 532 struct anon_map *amp = sptcargs->amp;
516 533 struct kshmid *sp = amp->a_sp;
517 534 struct cred *cred = CRED();
518 535 ulong_t i, j, anon_index = 0;
519 536 pgcnt_t npages = btopr(amp->size);
520 537 struct vnode *vp;
521 538 page_t **ppa;
522 539 uint_t hat_flags;
523 540 size_t pgsz;
524 541 pgcnt_t pgcnt;
525 542 caddr_t a;
526 543 pgcnt_t pidx;
527 544 size_t sz;
528 545 proc_t *procp = curproc;
529 546 rctl_qty_t lockedbytes = 0;
530 547 kproject_t *proj;
531 548
532 549 /*
533 550 * We are holding the a_lock on the underlying dummy as,
534 551 * so we can make calls to the HAT layer.
535 552 */
536 553 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
537 554 ASSERT(sp != NULL);
538 555
539 556 #ifdef DEBUG
540 557 TNF_PROBE_2(segspt_create, "spt", /* CSTYLED */,
541 558 tnf_opaque, addr, addr, tnf_ulong, len, seg->s_size);
542 559 #endif
543 560 if ((sptcargs->flags & SHM_PAGEABLE) == 0) {
544 561 if (err = anon_swap_adjust(npages))
545 562 return (err);
546 563 }
547 564 err = ENOMEM;
548 565
549 566 if ((sptd = kmem_zalloc(sizeof (*sptd), KM_NOSLEEP)) == NULL)
550 567 goto out1;
551 568
552 569 ppa = NULL;
553 570 if ((sptcargs->flags & SHM_PAGEABLE) == 0) {
554 571 if ((ppa = kmem_zalloc(((sizeof (page_t *)) * npages),
555 572 KM_NOSLEEP)) == NULL)
556 573 goto out2;
557 574 }
558 575
559 576 mutex_init(&sptd->spt_lock, NULL, MUTEX_DEFAULT, NULL);
560 577
561 578 if ((vp = kmem_zalloc(sizeof (*vp), KM_NOSLEEP)) == NULL)
562 579 goto out3;
563 580
564 581 seg->s_ops = &segspt_ops;
565 582 sptd->spt_vp = vp;
566 583 sptd->spt_amp = amp;
567 584 sptd->spt_prot = sptcargs->prot;
568 585 sptd->spt_flags = sptcargs->flags;
569 586 seg->s_data = (caddr_t)sptd;
570 587 sptd->spt_ppa = NULL;
571 588 sptd->spt_ppa_lckcnt = NULL;
572 589 seg->s_szc = sptcargs->szc;
573 590 cv_init(&sptd->spt_cv, NULL, CV_DEFAULT, NULL);
574 591 sptd->spt_gen = 0;
575 592
576 593 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
577 594 if (seg->s_szc > amp->a_szc) {
578 595 amp->a_szc = seg->s_szc;
579 596 }
580 597 ANON_LOCK_EXIT(&->a_rwlock);
581 598
582 599 /*
583 600 * Set policy to affect initial allocation of pages in
584 601 * anon_map_createpages()
585 602 */
586 603 (void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, amp, anon_index,
587 604 NULL, 0, ptob(npages));
588 605
589 606 if (sptcargs->flags & SHM_PAGEABLE) {
590 607 size_t share_sz;
591 608 pgcnt_t new_npgs, more_pgs;
592 609 struct anon_hdr *nahp;
593 610 zone_t *zone;
594 611
595 612 share_sz = page_get_pagesize(seg->s_szc);
596 613 if (!IS_P2ALIGNED(amp->size, share_sz)) {
597 614 /*
598 615 * We are rounding up the size of the anon array
599 616 * on 4 M boundary because we always create 4 M
600 617 * of page(s) when locking, faulting pages and we
601 618 * don't have to check for all corner cases e.g.
602 619 * if there is enough space to allocate 4 M
603 620 * page.
604 621 */
605 622 new_npgs = btop(P2ROUNDUP(amp->size, share_sz));
606 623 more_pgs = new_npgs - npages;
607 624
608 625 /*
609 626 * The zone will never be NULL, as a fully created
610 627 * shm always has an owning zone.
611 628 */
612 629 zone = sp->shm_perm.ipc_zone_ref.zref_zone;
613 630 ASSERT(zone != NULL);
614 631 if (anon_resv_zone(ptob(more_pgs), zone) == 0) {
615 632 err = ENOMEM;
616 633 goto out4;
617 634 }
618 635
619 636 nahp = anon_create(new_npgs, ANON_SLEEP);
620 637 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
621 638 (void) anon_copy_ptr(amp->ahp, 0, nahp, 0, npages,
622 639 ANON_SLEEP);
623 640 anon_release(amp->ahp, npages);
624 641 amp->ahp = nahp;
625 642 ASSERT(amp->swresv == ptob(npages));
626 643 amp->swresv = amp->size = ptob(new_npgs);
627 644 ANON_LOCK_EXIT(&->a_rwlock);
628 645 npages = new_npgs;
629 646 }
630 647
631 648 sptd->spt_ppa_lckcnt = kmem_zalloc(npages *
632 649 sizeof (*sptd->spt_ppa_lckcnt), KM_SLEEP);
633 650 sptd->spt_pcachecnt = 0;
634 651 sptd->spt_realsize = ptob(npages);
635 652 sptcargs->seg_spt = seg;
636 653 return (0);
637 654 }
638 655
639 656 /*
640 657 * get array of pages for each anon slot in amp
641 658 */
642 659 if ((err = anon_map_createpages(amp, anon_index, ptob(npages), ppa,
643 660 seg, addr, S_CREATE, cred)) != 0)
644 661 goto out4;
645 662
646 663 mutex_enter(&sp->shm_mlock);
647 664
648 665 /* May be partially locked, so, count bytes to charge for locking */
649 666 for (i = 0; i < npages; i++)
650 667 if (ppa[i]->p_lckcnt == 0)
651 668 lockedbytes += PAGESIZE;
652 669
653 670 proj = sp->shm_perm.ipc_proj;
654 671
655 672 if (lockedbytes > 0) {
656 673 mutex_enter(&procp->p_lock);
657 674 if (rctl_incr_locked_mem(procp, proj, lockedbytes, 0)) {
658 675 mutex_exit(&procp->p_lock);
659 676 mutex_exit(&sp->shm_mlock);
660 677 for (i = 0; i < npages; i++)
661 678 page_unlock(ppa[i]);
662 679 err = ENOMEM;
663 680 goto out4;
664 681 }
665 682 mutex_exit(&procp->p_lock);
666 683 }
667 684
668 685 /*
669 686 * addr is initial address corresponding to the first page on ppa list
670 687 */
671 688 for (i = 0; i < npages; i++) {
672 689 /* attempt to lock all pages */
673 690 if (page_pp_lock(ppa[i], 0, 1) == 0) {
674 691 /*
675 692 * if unable to lock any page, unlock all
676 693 * of them and return error
677 694 */
678 695 for (j = 0; j < i; j++)
679 696 page_pp_unlock(ppa[j], 0, 1);
680 697 for (i = 0; i < npages; i++)
681 698 page_unlock(ppa[i]);
682 699 rctl_decr_locked_mem(NULL, proj, lockedbytes, 0);
683 700 mutex_exit(&sp->shm_mlock);
684 701 err = ENOMEM;
685 702 goto out4;
686 703 }
687 704 }
688 705 mutex_exit(&sp->shm_mlock);
689 706
690 707 /*
691 708 * Some platforms assume that ISM mappings are HAT_LOAD_LOCK
692 709 * for the entire life of the segment. For example platforms
693 710 * that do not support Dynamic Reconfiguration.
694 711 */
695 712 hat_flags = HAT_LOAD_SHARE;
696 713 if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, NULL))
697 714 hat_flags |= HAT_LOAD_LOCK;
698 715
699 716 /*
700 717 * Load translations one lare page at a time
701 718 * to make sure we don't create mappings bigger than
702 719 * segment's size code in case underlying pages
703 720 * are shared with segvn's segment that uses bigger
704 721 * size code than we do.
705 722 */
706 723 pgsz = page_get_pagesize(seg->s_szc);
707 724 pgcnt = page_get_pagecnt(seg->s_szc);
708 725 for (a = addr, pidx = 0; pidx < npages; a += pgsz, pidx += pgcnt) {
709 726 sz = MIN(pgsz, ptob(npages - pidx));
710 727 hat_memload_array(seg->s_as->a_hat, a, sz,
711 728 &ppa[pidx], sptd->spt_prot, hat_flags);
712 729 }
713 730
714 731 /*
715 732 * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP,
716 733 * we will leave the pages locked SE_SHARED for the life
717 734 * of the ISM segment. This will prevent any calls to
718 735 * hat_pageunload() on this ISM segment for those platforms.
719 736 */
720 737 if (!(hat_flags & HAT_LOAD_LOCK)) {
721 738 /*
722 739 * On platforms that support HAT_DYNAMIC_ISM_UNMAP,
723 740 * we no longer need to hold the SE_SHARED lock on the pages,
724 741 * since L_PAGELOCK and F_SOFTLOCK calls will grab the
725 742 * SE_SHARED lock on the pages as necessary.
726 743 */
727 744 for (i = 0; i < npages; i++)
728 745 page_unlock(ppa[i]);
729 746 }
730 747 sptd->spt_pcachecnt = 0;
731 748 kmem_free(ppa, ((sizeof (page_t *)) * npages));
732 749 sptd->spt_realsize = ptob(npages);
733 750 atomic_add_long(&spt_used, npages);
734 751 sptcargs->seg_spt = seg;
735 752 return (0);
736 753
737 754 out4:
738 755 seg->s_data = NULL;
739 756 kmem_free(vp, sizeof (*vp));
740 757 cv_destroy(&sptd->spt_cv);
741 758 out3:
742 759 mutex_destroy(&sptd->spt_lock);
743 760 if ((sptcargs->flags & SHM_PAGEABLE) == 0)
744 761 kmem_free(ppa, (sizeof (*ppa) * npages));
745 762 out2:
746 763 kmem_free(sptd, sizeof (*sptd));
747 764 out1:
748 765 if ((sptcargs->flags & SHM_PAGEABLE) == 0)
749 766 anon_swap_restore(npages);
750 767 return (err);
751 768 }
752 769
753 770 /*ARGSUSED*/
754 771 void
755 772 segspt_free_pages(struct seg *seg, caddr_t addr, size_t len)
756 773 {
757 774 struct page *pp;
758 775 struct spt_data *sptd = (struct spt_data *)seg->s_data;
759 776 pgcnt_t npages;
760 777 ulong_t anon_idx;
761 778 struct anon_map *amp;
762 779 struct anon *ap;
763 780 struct vnode *vp;
764 781 u_offset_t off;
765 782 uint_t hat_flags;
766 783 int root = 0;
767 784 pgcnt_t pgs, curnpgs = 0;
768 785 page_t *rootpp;
769 786 rctl_qty_t unlocked_bytes = 0;
770 787 kproject_t *proj;
771 788 kshmid_t *sp;
772 789
773 790 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
774 791
775 792 len = P2ROUNDUP(len, PAGESIZE);
776 793
777 794 npages = btop(len);
778 795
779 796 hat_flags = HAT_UNLOAD_UNLOCK | HAT_UNLOAD_UNMAP;
780 797 if ((hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) ||
781 798 (sptd->spt_flags & SHM_PAGEABLE)) {
782 799 hat_flags = HAT_UNLOAD_UNMAP;
783 800 }
784 801
785 802 hat_unload(seg->s_as->a_hat, addr, len, hat_flags);
786 803
787 804 amp = sptd->spt_amp;
788 805 if (sptd->spt_flags & SHM_PAGEABLE)
789 806 npages = btop(amp->size);
790 807
791 808 ASSERT(amp != NULL);
792 809
793 810 proj = NULL;
794 811 rootpp = NULL;
795 812 sp = NULL;
796 813 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
797 814 sp = amp->a_sp;
798 815 proj = sp->shm_perm.ipc_proj;
799 816 mutex_enter(&sp->shm_mlock);
800 817 }
801 818 for (anon_idx = 0; anon_idx < npages; anon_idx++) {
802 819 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
803 820 if ((ap = anon_get_ptr(amp->ahp, anon_idx)) == NULL) {
804 821 panic("segspt_free_pages: null app");
805 822 /*NOTREACHED*/
806 823 }
807 824 } else {
808 825 if ((ap = anon_get_next_ptr(amp->ahp, &anon_idx))
809 826 == NULL)
810 827 continue;
811 828 }
812 829 ASSERT(ANON_ISBUSY(anon_get_slot(amp->ahp, anon_idx)) == 0);
813 830 swap_xlate(ap, &vp, &off);
814 831
815 832 /*
816 833 * If this platform supports HAT_DYNAMIC_ISM_UNMAP,
817 834 * the pages won't be having SE_SHARED lock at this
818 835 * point.
819 836 *
820 837 * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP,
821 838 * the pages are still held SE_SHARED locked from the
822 839 * original segspt_create()
823 840 *
824 841 * Our goal is to get SE_EXCL lock on each page, remove
825 842 * permanent lock on it and invalidate the page.
826 843 */
827 844 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
828 845 if (hat_flags == HAT_UNLOAD_UNMAP)
829 846 pp = page_lookup(vp, off, SE_EXCL);
830 847 else {
831 848 if ((pp = page_find(vp, off)) == NULL) {
832 849 panic("segspt_free_pages: "
833 850 "page not locked");
834 851 /*NOTREACHED*/
835 852 }
836 853 if (!page_tryupgrade(pp)) {
837 854 page_unlock(pp);
838 855 pp = page_lookup(vp, off, SE_EXCL);
839 856 }
840 857 }
841 858 if (pp == NULL) {
842 859 panic("segspt_free_pages: "
843 860 "page not in the system");
844 861 /*NOTREACHED*/
845 862 }
846 863 ASSERT(pp->p_lckcnt > 0);
847 864 page_pp_unlock(pp, 0, 1);
848 865 if (pp->p_lckcnt == 0)
849 866 unlocked_bytes += PAGESIZE;
850 867 } else {
851 868 if ((pp = page_lookup(vp, off, SE_EXCL)) == NULL)
852 869 continue;
853 870 }
854 871 /*
855 872 * It's logical to invalidate the pages here as in most cases
856 873 * these were created by segspt.
857 874 */
858 875 if (pp->p_szc != 0) {
859 876 if (root == 0) {
860 877 ASSERT(curnpgs == 0);
861 878 root = 1;
862 879 rootpp = pp;
863 880 pgs = curnpgs = page_get_pagecnt(pp->p_szc);
864 881 ASSERT(pgs > 1);
865 882 ASSERT(IS_P2ALIGNED(pgs, pgs));
866 883 ASSERT(!(page_pptonum(pp) & (pgs - 1)));
867 884 curnpgs--;
868 885 } else if ((page_pptonum(pp) & (pgs - 1)) == pgs - 1) {
869 886 ASSERT(curnpgs == 1);
870 887 ASSERT(page_pptonum(pp) ==
871 888 page_pptonum(rootpp) + (pgs - 1));
872 889 page_destroy_pages(rootpp);
873 890 root = 0;
874 891 curnpgs = 0;
875 892 } else {
876 893 ASSERT(curnpgs > 1);
877 894 ASSERT(page_pptonum(pp) ==
878 895 page_pptonum(rootpp) + (pgs - curnpgs));
879 896 curnpgs--;
880 897 }
881 898 } else {
882 899 if (root != 0 || curnpgs != 0) {
883 900 panic("segspt_free_pages: bad large page");
884 901 /*NOTREACHED*/
885 902 }
886 903 /*
887 904 * Before destroying the pages, we need to take care
888 905 * of the rctl locked memory accounting. For that
889 906 * we need to calculte the unlocked_bytes.
890 907 */
891 908 if (pp->p_lckcnt > 0)
892 909 unlocked_bytes += PAGESIZE;
893 910 /*LINTED: constant in conditional context */
894 911 VN_DISPOSE(pp, B_INVAL, 0, kcred);
895 912 }
896 913 }
897 914 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
898 915 if (unlocked_bytes > 0)
899 916 rctl_decr_locked_mem(NULL, proj, unlocked_bytes, 0);
900 917 mutex_exit(&sp->shm_mlock);
901 918 }
902 919 if (root != 0 || curnpgs != 0) {
903 920 panic("segspt_free_pages: bad large page");
904 921 /*NOTREACHED*/
905 922 }
906 923
907 924 /*
908 925 * mark that pages have been released
909 926 */
910 927 sptd->spt_realsize = 0;
911 928
912 929 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
913 930 atomic_add_long(&spt_used, -npages);
914 931 anon_swap_restore(npages);
915 932 }
916 933 }
917 934
918 935 /*
919 936 * Get memory allocation policy info for specified address in given segment
920 937 */
921 938 static lgrp_mem_policy_info_t *
922 939 segspt_getpolicy(struct seg *seg, caddr_t addr)
923 940 {
924 941 struct anon_map *amp;
925 942 ulong_t anon_index;
926 943 lgrp_mem_policy_info_t *policy_info;
927 944 struct spt_data *spt_data;
928 945
929 946 ASSERT(seg != NULL);
930 947
931 948 /*
932 949 * Get anon_map from segspt
933 950 *
934 951 * Assume that no lock needs to be held on anon_map, since
935 952 * it should be protected by its reference count which must be
936 953 * nonzero for an existing segment
937 954 * Need to grab readers lock on policy tree though
938 955 */
939 956 spt_data = (struct spt_data *)seg->s_data;
940 957 if (spt_data == NULL)
941 958 return (NULL);
942 959 amp = spt_data->spt_amp;
943 960 ASSERT(amp->refcnt != 0);
944 961
945 962 /*
946 963 * Get policy info
947 964 *
948 965 * Assume starting anon index of 0
949 966 */
950 967 anon_index = seg_page(seg, addr);
951 968 policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0);
952 969
953 970 return (policy_info);
954 971 }
955 972
956 973 /*
957 974 * DISM only.
958 975 * Return locked pages over a given range.
959 976 *
960 977 * We will cache all DISM locked pages and save the pplist for the
961 978 * entire segment in the ppa field of the underlying DISM segment structure.
962 979 * Later, during a call to segspt_reclaim() we will use this ppa array
963 980 * to page_unlock() all of the pages and then we will free this ppa list.
964 981 */
965 982 /*ARGSUSED*/
966 983 static int
967 984 segspt_dismpagelock(struct seg *seg, caddr_t addr, size_t len,
968 985 struct page ***ppp, enum lock_type type, enum seg_rw rw)
969 986 {
970 987 struct shm_data *shmd = (struct shm_data *)seg->s_data;
971 988 struct seg *sptseg = shmd->shm_sptseg;
972 989 struct spt_data *sptd = sptseg->s_data;
973 990 pgcnt_t pg_idx, npages, tot_npages, npgs;
974 991 struct page **pplist, **pl, **ppa, *pp;
975 992 struct anon_map *amp;
976 993 spgcnt_t an_idx;
977 994 int ret = ENOTSUP;
978 995 uint_t pl_built = 0;
979 996 struct anon *ap;
980 997 struct vnode *vp;
981 998 u_offset_t off;
982 999 pgcnt_t claim_availrmem = 0;
983 1000 uint_t szc;
984 1001
985 1002 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
986 1003 ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK);
987 1004
988 1005 /*
989 1006 * We want to lock/unlock the entire ISM segment. Therefore,
990 1007 * we will be using the underlying sptseg and it's base address
991 1008 * and length for the caching arguments.
992 1009 */
993 1010 ASSERT(sptseg);
994 1011 ASSERT(sptd);
995 1012
996 1013 pg_idx = seg_page(seg, addr);
997 1014 npages = btopr(len);
998 1015
999 1016 /*
1000 1017 * check if the request is larger than number of pages covered
1001 1018 * by amp
1002 1019 */
1003 1020 if (pg_idx + npages > btopr(sptd->spt_amp->size)) {
1004 1021 *ppp = NULL;
1005 1022 return (ENOTSUP);
1006 1023 }
1007 1024
1008 1025 if (type == L_PAGEUNLOCK) {
1009 1026 ASSERT(sptd->spt_ppa != NULL);
1010 1027
1011 1028 seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size,
1012 1029 sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
1013 1030
1014 1031 /*
1015 1032 * If someone is blocked while unmapping, we purge
1016 1033 * segment page cache and thus reclaim pplist synchronously
1017 1034 * without waiting for seg_pasync_thread. This speeds up
1018 1035 * unmapping in cases where munmap(2) is called, while
1019 1036 * raw async i/o is still in progress or where a thread
1020 1037 * exits on data fault in a multithreaded application.
1021 1038 */
1022 1039 if ((sptd->spt_flags & DISM_PPA_CHANGED) ||
1023 1040 (AS_ISUNMAPWAIT(seg->s_as) &&
1024 1041 shmd->shm_softlockcnt > 0)) {
1025 1042 segspt_purge(seg);
1026 1043 }
1027 1044 return (0);
1028 1045 }
1029 1046
1030 1047 /* The L_PAGELOCK case ... */
1031 1048
1032 1049 if (sptd->spt_flags & DISM_PPA_CHANGED) {
1033 1050 segspt_purge(seg);
1034 1051 /*
1035 1052 * for DISM ppa needs to be rebuild since
1036 1053 * number of locked pages could be changed
1037 1054 */
1038 1055 *ppp = NULL;
1039 1056 return (ENOTSUP);
1040 1057 }
1041 1058
1042 1059 /*
1043 1060 * First try to find pages in segment page cache, without
1044 1061 * holding the segment lock.
1045 1062 */
1046 1063 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
1047 1064 S_WRITE, SEGP_FORCE_WIRED);
1048 1065 if (pplist != NULL) {
1049 1066 ASSERT(sptd->spt_ppa != NULL);
1050 1067 ASSERT(sptd->spt_ppa == pplist);
1051 1068 ppa = sptd->spt_ppa;
1052 1069 for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
1053 1070 if (ppa[an_idx] == NULL) {
1054 1071 seg_pinactive(seg, NULL, seg->s_base,
1055 1072 sptd->spt_amp->size, ppa,
1056 1073 S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
1057 1074 *ppp = NULL;
1058 1075 return (ENOTSUP);
1059 1076 }
1060 1077 if ((szc = ppa[an_idx]->p_szc) != 0) {
1061 1078 npgs = page_get_pagecnt(szc);
1062 1079 an_idx = P2ROUNDUP(an_idx + 1, npgs);
1063 1080 } else {
1064 1081 an_idx++;
1065 1082 }
1066 1083 }
1067 1084 /*
1068 1085 * Since we cache the entire DISM segment, we want to
1069 1086 * set ppp to point to the first slot that corresponds
1070 1087 * to the requested addr, i.e. pg_idx.
1071 1088 */
1072 1089 *ppp = &(sptd->spt_ppa[pg_idx]);
1073 1090 return (0);
1074 1091 }
1075 1092
1076 1093 mutex_enter(&sptd->spt_lock);
1077 1094 /*
1078 1095 * try to find pages in segment page cache with mutex
1079 1096 */
1080 1097 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
1081 1098 S_WRITE, SEGP_FORCE_WIRED);
1082 1099 if (pplist != NULL) {
1083 1100 ASSERT(sptd->spt_ppa != NULL);
1084 1101 ASSERT(sptd->spt_ppa == pplist);
1085 1102 ppa = sptd->spt_ppa;
1086 1103 for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
1087 1104 if (ppa[an_idx] == NULL) {
1088 1105 mutex_exit(&sptd->spt_lock);
1089 1106 seg_pinactive(seg, NULL, seg->s_base,
1090 1107 sptd->spt_amp->size, ppa,
1091 1108 S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
1092 1109 *ppp = NULL;
1093 1110 return (ENOTSUP);
1094 1111 }
1095 1112 if ((szc = ppa[an_idx]->p_szc) != 0) {
1096 1113 npgs = page_get_pagecnt(szc);
1097 1114 an_idx = P2ROUNDUP(an_idx + 1, npgs);
1098 1115 } else {
1099 1116 an_idx++;
1100 1117 }
1101 1118 }
1102 1119 /*
1103 1120 * Since we cache the entire DISM segment, we want to
1104 1121 * set ppp to point to the first slot that corresponds
1105 1122 * to the requested addr, i.e. pg_idx.
1106 1123 */
1107 1124 mutex_exit(&sptd->spt_lock);
1108 1125 *ppp = &(sptd->spt_ppa[pg_idx]);
1109 1126 return (0);
1110 1127 }
1111 1128 if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size,
1112 1129 SEGP_FORCE_WIRED) == SEGP_FAIL) {
1113 1130 mutex_exit(&sptd->spt_lock);
1114 1131 *ppp = NULL;
1115 1132 return (ENOTSUP);
1116 1133 }
1117 1134
1118 1135 /*
1119 1136 * No need to worry about protections because DISM pages are always rw.
1120 1137 */
1121 1138 pl = pplist = NULL;
1122 1139 amp = sptd->spt_amp;
1123 1140
1124 1141 /*
1125 1142 * Do we need to build the ppa array?
1126 1143 */
1127 1144 if (sptd->spt_ppa == NULL) {
1128 1145 pgcnt_t lpg_cnt = 0;
1129 1146
1130 1147 pl_built = 1;
1131 1148 tot_npages = btopr(sptd->spt_amp->size);
1132 1149
1133 1150 ASSERT(sptd->spt_pcachecnt == 0);
1134 1151 pplist = kmem_zalloc(sizeof (page_t *) * tot_npages, KM_SLEEP);
1135 1152 pl = pplist;
1136 1153
1137 1154 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
1138 1155 for (an_idx = 0; an_idx < tot_npages; ) {
1139 1156 ap = anon_get_ptr(amp->ahp, an_idx);
1140 1157 /*
1141 1158 * Cache only mlocked pages. For large pages
1142 1159 * if one (constituent) page is mlocked
1143 1160 * all pages for that large page
1144 1161 * are cached also. This is for quick
1145 1162 * lookups of ppa array;
1146 1163 */
1147 1164 if ((ap != NULL) && (lpg_cnt != 0 ||
1148 1165 (sptd->spt_ppa_lckcnt[an_idx] != 0))) {
1149 1166
1150 1167 swap_xlate(ap, &vp, &off);
1151 1168 pp = page_lookup(vp, off, SE_SHARED);
1152 1169 ASSERT(pp != NULL);
1153 1170 if (lpg_cnt == 0) {
1154 1171 lpg_cnt++;
1155 1172 /*
1156 1173 * For a small page, we are done --
1157 1174 * lpg_count is reset to 0 below.
1158 1175 *
1159 1176 * For a large page, we are guaranteed
1160 1177 * to find the anon structures of all
1161 1178 * constituent pages and a non-zero
1162 1179 * lpg_cnt ensures that we don't test
1163 1180 * for mlock for these. We are done
1164 1181 * when lpg_count reaches (npgs + 1).
1165 1182 * If we are not the first constituent
1166 1183 * page, restart at the first one.
1167 1184 */
1168 1185 npgs = page_get_pagecnt(pp->p_szc);
1169 1186 if (!IS_P2ALIGNED(an_idx, npgs)) {
1170 1187 an_idx = P2ALIGN(an_idx, npgs);
1171 1188 page_unlock(pp);
1172 1189 continue;
1173 1190 }
1174 1191 }
1175 1192 if (++lpg_cnt > npgs)
1176 1193 lpg_cnt = 0;
1177 1194
1178 1195 /*
1179 1196 * availrmem is decremented only
1180 1197 * for unlocked pages
1181 1198 */
1182 1199 if (sptd->spt_ppa_lckcnt[an_idx] == 0)
1183 1200 claim_availrmem++;
1184 1201 pplist[an_idx] = pp;
1185 1202 }
1186 1203 an_idx++;
1187 1204 }
1188 1205 ANON_LOCK_EXIT(&->a_rwlock);
1189 1206
1190 1207 if (claim_availrmem) {
1191 1208 mutex_enter(&freemem_lock);
1192 1209 if (availrmem < tune.t_minarmem + claim_availrmem) {
1193 1210 mutex_exit(&freemem_lock);
1194 1211 ret = ENOTSUP;
1195 1212 claim_availrmem = 0;
1196 1213 goto insert_fail;
1197 1214 } else {
1198 1215 availrmem -= claim_availrmem;
1199 1216 }
1200 1217 mutex_exit(&freemem_lock);
1201 1218 }
1202 1219
1203 1220 sptd->spt_ppa = pl;
1204 1221 } else {
1205 1222 /*
1206 1223 * We already have a valid ppa[].
1207 1224 */
1208 1225 pl = sptd->spt_ppa;
1209 1226 }
1210 1227
1211 1228 ASSERT(pl != NULL);
1212 1229
1213 1230 ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size,
1214 1231 sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED,
1215 1232 segspt_reclaim);
1216 1233 if (ret == SEGP_FAIL) {
1217 1234 /*
1218 1235 * seg_pinsert failed. We return
1219 1236 * ENOTSUP, so that the as_pagelock() code will
1220 1237 * then try the slower F_SOFTLOCK path.
1221 1238 */
1222 1239 if (pl_built) {
1223 1240 /*
1224 1241 * No one else has referenced the ppa[].
1225 1242 * We created it and we need to destroy it.
1226 1243 */
1227 1244 sptd->spt_ppa = NULL;
1228 1245 }
1229 1246 ret = ENOTSUP;
1230 1247 goto insert_fail;
1231 1248 }
1232 1249
1233 1250 /*
1234 1251 * In either case, we increment softlockcnt on the 'real' segment.
1235 1252 */
1236 1253 sptd->spt_pcachecnt++;
1237 1254 atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
1238 1255
1239 1256 ppa = sptd->spt_ppa;
1240 1257 for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
1241 1258 if (ppa[an_idx] == NULL) {
1242 1259 mutex_exit(&sptd->spt_lock);
1243 1260 seg_pinactive(seg, NULL, seg->s_base,
1244 1261 sptd->spt_amp->size,
1245 1262 pl, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
1246 1263 *ppp = NULL;
1247 1264 return (ENOTSUP);
1248 1265 }
1249 1266 if ((szc = ppa[an_idx]->p_szc) != 0) {
1250 1267 npgs = page_get_pagecnt(szc);
1251 1268 an_idx = P2ROUNDUP(an_idx + 1, npgs);
1252 1269 } else {
1253 1270 an_idx++;
1254 1271 }
1255 1272 }
1256 1273 /*
1257 1274 * We can now drop the sptd->spt_lock since the ppa[]
1258 1275 * exists and we have incremented pacachecnt.
1259 1276 */
1260 1277 mutex_exit(&sptd->spt_lock);
1261 1278
1262 1279 /*
1263 1280 * Since we cache the entire segment, we want to
1264 1281 * set ppp to point to the first slot that corresponds
1265 1282 * to the requested addr, i.e. pg_idx.
1266 1283 */
1267 1284 *ppp = &(sptd->spt_ppa[pg_idx]);
1268 1285 return (0);
1269 1286
1270 1287 insert_fail:
1271 1288 /*
1272 1289 * We will only reach this code if we tried and failed.
1273 1290 *
1274 1291 * And we can drop the lock on the dummy seg, once we've failed
1275 1292 * to set up a new ppa[].
1276 1293 */
1277 1294 mutex_exit(&sptd->spt_lock);
1278 1295
1279 1296 if (pl_built) {
1280 1297 if (claim_availrmem) {
1281 1298 mutex_enter(&freemem_lock);
1282 1299 availrmem += claim_availrmem;
1283 1300 mutex_exit(&freemem_lock);
1284 1301 }
1285 1302
1286 1303 /*
1287 1304 * We created pl and we need to destroy it.
1288 1305 */
1289 1306 pplist = pl;
1290 1307 for (an_idx = 0; an_idx < tot_npages; an_idx++) {
1291 1308 if (pplist[an_idx] != NULL)
1292 1309 page_unlock(pplist[an_idx]);
1293 1310 }
1294 1311 kmem_free(pl, sizeof (page_t *) * tot_npages);
1295 1312 }
1296 1313
1297 1314 if (shmd->shm_softlockcnt <= 0) {
1298 1315 if (AS_ISUNMAPWAIT(seg->s_as)) {
1299 1316 mutex_enter(&seg->s_as->a_contents);
1300 1317 if (AS_ISUNMAPWAIT(seg->s_as)) {
1301 1318 AS_CLRUNMAPWAIT(seg->s_as);
1302 1319 cv_broadcast(&seg->s_as->a_cv);
1303 1320 }
1304 1321 mutex_exit(&seg->s_as->a_contents);
1305 1322 }
1306 1323 }
1307 1324 *ppp = NULL;
1308 1325 return (ret);
1309 1326 }
1310 1327
1311 1328
1312 1329
1313 1330 /*
1314 1331 * return locked pages over a given range.
1315 1332 *
1316 1333 * We will cache the entire ISM segment and save the pplist for the
1317 1334 * entire segment in the ppa field of the underlying ISM segment structure.
1318 1335 * Later, during a call to segspt_reclaim() we will use this ppa array
1319 1336 * to page_unlock() all of the pages and then we will free this ppa list.
1320 1337 */
1321 1338 /*ARGSUSED*/
1322 1339 static int
1323 1340 segspt_shmpagelock(struct seg *seg, caddr_t addr, size_t len,
1324 1341 struct page ***ppp, enum lock_type type, enum seg_rw rw)
1325 1342 {
1326 1343 struct shm_data *shmd = (struct shm_data *)seg->s_data;
1327 1344 struct seg *sptseg = shmd->shm_sptseg;
1328 1345 struct spt_data *sptd = sptseg->s_data;
1329 1346 pgcnt_t np, page_index, npages;
1330 1347 caddr_t a, spt_base;
1331 1348 struct page **pplist, **pl, *pp;
1332 1349 struct anon_map *amp;
1333 1350 ulong_t anon_index;
1334 1351 int ret = ENOTSUP;
1335 1352 uint_t pl_built = 0;
1336 1353 struct anon *ap;
1337 1354 struct vnode *vp;
1338 1355 u_offset_t off;
1339 1356
1340 1357 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
1341 1358 ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK);
1342 1359
1343 1360
1344 1361 /*
1345 1362 * We want to lock/unlock the entire ISM segment. Therefore,
1346 1363 * we will be using the underlying sptseg and it's base address
1347 1364 * and length for the caching arguments.
1348 1365 */
1349 1366 ASSERT(sptseg);
1350 1367 ASSERT(sptd);
1351 1368
1352 1369 if (sptd->spt_flags & SHM_PAGEABLE) {
1353 1370 return (segspt_dismpagelock(seg, addr, len, ppp, type, rw));
1354 1371 }
1355 1372
1356 1373 page_index = seg_page(seg, addr);
1357 1374 npages = btopr(len);
1358 1375
1359 1376 /*
1360 1377 * check if the request is larger than number of pages covered
1361 1378 * by amp
1362 1379 */
1363 1380 if (page_index + npages > btopr(sptd->spt_amp->size)) {
1364 1381 *ppp = NULL;
1365 1382 return (ENOTSUP);
1366 1383 }
1367 1384
1368 1385 if (type == L_PAGEUNLOCK) {
1369 1386
1370 1387 ASSERT(sptd->spt_ppa != NULL);
1371 1388
1372 1389 seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size,
1373 1390 sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
1374 1391
1375 1392 /*
1376 1393 * If someone is blocked while unmapping, we purge
1377 1394 * segment page cache and thus reclaim pplist synchronously
1378 1395 * without waiting for seg_pasync_thread. This speeds up
1379 1396 * unmapping in cases where munmap(2) is called, while
1380 1397 * raw async i/o is still in progress or where a thread
1381 1398 * exits on data fault in a multithreaded application.
1382 1399 */
1383 1400 if (AS_ISUNMAPWAIT(seg->s_as) && (shmd->shm_softlockcnt > 0)) {
1384 1401 segspt_purge(seg);
1385 1402 }
1386 1403 return (0);
1387 1404 }
1388 1405
1389 1406 /* The L_PAGELOCK case... */
1390 1407
1391 1408 /*
1392 1409 * First try to find pages in segment page cache, without
1393 1410 * holding the segment lock.
1394 1411 */
1395 1412 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
1396 1413 S_WRITE, SEGP_FORCE_WIRED);
1397 1414 if (pplist != NULL) {
1398 1415 ASSERT(sptd->spt_ppa == pplist);
1399 1416 ASSERT(sptd->spt_ppa[page_index]);
1400 1417 /*
1401 1418 * Since we cache the entire ISM segment, we want to
1402 1419 * set ppp to point to the first slot that corresponds
1403 1420 * to the requested addr, i.e. page_index.
1404 1421 */
1405 1422 *ppp = &(sptd->spt_ppa[page_index]);
1406 1423 return (0);
1407 1424 }
1408 1425
1409 1426 mutex_enter(&sptd->spt_lock);
1410 1427
1411 1428 /*
1412 1429 * try to find pages in segment page cache
1413 1430 */
1414 1431 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
1415 1432 S_WRITE, SEGP_FORCE_WIRED);
1416 1433 if (pplist != NULL) {
1417 1434 ASSERT(sptd->spt_ppa == pplist);
1418 1435 /*
1419 1436 * Since we cache the entire segment, we want to
1420 1437 * set ppp to point to the first slot that corresponds
1421 1438 * to the requested addr, i.e. page_index.
1422 1439 */
1423 1440 mutex_exit(&sptd->spt_lock);
1424 1441 *ppp = &(sptd->spt_ppa[page_index]);
1425 1442 return (0);
1426 1443 }
1427 1444
1428 1445 if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size,
1429 1446 SEGP_FORCE_WIRED) == SEGP_FAIL) {
1430 1447 mutex_exit(&sptd->spt_lock);
1431 1448 *ppp = NULL;
1432 1449 return (ENOTSUP);
1433 1450 }
1434 1451
1435 1452 /*
1436 1453 * No need to worry about protections because ISM pages
1437 1454 * are always rw.
1438 1455 */
1439 1456 pl = pplist = NULL;
1440 1457
1441 1458 /*
1442 1459 * Do we need to build the ppa array?
1443 1460 */
1444 1461 if (sptd->spt_ppa == NULL) {
1445 1462 ASSERT(sptd->spt_ppa == pplist);
1446 1463
1447 1464 spt_base = sptseg->s_base;
1448 1465 pl_built = 1;
1449 1466
1450 1467 /*
1451 1468 * availrmem is decremented once during anon_swap_adjust()
1452 1469 * and is incremented during the anon_unresv(), which is
1453 1470 * called from shm_rm_amp() when the segment is destroyed.
1454 1471 */
1455 1472 amp = sptd->spt_amp;
1456 1473 ASSERT(amp != NULL);
1457 1474
1458 1475 /* pcachecnt is protected by sptd->spt_lock */
1459 1476 ASSERT(sptd->spt_pcachecnt == 0);
1460 1477 pplist = kmem_zalloc(sizeof (page_t *)
1461 1478 * btopr(sptd->spt_amp->size), KM_SLEEP);
1462 1479 pl = pplist;
1463 1480
1464 1481 anon_index = seg_page(sptseg, spt_base);
1465 1482
1466 1483 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
1467 1484 for (a = spt_base; a < (spt_base + sptd->spt_amp->size);
1468 1485 a += PAGESIZE, anon_index++, pplist++) {
1469 1486 ap = anon_get_ptr(amp->ahp, anon_index);
1470 1487 ASSERT(ap != NULL);
1471 1488 swap_xlate(ap, &vp, &off);
1472 1489 pp = page_lookup(vp, off, SE_SHARED);
1473 1490 ASSERT(pp != NULL);
1474 1491 *pplist = pp;
1475 1492 }
1476 1493 ANON_LOCK_EXIT(&->a_rwlock);
1477 1494
1478 1495 if (a < (spt_base + sptd->spt_amp->size)) {
1479 1496 ret = ENOTSUP;
1480 1497 goto insert_fail;
1481 1498 }
1482 1499 sptd->spt_ppa = pl;
1483 1500 } else {
1484 1501 /*
1485 1502 * We already have a valid ppa[].
1486 1503 */
1487 1504 pl = sptd->spt_ppa;
1488 1505 }
1489 1506
1490 1507 ASSERT(pl != NULL);
1491 1508
1492 1509 ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size,
1493 1510 sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED,
1494 1511 segspt_reclaim);
1495 1512 if (ret == SEGP_FAIL) {
1496 1513 /*
1497 1514 * seg_pinsert failed. We return
1498 1515 * ENOTSUP, so that the as_pagelock() code will
1499 1516 * then try the slower F_SOFTLOCK path.
1500 1517 */
1501 1518 if (pl_built) {
1502 1519 /*
1503 1520 * No one else has referenced the ppa[].
1504 1521 * We created it and we need to destroy it.
1505 1522 */
1506 1523 sptd->spt_ppa = NULL;
1507 1524 }
1508 1525 ret = ENOTSUP;
1509 1526 goto insert_fail;
1510 1527 }
1511 1528
1512 1529 /*
1513 1530 * In either case, we increment softlockcnt on the 'real' segment.
1514 1531 */
1515 1532 sptd->spt_pcachecnt++;
1516 1533 atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
1517 1534
1518 1535 /*
1519 1536 * We can now drop the sptd->spt_lock since the ppa[]
1520 1537 * exists and we have incremented pacachecnt.
1521 1538 */
1522 1539 mutex_exit(&sptd->spt_lock);
1523 1540
1524 1541 /*
1525 1542 * Since we cache the entire segment, we want to
1526 1543 * set ppp to point to the first slot that corresponds
1527 1544 * to the requested addr, i.e. page_index.
1528 1545 */
1529 1546 *ppp = &(sptd->spt_ppa[page_index]);
1530 1547 return (0);
1531 1548
1532 1549 insert_fail:
1533 1550 /*
1534 1551 * We will only reach this code if we tried and failed.
1535 1552 *
1536 1553 * And we can drop the lock on the dummy seg, once we've failed
1537 1554 * to set up a new ppa[].
1538 1555 */
1539 1556 mutex_exit(&sptd->spt_lock);
1540 1557
1541 1558 if (pl_built) {
1542 1559 /*
1543 1560 * We created pl and we need to destroy it.
1544 1561 */
1545 1562 pplist = pl;
1546 1563 np = (((uintptr_t)(a - spt_base)) >> PAGESHIFT);
1547 1564 while (np) {
1548 1565 page_unlock(*pplist);
1549 1566 np--;
1550 1567 pplist++;
1551 1568 }
1552 1569 kmem_free(pl, sizeof (page_t *) * btopr(sptd->spt_amp->size));
1553 1570 }
1554 1571 if (shmd->shm_softlockcnt <= 0) {
1555 1572 if (AS_ISUNMAPWAIT(seg->s_as)) {
1556 1573 mutex_enter(&seg->s_as->a_contents);
1557 1574 if (AS_ISUNMAPWAIT(seg->s_as)) {
1558 1575 AS_CLRUNMAPWAIT(seg->s_as);
1559 1576 cv_broadcast(&seg->s_as->a_cv);
1560 1577 }
1561 1578 mutex_exit(&seg->s_as->a_contents);
1562 1579 }
1563 1580 }
1564 1581 *ppp = NULL;
1565 1582 return (ret);
1566 1583 }
1567 1584
1568 1585 /*
1569 1586 * purge any cached pages in the I/O page cache
1570 1587 */
1571 1588 static void
1572 1589 segspt_purge(struct seg *seg)
1573 1590 {
1574 1591 seg_ppurge(seg, NULL, SEGP_FORCE_WIRED);
1575 1592 }
1576 1593
1577 1594 static int
1578 1595 segspt_reclaim(void *ptag, caddr_t addr, size_t len, struct page **pplist,
1579 1596 enum seg_rw rw, int async)
1580 1597 {
1581 1598 struct seg *seg = (struct seg *)ptag;
1582 1599 struct shm_data *shmd = (struct shm_data *)seg->s_data;
1583 1600 struct seg *sptseg;
1584 1601 struct spt_data *sptd;
1585 1602 pgcnt_t npages, i, free_availrmem = 0;
1586 1603 int done = 0;
1587 1604
1588 1605 #ifdef lint
1589 1606 addr = addr;
1590 1607 #endif
1591 1608 sptseg = shmd->shm_sptseg;
1592 1609 sptd = sptseg->s_data;
1593 1610 npages = (len >> PAGESHIFT);
1594 1611 ASSERT(npages);
1595 1612 ASSERT(sptd->spt_pcachecnt != 0);
1596 1613 ASSERT(sptd->spt_ppa == pplist);
1597 1614 ASSERT(npages == btopr(sptd->spt_amp->size));
1598 1615 ASSERT(async || AS_LOCK_HELD(seg->s_as));
1599 1616
1600 1617 /*
1601 1618 * Acquire the lock on the dummy seg and destroy the
1602 1619 * ppa array IF this is the last pcachecnt.
1603 1620 */
1604 1621 mutex_enter(&sptd->spt_lock);
1605 1622 if (--sptd->spt_pcachecnt == 0) {
1606 1623 for (i = 0; i < npages; i++) {
1607 1624 if (pplist[i] == NULL) {
1608 1625 continue;
1609 1626 }
1610 1627 if (rw == S_WRITE) {
1611 1628 hat_setrefmod(pplist[i]);
1612 1629 } else {
1613 1630 hat_setref(pplist[i]);
1614 1631 }
1615 1632 if ((sptd->spt_flags & SHM_PAGEABLE) &&
1616 1633 (sptd->spt_ppa_lckcnt[i] == 0))
1617 1634 free_availrmem++;
1618 1635 page_unlock(pplist[i]);
1619 1636 }
1620 1637 if ((sptd->spt_flags & SHM_PAGEABLE) && free_availrmem) {
1621 1638 mutex_enter(&freemem_lock);
1622 1639 availrmem += free_availrmem;
1623 1640 mutex_exit(&freemem_lock);
1624 1641 }
1625 1642 /*
1626 1643 * Since we want to cach/uncache the entire ISM segment,
1627 1644 * we will track the pplist in a segspt specific field
1628 1645 * ppa, that is initialized at the time we add an entry to
1629 1646 * the cache.
1630 1647 */
1631 1648 ASSERT(sptd->spt_pcachecnt == 0);
1632 1649 kmem_free(pplist, sizeof (page_t *) * npages);
1633 1650 sptd->spt_ppa = NULL;
1634 1651 sptd->spt_flags &= ~DISM_PPA_CHANGED;
1635 1652 sptd->spt_gen++;
1636 1653 cv_broadcast(&sptd->spt_cv);
1637 1654 done = 1;
1638 1655 }
1639 1656 mutex_exit(&sptd->spt_lock);
1640 1657
1641 1658 /*
1642 1659 * If we are pcache async thread or called via seg_ppurge_wiredpp() we
1643 1660 * may not hold AS lock (in this case async argument is not 0). This
1644 1661 * means if softlockcnt drops to 0 after the decrement below address
1645 1662 * space may get freed. We can't allow it since after softlock
1646 1663 * derement to 0 we still need to access as structure for possible
1647 1664 * wakeup of unmap waiters. To prevent the disappearance of as we take
1648 1665 * this segment's shm_segfree_syncmtx. segspt_shmfree() also takes
1649 1666 * this mutex as a barrier to make sure this routine completes before
1650 1667 * segment is freed.
1651 1668 *
1652 1669 * The second complication we have to deal with in async case is a
1653 1670 * possibility of missed wake up of unmap wait thread. When we don't
1654 1671 * hold as lock here we may take a_contents lock before unmap wait
1655 1672 * thread that was first to see softlockcnt was still not 0. As a
1656 1673 * result we'll fail to wake up an unmap wait thread. To avoid this
1657 1674 * race we set nounmapwait flag in as structure if we drop softlockcnt
1658 1675 * to 0 if async is not 0. unmapwait thread
1659 1676 * will not block if this flag is set.
1660 1677 */
1661 1678 if (async)
1662 1679 mutex_enter(&shmd->shm_segfree_syncmtx);
1663 1680
1664 1681 /*
1665 1682 * Now decrement softlockcnt.
1666 1683 */
1667 1684 ASSERT(shmd->shm_softlockcnt > 0);
1668 1685 atomic_dec_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
1669 1686
1670 1687 if (shmd->shm_softlockcnt <= 0) {
1671 1688 if (async || AS_ISUNMAPWAIT(seg->s_as)) {
1672 1689 mutex_enter(&seg->s_as->a_contents);
1673 1690 if (async)
1674 1691 AS_SETNOUNMAPWAIT(seg->s_as);
1675 1692 if (AS_ISUNMAPWAIT(seg->s_as)) {
1676 1693 AS_CLRUNMAPWAIT(seg->s_as);
1677 1694 cv_broadcast(&seg->s_as->a_cv);
1678 1695 }
1679 1696 mutex_exit(&seg->s_as->a_contents);
1680 1697 }
1681 1698 }
1682 1699
1683 1700 if (async)
1684 1701 mutex_exit(&shmd->shm_segfree_syncmtx);
1685 1702
1686 1703 return (done);
1687 1704 }
1688 1705
1689 1706 /*
1690 1707 * Do a F_SOFTUNLOCK call over the range requested.
1691 1708 * The range must have already been F_SOFTLOCK'ed.
1692 1709 *
1693 1710 * The calls to acquire and release the anon map lock mutex were
1694 1711 * removed in order to avoid a deadly embrace during a DR
1695 1712 * memory delete operation. (Eg. DR blocks while waiting for a
1696 1713 * exclusive lock on a page that is being used for kaio; the
1697 1714 * thread that will complete the kaio and call segspt_softunlock
1698 1715 * blocks on the anon map lock; another thread holding the anon
1699 1716 * map lock blocks on another page lock via the segspt_shmfault
1700 1717 * -> page_lookup -> page_lookup_create -> page_lock_es code flow.)
1701 1718 *
1702 1719 * The appropriateness of the removal is based upon the following:
1703 1720 * 1. If we are holding a segment's reader lock and the page is held
1704 1721 * shared, then the corresponding element in anonmap which points to
1705 1722 * anon struct cannot change and there is no need to acquire the
1706 1723 * anonymous map lock.
1707 1724 * 2. Threads in segspt_softunlock have a reader lock on the segment
1708 1725 * and already have the shared page lock, so we are guaranteed that
1709 1726 * the anon map slot cannot change and therefore can call anon_get_ptr()
1710 1727 * without grabbing the anonymous map lock.
1711 1728 * 3. Threads that softlock a shared page break copy-on-write, even if
1712 1729 * its a read. Thus cow faults can be ignored with respect to soft
1713 1730 * unlocking, since the breaking of cow means that the anon slot(s) will
1714 1731 * not be shared.
1715 1732 */
1716 1733 static void
1717 1734 segspt_softunlock(struct seg *seg, caddr_t sptseg_addr,
1718 1735 size_t len, enum seg_rw rw)
1719 1736 {
1720 1737 struct shm_data *shmd = (struct shm_data *)seg->s_data;
1721 1738 struct seg *sptseg;
1722 1739 struct spt_data *sptd;
1723 1740 page_t *pp;
1724 1741 caddr_t adr;
1725 1742 struct vnode *vp;
1726 1743 u_offset_t offset;
1727 1744 ulong_t anon_index;
1728 1745 struct anon_map *amp; /* XXX - for locknest */
1729 1746 struct anon *ap = NULL;
1730 1747 pgcnt_t npages;
1731 1748
1732 1749 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
1733 1750
1734 1751 sptseg = shmd->shm_sptseg;
1735 1752 sptd = sptseg->s_data;
1736 1753
1737 1754 /*
1738 1755 * Some platforms assume that ISM mappings are HAT_LOAD_LOCK
1739 1756 * and therefore their pages are SE_SHARED locked
1740 1757 * for the entire life of the segment.
1741 1758 */
1742 1759 if ((!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) &&
1743 1760 ((sptd->spt_flags & SHM_PAGEABLE) == 0)) {
1744 1761 goto softlock_decrement;
1745 1762 }
1746 1763
1747 1764 /*
1748 1765 * Any thread is free to do a page_find and
1749 1766 * page_unlock() on the pages within this seg.
1750 1767 *
1751 1768 * We are already holding the as->a_lock on the user's
1752 1769 * real segment, but we need to hold the a_lock on the
1753 1770 * underlying dummy as. This is mostly to satisfy the
1754 1771 * underlying HAT layer.
1755 1772 */
1756 1773 AS_LOCK_ENTER(sptseg->s_as, RW_READER);
1757 1774 hat_unlock(sptseg->s_as->a_hat, sptseg_addr, len);
1758 1775 AS_LOCK_EXIT(sptseg->s_as);
1759 1776
1760 1777 amp = sptd->spt_amp;
1761 1778 ASSERT(amp != NULL);
1762 1779 anon_index = seg_page(sptseg, sptseg_addr);
1763 1780
1764 1781 for (adr = sptseg_addr; adr < sptseg_addr + len; adr += PAGESIZE) {
1765 1782 ap = anon_get_ptr(amp->ahp, anon_index++);
1766 1783 ASSERT(ap != NULL);
1767 1784 swap_xlate(ap, &vp, &offset);
1768 1785
1769 1786 /*
1770 1787 * Use page_find() instead of page_lookup() to
1771 1788 * find the page since we know that it has a
1772 1789 * "shared" lock.
1773 1790 */
1774 1791 pp = page_find(vp, offset);
1775 1792 ASSERT(ap == anon_get_ptr(amp->ahp, anon_index - 1));
1776 1793 if (pp == NULL) {
1777 1794 panic("segspt_softunlock: "
1778 1795 "addr %p, ap %p, vp %p, off %llx",
1779 1796 (void *)adr, (void *)ap, (void *)vp, offset);
1780 1797 /*NOTREACHED*/
1781 1798 }
1782 1799
1783 1800 if (rw == S_WRITE) {
1784 1801 hat_setrefmod(pp);
1785 1802 } else if (rw != S_OTHER) {
1786 1803 hat_setref(pp);
1787 1804 }
1788 1805 page_unlock(pp);
1789 1806 }
1790 1807
1791 1808 softlock_decrement:
1792 1809 npages = btopr(len);
1793 1810 ASSERT(shmd->shm_softlockcnt >= npages);
1794 1811 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), -npages);
1795 1812 if (shmd->shm_softlockcnt == 0) {
1796 1813 /*
1797 1814 * All SOFTLOCKS are gone. Wakeup any waiting
1798 1815 * unmappers so they can try again to unmap.
1799 1816 * Check for waiters first without the mutex
1800 1817 * held so we don't always grab the mutex on
1801 1818 * softunlocks.
1802 1819 */
1803 1820 if (AS_ISUNMAPWAIT(seg->s_as)) {
1804 1821 mutex_enter(&seg->s_as->a_contents);
1805 1822 if (AS_ISUNMAPWAIT(seg->s_as)) {
1806 1823 AS_CLRUNMAPWAIT(seg->s_as);
1807 1824 cv_broadcast(&seg->s_as->a_cv);
1808 1825 }
1809 1826 mutex_exit(&seg->s_as->a_contents);
1810 1827 }
1811 1828 }
1812 1829 }
1813 1830
1814 1831 int
1815 1832 segspt_shmattach(struct seg **segpp, void *argsp)
1816 1833 {
1817 1834 struct seg *seg = *segpp;
1818 1835 struct shm_data *shmd_arg = (struct shm_data *)argsp;
1819 1836 struct shm_data *shmd;
1820 1837 struct anon_map *shm_amp = shmd_arg->shm_amp;
1821 1838 struct spt_data *sptd;
1822 1839 int error = 0;
1823 1840
1824 1841 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
1825 1842
1826 1843 shmd = kmem_zalloc((sizeof (*shmd)), KM_NOSLEEP);
1827 1844 if (shmd == NULL)
1828 1845 return (ENOMEM);
1829 1846
1830 1847 shmd->shm_sptas = shmd_arg->shm_sptas;
1831 1848 shmd->shm_amp = shm_amp;
1832 1849 shmd->shm_sptseg = shmd_arg->shm_sptseg;
1833 1850
1834 1851 (void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, shm_amp, 0,
1835 1852 NULL, 0, seg->s_size);
1836 1853
1837 1854 mutex_init(&shmd->shm_segfree_syncmtx, NULL, MUTEX_DEFAULT, NULL);
1838 1855
1839 1856 seg->s_data = (void *)shmd;
1840 1857 seg->s_ops = &segspt_shmops;
1841 1858 seg->s_szc = shmd->shm_sptseg->s_szc;
1842 1859 sptd = shmd->shm_sptseg->s_data;
1843 1860
1844 1861 if (sptd->spt_flags & SHM_PAGEABLE) {
1845 1862 if ((shmd->shm_vpage = kmem_zalloc(btopr(shm_amp->size),
1846 1863 KM_NOSLEEP)) == NULL) {
1847 1864 seg->s_data = (void *)NULL;
1848 1865 kmem_free(shmd, (sizeof (*shmd)));
1849 1866 return (ENOMEM);
1850 1867 }
1851 1868 shmd->shm_lckpgs = 0;
1852 1869 if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
1853 1870 if ((error = hat_share(seg->s_as->a_hat, seg->s_base,
1854 1871 shmd_arg->shm_sptas->a_hat, SEGSPTADDR,
1855 1872 seg->s_size, seg->s_szc)) != 0) {
1856 1873 kmem_free(shmd->shm_vpage,
1857 1874 btopr(shm_amp->size));
1858 1875 }
1859 1876 }
1860 1877 } else {
1861 1878 error = hat_share(seg->s_as->a_hat, seg->s_base,
1862 1879 shmd_arg->shm_sptas->a_hat, SEGSPTADDR,
1863 1880 seg->s_size, seg->s_szc);
1864 1881 }
1865 1882 if (error) {
1866 1883 seg->s_szc = 0;
1867 1884 seg->s_data = (void *)NULL;
1868 1885 kmem_free(shmd, (sizeof (*shmd)));
1869 1886 } else {
1870 1887 ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER);
1871 1888 shm_amp->refcnt++;
1872 1889 ANON_LOCK_EXIT(&shm_amp->a_rwlock);
1873 1890 }
1874 1891 return (error);
1875 1892 }
1876 1893
1877 1894 int
1878 1895 segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize)
1879 1896 {
1880 1897 struct shm_data *shmd = (struct shm_data *)seg->s_data;
1881 1898 int reclaim = 1;
1882 1899
1883 1900 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
1884 1901 retry:
1885 1902 if (shmd->shm_softlockcnt > 0) {
1886 1903 if (reclaim == 1) {
1887 1904 segspt_purge(seg);
1888 1905 reclaim = 0;
1889 1906 goto retry;
1890 1907 }
1891 1908 return (EAGAIN);
1892 1909 }
1893 1910
1894 1911 if (ssize != seg->s_size) {
1895 1912 #ifdef DEBUG
1896 1913 cmn_err(CE_WARN, "Incompatible ssize %lx s_size %lx\n",
1897 1914 ssize, seg->s_size);
1898 1915 #endif
1899 1916 return (EINVAL);
1900 1917 }
1901 1918
1902 1919 (void) segspt_shmlockop(seg, raddr, shmd->shm_amp->size, 0, MC_UNLOCK,
1903 1920 NULL, 0);
1904 1921 hat_unshare(seg->s_as->a_hat, raddr, ssize, seg->s_szc);
1905 1922
1906 1923 seg_free(seg);
1907 1924
1908 1925 return (0);
1909 1926 }
1910 1927
1911 1928 void
1912 1929 segspt_shmfree(struct seg *seg)
1913 1930 {
1914 1931 struct shm_data *shmd = (struct shm_data *)seg->s_data;
1915 1932 struct anon_map *shm_amp = shmd->shm_amp;
1916 1933
1917 1934 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
1918 1935
1919 1936 (void) segspt_shmlockop(seg, seg->s_base, shm_amp->size, 0,
1920 1937 MC_UNLOCK, NULL, 0);
1921 1938
1922 1939 /*
1923 1940 * Need to increment refcnt when attaching
1924 1941 * and decrement when detaching because of dup().
1925 1942 */
1926 1943 ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER);
1927 1944 shm_amp->refcnt--;
1928 1945 ANON_LOCK_EXIT(&shm_amp->a_rwlock);
1929 1946
1930 1947 if (shmd->shm_vpage) { /* only for DISM */
1931 1948 kmem_free(shmd->shm_vpage, btopr(shm_amp->size));
1932 1949 shmd->shm_vpage = NULL;
1933 1950 }
1934 1951
1935 1952 /*
1936 1953 * Take shm_segfree_syncmtx lock to let segspt_reclaim() finish if it's
1937 1954 * still working with this segment without holding as lock.
1938 1955 */
1939 1956 ASSERT(shmd->shm_softlockcnt == 0);
1940 1957 mutex_enter(&shmd->shm_segfree_syncmtx);
1941 1958 mutex_destroy(&shmd->shm_segfree_syncmtx);
1942 1959
1943 1960 kmem_free(shmd, sizeof (*shmd));
1944 1961 }
1945 1962
1946 1963 /*ARGSUSED*/
1947 1964 int
1948 1965 segspt_shmsetprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
1949 1966 {
1950 1967 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
1951 1968
1952 1969 /*
1953 1970 * Shared page table is more than shared mapping.
1954 1971 * Individual process sharing page tables can't change prot
1955 1972 * because there is only one set of page tables.
1956 1973 * This will be allowed after private page table is
1957 1974 * supported.
1958 1975 */
1959 1976 /* need to return correct status error? */
1960 1977 return (0);
1961 1978 }
1962 1979
1963 1980
1964 1981 faultcode_t
1965 1982 segspt_dismfault(struct hat *hat, struct seg *seg, caddr_t addr,
1966 1983 size_t len, enum fault_type type, enum seg_rw rw)
1967 1984 {
1968 1985 struct shm_data *shmd = (struct shm_data *)seg->s_data;
1969 1986 struct seg *sptseg = shmd->shm_sptseg;
1970 1987 struct as *curspt = shmd->shm_sptas;
1971 1988 struct spt_data *sptd = sptseg->s_data;
1972 1989 pgcnt_t npages;
1973 1990 size_t size;
1974 1991 caddr_t segspt_addr, shm_addr;
1975 1992 page_t **ppa;
1976 1993 int i;
1977 1994 ulong_t an_idx = 0;
1978 1995 int err = 0;
1979 1996 int dyn_ism_unmap = hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0);
1980 1997 size_t pgsz;
1981 1998 pgcnt_t pgcnt;
1982 1999 caddr_t a;
1983 2000 pgcnt_t pidx;
1984 2001
1985 2002 #ifdef lint
1986 2003 hat = hat;
1987 2004 #endif
1988 2005 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
1989 2006
1990 2007 /*
1991 2008 * Because of the way spt is implemented
1992 2009 * the realsize of the segment does not have to be
1993 2010 * equal to the segment size itself. The segment size is
1994 2011 * often in multiples of a page size larger than PAGESIZE.
1995 2012 * The realsize is rounded up to the nearest PAGESIZE
1996 2013 * based on what the user requested. This is a bit of
1997 2014 * ungliness that is historical but not easily fixed
1998 2015 * without re-designing the higher levels of ISM.
1999 2016 */
2000 2017 ASSERT(addr >= seg->s_base);
2001 2018 if (((addr + len) - seg->s_base) > sptd->spt_realsize)
2002 2019 return (FC_NOMAP);
2003 2020 /*
2004 2021 * For all of the following cases except F_PROT, we need to
2005 2022 * make any necessary adjustments to addr and len
2006 2023 * and get all of the necessary page_t's into an array called ppa[].
2007 2024 *
2008 2025 * The code in shmat() forces base addr and len of ISM segment
2009 2026 * to be aligned to largest page size supported. Therefore,
2010 2027 * we are able to handle F_SOFTLOCK and F_INVAL calls in "large
2011 2028 * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK
2012 2029 * in large pagesize chunks, or else we will screw up the HAT
2013 2030 * layer by calling hat_memload_array() with differing page sizes
2014 2031 * over a given virtual range.
2015 2032 */
2016 2033 pgsz = page_get_pagesize(sptseg->s_szc);
2017 2034 pgcnt = page_get_pagecnt(sptseg->s_szc);
2018 2035 shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz);
2019 2036 size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), pgsz);
2020 2037 npages = btopr(size);
2021 2038
2022 2039 /*
2023 2040 * Now we need to convert from addr in segshm to addr in segspt.
2024 2041 */
2025 2042 an_idx = seg_page(seg, shm_addr);
2026 2043 segspt_addr = sptseg->s_base + ptob(an_idx);
2027 2044
2028 2045 ASSERT((segspt_addr + ptob(npages)) <=
2029 2046 (sptseg->s_base + sptd->spt_realsize));
2030 2047 ASSERT(segspt_addr < (sptseg->s_base + sptseg->s_size));
2031 2048
2032 2049 switch (type) {
2033 2050
2034 2051 case F_SOFTLOCK:
2035 2052
2036 2053 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages);
2037 2054 /*
2038 2055 * Fall through to the F_INVAL case to load up the hat layer
2039 2056 * entries with the HAT_LOAD_LOCK flag.
2040 2057 */
2041 2058 /* FALLTHRU */
2042 2059 case F_INVAL:
2043 2060
2044 2061 if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC))
2045 2062 return (FC_NOMAP);
2046 2063
2047 2064 ppa = kmem_zalloc(npages * sizeof (page_t *), KM_SLEEP);
2048 2065
2049 2066 err = spt_anon_getpages(sptseg, segspt_addr, size, ppa);
2050 2067 if (err != 0) {
2051 2068 if (type == F_SOFTLOCK) {
2052 2069 atomic_add_long((ulong_t *)(
2053 2070 &(shmd->shm_softlockcnt)), -npages);
2054 2071 }
2055 2072 goto dism_err;
2056 2073 }
2057 2074 AS_LOCK_ENTER(sptseg->s_as, RW_READER);
2058 2075 a = segspt_addr;
2059 2076 pidx = 0;
2060 2077 if (type == F_SOFTLOCK) {
2061 2078
2062 2079 /*
2063 2080 * Load up the translation keeping it
2064 2081 * locked and don't unlock the page.
2065 2082 */
2066 2083 for (; pidx < npages; a += pgsz, pidx += pgcnt) {
2067 2084 hat_memload_array(sptseg->s_as->a_hat,
2068 2085 a, pgsz, &ppa[pidx], sptd->spt_prot,
2069 2086 HAT_LOAD_LOCK | HAT_LOAD_SHARE);
2070 2087 }
2071 2088 } else {
2072 2089 /*
2073 2090 * Migrate pages marked for migration
2074 2091 */
2075 2092 if (lgrp_optimizations())
2076 2093 page_migrate(seg, shm_addr, ppa, npages);
2077 2094
2078 2095 for (; pidx < npages; a += pgsz, pidx += pgcnt) {
2079 2096 hat_memload_array(sptseg->s_as->a_hat,
2080 2097 a, pgsz, &ppa[pidx],
2081 2098 sptd->spt_prot,
2082 2099 HAT_LOAD_SHARE);
2083 2100 }
2084 2101
2085 2102 /*
2086 2103 * And now drop the SE_SHARED lock(s).
2087 2104 */
2088 2105 if (dyn_ism_unmap) {
2089 2106 for (i = 0; i < npages; i++) {
2090 2107 page_unlock(ppa[i]);
2091 2108 }
2092 2109 }
2093 2110 }
2094 2111
2095 2112 if (!dyn_ism_unmap) {
2096 2113 if (hat_share(seg->s_as->a_hat, shm_addr,
2097 2114 curspt->a_hat, segspt_addr, ptob(npages),
2098 2115 seg->s_szc) != 0) {
2099 2116 panic("hat_share err in DISM fault");
2100 2117 /* NOTREACHED */
2101 2118 }
2102 2119 if (type == F_INVAL) {
2103 2120 for (i = 0; i < npages; i++) {
2104 2121 page_unlock(ppa[i]);
2105 2122 }
2106 2123 }
2107 2124 }
2108 2125 AS_LOCK_EXIT(sptseg->s_as);
2109 2126 dism_err:
2110 2127 kmem_free(ppa, npages * sizeof (page_t *));
2111 2128 return (err);
2112 2129
2113 2130 case F_SOFTUNLOCK:
2114 2131
2115 2132 /*
2116 2133 * This is a bit ugly, we pass in the real seg pointer,
2117 2134 * but the segspt_addr is the virtual address within the
2118 2135 * dummy seg.
2119 2136 */
2120 2137 segspt_softunlock(seg, segspt_addr, size, rw);
2121 2138 return (0);
2122 2139
2123 2140 case F_PROT:
2124 2141
2125 2142 /*
2126 2143 * This takes care of the unusual case where a user
2127 2144 * allocates a stack in shared memory and a register
2128 2145 * window overflow is written to that stack page before
2129 2146 * it is otherwise modified.
2130 2147 *
2131 2148 * We can get away with this because ISM segments are
2132 2149 * always rw. Other than this unusual case, there
2133 2150 * should be no instances of protection violations.
2134 2151 */
2135 2152 return (0);
2136 2153
2137 2154 default:
2138 2155 #ifdef DEBUG
2139 2156 panic("segspt_dismfault default type?");
2140 2157 #else
2141 2158 return (FC_NOMAP);
2142 2159 #endif
2143 2160 }
2144 2161 }
2145 2162
2146 2163
2147 2164 faultcode_t
2148 2165 segspt_shmfault(struct hat *hat, struct seg *seg, caddr_t addr,
2149 2166 size_t len, enum fault_type type, enum seg_rw rw)
2150 2167 {
2151 2168 struct shm_data *shmd = (struct shm_data *)seg->s_data;
2152 2169 struct seg *sptseg = shmd->shm_sptseg;
2153 2170 struct as *curspt = shmd->shm_sptas;
2154 2171 struct spt_data *sptd = sptseg->s_data;
2155 2172 pgcnt_t npages;
2156 2173 size_t size;
2157 2174 caddr_t sptseg_addr, shm_addr;
2158 2175 page_t *pp, **ppa;
2159 2176 int i;
2160 2177 u_offset_t offset;
2161 2178 ulong_t anon_index = 0;
2162 2179 struct vnode *vp;
2163 2180 struct anon_map *amp; /* XXX - for locknest */
2164 2181 struct anon *ap = NULL;
2165 2182 size_t pgsz;
2166 2183 pgcnt_t pgcnt;
2167 2184 caddr_t a;
2168 2185 pgcnt_t pidx;
2169 2186 size_t sz;
2170 2187
2171 2188 #ifdef lint
2172 2189 hat = hat;
2173 2190 #endif
2174 2191
2175 2192 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2176 2193
2177 2194 if (sptd->spt_flags & SHM_PAGEABLE) {
2178 2195 return (segspt_dismfault(hat, seg, addr, len, type, rw));
2179 2196 }
2180 2197
2181 2198 /*
2182 2199 * Because of the way spt is implemented
2183 2200 * the realsize of the segment does not have to be
2184 2201 * equal to the segment size itself. The segment size is
2185 2202 * often in multiples of a page size larger than PAGESIZE.
2186 2203 * The realsize is rounded up to the nearest PAGESIZE
2187 2204 * based on what the user requested. This is a bit of
2188 2205 * ungliness that is historical but not easily fixed
2189 2206 * without re-designing the higher levels of ISM.
2190 2207 */
2191 2208 ASSERT(addr >= seg->s_base);
2192 2209 if (((addr + len) - seg->s_base) > sptd->spt_realsize)
2193 2210 return (FC_NOMAP);
2194 2211 /*
2195 2212 * For all of the following cases except F_PROT, we need to
2196 2213 * make any necessary adjustments to addr and len
2197 2214 * and get all of the necessary page_t's into an array called ppa[].
2198 2215 *
2199 2216 * The code in shmat() forces base addr and len of ISM segment
2200 2217 * to be aligned to largest page size supported. Therefore,
2201 2218 * we are able to handle F_SOFTLOCK and F_INVAL calls in "large
2202 2219 * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK
2203 2220 * in large pagesize chunks, or else we will screw up the HAT
2204 2221 * layer by calling hat_memload_array() with differing page sizes
2205 2222 * over a given virtual range.
2206 2223 */
2207 2224 pgsz = page_get_pagesize(sptseg->s_szc);
2208 2225 pgcnt = page_get_pagecnt(sptseg->s_szc);
2209 2226 shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz);
2210 2227 size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), pgsz);
2211 2228 npages = btopr(size);
2212 2229
2213 2230 /*
2214 2231 * Now we need to convert from addr in segshm to addr in segspt.
2215 2232 */
2216 2233 anon_index = seg_page(seg, shm_addr);
2217 2234 sptseg_addr = sptseg->s_base + ptob(anon_index);
2218 2235
2219 2236 /*
2220 2237 * And now we may have to adjust npages downward if we have
2221 2238 * exceeded the realsize of the segment or initial anon
2222 2239 * allocations.
2223 2240 */
2224 2241 if ((sptseg_addr + ptob(npages)) >
2225 2242 (sptseg->s_base + sptd->spt_realsize))
2226 2243 size = (sptseg->s_base + sptd->spt_realsize) - sptseg_addr;
2227 2244
2228 2245 npages = btopr(size);
2229 2246
2230 2247 ASSERT(sptseg_addr < (sptseg->s_base + sptseg->s_size));
2231 2248 ASSERT((sptd->spt_flags & SHM_PAGEABLE) == 0);
2232 2249
2233 2250 switch (type) {
2234 2251
2235 2252 case F_SOFTLOCK:
2236 2253
2237 2254 /*
2238 2255 * availrmem is decremented once during anon_swap_adjust()
2239 2256 * and is incremented during the anon_unresv(), which is
2240 2257 * called from shm_rm_amp() when the segment is destroyed.
2241 2258 */
2242 2259 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages);
2243 2260 /*
2244 2261 * Some platforms assume that ISM pages are SE_SHARED
2245 2262 * locked for the entire life of the segment.
2246 2263 */
2247 2264 if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0))
2248 2265 return (0);
2249 2266 /*
2250 2267 * Fall through to the F_INVAL case to load up the hat layer
2251 2268 * entries with the HAT_LOAD_LOCK flag.
2252 2269 */
2253 2270
2254 2271 /* FALLTHRU */
2255 2272 case F_INVAL:
2256 2273
2257 2274 if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC))
2258 2275 return (FC_NOMAP);
2259 2276
2260 2277 /*
2261 2278 * Some platforms that do NOT support DYNAMIC_ISM_UNMAP
2262 2279 * may still rely on this call to hat_share(). That
2263 2280 * would imply that those hat's can fault on a
2264 2281 * HAT_LOAD_LOCK translation, which would seem
2265 2282 * contradictory.
2266 2283 */
2267 2284 if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
2268 2285 if (hat_share(seg->s_as->a_hat, seg->s_base,
2269 2286 curspt->a_hat, sptseg->s_base,
2270 2287 sptseg->s_size, sptseg->s_szc) != 0) {
2271 2288 panic("hat_share error in ISM fault");
2272 2289 /*NOTREACHED*/
2273 2290 }
2274 2291 return (0);
2275 2292 }
2276 2293 ppa = kmem_zalloc(sizeof (page_t *) * npages, KM_SLEEP);
2277 2294
2278 2295 /*
2279 2296 * I see no need to lock the real seg,
2280 2297 * here, because all of our work will be on the underlying
2281 2298 * dummy seg.
2282 2299 *
2283 2300 * sptseg_addr and npages now account for large pages.
2284 2301 */
2285 2302 amp = sptd->spt_amp;
2286 2303 ASSERT(amp != NULL);
2287 2304 anon_index = seg_page(sptseg, sptseg_addr);
2288 2305
2289 2306 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
2290 2307 for (i = 0; i < npages; i++) {
2291 2308 ap = anon_get_ptr(amp->ahp, anon_index++);
2292 2309 ASSERT(ap != NULL);
2293 2310 swap_xlate(ap, &vp, &offset);
2294 2311 pp = page_lookup(vp, offset, SE_SHARED);
2295 2312 ASSERT(pp != NULL);
2296 2313 ppa[i] = pp;
2297 2314 }
2298 2315 ANON_LOCK_EXIT(&->a_rwlock);
2299 2316 ASSERT(i == npages);
2300 2317
2301 2318 /*
2302 2319 * We are already holding the as->a_lock on the user's
2303 2320 * real segment, but we need to hold the a_lock on the
2304 2321 * underlying dummy as. This is mostly to satisfy the
2305 2322 * underlying HAT layer.
2306 2323 */
2307 2324 AS_LOCK_ENTER(sptseg->s_as, RW_READER);
2308 2325 a = sptseg_addr;
2309 2326 pidx = 0;
2310 2327 if (type == F_SOFTLOCK) {
2311 2328 /*
2312 2329 * Load up the translation keeping it
2313 2330 * locked and don't unlock the page.
2314 2331 */
2315 2332 for (; pidx < npages; a += pgsz, pidx += pgcnt) {
2316 2333 sz = MIN(pgsz, ptob(npages - pidx));
2317 2334 hat_memload_array(sptseg->s_as->a_hat, a,
2318 2335 sz, &ppa[pidx], sptd->spt_prot,
2319 2336 HAT_LOAD_LOCK | HAT_LOAD_SHARE);
2320 2337 }
2321 2338 } else {
2322 2339 /*
2323 2340 * Migrate pages marked for migration.
2324 2341 */
2325 2342 if (lgrp_optimizations())
2326 2343 page_migrate(seg, shm_addr, ppa, npages);
2327 2344
2328 2345 for (; pidx < npages; a += pgsz, pidx += pgcnt) {
2329 2346 sz = MIN(pgsz, ptob(npages - pidx));
2330 2347 hat_memload_array(sptseg->s_as->a_hat,
2331 2348 a, sz, &ppa[pidx],
2332 2349 sptd->spt_prot, HAT_LOAD_SHARE);
2333 2350 }
2334 2351
2335 2352 /*
2336 2353 * And now drop the SE_SHARED lock(s).
2337 2354 */
2338 2355 for (i = 0; i < npages; i++)
2339 2356 page_unlock(ppa[i]);
2340 2357 }
2341 2358 AS_LOCK_EXIT(sptseg->s_as);
2342 2359
2343 2360 kmem_free(ppa, sizeof (page_t *) * npages);
2344 2361 return (0);
2345 2362 case F_SOFTUNLOCK:
2346 2363
2347 2364 /*
2348 2365 * This is a bit ugly, we pass in the real seg pointer,
2349 2366 * but the sptseg_addr is the virtual address within the
2350 2367 * dummy seg.
2351 2368 */
2352 2369 segspt_softunlock(seg, sptseg_addr, ptob(npages), rw);
2353 2370 return (0);
2354 2371
2355 2372 case F_PROT:
2356 2373
2357 2374 /*
2358 2375 * This takes care of the unusual case where a user
2359 2376 * allocates a stack in shared memory and a register
2360 2377 * window overflow is written to that stack page before
2361 2378 * it is otherwise modified.
2362 2379 *
2363 2380 * We can get away with this because ISM segments are
2364 2381 * always rw. Other than this unusual case, there
2365 2382 * should be no instances of protection violations.
2366 2383 */
2367 2384 return (0);
2368 2385
2369 2386 default:
2370 2387 #ifdef DEBUG
2371 2388 cmn_err(CE_WARN, "segspt_shmfault default type?");
2372 2389 #endif
2373 2390 return (FC_NOMAP);
2374 2391 }
2375 2392 }
2376 2393
2377 2394 /*ARGSUSED*/
2378 2395 static faultcode_t
2379 2396 segspt_shmfaulta(struct seg *seg, caddr_t addr)
2380 2397 {
2381 2398 return (0);
2382 2399 }
2383 2400
2384 2401 /*ARGSUSED*/
2385 2402 static int
2386 2403 segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta)
2387 2404 {
2388 2405 return (0);
2389 2406 }
2390 2407
2391 2408 /*ARGSUSED*/
2392 2409 static size_t
2393 2410 segspt_shmswapout(struct seg *seg)
2394 2411 {
2395 2412 return (0);
2396 2413 }
2397 2414
2398 2415 /*
2399 2416 * duplicate the shared page tables
2400 2417 */
2401 2418 int
2402 2419 segspt_shmdup(struct seg *seg, struct seg *newseg)
2403 2420 {
2404 2421 struct shm_data *shmd = (struct shm_data *)seg->s_data;
2405 2422 struct anon_map *amp = shmd->shm_amp;
2406 2423 struct shm_data *shmd_new;
2407 2424 struct seg *spt_seg = shmd->shm_sptseg;
2408 2425 struct spt_data *sptd = spt_seg->s_data;
2409 2426 int error = 0;
2410 2427
2411 2428 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
2412 2429
2413 2430 shmd_new = kmem_zalloc((sizeof (*shmd_new)), KM_SLEEP);
2414 2431 newseg->s_data = (void *)shmd_new;
2415 2432 shmd_new->shm_sptas = shmd->shm_sptas;
2416 2433 shmd_new->shm_amp = amp;
2417 2434 shmd_new->shm_sptseg = shmd->shm_sptseg;
2418 2435 newseg->s_ops = &segspt_shmops;
2419 2436 newseg->s_szc = seg->s_szc;
2420 2437 ASSERT(seg->s_szc == shmd->shm_sptseg->s_szc);
2421 2438
2422 2439 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
2423 2440 amp->refcnt++;
2424 2441 ANON_LOCK_EXIT(&->a_rwlock);
2425 2442
2426 2443 if (sptd->spt_flags & SHM_PAGEABLE) {
2427 2444 shmd_new->shm_vpage = kmem_zalloc(btopr(amp->size), KM_SLEEP);
2428 2445 shmd_new->shm_lckpgs = 0;
2429 2446 if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
2430 2447 if ((error = hat_share(newseg->s_as->a_hat,
2431 2448 newseg->s_base, shmd->shm_sptas->a_hat, SEGSPTADDR,
2432 2449 seg->s_size, seg->s_szc)) != 0) {
2433 2450 kmem_free(shmd_new->shm_vpage,
2434 2451 btopr(amp->size));
2435 2452 }
2436 2453 }
2437 2454 return (error);
2438 2455 } else {
2439 2456 return (hat_share(newseg->s_as->a_hat, newseg->s_base,
2440 2457 shmd->shm_sptas->a_hat, SEGSPTADDR, seg->s_size,
2441 2458 seg->s_szc));
2442 2459
2443 2460 }
2444 2461 }
2445 2462
2446 2463 /*ARGSUSED*/
2447 2464 int
2448 2465 segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot)
2449 2466 {
2450 2467 struct shm_data *shmd = (struct shm_data *)seg->s_data;
2451 2468 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2452 2469
2453 2470 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2454 2471
2455 2472 /*
2456 2473 * ISM segment is always rw.
2457 2474 */
2458 2475 return (((sptd->spt_prot & prot) != prot) ? EACCES : 0);
2459 2476 }
2460 2477
2461 2478 /*
2462 2479 * Return an array of locked large pages, for empty slots allocate
2463 2480 * private zero-filled anon pages.
2464 2481 */
2465 2482 static int
2466 2483 spt_anon_getpages(
2467 2484 struct seg *sptseg,
2468 2485 caddr_t sptaddr,
2469 2486 size_t len,
2470 2487 page_t *ppa[])
2471 2488 {
2472 2489 struct spt_data *sptd = sptseg->s_data;
2473 2490 struct anon_map *amp = sptd->spt_amp;
2474 2491 enum seg_rw rw = sptd->spt_prot;
2475 2492 uint_t szc = sptseg->s_szc;
2476 2493 size_t pg_sz, share_sz = page_get_pagesize(szc);
2477 2494 pgcnt_t lp_npgs;
2478 2495 caddr_t lp_addr, e_sptaddr;
2479 2496 uint_t vpprot, ppa_szc = 0;
2480 2497 struct vpage *vpage = NULL;
2481 2498 ulong_t j, ppa_idx;
2482 2499 int err, ierr = 0;
2483 2500 pgcnt_t an_idx;
2484 2501 anon_sync_obj_t cookie;
2485 2502 int anon_locked = 0;
2486 2503 pgcnt_t amp_pgs;
2487 2504
2488 2505
2489 2506 ASSERT(IS_P2ALIGNED(sptaddr, share_sz) && IS_P2ALIGNED(len, share_sz));
2490 2507 ASSERT(len != 0);
2491 2508
2492 2509 pg_sz = share_sz;
2493 2510 lp_npgs = btop(pg_sz);
2494 2511 lp_addr = sptaddr;
2495 2512 e_sptaddr = sptaddr + len;
2496 2513 an_idx = seg_page(sptseg, sptaddr);
2497 2514 ppa_idx = 0;
2498 2515
2499 2516 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
2500 2517
2501 2518 amp_pgs = page_get_pagecnt(amp->a_szc);
2502 2519
2503 2520 /*CONSTCOND*/
2504 2521 while (1) {
2505 2522 for (; lp_addr < e_sptaddr;
2506 2523 an_idx += lp_npgs, lp_addr += pg_sz, ppa_idx += lp_npgs) {
2507 2524
2508 2525 /*
2509 2526 * If we're currently locked, and we get to a new
2510 2527 * page, unlock our current anon chunk.
2511 2528 */
2512 2529 if (anon_locked && P2PHASE(an_idx, amp_pgs) == 0) {
2513 2530 anon_array_exit(&cookie);
2514 2531 anon_locked = 0;
2515 2532 }
2516 2533 if (!anon_locked) {
2517 2534 anon_array_enter(amp, an_idx, &cookie);
2518 2535 anon_locked = 1;
2519 2536 }
2520 2537 ppa_szc = (uint_t)-1;
2521 2538 ierr = anon_map_getpages(amp, an_idx, szc, sptseg,
2522 2539 lp_addr, sptd->spt_prot, &vpprot, &ppa[ppa_idx],
2523 2540 &ppa_szc, vpage, rw, 0, segvn_anypgsz, 0, kcred);
2524 2541
2525 2542 if (ierr != 0) {
2526 2543 if (ierr > 0) {
2527 2544 err = FC_MAKE_ERR(ierr);
2528 2545 goto lpgs_err;
2529 2546 }
2530 2547 break;
2531 2548 }
2532 2549 }
2533 2550 if (lp_addr == e_sptaddr) {
2534 2551 break;
2535 2552 }
2536 2553 ASSERT(lp_addr < e_sptaddr);
2537 2554
2538 2555 /*
2539 2556 * ierr == -1 means we failed to allocate a large page.
2540 2557 * so do a size down operation.
2541 2558 *
2542 2559 * ierr == -2 means some other process that privately shares
2543 2560 * pages with this process has allocated a larger page and we
2544 2561 * need to retry with larger pages. So do a size up
2545 2562 * operation. This relies on the fact that large pages are
2546 2563 * never partially shared i.e. if we share any constituent
2547 2564 * page of a large page with another process we must share the
2548 2565 * entire large page. Note this cannot happen for SOFTLOCK
2549 2566 * case, unless current address (lpaddr) is at the beginning
2550 2567 * of the next page size boundary because the other process
2551 2568 * couldn't have relocated locked pages.
2552 2569 */
2553 2570 ASSERT(ierr == -1 || ierr == -2);
2554 2571 if (segvn_anypgsz) {
2555 2572 ASSERT(ierr == -2 || szc != 0);
2556 2573 ASSERT(ierr == -1 || szc < sptseg->s_szc);
2557 2574 szc = (ierr == -1) ? szc - 1 : szc + 1;
2558 2575 } else {
2559 2576 /*
2560 2577 * For faults and segvn_anypgsz == 0
2561 2578 * we need to be careful not to loop forever
2562 2579 * if existing page is found with szc other
2563 2580 * than 0 or seg->s_szc. This could be due
2564 2581 * to page relocations on behalf of DR or
2565 2582 * more likely large page creation. For this
2566 2583 * case simply re-size to existing page's szc
2567 2584 * if returned by anon_map_getpages().
2568 2585 */
2569 2586 if (ppa_szc == (uint_t)-1) {
2570 2587 szc = (ierr == -1) ? 0 : sptseg->s_szc;
2571 2588 } else {
2572 2589 ASSERT(ppa_szc <= sptseg->s_szc);
2573 2590 ASSERT(ierr == -2 || ppa_szc < szc);
2574 2591 ASSERT(ierr == -1 || ppa_szc > szc);
2575 2592 szc = ppa_szc;
2576 2593 }
2577 2594 }
2578 2595 pg_sz = page_get_pagesize(szc);
2579 2596 lp_npgs = btop(pg_sz);
2580 2597 ASSERT(IS_P2ALIGNED(lp_addr, pg_sz));
2581 2598 }
2582 2599 if (anon_locked) {
2583 2600 anon_array_exit(&cookie);
2584 2601 }
2585 2602 ANON_LOCK_EXIT(&->a_rwlock);
2586 2603 return (0);
2587 2604
2588 2605 lpgs_err:
2589 2606 if (anon_locked) {
2590 2607 anon_array_exit(&cookie);
2591 2608 }
2592 2609 ANON_LOCK_EXIT(&->a_rwlock);
2593 2610 for (j = 0; j < ppa_idx; j++)
2594 2611 page_unlock(ppa[j]);
2595 2612 return (err);
2596 2613 }
2597 2614
2598 2615 /*
2599 2616 * count the number of bytes in a set of spt pages that are currently not
2600 2617 * locked
2601 2618 */
2602 2619 static rctl_qty_t
2603 2620 spt_unlockedbytes(pgcnt_t npages, page_t **ppa)
2604 2621 {
2605 2622 ulong_t i;
2606 2623 rctl_qty_t unlocked = 0;
2607 2624
2608 2625 for (i = 0; i < npages; i++) {
2609 2626 if (ppa[i]->p_lckcnt == 0)
2610 2627 unlocked += PAGESIZE;
2611 2628 }
2612 2629 return (unlocked);
2613 2630 }
2614 2631
2615 2632 extern u_longlong_t randtick(void);
2616 2633 /* number of locks to reserve/skip by spt_lockpages() and spt_unlockpages() */
2617 2634 #define NLCK (NCPU_P2)
2618 2635 /* Random number with a range [0, n-1], n must be power of two */
2619 2636 #define RAND_P2(n) \
2620 2637 ((((long)curthread >> PTR24_LSB) ^ (long)randtick()) & ((n) - 1))
2621 2638
2622 2639 int
2623 2640 spt_lockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages,
2624 2641 page_t **ppa, ulong_t *lockmap, size_t pos,
2625 2642 rctl_qty_t *locked)
2626 2643 {
2627 2644 struct shm_data *shmd = seg->s_data;
2628 2645 struct spt_data *sptd = shmd->shm_sptseg->s_data;
2629 2646 ulong_t i;
2630 2647 int kernel;
2631 2648 pgcnt_t nlck = 0;
2632 2649 int rv = 0;
2633 2650 int use_reserved = 1;
2634 2651
2635 2652 /* return the number of bytes actually locked */
2636 2653 *locked = 0;
2637 2654
2638 2655 /*
2639 2656 * To avoid contention on freemem_lock, availrmem and pages_locked
2640 2657 * global counters are updated only every nlck locked pages instead of
2641 2658 * every time. Reserve nlck locks up front and deduct from this
2642 2659 * reservation for each page that requires a lock. When the reservation
2643 2660 * is consumed, reserve again. nlck is randomized, so the competing
2644 2661 * threads do not fall into a cyclic lock contention pattern. When
2645 2662 * memory is low, the lock ahead is disabled, and instead page_pp_lock()
2646 2663 * is used to lock pages.
2647 2664 */
2648 2665 for (i = 0; i < npages; anon_index++, pos++, i++) {
2649 2666 if (nlck == 0 && use_reserved == 1) {
2650 2667 nlck = NLCK + RAND_P2(NLCK);
2651 2668 /* if fewer loops left, decrease nlck */
2652 2669 nlck = MIN(nlck, npages - i);
2653 2670 /*
2654 2671 * Reserve nlck locks up front and deduct from this
2655 2672 * reservation for each page that requires a lock. When
2656 2673 * the reservation is consumed, reserve again.
2657 2674 */
2658 2675 mutex_enter(&freemem_lock);
2659 2676 if ((availrmem - nlck) < pages_pp_maximum) {
2660 2677 /* Do not do advance memory reserves */
2661 2678 use_reserved = 0;
2662 2679 } else {
2663 2680 availrmem -= nlck;
2664 2681 pages_locked += nlck;
2665 2682 }
2666 2683 mutex_exit(&freemem_lock);
2667 2684 }
2668 2685 if (!(shmd->shm_vpage[anon_index] & DISM_PG_LOCKED)) {
2669 2686 if (sptd->spt_ppa_lckcnt[anon_index] <
2670 2687 (ushort_t)DISM_LOCK_MAX) {
2671 2688 if (++sptd->spt_ppa_lckcnt[anon_index] ==
2672 2689 (ushort_t)DISM_LOCK_MAX) {
2673 2690 cmn_err(CE_WARN,
2674 2691 "DISM page lock limit "
2675 2692 "reached on DISM offset 0x%lx\n",
2676 2693 anon_index << PAGESHIFT);
2677 2694 }
2678 2695 kernel = (sptd->spt_ppa &&
2679 2696 sptd->spt_ppa[anon_index]);
2680 2697 if (!page_pp_lock(ppa[i], 0, kernel ||
2681 2698 use_reserved)) {
2682 2699 sptd->spt_ppa_lckcnt[anon_index]--;
2683 2700 rv = EAGAIN;
2684 2701 break;
2685 2702 }
2686 2703 /* if this is a newly locked page, count it */
2687 2704 if (ppa[i]->p_lckcnt == 1) {
2688 2705 if (kernel == 0 && use_reserved == 1)
2689 2706 nlck--;
2690 2707 *locked += PAGESIZE;
2691 2708 }
2692 2709 shmd->shm_lckpgs++;
2693 2710 shmd->shm_vpage[anon_index] |= DISM_PG_LOCKED;
2694 2711 if (lockmap != NULL)
2695 2712 BT_SET(lockmap, pos);
2696 2713 }
2697 2714 }
2698 2715 }
2699 2716 /* Return unused lock reservation */
2700 2717 if (nlck != 0 && use_reserved == 1) {
2701 2718 mutex_enter(&freemem_lock);
2702 2719 availrmem += nlck;
2703 2720 pages_locked -= nlck;
2704 2721 mutex_exit(&freemem_lock);
2705 2722 }
2706 2723
2707 2724 return (rv);
2708 2725 }
2709 2726
2710 2727 int
2711 2728 spt_unlockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages,
2712 2729 rctl_qty_t *unlocked)
2713 2730 {
2714 2731 struct shm_data *shmd = seg->s_data;
2715 2732 struct spt_data *sptd = shmd->shm_sptseg->s_data;
2716 2733 struct anon_map *amp = sptd->spt_amp;
2717 2734 struct anon *ap;
2718 2735 struct vnode *vp;
2719 2736 u_offset_t off;
2720 2737 struct page *pp;
2721 2738 int kernel;
2722 2739 anon_sync_obj_t cookie;
2723 2740 ulong_t i;
2724 2741 pgcnt_t nlck = 0;
2725 2742 pgcnt_t nlck_limit = NLCK;
2726 2743
2727 2744 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
2728 2745 for (i = 0; i < npages; i++, anon_index++) {
2729 2746 if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) {
2730 2747 anon_array_enter(amp, anon_index, &cookie);
2731 2748 ap = anon_get_ptr(amp->ahp, anon_index);
2732 2749 ASSERT(ap);
2733 2750
2734 2751 swap_xlate(ap, &vp, &off);
2735 2752 anon_array_exit(&cookie);
2736 2753 pp = page_lookup(vp, off, SE_SHARED);
2737 2754 ASSERT(pp);
2738 2755 /*
2739 2756 * availrmem is decremented only for pages which are not
2740 2757 * in seg pcache, for pages in seg pcache availrmem was
2741 2758 * decremented in _dismpagelock()
2742 2759 */
2743 2760 kernel = (sptd->spt_ppa && sptd->spt_ppa[anon_index]);
2744 2761 ASSERT(pp->p_lckcnt > 0);
2745 2762
2746 2763 /*
2747 2764 * lock page but do not change availrmem, we do it
2748 2765 * ourselves every nlck loops.
2749 2766 */
2750 2767 page_pp_unlock(pp, 0, 1);
2751 2768 if (pp->p_lckcnt == 0) {
2752 2769 if (kernel == 0)
2753 2770 nlck++;
2754 2771 *unlocked += PAGESIZE;
2755 2772 }
2756 2773 page_unlock(pp);
2757 2774 shmd->shm_vpage[anon_index] &= ~DISM_PG_LOCKED;
2758 2775 sptd->spt_ppa_lckcnt[anon_index]--;
2759 2776 shmd->shm_lckpgs--;
2760 2777 }
2761 2778
2762 2779 /*
2763 2780 * To reduce freemem_lock contention, do not update availrmem
2764 2781 * until at least NLCK pages have been unlocked.
2765 2782 * 1. No need to update if nlck is zero
2766 2783 * 2. Always update if the last iteration
2767 2784 */
2768 2785 if (nlck > 0 && (nlck == nlck_limit || i == npages - 1)) {
2769 2786 mutex_enter(&freemem_lock);
2770 2787 availrmem += nlck;
2771 2788 pages_locked -= nlck;
2772 2789 mutex_exit(&freemem_lock);
2773 2790 nlck = 0;
2774 2791 nlck_limit = NLCK + RAND_P2(NLCK);
2775 2792 }
2776 2793 }
2777 2794 ANON_LOCK_EXIT(&->a_rwlock);
2778 2795
2779 2796 return (0);
2780 2797 }
2781 2798
2782 2799 /*ARGSUSED*/
2783 2800 static int
2784 2801 segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len,
2785 2802 int attr, int op, ulong_t *lockmap, size_t pos)
2786 2803 {
2787 2804 struct shm_data *shmd = seg->s_data;
2788 2805 struct seg *sptseg = shmd->shm_sptseg;
2789 2806 struct spt_data *sptd = sptseg->s_data;
2790 2807 struct kshmid *sp = sptd->spt_amp->a_sp;
2791 2808 pgcnt_t npages, a_npages;
2792 2809 page_t **ppa;
2793 2810 pgcnt_t an_idx, a_an_idx, ppa_idx;
2794 2811 caddr_t spt_addr, a_addr; /* spt and aligned address */
2795 2812 size_t a_len; /* aligned len */
2796 2813 size_t share_sz;
2797 2814 ulong_t i;
2798 2815 int sts = 0;
2799 2816 rctl_qty_t unlocked = 0;
2800 2817 rctl_qty_t locked = 0;
2801 2818 struct proc *p = curproc;
2802 2819 kproject_t *proj;
2803 2820
2804 2821 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2805 2822 ASSERT(sp != NULL);
2806 2823
2807 2824 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
2808 2825 return (0);
2809 2826 }
2810 2827
2811 2828 addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2812 2829 an_idx = seg_page(seg, addr);
2813 2830 npages = btopr(len);
2814 2831
2815 2832 if (an_idx + npages > btopr(shmd->shm_amp->size)) {
2816 2833 return (ENOMEM);
2817 2834 }
2818 2835
2819 2836 /*
2820 2837 * A shm's project never changes, so no lock needed.
2821 2838 * The shm has a hold on the project, so it will not go away.
2822 2839 * Since we have a mapping to shm within this zone, we know
2823 2840 * that the zone will not go away.
2824 2841 */
2825 2842 proj = sp->shm_perm.ipc_proj;
2826 2843
2827 2844 if (op == MC_LOCK) {
2828 2845
2829 2846 /*
2830 2847 * Need to align addr and size request if they are not
2831 2848 * aligned so we can always allocate large page(s) however
2832 2849 * we only lock what was requested in initial request.
2833 2850 */
2834 2851 share_sz = page_get_pagesize(sptseg->s_szc);
2835 2852 a_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_sz);
2836 2853 a_len = P2ROUNDUP((uintptr_t)(((addr + len) - a_addr)),
2837 2854 share_sz);
2838 2855 a_npages = btop(a_len);
2839 2856 a_an_idx = seg_page(seg, a_addr);
2840 2857 spt_addr = sptseg->s_base + ptob(a_an_idx);
2841 2858 ppa_idx = an_idx - a_an_idx;
2842 2859
2843 2860 if ((ppa = kmem_zalloc(((sizeof (page_t *)) * a_npages),
2844 2861 KM_NOSLEEP)) == NULL) {
2845 2862 return (ENOMEM);
2846 2863 }
2847 2864
2848 2865 /*
2849 2866 * Don't cache any new pages for IO and
2850 2867 * flush any cached pages.
2851 2868 */
2852 2869 mutex_enter(&sptd->spt_lock);
2853 2870 if (sptd->spt_ppa != NULL)
2854 2871 sptd->spt_flags |= DISM_PPA_CHANGED;
2855 2872
2856 2873 sts = spt_anon_getpages(sptseg, spt_addr, a_len, ppa);
2857 2874 if (sts != 0) {
2858 2875 mutex_exit(&sptd->spt_lock);
2859 2876 kmem_free(ppa, ((sizeof (page_t *)) * a_npages));
2860 2877 return (sts);
2861 2878 }
2862 2879
2863 2880 mutex_enter(&sp->shm_mlock);
2864 2881 /* enforce locked memory rctl */
2865 2882 unlocked = spt_unlockedbytes(npages, &ppa[ppa_idx]);
2866 2883
2867 2884 mutex_enter(&p->p_lock);
2868 2885 if (rctl_incr_locked_mem(p, proj, unlocked, 0)) {
2869 2886 mutex_exit(&p->p_lock);
2870 2887 sts = EAGAIN;
2871 2888 } else {
2872 2889 mutex_exit(&p->p_lock);
2873 2890 sts = spt_lockpages(seg, an_idx, npages,
2874 2891 &ppa[ppa_idx], lockmap, pos, &locked);
2875 2892
2876 2893 /*
2877 2894 * correct locked count if not all pages could be
2878 2895 * locked
2879 2896 */
2880 2897 if ((unlocked - locked) > 0) {
2881 2898 rctl_decr_locked_mem(NULL, proj,
2882 2899 (unlocked - locked), 0);
2883 2900 }
2884 2901 }
2885 2902 /*
2886 2903 * unlock pages
2887 2904 */
2888 2905 for (i = 0; i < a_npages; i++)
2889 2906 page_unlock(ppa[i]);
2890 2907 if (sptd->spt_ppa != NULL)
2891 2908 sptd->spt_flags |= DISM_PPA_CHANGED;
2892 2909 mutex_exit(&sp->shm_mlock);
2893 2910 mutex_exit(&sptd->spt_lock);
2894 2911
2895 2912 kmem_free(ppa, ((sizeof (page_t *)) * a_npages));
2896 2913
2897 2914 } else if (op == MC_UNLOCK) { /* unlock */
2898 2915 page_t **ppa;
2899 2916
2900 2917 mutex_enter(&sptd->spt_lock);
2901 2918 if (shmd->shm_lckpgs == 0) {
2902 2919 mutex_exit(&sptd->spt_lock);
2903 2920 return (0);
2904 2921 }
2905 2922 /*
2906 2923 * Don't cache new IO pages.
2907 2924 */
2908 2925 if (sptd->spt_ppa != NULL)
2909 2926 sptd->spt_flags |= DISM_PPA_CHANGED;
2910 2927
2911 2928 mutex_enter(&sp->shm_mlock);
2912 2929 sts = spt_unlockpages(seg, an_idx, npages, &unlocked);
2913 2930 if ((ppa = sptd->spt_ppa) != NULL)
2914 2931 sptd->spt_flags |= DISM_PPA_CHANGED;
2915 2932 mutex_exit(&sptd->spt_lock);
2916 2933
2917 2934 rctl_decr_locked_mem(NULL, proj, unlocked, 0);
2918 2935 mutex_exit(&sp->shm_mlock);
2919 2936
2920 2937 if (ppa != NULL)
2921 2938 seg_ppurge_wiredpp(ppa);
2922 2939 }
2923 2940 return (sts);
2924 2941 }
2925 2942
2926 2943 /*ARGSUSED*/
2927 2944 int
2928 2945 segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
2929 2946 {
2930 2947 struct shm_data *shmd = (struct shm_data *)seg->s_data;
2931 2948 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2932 2949 spgcnt_t pgno = seg_page(seg, addr+len) - seg_page(seg, addr) + 1;
2933 2950
2934 2951 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2935 2952
2936 2953 /*
2937 2954 * ISM segment is always rw.
2938 2955 */
2939 2956 while (--pgno >= 0)
2940 2957 *protv++ = sptd->spt_prot;
2941 2958 return (0);
2942 2959 }
2943 2960
2944 2961 /*ARGSUSED*/
2945 2962 u_offset_t
2946 2963 segspt_shmgetoffset(struct seg *seg, caddr_t addr)
2947 2964 {
2948 2965 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2949 2966
2950 2967 /* Offset does not matter in ISM memory */
2951 2968
2952 2969 return ((u_offset_t)0);
2953 2970 }
2954 2971
2955 2972 /* ARGSUSED */
2956 2973 int
2957 2974 segspt_shmgettype(struct seg *seg, caddr_t addr)
2958 2975 {
2959 2976 struct shm_data *shmd = (struct shm_data *)seg->s_data;
2960 2977 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2961 2978
2962 2979 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2963 2980
2964 2981 /*
2965 2982 * The shared memory mapping is always MAP_SHARED, SWAP is only
2966 2983 * reserved for DISM
2967 2984 */
2968 2985 return (MAP_SHARED |
2969 2986 ((sptd->spt_flags & SHM_PAGEABLE) ? 0 : MAP_NORESERVE));
2970 2987 }
2971 2988
2972 2989 /*ARGSUSED*/
2973 2990 int
2974 2991 segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
2975 2992 {
2976 2993 struct shm_data *shmd = (struct shm_data *)seg->s_data;
2977 2994 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2978 2995
2979 2996 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2980 2997
2981 2998 *vpp = sptd->spt_vp;
2982 2999 return (0);
2983 3000 }
2984 3001
2985 3002 /*
2986 3003 * We need to wait for pending IO to complete to a DISM segment in order for
2987 3004 * pages to get kicked out of the seg_pcache. 120 seconds should be more
2988 3005 * than enough time to wait.
2989 3006 */
2990 3007 static clock_t spt_pcache_wait = 120;
2991 3008
2992 3009 /*ARGSUSED*/
2993 3010 static int
2994 3011 segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
2995 3012 {
2996 3013 struct shm_data *shmd = (struct shm_data *)seg->s_data;
2997 3014 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2998 3015 struct anon_map *amp;
2999 3016 pgcnt_t pg_idx;
3000 3017 ushort_t gen;
3001 3018 clock_t end_lbolt;
3002 3019 int writer;
3003 3020 page_t **ppa;
3004 3021
3005 3022 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
3006 3023
3007 3024 if (behav == MADV_FREE || behav == MADV_PURGE) {
3008 3025 if ((sptd->spt_flags & SHM_PAGEABLE) == 0)
3009 3026 return (0);
3010 3027
3011 3028 amp = sptd->spt_amp;
3012 3029 pg_idx = seg_page(seg, addr);
3013 3030
3014 3031 mutex_enter(&sptd->spt_lock);
3015 3032 if ((ppa = sptd->spt_ppa) == NULL) {
3016 3033 mutex_exit(&sptd->spt_lock);
3017 3034 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
3018 3035 (void) anon_disclaim(amp, pg_idx, len, behav, NULL);
3019 3036 ANON_LOCK_EXIT(&->a_rwlock);
3020 3037 return (0);
3021 3038 }
3022 3039
3023 3040 sptd->spt_flags |= DISM_PPA_CHANGED;
3024 3041 gen = sptd->spt_gen;
3025 3042
3026 3043 mutex_exit(&sptd->spt_lock);
3027 3044
3028 3045 /*
3029 3046 * Purge all DISM cached pages
3030 3047 */
3031 3048 seg_ppurge_wiredpp(ppa);
3032 3049
3033 3050 /*
3034 3051 * Drop the AS_LOCK so that other threads can grab it
3035 3052 * in the as_pageunlock path and hopefully get the segment
3036 3053 * kicked out of the seg_pcache. We bump the shm_softlockcnt
3037 3054 * to keep this segment resident.
3038 3055 */
3039 3056 writer = AS_WRITE_HELD(seg->s_as);
3040 3057 atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
3041 3058 AS_LOCK_EXIT(seg->s_as);
3042 3059
3043 3060 mutex_enter(&sptd->spt_lock);
3044 3061
3045 3062 end_lbolt = ddi_get_lbolt() + (hz * spt_pcache_wait);
3046 3063
3047 3064 /*
3048 3065 * Try to wait for pages to get kicked out of the seg_pcache.
3049 3066 */
3050 3067 while (sptd->spt_gen == gen &&
3051 3068 (sptd->spt_flags & DISM_PPA_CHANGED) &&
3052 3069 ddi_get_lbolt() < end_lbolt) {
3053 3070 if (!cv_timedwait_sig(&sptd->spt_cv,
3054 3071 &sptd->spt_lock, end_lbolt)) {
3055 3072 break;
3056 3073 }
3057 3074 }
3058 3075
3059 3076 mutex_exit(&sptd->spt_lock);
3060 3077
3061 3078 /* Regrab the AS_LOCK and release our hold on the segment */
3062 3079 AS_LOCK_ENTER(seg->s_as, writer ? RW_WRITER : RW_READER);
3063 3080 atomic_dec_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
3064 3081 if (shmd->shm_softlockcnt <= 0) {
3065 3082 if (AS_ISUNMAPWAIT(seg->s_as)) {
3066 3083 mutex_enter(&seg->s_as->a_contents);
3067 3084 if (AS_ISUNMAPWAIT(seg->s_as)) {
3068 3085 AS_CLRUNMAPWAIT(seg->s_as);
3069 3086 cv_broadcast(&seg->s_as->a_cv);
3070 3087 }
3071 3088 mutex_exit(&seg->s_as->a_contents);
3072 3089 }
3073 3090 }
3074 3091
3075 3092 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
3076 3093 (void) anon_disclaim(amp, pg_idx, len, behav, NULL);
3077 3094 ANON_LOCK_EXIT(&->a_rwlock);
3078 3095 } else if (lgrp_optimizations() && (behav == MADV_ACCESS_LWP ||
3079 3096 behav == MADV_ACCESS_MANY || behav == MADV_ACCESS_DEFAULT)) {
3080 3097 int already_set;
3081 3098 ulong_t anon_index;
3082 3099 lgrp_mem_policy_t policy;
3083 3100 caddr_t shm_addr;
3084 3101 size_t share_size;
3085 3102 size_t size;
3086 3103 struct seg *sptseg = shmd->shm_sptseg;
3087 3104 caddr_t sptseg_addr;
3088 3105
3089 3106 /*
3090 3107 * Align address and length to page size of underlying segment
3091 3108 */
3092 3109 share_size = page_get_pagesize(shmd->shm_sptseg->s_szc);
3093 3110 shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_size);
3094 3111 size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)),
3095 3112 share_size);
3096 3113
3097 3114 amp = shmd->shm_amp;
3098 3115 anon_index = seg_page(seg, shm_addr);
3099 3116
3100 3117 /*
3101 3118 * And now we may have to adjust size downward if we have
3102 3119 * exceeded the realsize of the segment or initial anon
3103 3120 * allocations.
3104 3121 */
3105 3122 sptseg_addr = sptseg->s_base + ptob(anon_index);
3106 3123 if ((sptseg_addr + size) >
3107 3124 (sptseg->s_base + sptd->spt_realsize))
3108 3125 size = (sptseg->s_base + sptd->spt_realsize) -
3109 3126 sptseg_addr;
3110 3127
3111 3128 /*
3112 3129 * Set memory allocation policy for this segment
3113 3130 */
3114 3131 policy = lgrp_madv_to_policy(behav, len, MAP_SHARED);
3115 3132 already_set = lgrp_shm_policy_set(policy, amp, anon_index,
3116 3133 NULL, 0, len);
3117 3134
3118 3135 /*
3119 3136 * If random memory allocation policy set already,
3120 3137 * don't bother reapplying it.
3121 3138 */
3122 3139 if (already_set && !LGRP_MEM_POLICY_REAPPLICABLE(policy))
3123 3140 return (0);
3124 3141
3125 3142 /*
3126 3143 * Mark any existing pages in the given range for
3127 3144 * migration, flushing the I/O page cache, and using
3128 3145 * underlying segment to calculate anon index and get
3129 3146 * anonmap and vnode pointer from
3130 3147 */
3131 3148 if (shmd->shm_softlockcnt > 0)
3132 3149 segspt_purge(seg);
3133 3150
3134 3151 page_mark_migrate(seg, shm_addr, size, amp, 0, NULL, 0, 0);
3135 3152 }
3136 3153
3137 3154 return (0);
3138 3155 }
3139 3156
3140 3157 /*ARGSUSED*/
3141 3158 void
3142 3159 segspt_shmdump(struct seg *seg)
3143 3160 {
3144 3161 /* no-op for ISM segment */
3145 3162 }
3146 3163
3147 3164 /*ARGSUSED*/
3148 3165 static int
3149 3166 segspt_shmsetpgsz(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
3150 3167 {
3151 3168 return (ENOTSUP);
3152 3169 }
3153 3170
3154 3171 /*
3155 3172 * get a memory ID for an addr in a given segment
3156 3173 */
3157 3174 static int
3158 3175 segspt_shmgetmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
3159 3176 {
3160 3177 struct shm_data *shmd = (struct shm_data *)seg->s_data;
3161 3178 struct anon *ap;
3162 3179 size_t anon_index;
3163 3180 struct anon_map *amp = shmd->shm_amp;
3164 3181 struct spt_data *sptd = shmd->shm_sptseg->s_data;
3165 3182 struct seg *sptseg = shmd->shm_sptseg;
3166 3183 anon_sync_obj_t cookie;
3167 3184
3168 3185 anon_index = seg_page(seg, addr);
3169 3186
3170 3187 if (addr > (seg->s_base + sptd->spt_realsize)) {
3171 3188 return (EFAULT);
3172 3189 }
3173 3190
3174 3191 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
3175 3192 anon_array_enter(amp, anon_index, &cookie);
3176 3193 ap = anon_get_ptr(amp->ahp, anon_index);
3177 3194 if (ap == NULL) {
3178 3195 struct page *pp;
3179 3196 caddr_t spt_addr = sptseg->s_base + ptob(anon_index);
3180 3197
3181 3198 pp = anon_zero(sptseg, spt_addr, &ap, kcred);
3182 3199 if (pp == NULL) {
3183 3200 anon_array_exit(&cookie);
3184 3201 ANON_LOCK_EXIT(&->a_rwlock);
3185 3202 return (ENOMEM);
3186 3203 }
3187 3204 (void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP);
3188 3205 page_unlock(pp);
3189 3206 }
3190 3207 anon_array_exit(&cookie);
3191 3208 ANON_LOCK_EXIT(&->a_rwlock);
3192 3209 memidp->val[0] = (uintptr_t)ap;
3193 3210 memidp->val[1] = (uintptr_t)addr & PAGEOFFSET;
3194 3211 return (0);
3195 3212 }
3196 3213
3197 3214 /*
3198 3215 * Get memory allocation policy info for specified address in given segment
3199 3216 */
3200 3217 static lgrp_mem_policy_info_t *
3201 3218 segspt_shmgetpolicy(struct seg *seg, caddr_t addr)
3202 3219 {
3203 3220 struct anon_map *amp;
3204 3221 ulong_t anon_index;
3205 3222 lgrp_mem_policy_info_t *policy_info;
3206 3223 struct shm_data *shm_data;
3207 3224
3208 3225 ASSERT(seg != NULL);
3209 3226
3210 3227 /*
3211 3228 * Get anon_map from segshm
3212 3229 *
3213 3230 * Assume that no lock needs to be held on anon_map, since
3214 3231 * it should be protected by its reference count which must be
3215 3232 * nonzero for an existing segment
3216 3233 * Need to grab readers lock on policy tree though
3217 3234 */
3218 3235 shm_data = (struct shm_data *)seg->s_data;
3219 3236 if (shm_data == NULL)
3220 3237 return (NULL);
3221 3238 amp = shm_data->shm_amp;
3222 3239 ASSERT(amp->refcnt != 0);
3223 3240
3224 3241 /*
3225 3242 * Get policy info
3226 3243 *
3227 3244 * Assume starting anon index of 0
3228 3245 */
3229 3246 anon_index = seg_page(seg, addr);
3230 3247 policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0);
3231 3248
3232 3249 return (policy_info);
3233 3250 }
3234 3251
3235 3252 /*ARGSUSED*/
3236 3253 static int
3237 3254 segspt_shmcapable(struct seg *seg, segcapability_t capability)
3238 3255 {
3239 3256 return (0);
3240 3257 }
↓ open down ↓ |
2895 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX