Print this page
OS-7753 THREAD_KPRI_RELEASE does nothing of the sort
Reviewed by: Bryan Cantrill <bryan@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/vm/page_lock.c
+++ new/usr/src/uts/common/vm/page_lock.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
↓ open down ↓ |
12 lines elided |
↑ open up ↑ |
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23 + * Copyright 2019 Joyent, Inc.
23 24 */
24 25
25 26
26 27 /*
27 28 * VM - page locking primitives
28 29 */
29 30 #include <sys/param.h>
30 31 #include <sys/t_lock.h>
31 32 #include <sys/vtrace.h>
32 33 #include <sys/debug.h>
33 34 #include <sys/cmn_err.h>
34 35 #include <sys/bitmap.h>
35 36 #include <sys/lockstat.h>
36 37 #include <sys/sysmacros.h>
37 38 #include <sys/condvar_impl.h>
38 39 #include <vm/page.h>
39 40 #include <vm/seg_enum.h>
40 41 #include <vm/vm_dep.h>
41 42 #include <vm/seg_kmem.h>
42 43
43 44 /*
44 45 * This global mutex array is for logical page locking.
45 46 * The following fields in the page structure are protected
46 47 * by this lock:
47 48 *
48 49 * p_lckcnt
49 50 * p_cowcnt
50 51 */
51 52 pad_mutex_t page_llocks[8 * NCPU_P2];
52 53
53 54 /*
54 55 * This is a global lock for the logical page free list. The
55 56 * logical free list, in this implementation, is maintained as two
56 57 * separate physical lists - the cache list and the free list.
57 58 */
58 59 kmutex_t page_freelock;
59 60
60 61 /*
61 62 * The hash table, page_hash[], the p_selock fields, and the
62 63 * list of pages associated with vnodes are protected by arrays of mutexes.
63 64 *
64 65 * Unless the hashes are changed radically, the table sizes must be
65 66 * a power of two. Also, we typically need more mutexes for the
66 67 * vnodes since these locks are occasionally held for long periods.
67 68 * And since there seem to be two special vnodes (kvp and swapvp),
68 69 * we make room for private mutexes for them.
69 70 *
70 71 * The pse_mutex[] array holds the mutexes to protect the p_selock
71 72 * fields of all page_t structures.
72 73 *
73 74 * PAGE_SE_MUTEX(pp) returns the address of the appropriate mutex
74 75 * when given a pointer to a page_t.
75 76 *
76 77 * PIO_TABLE_SIZE must be a power of two. One could argue that we
77 78 * should go to the trouble of setting it up at run time and base it
78 79 * on memory size rather than the number of compile time CPUs.
79 80 *
80 81 * XX64 We should be using physmem size to calculate PIO_SHIFT.
81 82 *
82 83 * These might break in 64 bit world.
83 84 */
84 85 #define PIO_SHIFT 7 /* log2(sizeof(page_t)) */
85 86 #define PIO_TABLE_SIZE 128 /* number of io mutexes to have */
86 87
87 88 pad_mutex_t ph_mutex[PH_TABLE_SIZE];
88 89 kmutex_t pio_mutex[PIO_TABLE_SIZE];
89 90
90 91 #define PAGE_IO_MUTEX(pp) \
91 92 &pio_mutex[(((uintptr_t)pp) >> PIO_SHIFT) & (PIO_TABLE_SIZE - 1)]
92 93
93 94 /*
94 95 * The pse_mutex[] array is allocated in the platform startup code
95 96 * based on the size of the machine at startup.
96 97 */
97 98 extern pad_mutex_t *pse_mutex; /* Locks protecting pp->p_selock */
98 99 extern size_t pse_table_size; /* Number of mutexes in pse_mutex[] */
99 100 extern int pse_shift; /* log2(pse_table_size) */
100 101 #define PAGE_SE_MUTEX(pp) &pse_mutex[ \
101 102 ((((uintptr_t)(pp) >> pse_shift) ^ ((uintptr_t)(pp))) >> 7) & \
102 103 (pse_table_size - 1)].pad_mutex
103 104
104 105 #define PSZC_MTX_TABLE_SIZE 128
105 106 #define PSZC_MTX_TABLE_SHIFT 7
106 107
107 108 static pad_mutex_t pszc_mutex[PSZC_MTX_TABLE_SIZE];
108 109
109 110 #define PAGE_SZC_MUTEX(_pp) \
110 111 &pszc_mutex[((((uintptr_t)(_pp) >> PSZC_MTX_TABLE_SHIFT) ^ \
111 112 ((uintptr_t)(_pp) >> (PSZC_MTX_TABLE_SHIFT << 1)) ^ \
112 113 ((uintptr_t)(_pp) >> (3 * PSZC_MTX_TABLE_SHIFT))) & \
113 114 (PSZC_MTX_TABLE_SIZE - 1))].pad_mutex
114 115
115 116 /*
116 117 * The vph_mutex[] array holds the mutexes to protect the vnode chains,
117 118 * (i.e., the list of pages anchored by v_pages and connected via p_vpprev
118 119 * and p_vpnext).
119 120 *
120 121 * The page_vnode_mutex(vp) function returns the address of the appropriate
121 122 * mutex from this array given a pointer to a vnode. It is complicated
122 123 * by the fact that the kernel's vnode and the swapfs vnode are referenced
123 124 * frequently enough to warrent their own mutexes.
124 125 *
125 126 * The VP_HASH_FUNC returns the index into the vph_mutex array given
126 127 * an address of a vnode.
127 128 */
128 129
129 130 #if defined(_LP64)
130 131 #define VPH_TABLE_SIZE (8 * NCPU_P2)
131 132 #else /* 32 bits */
132 133 #define VPH_TABLE_SIZE (2 * NCPU_P2)
133 134 #endif
134 135
135 136 #define VP_HASH_FUNC(vp) \
136 137 ((((uintptr_t)(vp) >> 6) + \
137 138 ((uintptr_t)(vp) >> 8) + \
138 139 ((uintptr_t)(vp) >> 10) + \
139 140 ((uintptr_t)(vp) >> 12)) \
140 141 & (VPH_TABLE_SIZE - 1))
141 142
142 143 /*
143 144 * Two slots after VPH_TABLE_SIZE are reserved in vph_mutex for kernel vnodes.
144 145 * The lock for kvp is VPH_TABLE_SIZE + 0, and the lock for zvp is
145 146 * VPH_TABLE_SIZE + 1.
146 147 */
147 148
148 149 kmutex_t vph_mutex[VPH_TABLE_SIZE + 2];
149 150
150 151 /*
151 152 * Initialize the locks used by the Virtual Memory Management system.
152 153 */
153 154 void
154 155 page_lock_init()
155 156 {
156 157 }
157 158
158 159 /*
159 160 * Return a value for pse_shift based on npg (the number of physical pages)
160 161 * and ncpu (the maximum number of CPUs). This is called by platform startup
161 162 * code.
162 163 *
163 164 * Lockstat data from TPC-H runs showed that contention on the pse_mutex[]
164 165 * locks grew approximately as the square of the number of threads executing.
165 166 * So the primary scaling factor used is NCPU^2. The size of the machine in
166 167 * megabytes is used as an upper bound, particularly for sun4v machines which
167 168 * all claim to have 256 CPUs maximum, and the old value of PSE_TABLE_SIZE
168 169 * (128) is used as a minimum. Since the size of the table has to be a power
169 170 * of two, the calculated size is rounded up to the next power of two.
170 171 */
171 172 /*ARGSUSED*/
172 173 int
173 174 size_pse_array(pgcnt_t npg, int ncpu)
174 175 {
175 176 size_t size;
176 177 pgcnt_t pp_per_mb = (1024 * 1024) / PAGESIZE;
177 178
178 179 size = MAX(128, MIN(npg / pp_per_mb, 2 * ncpu * ncpu));
179 180 size += (1 << (highbit(size) - 1)) - 1;
180 181 return (highbit(size) - 1);
181 182 }
182 183
183 184 /*
184 185 * At present we only use page ownership to aid debugging, so it's
185 186 * OK if the owner field isn't exact. In the 32-bit world two thread ids
186 187 * can map to the same owner because we just 'or' in 0x80000000 and
187 188 * then clear the second highest bit, so that (for example) 0x2faced00
188 189 * and 0xafaced00 both map to 0xafaced00.
189 190 * In the 64-bit world, p_selock may not be large enough to hold a full
190 191 * thread pointer. If we ever need precise ownership (e.g. if we implement
191 192 * priority inheritance for page locks) then p_selock should become a
192 193 * uintptr_t and SE_WRITER should be -((uintptr_t)curthread >> 2).
193 194 */
194 195 #define SE_WRITER (((selock_t)(ulong_t)curthread | INT_MIN) & ~SE_EWANTED)
195 196 #define SE_READER 1
196 197
197 198 /*
198 199 * A page that is deleted must be marked as such using the
199 200 * page_lock_delete() function. The page must be exclusively locked.
200 201 * The SE_DELETED marker is put in p_selock when this function is called.
201 202 * SE_DELETED must be distinct from any SE_WRITER value.
202 203 */
203 204 #define SE_DELETED (1 | INT_MIN)
204 205
205 206 #ifdef VM_STATS
206 207 uint_t vph_kvp_count;
207 208 uint_t vph_swapfsvp_count;
208 209 uint_t vph_other;
209 210 #endif /* VM_STATS */
210 211
211 212 #ifdef VM_STATS
212 213 uint_t page_lock_count;
213 214 uint_t page_lock_miss;
214 215 uint_t page_lock_miss_lock;
215 216 uint_t page_lock_reclaim;
216 217 uint_t page_lock_bad_reclaim;
217 218 uint_t page_lock_same_page;
218 219 uint_t page_lock_upgrade;
219 220 uint_t page_lock_retired;
220 221 uint_t page_lock_upgrade_failed;
221 222 uint_t page_lock_deleted;
222 223
223 224 uint_t page_trylock_locked;
224 225 uint_t page_trylock_failed;
225 226 uint_t page_trylock_missed;
226 227
227 228 uint_t page_try_reclaim_upgrade;
228 229 #endif /* VM_STATS */
229 230
230 231 /*
231 232 * Acquire the "shared/exclusive" lock on a page.
232 233 *
233 234 * Returns 1 on success and locks the page appropriately.
234 235 * 0 on failure and does not lock the page.
235 236 *
236 237 * If `lock' is non-NULL, it will be dropped and reacquired in the
237 238 * failure case. This routine can block, and if it does
238 239 * it will always return a failure since the page identity [vp, off]
239 240 * or state may have changed.
240 241 */
241 242
242 243 int
243 244 page_lock(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim)
244 245 {
245 246 return (page_lock_es(pp, se, lock, reclaim, 0));
246 247 }
247 248
248 249 /*
249 250 * With the addition of reader-writer lock semantics to page_lock_es,
250 251 * callers wanting an exclusive (writer) lock may prevent shared-lock
251 252 * (reader) starvation by setting the es parameter to SE_EXCL_WANTED.
252 253 * In this case, when an exclusive lock cannot be acquired, p_selock's
253 254 * SE_EWANTED bit is set. Shared-lock (reader) requests are also denied
254 255 * if the page is slated for retirement.
255 256 *
256 257 * The se and es parameters determine if the lock should be granted
257 258 * based on the following decision table:
258 259 *
259 260 * Lock wanted es flags p_selock/SE_EWANTED Action
260 261 * ----------- -------------- ------------------- ---------
261 262 * SE_EXCL any [1][2] unlocked/any grant lock, clear SE_EWANTED
262 263 * SE_EXCL SE_EWANTED any lock/any deny, set SE_EWANTED
263 264 * SE_EXCL none any lock/any deny
264 265 * SE_SHARED n/a [2] shared/0 grant
265 266 * SE_SHARED n/a [2] unlocked/0 grant
266 267 * SE_SHARED n/a shared/1 deny
267 268 * SE_SHARED n/a unlocked/1 deny
268 269 * SE_SHARED n/a excl/any deny
269 270 *
270 271 * Notes:
271 272 * [1] The code grants an exclusive lock to the caller and clears the bit
272 273 * SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED
273 274 * bit's value. This was deemed acceptable as we are not concerned about
274 275 * exclusive-lock starvation. If this ever becomes an issue, a priority or
275 276 * fifo mechanism should also be implemented. Meantime, the thread that
276 277 * set SE_EWANTED should be prepared to catch this condition and reset it
277 278 *
278 279 * [2] Retired pages may not be locked at any time, regardless of the
279 280 * dispostion of se, unless the es parameter has SE_RETIRED flag set.
280 281 *
281 282 * Notes on values of "es":
282 283 *
283 284 * es & 1: page_lookup_create will attempt page relocation
284 285 * es & SE_EXCL_WANTED: caller wants SE_EWANTED set (eg. delete
285 286 * memory thread); this prevents reader-starvation of waiting
286 287 * writer thread(s) by giving priority to writers over readers.
287 288 * es & SE_RETIRED: caller wants to lock pages even if they are
288 289 * retired. Default is to deny the lock if the page is retired.
289 290 *
290 291 * And yes, we know, the semantics of this function are too complicated.
291 292 * It's on the list to be cleaned up.
292 293 */
293 294 int
294 295 page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es)
295 296 {
296 297 int retval;
297 298 kmutex_t *pse = PAGE_SE_MUTEX(pp);
298 299 int upgraded;
299 300 int reclaim_it;
300 301
301 302 ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
302 303
303 304 VM_STAT_ADD(page_lock_count);
304 305
305 306 upgraded = 0;
306 307 reclaim_it = 0;
307 308
308 309 mutex_enter(pse);
309 310
310 311 ASSERT(((es & SE_EXCL_WANTED) == 0) ||
311 312 ((es & SE_EXCL_WANTED) && (se == SE_EXCL)));
312 313
313 314 if (PP_RETIRED(pp) && !(es & SE_RETIRED)) {
314 315 mutex_exit(pse);
315 316 VM_STAT_ADD(page_lock_retired);
316 317 return (0);
317 318 }
318 319
319 320 if (se == SE_SHARED && es == 1 && pp->p_selock == 0) {
320 321 se = SE_EXCL;
321 322 }
322 323
323 324 if ((reclaim == P_RECLAIM) && (PP_ISFREE(pp))) {
324 325
325 326 reclaim_it = 1;
326 327 if (se == SE_SHARED) {
327 328 /*
328 329 * This is an interesting situation.
329 330 *
330 331 * Remember that p_free can only change if
331 332 * p_selock < 0.
332 333 * p_free does not depend on our holding `pse'.
333 334 * And, since we hold `pse', p_selock can not change.
334 335 * So, if p_free changes on us, the page is already
335 336 * exclusively held, and we would fail to get p_selock
336 337 * regardless.
337 338 *
338 339 * We want to avoid getting the share
339 340 * lock on a free page that needs to be reclaimed.
340 341 * It is possible that some other thread has the share
341 342 * lock and has left the free page on the cache list.
342 343 * pvn_vplist_dirty() does this for brief periods.
343 344 * If the se_share is currently SE_EXCL, we will fail
344 345 * to acquire p_selock anyway. Blocking is the
345 346 * right thing to do.
346 347 * If we need to reclaim this page, we must get
347 348 * exclusive access to it, force the upgrade now.
348 349 * Again, we will fail to acquire p_selock if the
349 350 * page is not free and block.
350 351 */
351 352 upgraded = 1;
352 353 se = SE_EXCL;
353 354 VM_STAT_ADD(page_lock_upgrade);
354 355 }
355 356 }
356 357
↓ open down ↓ |
324 lines elided |
↑ open up ↑ |
357 358 if (se == SE_EXCL) {
358 359 if (!(es & SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) {
359 360 /*
360 361 * if the caller wants a writer lock (but did not
361 362 * specify exclusive access), and there is a pending
362 363 * writer that wants exclusive access, return failure
363 364 */
364 365 retval = 0;
365 366 } else if ((pp->p_selock & ~SE_EWANTED) == 0) {
366 367 /* no reader/writer lock held */
367 - THREAD_KPRI_REQUEST();
368 368 /* this clears our setting of the SE_EWANTED bit */
369 369 pp->p_selock = SE_WRITER;
370 370 retval = 1;
371 371 } else {
372 372 /* page is locked */
373 373 if (es & SE_EXCL_WANTED) {
374 374 /* set the SE_EWANTED bit */
375 375 pp->p_selock |= SE_EWANTED;
376 376 }
377 377 retval = 0;
378 378 }
379 379 } else {
380 380 retval = 0;
381 381 if (pp->p_selock >= 0) {
382 382 if ((pp->p_selock & SE_EWANTED) == 0) {
383 383 pp->p_selock += SE_READER;
384 384 retval = 1;
385 385 }
386 386 }
387 387 }
388 388
389 389 if (retval == 0) {
390 390 if ((pp->p_selock & ~SE_EWANTED) == SE_DELETED) {
391 391 VM_STAT_ADD(page_lock_deleted);
392 392 mutex_exit(pse);
393 393 return (retval);
394 394 }
395 395
396 396 #ifdef VM_STATS
397 397 VM_STAT_ADD(page_lock_miss);
398 398 if (upgraded) {
399 399 VM_STAT_ADD(page_lock_upgrade_failed);
400 400 }
401 401 #endif
402 402 if (lock) {
403 403 VM_STAT_ADD(page_lock_miss_lock);
404 404 mutex_exit(lock);
405 405 }
406 406
407 407 /*
408 408 * Now, wait for the page to be unlocked and
409 409 * release the lock protecting p_cv and p_selock.
410 410 */
411 411 cv_wait(&pp->p_cv, pse);
412 412 mutex_exit(pse);
413 413
414 414 /*
415 415 * The page identity may have changed while we were
416 416 * blocked. If we are willing to depend on "pp"
417 417 * still pointing to a valid page structure (i.e.,
418 418 * assuming page structures are not dynamically allocated
419 419 * or freed), we could try to lock the page if its
420 420 * identity hasn't changed.
421 421 *
422 422 * This needs to be measured, since we come back from
423 423 * cv_wait holding pse (the expensive part of this
424 424 * operation) we might as well try the cheap part.
425 425 * Though we would also have to confirm that dropping
426 426 * `lock' did not cause any grief to the callers.
427 427 */
428 428 if (lock) {
429 429 mutex_enter(lock);
430 430 }
431 431 } else {
432 432 /*
433 433 * We have the page lock.
434 434 * If we needed to reclaim the page, and the page
435 435 * needed reclaiming (ie, it was free), then we
436 436 * have the page exclusively locked. We may need
437 437 * to downgrade the page.
438 438 */
439 439 ASSERT((upgraded) ?
440 440 ((PP_ISFREE(pp)) && PAGE_EXCL(pp)) : 1);
441 441 mutex_exit(pse);
442 442
443 443 /*
444 444 * We now hold this page's lock, either shared or
445 445 * exclusive. This will prevent its identity from changing.
446 446 * The page, however, may or may not be free. If the caller
447 447 * requested, and it is free, go reclaim it from the
448 448 * free list. If the page can't be reclaimed, return failure
449 449 * so that the caller can start all over again.
450 450 *
451 451 * NOTE:page_reclaim() releases the page lock (p_selock)
452 452 * if it can't be reclaimed.
453 453 */
454 454 if (reclaim_it) {
455 455 if (!page_reclaim(pp, lock)) {
456 456 VM_STAT_ADD(page_lock_bad_reclaim);
457 457 retval = 0;
458 458 } else {
459 459 VM_STAT_ADD(page_lock_reclaim);
460 460 if (upgraded) {
461 461 page_downgrade(pp);
462 462 }
463 463 }
464 464 }
465 465 }
466 466 return (retval);
467 467 }
468 468
469 469 /*
470 470 * Clear the SE_EWANTED bit from p_selock. This function allows
471 471 * callers of page_lock_es and page_try_reclaim_lock to clear
472 472 * their setting of this bit if they decide they no longer wish
473 473 * to gain exclusive access to the page. Currently only
474 474 * delete_memory_thread uses this when the delete memory
475 475 * operation is cancelled.
476 476 */
477 477 void
478 478 page_lock_clr_exclwanted(page_t *pp)
479 479 {
480 480 kmutex_t *pse = PAGE_SE_MUTEX(pp);
481 481
482 482 mutex_enter(pse);
483 483 pp->p_selock &= ~SE_EWANTED;
484 484 if (CV_HAS_WAITERS(&pp->p_cv))
485 485 cv_broadcast(&pp->p_cv);
486 486 mutex_exit(pse);
487 487 }
488 488
489 489 /*
490 490 * Read the comments inside of page_lock_es() carefully.
491 491 *
492 492 * SE_EXCL callers specifying es == SE_EXCL_WANTED will cause the
493 493 * SE_EWANTED bit of p_selock to be set when the lock cannot be obtained.
494 494 * This is used by threads subject to reader-starvation (eg. memory delete).
495 495 *
496 496 * When a thread using SE_EXCL_WANTED does not obtain the SE_EXCL lock,
497 497 * it is expected that it will retry at a later time. Threads that will
498 498 * not retry the lock *must* call page_lock_clr_exclwanted to clear the
499 499 * SE_EWANTED bit. (When a thread using SE_EXCL_WANTED obtains the lock,
500 500 * the bit is cleared.)
501 501 */
502 502 int
503 503 page_try_reclaim_lock(page_t *pp, se_t se, int es)
504 504 {
505 505 kmutex_t *pse = PAGE_SE_MUTEX(pp);
506 506 selock_t old;
507 507
508 508 mutex_enter(pse);
509 509
510 510 old = pp->p_selock;
511 511
512 512 ASSERT(((es & SE_EXCL_WANTED) == 0) ||
513 513 ((es & SE_EXCL_WANTED) && (se == SE_EXCL)));
514 514
515 515 if (PP_RETIRED(pp) && !(es & SE_RETIRED)) {
516 516 mutex_exit(pse);
517 517 VM_STAT_ADD(page_trylock_failed);
518 518 return (0);
519 519 }
520 520
521 521 if (se == SE_SHARED && es == 1 && old == 0) {
522 522 se = SE_EXCL;
523 523 }
524 524
525 525 if (se == SE_SHARED) {
526 526 if (!PP_ISFREE(pp)) {
527 527 if (old >= 0) {
528 528 /*
529 529 * Readers are not allowed when excl wanted
530 530 */
531 531 if ((old & SE_EWANTED) == 0) {
532 532 pp->p_selock = old + SE_READER;
533 533 mutex_exit(pse);
534 534 return (1);
535 535 }
536 536 }
537 537 mutex_exit(pse);
538 538 return (0);
539 539 }
540 540 /*
541 541 * The page is free, so we really want SE_EXCL (below)
542 542 */
543 543 VM_STAT_ADD(page_try_reclaim_upgrade);
↓ open down ↓ |
166 lines elided |
↑ open up ↑ |
544 544 }
545 545
546 546 /*
547 547 * The caller wants a writer lock. We try for it only if
548 548 * SE_EWANTED is not set, or if the caller specified
549 549 * SE_EXCL_WANTED.
550 550 */
551 551 if (!(old & SE_EWANTED) || (es & SE_EXCL_WANTED)) {
552 552 if ((old & ~SE_EWANTED) == 0) {
553 553 /* no reader/writer lock held */
554 - THREAD_KPRI_REQUEST();
555 554 /* this clears out our setting of the SE_EWANTED bit */
556 555 pp->p_selock = SE_WRITER;
557 556 mutex_exit(pse);
558 557 return (1);
559 558 }
560 559 }
561 560 if (es & SE_EXCL_WANTED) {
562 561 /* page is locked, set the SE_EWANTED bit */
563 562 pp->p_selock |= SE_EWANTED;
564 563 }
565 564 mutex_exit(pse);
566 565 return (0);
567 566 }
568 567
569 568 /*
570 569 * Acquire a page's "shared/exclusive" lock, but never block.
571 570 * Returns 1 on success, 0 on failure.
572 571 */
573 572 int
574 573 page_trylock(page_t *pp, se_t se)
575 574 {
576 575 kmutex_t *pse = PAGE_SE_MUTEX(pp);
577 576
578 577 mutex_enter(pse);
579 578 if (pp->p_selock & SE_EWANTED || PP_RETIRED(pp) ||
580 579 (se == SE_SHARED && PP_PR_NOSHARE(pp))) {
581 580 /*
582 581 * Fail if a thread wants exclusive access and page is
↓ open down ↓ |
18 lines elided |
↑ open up ↑ |
583 582 * retired, if the page is slated for retirement, or a
584 583 * share lock is requested.
585 584 */
586 585 mutex_exit(pse);
587 586 VM_STAT_ADD(page_trylock_failed);
588 587 return (0);
589 588 }
590 589
591 590 if (se == SE_EXCL) {
592 591 if (pp->p_selock == 0) {
593 - THREAD_KPRI_REQUEST();
594 592 pp->p_selock = SE_WRITER;
595 593 mutex_exit(pse);
596 594 return (1);
597 595 }
598 596 } else {
599 597 if (pp->p_selock >= 0) {
600 598 pp->p_selock += SE_READER;
601 599 mutex_exit(pse);
602 600 return (1);
603 601 }
604 602 }
605 603 mutex_exit(pse);
606 604 return (0);
607 605 }
608 606
609 607 /*
610 608 * Variant of page_unlock() specifically for the page freelist
611 609 * code. The mere existence of this code is a vile hack that
612 610 * has resulted due to the backwards locking order of the page
613 611 * freelist manager; please don't call it.
614 612 */
615 613 void
616 614 page_unlock_nocapture(page_t *pp)
617 615 {
618 616 kmutex_t *pse = PAGE_SE_MUTEX(pp);
619 617 selock_t old;
620 618
↓ open down ↓ |
17 lines elided |
↑ open up ↑ |
621 619 mutex_enter(pse);
622 620
623 621 old = pp->p_selock;
624 622 if ((old & ~SE_EWANTED) == SE_READER) {
625 623 pp->p_selock = old & ~SE_READER;
626 624 if (CV_HAS_WAITERS(&pp->p_cv))
627 625 cv_broadcast(&pp->p_cv);
628 626 } else if ((old & ~SE_EWANTED) == SE_DELETED) {
629 627 panic("page_unlock_nocapture: page %p is deleted", (void *)pp);
630 628 } else if (old < 0) {
631 - THREAD_KPRI_RELEASE();
632 629 pp->p_selock &= SE_EWANTED;
633 630 if (CV_HAS_WAITERS(&pp->p_cv))
634 631 cv_broadcast(&pp->p_cv);
635 632 } else if ((old & ~SE_EWANTED) > SE_READER) {
636 633 pp->p_selock = old - SE_READER;
637 634 } else {
638 635 panic("page_unlock_nocapture: page %p is not locked",
639 636 (void *)pp);
640 637 }
641 638
642 639 mutex_exit(pse);
643 640 }
644 641
645 642 /*
646 643 * Release the page's "shared/exclusive" lock and wake up anyone
647 644 * who might be waiting for it.
648 645 */
649 646 void
650 647 page_unlock(page_t *pp)
651 648 {
652 649 kmutex_t *pse = PAGE_SE_MUTEX(pp);
653 650 selock_t old;
654 651
↓ open down ↓ |
13 lines elided |
↑ open up ↑ |
655 652 mutex_enter(pse);
656 653
657 654 old = pp->p_selock;
658 655 if ((old & ~SE_EWANTED) == SE_READER) {
659 656 pp->p_selock = old & ~SE_READER;
660 657 if (CV_HAS_WAITERS(&pp->p_cv))
661 658 cv_broadcast(&pp->p_cv);
662 659 } else if ((old & ~SE_EWANTED) == SE_DELETED) {
663 660 panic("page_unlock: page %p is deleted", (void *)pp);
664 661 } else if (old < 0) {
665 - THREAD_KPRI_RELEASE();
666 662 pp->p_selock &= SE_EWANTED;
667 663 if (CV_HAS_WAITERS(&pp->p_cv))
668 664 cv_broadcast(&pp->p_cv);
669 665 } else if ((old & ~SE_EWANTED) > SE_READER) {
670 666 pp->p_selock = old - SE_READER;
671 667 } else {
672 668 panic("page_unlock: page %p is not locked", (void *)pp);
673 669 }
674 670
675 671 if (pp->p_selock == 0) {
676 672 /*
677 673 * If the T_CAPTURING bit is set, that means that we should
678 674 * not try and capture the page again as we could recurse
679 675 * which could lead to a stack overflow panic or spending a
680 676 * relatively long time in the kernel making no progress.
681 677 */
682 678 if ((pp->p_toxic & PR_CAPTURE) &&
683 679 !(curthread->t_flag & T_CAPTURING) &&
684 680 !PP_RETIRED(pp)) {
685 - THREAD_KPRI_REQUEST();
686 681 pp->p_selock = SE_WRITER;
687 682 mutex_exit(pse);
688 683 page_unlock_capture(pp);
689 684 } else {
690 685 mutex_exit(pse);
691 686 }
692 687 } else {
693 688 mutex_exit(pse);
694 689 }
695 690 }
696 691
697 692 /*
698 693 * Try to upgrade the lock on the page from a "shared" to an
699 694 * "exclusive" lock. Since this upgrade operation is done while
700 695 * holding the mutex protecting this page, no one else can acquire this page's
701 696 * lock and change the page. Thus, it is safe to drop the "shared"
702 697 * lock and attempt to acquire the "exclusive" lock.
703 698 *
704 699 * Returns 1 on success, 0 on failure.
↓ open down ↓ |
9 lines elided |
↑ open up ↑ |
705 700 */
706 701 int
707 702 page_tryupgrade(page_t *pp)
708 703 {
709 704 kmutex_t *pse = PAGE_SE_MUTEX(pp);
710 705
711 706 mutex_enter(pse);
712 707 if (!(pp->p_selock & SE_EWANTED)) {
713 708 /* no threads want exclusive access, try upgrade */
714 709 if (pp->p_selock == SE_READER) {
715 - THREAD_KPRI_REQUEST();
716 710 /* convert to exclusive lock */
717 711 pp->p_selock = SE_WRITER;
718 712 mutex_exit(pse);
719 713 return (1);
720 714 }
721 715 }
722 716 mutex_exit(pse);
723 717 return (0);
724 718 }
725 719
726 720 /*
727 721 * Downgrade the "exclusive" lock on the page to a "shared" lock
728 722 * while holding the mutex protecting this page's p_selock field.
729 723 */
730 724 void
↓ open down ↓ |
5 lines elided |
↑ open up ↑ |
731 725 page_downgrade(page_t *pp)
732 726 {
733 727 kmutex_t *pse = PAGE_SE_MUTEX(pp);
734 728 int excl_waiting;
735 729
736 730 ASSERT((pp->p_selock & ~SE_EWANTED) != SE_DELETED);
737 731 ASSERT(PAGE_EXCL(pp));
738 732
739 733 mutex_enter(pse);
740 734 excl_waiting = pp->p_selock & SE_EWANTED;
741 - THREAD_KPRI_RELEASE();
742 735 pp->p_selock = SE_READER | excl_waiting;
743 736 if (CV_HAS_WAITERS(&pp->p_cv))
744 737 cv_broadcast(&pp->p_cv);
745 738 mutex_exit(pse);
746 739 }
747 740
748 741 void
749 742 page_lock_delete(page_t *pp)
750 743 {
751 744 kmutex_t *pse = PAGE_SE_MUTEX(pp);
752 745
753 746 ASSERT(PAGE_EXCL(pp));
754 747 ASSERT(pp->p_vnode == NULL);
755 748 ASSERT(pp->p_offset == (u_offset_t)-1);
756 749 ASSERT(!PP_ISFREE(pp));
757 750
758 751 mutex_enter(pse);
759 - THREAD_KPRI_RELEASE();
760 752 pp->p_selock = SE_DELETED;
761 753 if (CV_HAS_WAITERS(&pp->p_cv))
762 754 cv_broadcast(&pp->p_cv);
763 755 mutex_exit(pse);
764 756 }
765 757
766 758 int
767 759 page_deleted(page_t *pp)
768 760 {
769 761 return (pp->p_selock == SE_DELETED);
770 762 }
771 763
772 764 /*
773 765 * Implement the io lock for pages
774 766 */
775 767 void
776 768 page_iolock_init(page_t *pp)
777 769 {
778 770 pp->p_iolock_state = 0;
779 771 cv_init(&pp->p_io_cv, NULL, CV_DEFAULT, NULL);
780 772 }
781 773
782 774 /*
783 775 * Acquire the i/o lock on a page.
784 776 */
785 777 void
786 778 page_io_lock(page_t *pp)
787 779 {
788 780 kmutex_t *pio;
789 781
790 782 pio = PAGE_IO_MUTEX(pp);
791 783 mutex_enter(pio);
792 784 while (pp->p_iolock_state & PAGE_IO_INUSE) {
793 785 cv_wait(&(pp->p_io_cv), pio);
794 786 }
795 787 pp->p_iolock_state |= PAGE_IO_INUSE;
796 788 mutex_exit(pio);
797 789 }
798 790
799 791 /*
800 792 * Release the i/o lock on a page.
801 793 */
802 794 void
803 795 page_io_unlock(page_t *pp)
804 796 {
805 797 kmutex_t *pio;
806 798
807 799 pio = PAGE_IO_MUTEX(pp);
808 800 mutex_enter(pio);
809 801 cv_broadcast(&pp->p_io_cv);
810 802 pp->p_iolock_state &= ~PAGE_IO_INUSE;
811 803 mutex_exit(pio);
812 804 }
813 805
814 806 /*
815 807 * Try to acquire the i/o lock on a page without blocking.
816 808 * Returns 1 on success, 0 on failure.
817 809 */
818 810 int
819 811 page_io_trylock(page_t *pp)
820 812 {
821 813 kmutex_t *pio;
822 814
823 815 if (pp->p_iolock_state & PAGE_IO_INUSE)
824 816 return (0);
825 817
826 818 pio = PAGE_IO_MUTEX(pp);
827 819 mutex_enter(pio);
828 820
829 821 if (pp->p_iolock_state & PAGE_IO_INUSE) {
830 822 mutex_exit(pio);
831 823 return (0);
832 824 }
833 825 pp->p_iolock_state |= PAGE_IO_INUSE;
834 826 mutex_exit(pio);
835 827
836 828 return (1);
837 829 }
838 830
839 831 /*
840 832 * Wait until the i/o lock is not held.
841 833 */
842 834 void
843 835 page_io_wait(page_t *pp)
844 836 {
845 837 kmutex_t *pio;
846 838
847 839 pio = PAGE_IO_MUTEX(pp);
848 840 mutex_enter(pio);
849 841 while (pp->p_iolock_state & PAGE_IO_INUSE) {
850 842 cv_wait(&(pp->p_io_cv), pio);
851 843 }
852 844 mutex_exit(pio);
853 845 }
854 846
855 847 /*
856 848 * Returns 1 on success, 0 on failure.
857 849 */
858 850 int
859 851 page_io_locked(page_t *pp)
860 852 {
861 853 return (pp->p_iolock_state & PAGE_IO_INUSE);
862 854 }
863 855
864 856 /*
865 857 * Assert that the i/o lock on a page is held.
866 858 * Returns 1 on success, 0 on failure.
867 859 */
868 860 int
869 861 page_iolock_assert(page_t *pp)
870 862 {
871 863 return (page_io_locked(pp));
872 864 }
873 865
874 866 /*
875 867 * Wrapper exported to kernel routines that are built
876 868 * platform-independent (the macro is platform-dependent;
877 869 * the size of vph_mutex[] is based on NCPU).
878 870 *
879 871 * Note that you can do stress testing on this by setting the
880 872 * variable page_vnode_mutex_stress to something other than
881 873 * zero in a DEBUG kernel in a debugger after loading the kernel.
882 874 * Setting it after the kernel is running may not work correctly.
883 875 */
884 876 #ifdef DEBUG
885 877 static int page_vnode_mutex_stress = 0;
886 878 #endif
887 879
888 880 kmutex_t *
889 881 page_vnode_mutex(vnode_t *vp)
890 882 {
891 883 if (vp == &kvp)
892 884 return (&vph_mutex[VPH_TABLE_SIZE + 0]);
893 885
894 886 if (vp == &zvp)
895 887 return (&vph_mutex[VPH_TABLE_SIZE + 1]);
896 888 #ifdef DEBUG
897 889 if (page_vnode_mutex_stress != 0)
898 890 return (&vph_mutex[0]);
899 891 #endif
900 892
901 893 return (&vph_mutex[VP_HASH_FUNC(vp)]);
902 894 }
903 895
904 896 kmutex_t *
905 897 page_se_mutex(page_t *pp)
906 898 {
907 899 return (PAGE_SE_MUTEX(pp));
908 900 }
909 901
910 902 #ifdef VM_STATS
911 903 uint_t pszclck_stat[4];
912 904 #endif
913 905 /*
914 906 * Find, take and return a mutex held by hat_page_demote().
915 907 * Called by page_demote_vp_pages() before hat_page_demote() call and by
916 908 * routines that want to block hat_page_demote() but can't do it
917 909 * via locking all constituent pages.
918 910 *
919 911 * Return NULL if p_szc is 0.
920 912 *
921 913 * It should only be used for pages that can be demoted by hat_page_demote()
922 914 * i.e. non swapfs file system pages. The logic here is lifted from
923 915 * sfmmu_mlspl_enter() except there's no need to worry about p_szc increase
924 916 * since the page is locked and not free.
925 917 *
926 918 * Hash of the root page is used to find the lock.
927 919 * To find the root in the presense of hat_page_demote() chageing the location
928 920 * of the root this routine relies on the fact that hat_page_demote() changes
929 921 * root last.
930 922 *
931 923 * If NULL is returned pp's p_szc is guaranteed to be 0. If non NULL is
932 924 * returned pp's p_szc may be any value.
933 925 */
934 926 kmutex_t *
935 927 page_szc_lock(page_t *pp)
936 928 {
937 929 kmutex_t *mtx;
938 930 page_t *rootpp;
939 931 uint_t szc;
940 932 uint_t rszc;
941 933 uint_t pszc = pp->p_szc;
942 934
943 935 ASSERT(pp != NULL);
944 936 ASSERT(PAGE_LOCKED(pp));
945 937 ASSERT(!PP_ISFREE(pp));
946 938 ASSERT(pp->p_vnode != NULL);
947 939 ASSERT(!IS_SWAPFSVP(pp->p_vnode));
948 940 ASSERT(!PP_ISKAS(pp));
949 941
950 942 again:
951 943 if (pszc == 0) {
952 944 VM_STAT_ADD(pszclck_stat[0]);
953 945 return (NULL);
954 946 }
955 947
956 948 /* The lock lives in the root page */
957 949
958 950 rootpp = PP_GROUPLEADER(pp, pszc);
959 951 mtx = PAGE_SZC_MUTEX(rootpp);
960 952 mutex_enter(mtx);
961 953
962 954 /*
963 955 * since p_szc can only decrease if pp == rootpp
964 956 * rootpp will be always the same i.e we have the right root
965 957 * regardless of rootpp->p_szc.
966 958 * If location of pp's root didn't change after we took
967 959 * the lock we have the right root. return mutex hashed off it.
968 960 */
969 961 if (pp == rootpp || (rszc = rootpp->p_szc) == pszc) {
970 962 VM_STAT_ADD(pszclck_stat[1]);
971 963 return (mtx);
972 964 }
973 965
974 966 /*
975 967 * root location changed because page got demoted.
976 968 * locate the new root.
977 969 */
978 970 if (rszc < pszc) {
979 971 szc = pp->p_szc;
980 972 ASSERT(szc < pszc);
981 973 mutex_exit(mtx);
982 974 pszc = szc;
983 975 VM_STAT_ADD(pszclck_stat[2]);
984 976 goto again;
985 977 }
986 978
987 979 VM_STAT_ADD(pszclck_stat[3]);
988 980 /*
989 981 * current hat_page_demote not done yet.
990 982 * wait for it to finish.
991 983 */
992 984 mutex_exit(mtx);
993 985 rootpp = PP_GROUPLEADER(rootpp, rszc);
994 986 mtx = PAGE_SZC_MUTEX(rootpp);
995 987 mutex_enter(mtx);
996 988 mutex_exit(mtx);
997 989 ASSERT(rootpp->p_szc < rszc);
998 990 goto again;
999 991 }
1000 992
1001 993 int
1002 994 page_szc_lock_assert(page_t *pp)
1003 995 {
1004 996 page_t *rootpp = PP_PAGEROOT(pp);
1005 997 kmutex_t *mtx = PAGE_SZC_MUTEX(rootpp);
1006 998
1007 999 return (MUTEX_HELD(mtx));
1008 1000 }
1009 1001
1010 1002 /*
1011 1003 * memseg locking
1012 1004 */
1013 1005 static krwlock_t memsegslock;
1014 1006
1015 1007 /*
1016 1008 * memlist (phys_install, phys_avail) locking.
1017 1009 */
1018 1010 static krwlock_t memlists_lock;
1019 1011
1020 1012 int
1021 1013 memsegs_trylock(int writer)
1022 1014 {
1023 1015 return (rw_tryenter(&memsegslock, writer ? RW_WRITER : RW_READER));
1024 1016 }
1025 1017
1026 1018 void
1027 1019 memsegs_lock(int writer)
1028 1020 {
1029 1021 rw_enter(&memsegslock, writer ? RW_WRITER : RW_READER);
1030 1022 }
1031 1023
1032 1024 /*ARGSUSED*/
1033 1025 void
1034 1026 memsegs_unlock(int writer)
1035 1027 {
1036 1028 rw_exit(&memsegslock);
1037 1029 }
1038 1030
1039 1031 int
1040 1032 memsegs_lock_held(void)
1041 1033 {
1042 1034 return (RW_LOCK_HELD(&memsegslock));
1043 1035 }
1044 1036
1045 1037 void
1046 1038 memlist_read_lock(void)
1047 1039 {
1048 1040 rw_enter(&memlists_lock, RW_READER);
1049 1041 }
1050 1042
1051 1043 void
1052 1044 memlist_read_unlock(void)
1053 1045 {
1054 1046 rw_exit(&memlists_lock);
1055 1047 }
1056 1048
1057 1049 void
1058 1050 memlist_write_lock(void)
1059 1051 {
1060 1052 rw_enter(&memlists_lock, RW_WRITER);
1061 1053 }
1062 1054
1063 1055 void
1064 1056 memlist_write_unlock(void)
1065 1057 {
1066 1058 rw_exit(&memlists_lock);
1067 1059 }
↓ open down ↓ |
298 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX