1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 #include <sys/types.h> 30 #include <sys/param.h> 31 #include <sys/sysmacros.h> 32 #include <sys/proc.h> 33 #include <sys/kmem.h> 34 #include <sys/tuneable.h> 35 #include <sys/var.h> 36 #include <sys/cred.h> 37 #include <sys/systm.h> 38 #include <sys/prsystm.h> 39 #include <sys/vnode.h> 40 #include <sys/session.h> 41 #include <sys/cpuvar.h> 42 #include <sys/cmn_err.h> 43 #include <sys/bitmap.h> 44 #include <sys/debug.h> 45 #include <c2/audit.h> 46 #include <sys/project.h> 47 #include <sys/task.h> 48 #include <sys/zone.h> 49 50 /* directory entries for /proc */ 51 union procent { 52 proc_t *pe_proc; 53 union procent *pe_next; 54 }; 55 56 struct pid pid0 = { 57 0, /* pid_prinactive */ 58 1, /* pid_pgorphaned */ 59 0, /* pid_padding */ 60 0, /* pid_prslot */ 61 0, /* pid_id */ 62 NULL, /* pid_pglink */ 63 NULL, /* pid_pgtail */ 64 NULL, /* pid_link */ 65 3 /* pid_ref */ 66 }; 67 68 static int pid_hashlen = 4; /* desired average hash chain length */ 69 static int pid_hashsz; /* number of buckets in the hash table */ 70 71 #define HASHPID(pid) (pidhash[((pid)&(pid_hashsz-1))]) 72 73 extern uint_t nproc; 74 extern struct kmem_cache *process_cache; 75 static void upcount_init(void); 76 77 kmutex_t pidlock; /* global process lock */ 78 kmutex_t pr_pidlock; /* /proc global process lock */ 79 kcondvar_t *pr_pid_cv; /* for /proc, one per process slot */ 80 struct plock *proc_lock; /* persistent array of p_lock's */ 81 82 /* 83 * See the comment above pid_getlockslot() for a detailed explanation of this 84 * constant. Note that a PLOCK_SHIFT of 3 implies 64-byte coherence 85 * granularity; if the coherence granularity is ever changed, this constant 86 * should be modified to reflect the change to minimize proc_lock false 87 * sharing (correctness, however, is guaranteed regardless of the coherence 88 * granularity). 89 */ 90 #define PLOCK_SHIFT 3 91 92 static kmutex_t pidlinklock; 93 static struct pid **pidhash; 94 static pid_t minpid; 95 static pid_t mpid = FAMOUS_PIDS; /* one more than the last famous pid */ 96 static union procent *procdir; 97 static union procent *procentfree; 98 99 static struct pid * 100 pid_lookup(pid_t pid) 101 { 102 struct pid *pidp; 103 104 ASSERT(MUTEX_HELD(&pidlinklock)); 105 106 for (pidp = HASHPID(pid); pidp; pidp = pidp->pid_link) { 107 if (pidp->pid_id == pid) { 108 ASSERT(pidp->pid_ref > 0); 109 break; 110 } 111 } 112 return (pidp); 113 } 114 115 void 116 pid_setmin(void) 117 { 118 if (jump_pid && jump_pid > mpid) 119 minpid = mpid = jump_pid; 120 else 121 minpid = mpid; 122 } 123 124 /* 125 * When prslots are simply used as an index to determine a process' p_lock, 126 * adjacent prslots share adjacent p_locks. On machines where the size 127 * of a mutex is smaller than that of a cache line (which, as of this writing, 128 * is true for all machines on which Solaris runs), this can potentially 129 * induce false sharing. The standard solution for false sharing is to pad 130 * out one's data structures (in this case, struct plock). However, 131 * given the size and (generally) sparse use of the proc_lock array, this 132 * is suboptimal. We therefore stride through the proc_lock array with 133 * a stride of PLOCK_SHIFT. PLOCK_SHIFT should be defined as: 134 * 135 * log_2 (coherence_granularity / sizeof (kmutex_t)) 136 * 137 * Under this scheme, false sharing is still possible -- but only when 138 * the number of active processes is very large. Note that the one-to-one 139 * mapping between prslots and lockslots is maintained. 140 */ 141 static int 142 pid_getlockslot(int prslot) 143 { 144 int even = (v.v_proc >> PLOCK_SHIFT) << PLOCK_SHIFT; 145 int perlap = even >> PLOCK_SHIFT; 146 147 if (prslot >= even) 148 return (prslot); 149 150 return (((prslot % perlap) << PLOCK_SHIFT) + (prslot / perlap)); 151 } 152 153 /* 154 * This function allocates a pid structure, a free pid, and optionally a 155 * slot in the proc table for it. 156 * 157 * pid_allocate() returns the new pid on success, -1 on failure. 158 */ 159 pid_t 160 pid_allocate(proc_t *prp, pid_t pid, int flags) 161 { 162 struct pid *pidp; 163 union procent *pep; 164 pid_t newpid, startpid; 165 166 pidp = kmem_zalloc(sizeof (struct pid), KM_SLEEP); 167 168 mutex_enter(&pidlinklock); 169 if ((flags & PID_ALLOC_PROC) && (pep = procentfree) == NULL) { 170 /* 171 * ran out of /proc directory entries 172 */ 173 goto failed; 174 } 175 176 if (pid != 0) { 177 VERIFY(minpid == 0); 178 VERIFY3P(pid, <, mpid); 179 VERIFY3P(pid_lookup(pid), ==, NULL); 180 newpid = pid; 181 } else { 182 /* 183 * Allocate a pid 184 */ 185 ASSERT(minpid <= mpid && mpid < maxpid); 186 187 startpid = mpid; 188 for (;;) { 189 newpid = mpid; 190 if (++mpid == maxpid) 191 mpid = minpid; 192 193 if (pid_lookup(newpid) == NULL) 194 break; 195 196 if (mpid == startpid) 197 goto failed; 198 } 199 } 200 201 /* 202 * Put pid into the pid hash table. 203 */ 204 pidp->pid_link = HASHPID(newpid); 205 HASHPID(newpid) = pidp; 206 pidp->pid_ref = 1; 207 pidp->pid_id = newpid; 208 209 if (flags & PID_ALLOC_PROC) { 210 procentfree = pep->pe_next; 211 pidp->pid_prslot = pep - procdir; 212 pep->pe_proc = prp; 213 prp->p_pidp = pidp; 214 prp->p_lockp = &proc_lock[pid_getlockslot(pidp->pid_prslot)]; 215 } else { 216 pidp->pid_prslot = 0; 217 } 218 219 mutex_exit(&pidlinklock); 220 221 return (newpid); 222 223 failed: 224 mutex_exit(&pidlinklock); 225 kmem_free(pidp, sizeof (struct pid)); 226 return (-1); 227 } 228 229 /* 230 * decrement the reference count for pid 231 */ 232 int 233 pid_rele(struct pid *pidp) 234 { 235 struct pid **pidpp; 236 237 mutex_enter(&pidlinklock); 238 ASSERT(pidp != &pid0); 239 240 pidpp = &HASHPID(pidp->pid_id); 241 for (;;) { 242 ASSERT(*pidpp != NULL); 243 if (*pidpp == pidp) 244 break; 245 pidpp = &(*pidpp)->pid_link; 246 } 247 248 *pidpp = pidp->pid_link; 249 mutex_exit(&pidlinklock); 250 251 kmem_free(pidp, sizeof (*pidp)); 252 return (0); 253 } 254 255 void 256 proc_entry_free(struct pid *pidp) 257 { 258 mutex_enter(&pidlinklock); 259 pidp->pid_prinactive = 1; 260 procdir[pidp->pid_prslot].pe_next = procentfree; 261 procentfree = &procdir[pidp->pid_prslot]; 262 mutex_exit(&pidlinklock); 263 } 264 265 /* 266 * The original task needs to be passed in since the process has already been 267 * detached from the task at this point in time. 268 */ 269 void 270 pid_exit(proc_t *prp, struct task *tk) 271 { 272 struct pid *pidp; 273 zone_t *zone = prp->p_zone; 274 275 ASSERT(MUTEX_HELD(&pidlock)); 276 277 /* 278 * Exit process group. If it is NULL, it's because fork failed 279 * before calling pgjoin(). 280 */ 281 ASSERT(prp->p_pgidp != NULL || prp->p_stat == SIDL); 282 if (prp->p_pgidp != NULL) 283 pgexit(prp); 284 285 sess_rele(prp->p_sessp, B_TRUE); 286 287 pidp = prp->p_pidp; 288 289 proc_entry_free(pidp); 290 291 if (audit_active) 292 audit_pfree(prp); 293 294 if (practive == prp) { 295 practive = prp->p_next; 296 } 297 298 if (prp->p_next) { 299 prp->p_next->p_prev = prp->p_prev; 300 } 301 if (prp->p_prev) { 302 prp->p_prev->p_next = prp->p_next; 303 } 304 305 PID_RELE(pidp); 306 307 mutex_destroy(&prp->p_crlock); 308 kmem_cache_free(process_cache, prp); 309 nproc--; 310 311 /* 312 * Decrement the process counts of the original task, project and zone. 313 */ 314 mutex_enter(&zone->zone_nlwps_lock); 315 tk->tk_nprocs--; 316 tk->tk_proj->kpj_nprocs--; 317 zone->zone_nprocs--; 318 mutex_exit(&zone->zone_nlwps_lock); 319 } 320 321 /* 322 * Find a process visible from the specified zone given its process ID. 323 */ 324 proc_t * 325 prfind_zone(pid_t pid, zoneid_t zoneid) 326 { 327 struct pid *pidp; 328 proc_t *p; 329 330 ASSERT(MUTEX_HELD(&pidlock)); 331 332 mutex_enter(&pidlinklock); 333 pidp = pid_lookup(pid); 334 mutex_exit(&pidlinklock); 335 if (pidp != NULL && pidp->pid_prinactive == 0) { 336 p = procdir[pidp->pid_prslot].pe_proc; 337 if (zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid) 338 return (p); 339 } 340 return (NULL); 341 } 342 343 /* 344 * Find a process given its process ID. This obeys zone restrictions, 345 * so if the caller is in a non-global zone it won't find processes 346 * associated with other zones. Use prfind_zone(pid, ALL_ZONES) to 347 * bypass this restriction. 348 */ 349 proc_t * 350 prfind(pid_t pid) 351 { 352 zoneid_t zoneid; 353 354 if (INGLOBALZONE(curproc)) 355 zoneid = ALL_ZONES; 356 else 357 zoneid = getzoneid(); 358 return (prfind_zone(pid, zoneid)); 359 } 360 361 proc_t * 362 pgfind_zone(pid_t pgid, zoneid_t zoneid) 363 { 364 struct pid *pidp; 365 366 ASSERT(MUTEX_HELD(&pidlock)); 367 368 mutex_enter(&pidlinklock); 369 pidp = pid_lookup(pgid); 370 mutex_exit(&pidlinklock); 371 if (pidp != NULL) { 372 proc_t *p = pidp->pid_pglink; 373 374 if (zoneid == ALL_ZONES || pgid == 0 || p == NULL || 375 p->p_zone->zone_id == zoneid) 376 return (p); 377 } 378 return (NULL); 379 } 380 381 /* 382 * return the head of the list of processes whose process group ID is 'pgid', 383 * or NULL, if no such process group 384 */ 385 proc_t * 386 pgfind(pid_t pgid) 387 { 388 zoneid_t zoneid; 389 390 if (INGLOBALZONE(curproc)) 391 zoneid = ALL_ZONES; 392 else 393 zoneid = getzoneid(); 394 return (pgfind_zone(pgid, zoneid)); 395 } 396 397 /* 398 * Sets P_PR_LOCK on a non-system process. Process must be fully created 399 * and not exiting to succeed. 400 * 401 * Returns 0 on success. 402 * Returns 1 if P_PR_LOCK is set. 403 * Returns -1 if proc is in invalid state. 404 */ 405 int 406 sprtrylock_proc(proc_t *p) 407 { 408 ASSERT(MUTEX_HELD(&p->p_lock)); 409 410 /* skip system and incomplete processes */ 411 if (p->p_stat == SIDL || p->p_stat == SZOMB || 412 (p->p_flag & (SSYS | SEXITING | SEXITLWPS))) { 413 return (-1); 414 } 415 416 if (p->p_proc_flag & P_PR_LOCK) 417 return (1); 418 419 p->p_proc_flag |= P_PR_LOCK; 420 THREAD_KPRI_REQUEST(); 421 422 return (0); 423 } 424 425 /* 426 * Wait for P_PR_LOCK to become clear. Returns with p_lock dropped, 427 * and the proc pointer no longer valid, as the proc may have exited. 428 */ 429 void 430 sprwaitlock_proc(proc_t *p) 431 { 432 kmutex_t *mp; 433 434 ASSERT(MUTEX_HELD(&p->p_lock)); 435 ASSERT(p->p_proc_flag & P_PR_LOCK); 436 437 /* 438 * p_lock is persistent, but p itself is not -- it could 439 * vanish during cv_wait(). Load p->p_lock now so we can 440 * drop it after cv_wait() without referencing p. 441 */ 442 mp = &p->p_lock; 443 cv_wait(&pr_pid_cv[p->p_slot], mp); 444 mutex_exit(mp); 445 } 446 447 /* 448 * If pid exists, find its proc, acquire its p_lock and mark it P_PR_LOCK. 449 * Returns the proc pointer on success, NULL on failure. sprlock() is 450 * really just a stripped-down version of pr_p_lock() to allow practive 451 * walkers like dofusers() and dumpsys() to synchronize with /proc. 452 */ 453 proc_t * 454 sprlock_zone(pid_t pid, zoneid_t zoneid) 455 { 456 proc_t *p; 457 int ret; 458 459 for (;;) { 460 mutex_enter(&pidlock); 461 if ((p = prfind_zone(pid, zoneid)) == NULL) { 462 mutex_exit(&pidlock); 463 return (NULL); 464 } 465 mutex_enter(&p->p_lock); 466 mutex_exit(&pidlock); 467 468 if (panicstr) 469 return (p); 470 471 ret = sprtrylock_proc(p); 472 if (ret == -1) { 473 mutex_exit(&p->p_lock); 474 return (NULL); 475 } else if (ret == 0) { 476 break; 477 } 478 sprwaitlock_proc(p); 479 } 480 return (p); 481 } 482 483 proc_t * 484 sprlock(pid_t pid) 485 { 486 zoneid_t zoneid; 487 488 if (INGLOBALZONE(curproc)) 489 zoneid = ALL_ZONES; 490 else 491 zoneid = getzoneid(); 492 return (sprlock_zone(pid, zoneid)); 493 } 494 495 void 496 sprlock_proc(proc_t *p) 497 { 498 ASSERT(MUTEX_HELD(&p->p_lock)); 499 500 while (p->p_proc_flag & P_PR_LOCK) { 501 cv_wait(&pr_pid_cv[p->p_slot], &p->p_lock); 502 } 503 504 p->p_proc_flag |= P_PR_LOCK; 505 THREAD_KPRI_REQUEST(); 506 } 507 508 void 509 sprunlock(proc_t *p) 510 { 511 if (panicstr) { 512 mutex_exit(&p->p_lock); 513 return; 514 } 515 516 ASSERT(p->p_proc_flag & P_PR_LOCK); 517 ASSERT(MUTEX_HELD(&p->p_lock)); 518 519 cv_signal(&pr_pid_cv[p->p_slot]); 520 p->p_proc_flag &= ~P_PR_LOCK; 521 mutex_exit(&p->p_lock); 522 THREAD_KPRI_RELEASE(); 523 } 524 525 void 526 pid_init(void) 527 { 528 int i; 529 530 pid_hashsz = 1 << highbit(v.v_proc / pid_hashlen); 531 532 pidhash = kmem_zalloc(sizeof (struct pid *) * pid_hashsz, KM_SLEEP); 533 procdir = kmem_alloc(sizeof (union procent) * v.v_proc, KM_SLEEP); 534 pr_pid_cv = kmem_zalloc(sizeof (kcondvar_t) * v.v_proc, KM_SLEEP); 535 proc_lock = kmem_zalloc(sizeof (struct plock) * v.v_proc, KM_SLEEP); 536 537 nproc = 1; 538 practive = proc_sched; 539 proc_sched->p_next = NULL; 540 procdir[0].pe_proc = proc_sched; 541 542 procentfree = &procdir[1]; 543 for (i = 1; i < v.v_proc - 1; i++) 544 procdir[i].pe_next = &procdir[i+1]; 545 procdir[i].pe_next = NULL; 546 547 HASHPID(0) = &pid0; 548 549 upcount_init(); 550 } 551 552 proc_t * 553 pid_entry(int slot) 554 { 555 union procent *pep; 556 proc_t *prp; 557 558 ASSERT(MUTEX_HELD(&pidlock)); 559 ASSERT(slot >= 0 && slot < v.v_proc); 560 561 pep = procdir[slot].pe_next; 562 if (pep >= procdir && pep < &procdir[v.v_proc]) 563 return (NULL); 564 prp = procdir[slot].pe_proc; 565 if (prp != 0 && prp->p_stat == SIDL) 566 return (NULL); 567 return (prp); 568 } 569 570 /* 571 * Send the specified signal to all processes whose process group ID is 572 * equal to 'pgid' 573 */ 574 575 void 576 signal(pid_t pgid, int sig) 577 { 578 struct pid *pidp; 579 proc_t *prp; 580 581 mutex_enter(&pidlock); 582 mutex_enter(&pidlinklock); 583 if (pgid == 0 || (pidp = pid_lookup(pgid)) == NULL) { 584 mutex_exit(&pidlinklock); 585 mutex_exit(&pidlock); 586 return; 587 } 588 mutex_exit(&pidlinklock); 589 for (prp = pidp->pid_pglink; prp; prp = prp->p_pglink) { 590 mutex_enter(&prp->p_lock); 591 sigtoproc(prp, NULL, sig); 592 mutex_exit(&prp->p_lock); 593 } 594 mutex_exit(&pidlock); 595 } 596 597 /* 598 * Send the specified signal to the specified process 599 */ 600 601 void 602 prsignal(struct pid *pidp, int sig) 603 { 604 if (!(pidp->pid_prinactive)) 605 psignal(procdir[pidp->pid_prslot].pe_proc, sig); 606 } 607 608 #include <sys/sunddi.h> 609 610 /* 611 * DDI/DKI interfaces for drivers to send signals to processes 612 */ 613 614 /* 615 * obtain an opaque reference to a process for signaling 616 */ 617 void * 618 proc_ref(void) 619 { 620 struct pid *pidp; 621 622 mutex_enter(&pidlock); 623 pidp = curproc->p_pidp; 624 PID_HOLD(pidp); 625 mutex_exit(&pidlock); 626 627 return (pidp); 628 } 629 630 /* 631 * release a reference to a process 632 * - a process can exit even if a driver has a reference to it 633 * - one proc_unref for every proc_ref 634 */ 635 void 636 proc_unref(void *pref) 637 { 638 mutex_enter(&pidlock); 639 PID_RELE((struct pid *)pref); 640 mutex_exit(&pidlock); 641 } 642 643 /* 644 * send a signal to a process 645 * 646 * - send the process the signal 647 * - if the process went away, return a -1 648 * - if the process is still there return 0 649 */ 650 int 651 proc_signal(void *pref, int sig) 652 { 653 struct pid *pidp = pref; 654 655 prsignal(pidp, sig); 656 return (pidp->pid_prinactive ? -1 : 0); 657 } 658 659 660 static struct upcount **upc_hash; /* a boot time allocated array */ 661 static ulong_t upc_hashmask; 662 #define UPC_HASH(x, y) ((ulong_t)(x ^ y) & upc_hashmask) 663 664 /* 665 * Get us off the ground. Called once at boot. 666 */ 667 void 668 upcount_init(void) 669 { 670 ulong_t upc_hashsize; 671 672 /* 673 * An entry per MB of memory is our current guess 674 */ 675 /* 676 * 2^20 is a meg, so shifting right by 20 - PAGESHIFT 677 * converts pages to megs (without overflowing a u_int 678 * if you have more than 4G of memory, like ptob(physmem)/1M 679 * would). 680 */ 681 upc_hashsize = (1 << highbit(physmem >> (20 - PAGESHIFT))); 682 upc_hashmask = upc_hashsize - 1; 683 upc_hash = kmem_zalloc(upc_hashsize * sizeof (struct upcount *), 684 KM_SLEEP); 685 } 686 687 /* 688 * Increment the number of processes associated with a given uid and zoneid. 689 */ 690 void 691 upcount_inc(uid_t uid, zoneid_t zoneid) 692 { 693 struct upcount **upc, **hupc; 694 struct upcount *new; 695 696 ASSERT(MUTEX_HELD(&pidlock)); 697 new = NULL; 698 hupc = &upc_hash[UPC_HASH(uid, zoneid)]; 699 top: 700 upc = hupc; 701 while ((*upc) != NULL) { 702 if ((*upc)->up_uid == uid && (*upc)->up_zoneid == zoneid) { 703 (*upc)->up_count++; 704 if (new) { 705 /* 706 * did not need `new' afterall. 707 */ 708 kmem_free(new, sizeof (*new)); 709 } 710 return; 711 } 712 upc = &(*upc)->up_next; 713 } 714 715 /* 716 * There is no entry for this <uid,zoneid> pair. 717 * Allocate one. If we have to drop pidlock, check 718 * again. 719 */ 720 if (new == NULL) { 721 new = (struct upcount *)kmem_alloc(sizeof (*new), KM_NOSLEEP); 722 if (new == NULL) { 723 mutex_exit(&pidlock); 724 new = (struct upcount *)kmem_alloc(sizeof (*new), 725 KM_SLEEP); 726 mutex_enter(&pidlock); 727 goto top; 728 } 729 } 730 731 732 /* 733 * On the assumption that a new user is going to do some 734 * more forks, put the new upcount structure on the front. 735 */ 736 upc = hupc; 737 738 new->up_uid = uid; 739 new->up_zoneid = zoneid; 740 new->up_count = 1; 741 new->up_next = *upc; 742 743 *upc = new; 744 } 745 746 /* 747 * Decrement the number of processes a given uid and zoneid has. 748 */ 749 void 750 upcount_dec(uid_t uid, zoneid_t zoneid) 751 { 752 struct upcount **upc; 753 struct upcount *done; 754 755 ASSERT(MUTEX_HELD(&pidlock)); 756 757 upc = &upc_hash[UPC_HASH(uid, zoneid)]; 758 while ((*upc) != NULL) { 759 if ((*upc)->up_uid == uid && (*upc)->up_zoneid == zoneid) { 760 (*upc)->up_count--; 761 if ((*upc)->up_count == 0) { 762 done = *upc; 763 *upc = (*upc)->up_next; 764 kmem_free(done, sizeof (*done)); 765 } 766 return; 767 } 768 upc = &(*upc)->up_next; 769 } 770 cmn_err(CE_PANIC, "decr_upcount-off the end"); 771 } 772 773 /* 774 * Returns the number of processes a uid has. 775 * Non-existent uid's are assumed to have no processes. 776 */ 777 int 778 upcount_get(uid_t uid, zoneid_t zoneid) 779 { 780 struct upcount *upc; 781 782 ASSERT(MUTEX_HELD(&pidlock)); 783 784 upc = upc_hash[UPC_HASH(uid, zoneid)]; 785 while (upc != NULL) { 786 if (upc->up_uid == uid && upc->up_zoneid == zoneid) { 787 return (upc->up_count); 788 } 789 upc = upc->up_next; 790 } 791 return (0); 792 }