1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright 2019 Joyent, Inc. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 #include <sys/types.h> 31 #include <sys/param.h> 32 #include <sys/sysmacros.h> 33 #include <sys/proc.h> 34 #include <sys/kmem.h> 35 #include <sys/tuneable.h> 36 #include <sys/var.h> 37 #include <sys/cred.h> 38 #include <sys/systm.h> 39 #include <sys/prsystm.h> 40 #include <sys/vnode.h> 41 #include <sys/session.h> 42 #include <sys/cpuvar.h> 43 #include <sys/cmn_err.h> 44 #include <sys/bitmap.h> 45 #include <sys/debug.h> 46 #include <c2/audit.h> 47 #include <sys/project.h> 48 #include <sys/task.h> 49 #include <sys/zone.h> 50 51 /* directory entries for /proc */ 52 union procent { 53 proc_t *pe_proc; 54 union procent *pe_next; 55 }; 56 57 struct pid pid0 = { 58 0, /* pid_prinactive */ 59 1, /* pid_pgorphaned */ 60 0, /* pid_padding */ 61 0, /* pid_prslot */ 62 0, /* pid_id */ 63 NULL, /* pid_pglink */ 64 NULL, /* pid_pgtail */ 65 NULL, /* pid_link */ 66 3 /* pid_ref */ 67 }; 68 69 static int pid_hashlen = 4; /* desired average hash chain length */ 70 static int pid_hashsz; /* number of buckets in the hash table */ 71 72 #define HASHPID(pid) (pidhash[((pid)&(pid_hashsz-1))]) 73 74 extern uint_t nproc; 75 extern struct kmem_cache *process_cache; 76 static void upcount_init(void); 77 78 kmutex_t pidlock; /* global process lock */ 79 kmutex_t pr_pidlock; /* /proc global process lock */ 80 kcondvar_t *pr_pid_cv; /* for /proc, one per process slot */ 81 struct plock *proc_lock; /* persistent array of p_lock's */ 82 83 /* 84 * See the comment above pid_getlockslot() for a detailed explanation of this 85 * constant. Note that a PLOCK_SHIFT of 3 implies 64-byte coherence 86 * granularity; if the coherence granularity is ever changed, this constant 87 * should be modified to reflect the change to minimize proc_lock false 88 * sharing (correctness, however, is guaranteed regardless of the coherence 89 * granularity). 90 */ 91 #define PLOCK_SHIFT 3 92 93 static kmutex_t pidlinklock; 94 static struct pid **pidhash; 95 static pid_t minpid; 96 static pid_t mpid = FAMOUS_PIDS; /* one more than the last famous pid */ 97 static union procent *procdir; 98 static union procent *procentfree; 99 100 static struct pid * 101 pid_lookup(pid_t pid) 102 { 103 struct pid *pidp; 104 105 ASSERT(MUTEX_HELD(&pidlinklock)); 106 107 for (pidp = HASHPID(pid); pidp; pidp = pidp->pid_link) { 108 if (pidp->pid_id == pid) { 109 ASSERT(pidp->pid_ref > 0); 110 break; 111 } 112 } 113 return (pidp); 114 } 115 116 void 117 pid_setmin(void) 118 { 119 if (jump_pid && jump_pid > mpid) 120 minpid = mpid = jump_pid; 121 else 122 minpid = mpid; 123 } 124 125 /* 126 * When prslots are simply used as an index to determine a process' p_lock, 127 * adjacent prslots share adjacent p_locks. On machines where the size 128 * of a mutex is smaller than that of a cache line (which, as of this writing, 129 * is true for all machines on which Solaris runs), this can potentially 130 * induce false sharing. The standard solution for false sharing is to pad 131 * out one's data structures (in this case, struct plock). However, 132 * given the size and (generally) sparse use of the proc_lock array, this 133 * is suboptimal. We therefore stride through the proc_lock array with 134 * a stride of PLOCK_SHIFT. PLOCK_SHIFT should be defined as: 135 * 136 * log_2 (coherence_granularity / sizeof (kmutex_t)) 137 * 138 * Under this scheme, false sharing is still possible -- but only when 139 * the number of active processes is very large. Note that the one-to-one 140 * mapping between prslots and lockslots is maintained. 141 */ 142 static int 143 pid_getlockslot(int prslot) 144 { 145 int even = (v.v_proc >> PLOCK_SHIFT) << PLOCK_SHIFT; 146 int perlap = even >> PLOCK_SHIFT; 147 148 if (prslot >= even) 149 return (prslot); 150 151 return (((prslot % perlap) << PLOCK_SHIFT) + (prslot / perlap)); 152 } 153 154 /* 155 * This function allocates a pid structure, a free pid, and optionally a 156 * slot in the proc table for it. 157 * 158 * pid_allocate() returns the new pid on success, -1 on failure. 159 */ 160 pid_t 161 pid_allocate(proc_t *prp, pid_t pid, int flags) 162 { 163 struct pid *pidp; 164 union procent *pep; 165 pid_t newpid, startpid; 166 167 pidp = kmem_zalloc(sizeof (struct pid), KM_SLEEP); 168 169 mutex_enter(&pidlinklock); 170 if ((flags & PID_ALLOC_PROC) && (pep = procentfree) == NULL) { 171 /* 172 * ran out of /proc directory entries 173 */ 174 goto failed; 175 } 176 177 if (pid != 0) { 178 VERIFY(minpid == 0); 179 VERIFY3P(pid, <, mpid); 180 VERIFY3P(pid_lookup(pid), ==, NULL); 181 newpid = pid; 182 } else { 183 /* 184 * Allocate a pid 185 */ 186 ASSERT(minpid <= mpid && mpid < maxpid); 187 188 startpid = mpid; 189 for (;;) { 190 newpid = mpid; 191 if (++mpid == maxpid) 192 mpid = minpid; 193 194 if (pid_lookup(newpid) == NULL) 195 break; 196 197 if (mpid == startpid) 198 goto failed; 199 } 200 } 201 202 /* 203 * Put pid into the pid hash table. 204 */ 205 pidp->pid_link = HASHPID(newpid); 206 HASHPID(newpid) = pidp; 207 pidp->pid_ref = 1; 208 pidp->pid_id = newpid; 209 210 if (flags & PID_ALLOC_PROC) { 211 procentfree = pep->pe_next; 212 pidp->pid_prslot = pep - procdir; 213 pep->pe_proc = prp; 214 prp->p_pidp = pidp; 215 prp->p_lockp = &proc_lock[pid_getlockslot(pidp->pid_prslot)]; 216 } else { 217 pidp->pid_prslot = 0; 218 } 219 220 mutex_exit(&pidlinklock); 221 222 return (newpid); 223 224 failed: 225 mutex_exit(&pidlinklock); 226 kmem_free(pidp, sizeof (struct pid)); 227 return (-1); 228 } 229 230 /* 231 * decrement the reference count for pid 232 */ 233 int 234 pid_rele(struct pid *pidp) 235 { 236 struct pid **pidpp; 237 238 mutex_enter(&pidlinklock); 239 ASSERT(pidp != &pid0); 240 241 pidpp = &HASHPID(pidp->pid_id); 242 for (;;) { 243 ASSERT(*pidpp != NULL); 244 if (*pidpp == pidp) 245 break; 246 pidpp = &(*pidpp)->pid_link; 247 } 248 249 *pidpp = pidp->pid_link; 250 mutex_exit(&pidlinklock); 251 252 kmem_free(pidp, sizeof (*pidp)); 253 return (0); 254 } 255 256 void 257 proc_entry_free(struct pid *pidp) 258 { 259 mutex_enter(&pidlinklock); 260 pidp->pid_prinactive = 1; 261 procdir[pidp->pid_prslot].pe_next = procentfree; 262 procentfree = &procdir[pidp->pid_prslot]; 263 mutex_exit(&pidlinklock); 264 } 265 266 /* 267 * The original task needs to be passed in since the process has already been 268 * detached from the task at this point in time. 269 */ 270 void 271 pid_exit(proc_t *prp, struct task *tk) 272 { 273 struct pid *pidp; 274 zone_t *zone = prp->p_zone; 275 276 ASSERT(MUTEX_HELD(&pidlock)); 277 278 /* 279 * Exit process group. If it is NULL, it's because fork failed 280 * before calling pgjoin(). 281 */ 282 ASSERT(prp->p_pgidp != NULL || prp->p_stat == SIDL); 283 if (prp->p_pgidp != NULL) 284 pgexit(prp); 285 286 sess_rele(prp->p_sessp, B_TRUE); 287 288 pidp = prp->p_pidp; 289 290 proc_entry_free(pidp); 291 292 if (audit_active) 293 audit_pfree(prp); 294 295 if (practive == prp) { 296 practive = prp->p_next; 297 } 298 299 if (prp->p_next) { 300 prp->p_next->p_prev = prp->p_prev; 301 } 302 if (prp->p_prev) { 303 prp->p_prev->p_next = prp->p_next; 304 } 305 306 PID_RELE(pidp); 307 308 mutex_destroy(&prp->p_crlock); 309 kmem_cache_free(process_cache, prp); 310 nproc--; 311 312 /* 313 * Decrement the process counts of the original task, project and zone. 314 */ 315 mutex_enter(&zone->zone_nlwps_lock); 316 tk->tk_nprocs--; 317 tk->tk_proj->kpj_nprocs--; 318 zone->zone_nprocs--; 319 mutex_exit(&zone->zone_nlwps_lock); 320 } 321 322 /* 323 * Find a process visible from the specified zone given its process ID. 324 */ 325 proc_t * 326 prfind_zone(pid_t pid, zoneid_t zoneid) 327 { 328 struct pid *pidp; 329 proc_t *p; 330 331 ASSERT(MUTEX_HELD(&pidlock)); 332 333 mutex_enter(&pidlinklock); 334 pidp = pid_lookup(pid); 335 mutex_exit(&pidlinklock); 336 if (pidp != NULL && pidp->pid_prinactive == 0) { 337 p = procdir[pidp->pid_prslot].pe_proc; 338 if (zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid) 339 return (p); 340 } 341 return (NULL); 342 } 343 344 /* 345 * Find a process given its process ID. This obeys zone restrictions, 346 * so if the caller is in a non-global zone it won't find processes 347 * associated with other zones. Use prfind_zone(pid, ALL_ZONES) to 348 * bypass this restriction. 349 */ 350 proc_t * 351 prfind(pid_t pid) 352 { 353 zoneid_t zoneid; 354 355 if (INGLOBALZONE(curproc)) 356 zoneid = ALL_ZONES; 357 else 358 zoneid = getzoneid(); 359 return (prfind_zone(pid, zoneid)); 360 } 361 362 proc_t * 363 pgfind_zone(pid_t pgid, zoneid_t zoneid) 364 { 365 struct pid *pidp; 366 367 ASSERT(MUTEX_HELD(&pidlock)); 368 369 mutex_enter(&pidlinklock); 370 pidp = pid_lookup(pgid); 371 mutex_exit(&pidlinklock); 372 if (pidp != NULL) { 373 proc_t *p = pidp->pid_pglink; 374 375 if (zoneid == ALL_ZONES || pgid == 0 || p == NULL || 376 p->p_zone->zone_id == zoneid) 377 return (p); 378 } 379 return (NULL); 380 } 381 382 /* 383 * return the head of the list of processes whose process group ID is 'pgid', 384 * or NULL, if no such process group 385 */ 386 proc_t * 387 pgfind(pid_t pgid) 388 { 389 zoneid_t zoneid; 390 391 if (INGLOBALZONE(curproc)) 392 zoneid = ALL_ZONES; 393 else 394 zoneid = getzoneid(); 395 return (pgfind_zone(pgid, zoneid)); 396 } 397 398 /* 399 * Sets P_PR_LOCK on a non-system process. Process must be fully created 400 * and not exiting to succeed. 401 * 402 * Returns 0 on success. 403 * Returns 1 if P_PR_LOCK is set. 404 * Returns -1 if proc is in invalid state. 405 */ 406 int 407 sprtrylock_proc(proc_t *p) 408 { 409 ASSERT(MUTEX_HELD(&p->p_lock)); 410 411 /* skip system and incomplete processes */ 412 if (p->p_stat == SIDL || p->p_stat == SZOMB || 413 (p->p_flag & (SSYS | SEXITING | SEXITLWPS))) { 414 return (-1); 415 } 416 417 if (p->p_proc_flag & P_PR_LOCK) 418 return (1); 419 420 p->p_proc_flag |= P_PR_LOCK; 421 422 return (0); 423 } 424 425 /* 426 * Wait for P_PR_LOCK to become clear. Returns with p_lock dropped, 427 * and the proc pointer no longer valid, as the proc may have exited. 428 */ 429 void 430 sprwaitlock_proc(proc_t *p) 431 { 432 kmutex_t *mp; 433 434 ASSERT(MUTEX_HELD(&p->p_lock)); 435 ASSERT(p->p_proc_flag & P_PR_LOCK); 436 437 /* 438 * p_lock is persistent, but p itself is not -- it could 439 * vanish during cv_wait(). Load p->p_lock now so we can 440 * drop it after cv_wait() without referencing p. 441 */ 442 mp = &p->p_lock; 443 cv_wait(&pr_pid_cv[p->p_slot], mp); 444 mutex_exit(mp); 445 } 446 447 /* 448 * If pid exists, find its proc, acquire its p_lock and mark it P_PR_LOCK. 449 * Returns the proc pointer on success, NULL on failure. sprlock() is 450 * really just a stripped-down version of pr_p_lock() to allow practive 451 * walkers like dofusers() and dumpsys() to synchronize with /proc. 452 */ 453 proc_t * 454 sprlock_zone(pid_t pid, zoneid_t zoneid) 455 { 456 proc_t *p; 457 int ret; 458 459 for (;;) { 460 mutex_enter(&pidlock); 461 if ((p = prfind_zone(pid, zoneid)) == NULL) { 462 mutex_exit(&pidlock); 463 return (NULL); 464 } 465 mutex_enter(&p->p_lock); 466 mutex_exit(&pidlock); 467 468 if (panicstr) 469 return (p); 470 471 ret = sprtrylock_proc(p); 472 if (ret == -1) { 473 mutex_exit(&p->p_lock); 474 return (NULL); 475 } else if (ret == 0) { 476 break; 477 } 478 sprwaitlock_proc(p); 479 } 480 return (p); 481 } 482 483 proc_t * 484 sprlock(pid_t pid) 485 { 486 zoneid_t zoneid; 487 488 if (INGLOBALZONE(curproc)) 489 zoneid = ALL_ZONES; 490 else 491 zoneid = getzoneid(); 492 return (sprlock_zone(pid, zoneid)); 493 } 494 495 void 496 sprlock_proc(proc_t *p) 497 { 498 ASSERT(MUTEX_HELD(&p->p_lock)); 499 500 while (p->p_proc_flag & P_PR_LOCK) { 501 cv_wait(&pr_pid_cv[p->p_slot], &p->p_lock); 502 } 503 504 p->p_proc_flag |= P_PR_LOCK; 505 } 506 507 void 508 sprunlock(proc_t *p) 509 { 510 if (panicstr) { 511 mutex_exit(&p->p_lock); 512 return; 513 } 514 515 ASSERT(p->p_proc_flag & P_PR_LOCK); 516 ASSERT(MUTEX_HELD(&p->p_lock)); 517 518 cv_signal(&pr_pid_cv[p->p_slot]); 519 p->p_proc_flag &= ~P_PR_LOCK; 520 mutex_exit(&p->p_lock); 521 } 522 523 void 524 pid_init(void) 525 { 526 int i; 527 528 pid_hashsz = 1 << highbit(v.v_proc / pid_hashlen); 529 530 pidhash = kmem_zalloc(sizeof (struct pid *) * pid_hashsz, KM_SLEEP); 531 procdir = kmem_alloc(sizeof (union procent) * v.v_proc, KM_SLEEP); 532 pr_pid_cv = kmem_zalloc(sizeof (kcondvar_t) * v.v_proc, KM_SLEEP); 533 proc_lock = kmem_zalloc(sizeof (struct plock) * v.v_proc, KM_SLEEP); 534 535 nproc = 1; 536 practive = proc_sched; 537 proc_sched->p_next = NULL; 538 procdir[0].pe_proc = proc_sched; 539 540 procentfree = &procdir[1]; 541 for (i = 1; i < v.v_proc - 1; i++) 542 procdir[i].pe_next = &procdir[i+1]; 543 procdir[i].pe_next = NULL; 544 545 HASHPID(0) = &pid0; 546 547 upcount_init(); 548 } 549 550 proc_t * 551 pid_entry(int slot) 552 { 553 union procent *pep; 554 proc_t *prp; 555 556 ASSERT(MUTEX_HELD(&pidlock)); 557 ASSERT(slot >= 0 && slot < v.v_proc); 558 559 pep = procdir[slot].pe_next; 560 if (pep >= procdir && pep < &procdir[v.v_proc]) 561 return (NULL); 562 prp = procdir[slot].pe_proc; 563 if (prp != 0 && prp->p_stat == SIDL) 564 return (NULL); 565 return (prp); 566 } 567 568 /* 569 * Send the specified signal to all processes whose process group ID is 570 * equal to 'pgid' 571 */ 572 573 void 574 signal(pid_t pgid, int sig) 575 { 576 struct pid *pidp; 577 proc_t *prp; 578 579 mutex_enter(&pidlock); 580 mutex_enter(&pidlinklock); 581 if (pgid == 0 || (pidp = pid_lookup(pgid)) == NULL) { 582 mutex_exit(&pidlinklock); 583 mutex_exit(&pidlock); 584 return; 585 } 586 mutex_exit(&pidlinklock); 587 for (prp = pidp->pid_pglink; prp; prp = prp->p_pglink) { 588 mutex_enter(&prp->p_lock); 589 sigtoproc(prp, NULL, sig); 590 mutex_exit(&prp->p_lock); 591 } 592 mutex_exit(&pidlock); 593 } 594 595 /* 596 * Send the specified signal to the specified process 597 */ 598 599 void 600 prsignal(struct pid *pidp, int sig) 601 { 602 if (!(pidp->pid_prinactive)) 603 psignal(procdir[pidp->pid_prslot].pe_proc, sig); 604 } 605 606 #include <sys/sunddi.h> 607 608 /* 609 * DDI/DKI interfaces for drivers to send signals to processes 610 */ 611 612 /* 613 * obtain an opaque reference to a process for signaling 614 */ 615 void * 616 proc_ref(void) 617 { 618 struct pid *pidp; 619 620 mutex_enter(&pidlock); 621 pidp = curproc->p_pidp; 622 PID_HOLD(pidp); 623 mutex_exit(&pidlock); 624 625 return (pidp); 626 } 627 628 /* 629 * release a reference to a process 630 * - a process can exit even if a driver has a reference to it 631 * - one proc_unref for every proc_ref 632 */ 633 void 634 proc_unref(void *pref) 635 { 636 mutex_enter(&pidlock); 637 PID_RELE((struct pid *)pref); 638 mutex_exit(&pidlock); 639 } 640 641 /* 642 * send a signal to a process 643 * 644 * - send the process the signal 645 * - if the process went away, return a -1 646 * - if the process is still there return 0 647 */ 648 int 649 proc_signal(void *pref, int sig) 650 { 651 struct pid *pidp = pref; 652 653 prsignal(pidp, sig); 654 return (pidp->pid_prinactive ? -1 : 0); 655 } 656 657 658 static struct upcount **upc_hash; /* a boot time allocated array */ 659 static ulong_t upc_hashmask; 660 #define UPC_HASH(x, y) ((ulong_t)(x ^ y) & upc_hashmask) 661 662 /* 663 * Get us off the ground. Called once at boot. 664 */ 665 void 666 upcount_init(void) 667 { 668 ulong_t upc_hashsize; 669 670 /* 671 * An entry per MB of memory is our current guess 672 */ 673 /* 674 * 2^20 is a meg, so shifting right by 20 - PAGESHIFT 675 * converts pages to megs (without overflowing a u_int 676 * if you have more than 4G of memory, like ptob(physmem)/1M 677 * would). 678 */ 679 upc_hashsize = (1 << highbit(physmem >> (20 - PAGESHIFT))); 680 upc_hashmask = upc_hashsize - 1; 681 upc_hash = kmem_zalloc(upc_hashsize * sizeof (struct upcount *), 682 KM_SLEEP); 683 } 684 685 /* 686 * Increment the number of processes associated with a given uid and zoneid. 687 */ 688 void 689 upcount_inc(uid_t uid, zoneid_t zoneid) 690 { 691 struct upcount **upc, **hupc; 692 struct upcount *new; 693 694 ASSERT(MUTEX_HELD(&pidlock)); 695 new = NULL; 696 hupc = &upc_hash[UPC_HASH(uid, zoneid)]; 697 top: 698 upc = hupc; 699 while ((*upc) != NULL) { 700 if ((*upc)->up_uid == uid && (*upc)->up_zoneid == zoneid) { 701 (*upc)->up_count++; 702 if (new) { 703 /* 704 * did not need `new' afterall. 705 */ 706 kmem_free(new, sizeof (*new)); 707 } 708 return; 709 } 710 upc = &(*upc)->up_next; 711 } 712 713 /* 714 * There is no entry for this <uid,zoneid> pair. 715 * Allocate one. If we have to drop pidlock, check 716 * again. 717 */ 718 if (new == NULL) { 719 new = (struct upcount *)kmem_alloc(sizeof (*new), KM_NOSLEEP); 720 if (new == NULL) { 721 mutex_exit(&pidlock); 722 new = (struct upcount *)kmem_alloc(sizeof (*new), 723 KM_SLEEP); 724 mutex_enter(&pidlock); 725 goto top; 726 } 727 } 728 729 730 /* 731 * On the assumption that a new user is going to do some 732 * more forks, put the new upcount structure on the front. 733 */ 734 upc = hupc; 735 736 new->up_uid = uid; 737 new->up_zoneid = zoneid; 738 new->up_count = 1; 739 new->up_next = *upc; 740 741 *upc = new; 742 } 743 744 /* 745 * Decrement the number of processes a given uid and zoneid has. 746 */ 747 void 748 upcount_dec(uid_t uid, zoneid_t zoneid) 749 { 750 struct upcount **upc; 751 struct upcount *done; 752 753 ASSERT(MUTEX_HELD(&pidlock)); 754 755 upc = &upc_hash[UPC_HASH(uid, zoneid)]; 756 while ((*upc) != NULL) { 757 if ((*upc)->up_uid == uid && (*upc)->up_zoneid == zoneid) { 758 (*upc)->up_count--; 759 if ((*upc)->up_count == 0) { 760 done = *upc; 761 *upc = (*upc)->up_next; 762 kmem_free(done, sizeof (*done)); 763 } 764 return; 765 } 766 upc = &(*upc)->up_next; 767 } 768 cmn_err(CE_PANIC, "decr_upcount-off the end"); 769 } 770 771 /* 772 * Returns the number of processes a uid has. 773 * Non-existent uid's are assumed to have no processes. 774 */ 775 int 776 upcount_get(uid_t uid, zoneid_t zoneid) 777 { 778 struct upcount *upc; 779 780 ASSERT(MUTEX_HELD(&pidlock)); 781 782 upc = upc_hash[UPC_HASH(uid, zoneid)]; 783 while (upc != NULL) { 784 if (upc->up_uid == uid && upc->up_zoneid == zoneid) { 785 return (upc->up_count); 786 } 787 upc = upc->up_next; 788 } 789 return (0); 790 }