1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
28
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <sys/sysmacros.h>
32 #include <sys/proc.h>
33 #include <sys/kmem.h>
34 #include <sys/tuneable.h>
35 #include <sys/var.h>
36 #include <sys/cred.h>
37 #include <sys/systm.h>
38 #include <sys/prsystm.h>
39 #include <sys/vnode.h>
40 #include <sys/session.h>
41 #include <sys/cpuvar.h>
42 #include <sys/cmn_err.h>
43 #include <sys/bitmap.h>
44 #include <sys/debug.h>
45 #include <c2/audit.h>
46 #include <sys/project.h>
47 #include <sys/task.h>
48 #include <sys/zone.h>
49
50 /* directory entries for /proc */
51 union procent {
52 proc_t *pe_proc;
53 union procent *pe_next;
54 };
55
56 struct pid pid0 = {
57 0, /* pid_prinactive */
58 1, /* pid_pgorphaned */
59 0, /* pid_padding */
60 0, /* pid_prslot */
61 0, /* pid_id */
62 NULL, /* pid_pglink */
63 NULL, /* pid_pgtail */
64 NULL, /* pid_link */
65 3 /* pid_ref */
66 };
67
68 static int pid_hashlen = 4; /* desired average hash chain length */
69 static int pid_hashsz; /* number of buckets in the hash table */
70
71 #define HASHPID(pid) (pidhash[((pid)&(pid_hashsz-1))])
72
73 extern uint_t nproc;
74 extern struct kmem_cache *process_cache;
75 static void upcount_init(void);
76
77 kmutex_t pidlock; /* global process lock */
78 kmutex_t pr_pidlock; /* /proc global process lock */
79 kcondvar_t *pr_pid_cv; /* for /proc, one per process slot */
80 struct plock *proc_lock; /* persistent array of p_lock's */
81
82 /*
83 * See the comment above pid_getlockslot() for a detailed explanation of this
84 * constant. Note that a PLOCK_SHIFT of 3 implies 64-byte coherence
85 * granularity; if the coherence granularity is ever changed, this constant
86 * should be modified to reflect the change to minimize proc_lock false
87 * sharing (correctness, however, is guaranteed regardless of the coherence
88 * granularity).
89 */
90 #define PLOCK_SHIFT 3
91
92 static kmutex_t pidlinklock;
93 static struct pid **pidhash;
94 static pid_t minpid;
95 static pid_t mpid = FAMOUS_PIDS; /* one more than the last famous pid */
96 static union procent *procdir;
97 static union procent *procentfree;
98
99 static struct pid *
100 pid_lookup(pid_t pid)
101 {
102 struct pid *pidp;
103
104 ASSERT(MUTEX_HELD(&pidlinklock));
105
106 for (pidp = HASHPID(pid); pidp; pidp = pidp->pid_link) {
107 if (pidp->pid_id == pid) {
108 ASSERT(pidp->pid_ref > 0);
109 break;
110 }
111 }
112 return (pidp);
113 }
114
115 struct pid *
116 pid_find(pid_t pid)
117 {
118 struct pid *pidp;
119
120 mutex_enter(&pidlinklock);
121 pidp = pid_lookup(pid);
122 mutex_exit(&pidlinklock);
123
124 return (pidp);
125 }
126
127 void
128 pid_setmin(void)
129 {
130 if (jump_pid && jump_pid > mpid)
131 minpid = mpid = jump_pid;
132 else
133 minpid = mpid;
134 }
135
136 /*
137 * When prslots are simply used as an index to determine a process' p_lock,
138 * adjacent prslots share adjacent p_locks. On machines where the size
139 * of a mutex is smaller than that of a cache line (which, as of this writing,
140 * is true for all machines on which Solaris runs), this can potentially
141 * induce false sharing. The standard solution for false sharing is to pad
142 * out one's data structures (in this case, struct plock). However,
143 * given the size and (generally) sparse use of the proc_lock array, this
144 * is suboptimal. We therefore stride through the proc_lock array with
145 * a stride of PLOCK_SHIFT. PLOCK_SHIFT should be defined as:
146 *
147 * log_2 (coherence_granularity / sizeof (kmutex_t))
148 *
149 * Under this scheme, false sharing is still possible -- but only when
150 * the number of active processes is very large. Note that the one-to-one
151 * mapping between prslots and lockslots is maintained.
152 */
153 static int
154 pid_getlockslot(int prslot)
155 {
156 int even = (v.v_proc >> PLOCK_SHIFT) << PLOCK_SHIFT;
157 int perlap = even >> PLOCK_SHIFT;
158
159 if (prslot >= even)
160 return (prslot);
161
162 return (((prslot % perlap) << PLOCK_SHIFT) + (prslot / perlap));
163 }
164
165 /*
166 * This function allocates a pid structure, a free pid, and optionally a
167 * slot in the proc table for it.
168 *
169 * pid_allocate() returns the new pid on success, -1 on failure.
170 */
171 pid_t
172 pid_allocate(proc_t *prp, pid_t pid, int flags)
173 {
174 struct pid *pidp;
175 union procent *pep;
176 pid_t newpid, startpid;
177
178 pidp = kmem_zalloc(sizeof (struct pid), KM_SLEEP);
179
180 mutex_enter(&pidlinklock);
181 if ((flags & PID_ALLOC_PROC) && (pep = procentfree) == NULL) {
182 /*
183 * ran out of /proc directory entries
184 */
185 goto failed;
186 }
187
188 if (pid != 0) {
189 VERIFY(minpid == 0);
190 VERIFY3P(pid, <, mpid);
191 VERIFY3P(pid_lookup(pid), ==, NULL);
192 newpid = pid;
193 } else {
194 /*
195 * Allocate a pid
196 */
197 ASSERT(minpid <= mpid && mpid < maxpid);
198
199 startpid = mpid;
200 for (;;) {
201 newpid = mpid;
202 if (++mpid == maxpid)
203 mpid = minpid;
204
205 if (pid_lookup(newpid) == NULL)
206 break;
207
208 if (mpid == startpid)
209 goto failed;
210 }
211 }
212
213 /*
214 * Put pid into the pid hash table.
215 */
216 pidp->pid_link = HASHPID(newpid);
217 HASHPID(newpid) = pidp;
218 pidp->pid_ref = 1;
219 pidp->pid_id = newpid;
220
221 if (flags & PID_ALLOC_PROC) {
222 procentfree = pep->pe_next;
223 pidp->pid_prslot = pep - procdir;
224 pep->pe_proc = prp;
225 prp->p_pidp = pidp;
226 prp->p_lockp = &proc_lock[pid_getlockslot(pidp->pid_prslot)];
227 } else {
228 pidp->pid_prslot = 0;
229 }
230
231 mutex_exit(&pidlinklock);
232
233 return (newpid);
234
235 failed:
236 mutex_exit(&pidlinklock);
237 kmem_free(pidp, sizeof (struct pid));
238 return (-1);
239 }
240
241 /*
242 * decrement the reference count for pid
243 */
244 int
245 pid_rele(struct pid *pidp)
246 {
247 struct pid **pidpp;
248
249 mutex_enter(&pidlinklock);
250 ASSERT(pidp != &pid0);
251
252 pidpp = &HASHPID(pidp->pid_id);
253 for (;;) {
254 ASSERT(*pidpp != NULL);
255 if (*pidpp == pidp)
256 break;
257 pidpp = &(*pidpp)->pid_link;
258 }
259
260 *pidpp = pidp->pid_link;
261 mutex_exit(&pidlinklock);
262
263 kmem_free(pidp, sizeof (*pidp));
264 return (0);
265 }
266
267 void
268 proc_entry_free(struct pid *pidp)
269 {
270 mutex_enter(&pidlinklock);
271 pidp->pid_prinactive = 1;
272 procdir[pidp->pid_prslot].pe_next = procentfree;
273 procentfree = &procdir[pidp->pid_prslot];
274 mutex_exit(&pidlinklock);
275 }
276
277 /*
278 * The original task needs to be passed in since the process has already been
279 * detached from the task at this point in time.
280 */
281 void
282 pid_exit(proc_t *prp, struct task *tk)
283 {
284 struct pid *pidp;
285 zone_t *zone = prp->p_zone;
286
287 ASSERT(MUTEX_HELD(&pidlock));
288
289 /*
290 * Exit process group. If it is NULL, it's because fork failed
291 * before calling pgjoin().
292 */
293 ASSERT(prp->p_pgidp != NULL || prp->p_stat == SIDL);
294 if (prp->p_pgidp != NULL)
295 pgexit(prp);
296
297 sess_rele(prp->p_sessp, B_TRUE);
298
299 pidp = prp->p_pidp;
300
301 proc_entry_free(pidp);
302
303 if (audit_active)
304 audit_pfree(prp);
305
306 if (practive == prp) {
307 practive = prp->p_next;
308 }
309
310 if (prp->p_next) {
311 prp->p_next->p_prev = prp->p_prev;
312 }
313 if (prp->p_prev) {
314 prp->p_prev->p_next = prp->p_next;
315 }
316
317 PID_RELE(pidp);
318
319 mutex_destroy(&prp->p_crlock);
320 kmem_cache_free(process_cache, prp);
321 nproc--;
322
323 /*
324 * Decrement the process counts of the original task, project and zone.
325 */
326 mutex_enter(&zone->zone_nlwps_lock);
327 tk->tk_nprocs--;
328 tk->tk_proj->kpj_nprocs--;
329 zone->zone_nprocs--;
330 mutex_exit(&zone->zone_nlwps_lock);
331 }
332
333 /*
334 * Find a process visible from the specified zone given its process ID.
335 */
336 proc_t *
337 prfind_zone(pid_t pid, zoneid_t zoneid)
338 {
339 struct pid *pidp;
340 proc_t *p;
341
342 ASSERT(MUTEX_HELD(&pidlock));
343
344 mutex_enter(&pidlinklock);
345 pidp = pid_lookup(pid);
346 mutex_exit(&pidlinklock);
347 if (pidp != NULL && pidp->pid_prinactive == 0) {
348 p = procdir[pidp->pid_prslot].pe_proc;
349 if (zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid)
350 return (p);
351 }
352 return (NULL);
353 }
354
355 /*
356 * Find a process given its process ID. This obeys zone restrictions,
357 * so if the caller is in a non-global zone it won't find processes
358 * associated with other zones. Use prfind_zone(pid, ALL_ZONES) to
359 * bypass this restriction.
360 */
361 proc_t *
362 prfind(pid_t pid)
363 {
364 zoneid_t zoneid;
365
366 if (INGLOBALZONE(curproc))
367 zoneid = ALL_ZONES;
368 else
369 zoneid = getzoneid();
370 return (prfind_zone(pid, zoneid));
371 }
372
373 proc_t *
374 pgfind_zone(pid_t pgid, zoneid_t zoneid)
375 {
376 struct pid *pidp;
377
378 ASSERT(MUTEX_HELD(&pidlock));
379
380 mutex_enter(&pidlinklock);
381 pidp = pid_lookup(pgid);
382 mutex_exit(&pidlinklock);
383 if (pidp != NULL) {
384 proc_t *p = pidp->pid_pglink;
385
386 if (zoneid == ALL_ZONES || pgid == 0 || p == NULL ||
387 p->p_zone->zone_id == zoneid)
388 return (p);
389 }
390 return (NULL);
391 }
392
393 /*
394 * return the head of the list of processes whose process group ID is 'pgid',
395 * or NULL, if no such process group
396 */
397 proc_t *
398 pgfind(pid_t pgid)
399 {
400 zoneid_t zoneid;
401
402 if (INGLOBALZONE(curproc))
403 zoneid = ALL_ZONES;
404 else
405 zoneid = getzoneid();
406 return (pgfind_zone(pgid, zoneid));
407 }
408
409 /*
410 * Sets P_PR_LOCK on a non-system process. Process must be fully created
411 * and not exiting to succeed.
412 *
413 * Returns 0 on success.
414 * Returns 1 if P_PR_LOCK is set.
415 * Returns -1 if proc is in invalid state.
416 */
417 int
418 sprtrylock_proc(proc_t *p)
419 {
420 ASSERT(MUTEX_HELD(&p->p_lock));
421
422 /* skip system and incomplete processes */
423 if (p->p_stat == SIDL || p->p_stat == SZOMB ||
424 (p->p_flag & (SSYS | SEXITING | SEXITLWPS))) {
425 return (-1);
426 }
427
428 if (p->p_proc_flag & P_PR_LOCK)
429 return (1);
430
431 p->p_proc_flag |= P_PR_LOCK;
432 THREAD_KPRI_REQUEST();
433
434 return (0);
435 }
436
437 /*
438 * Wait for P_PR_LOCK to become clear. Returns with p_lock dropped,
439 * and the proc pointer no longer valid, as the proc may have exited.
440 */
441 void
442 sprwaitlock_proc(proc_t *p)
443 {
444 kmutex_t *mp;
445
446 ASSERT(MUTEX_HELD(&p->p_lock));
447 ASSERT(p->p_proc_flag & P_PR_LOCK);
448
449 /*
450 * p_lock is persistent, but p itself is not -- it could
451 * vanish during cv_wait(). Load p->p_lock now so we can
452 * drop it after cv_wait() without referencing p.
453 */
454 mp = &p->p_lock;
455 cv_wait(&pr_pid_cv[p->p_slot], mp);
456 mutex_exit(mp);
457 }
458
459 /*
460 * If pid exists, find its proc, acquire its p_lock and mark it P_PR_LOCK.
461 * Returns the proc pointer on success, NULL on failure. sprlock() is
462 * really just a stripped-down version of pr_p_lock() to allow practive
463 * walkers like dofusers() and dumpsys() to synchronize with /proc.
464 */
465 proc_t *
466 sprlock_zone(pid_t pid, zoneid_t zoneid)
467 {
468 proc_t *p;
469 int ret;
470
471 for (;;) {
472 mutex_enter(&pidlock);
473 if ((p = prfind_zone(pid, zoneid)) == NULL) {
474 mutex_exit(&pidlock);
475 return (NULL);
476 }
477 mutex_enter(&p->p_lock);
478 mutex_exit(&pidlock);
479
480 if (panicstr)
481 return (p);
482
483 ret = sprtrylock_proc(p);
484 if (ret == -1) {
485 mutex_exit(&p->p_lock);
486 return (NULL);
487 } else if (ret == 0) {
488 break;
489 }
490 sprwaitlock_proc(p);
491 }
492 return (p);
493 }
494
495 proc_t *
496 sprlock(pid_t pid)
497 {
498 zoneid_t zoneid;
499
500 if (INGLOBALZONE(curproc))
501 zoneid = ALL_ZONES;
502 else
503 zoneid = getzoneid();
504 return (sprlock_zone(pid, zoneid));
505 }
506
507 void
508 sprlock_proc(proc_t *p)
509 {
510 ASSERT(MUTEX_HELD(&p->p_lock));
511
512 while (p->p_proc_flag & P_PR_LOCK) {
513 cv_wait(&pr_pid_cv[p->p_slot], &p->p_lock);
514 }
515
516 p->p_proc_flag |= P_PR_LOCK;
517 THREAD_KPRI_REQUEST();
518 }
519
520 void
521 sprunlock(proc_t *p)
522 {
523 if (panicstr) {
524 mutex_exit(&p->p_lock);
525 return;
526 }
527
528 ASSERT(p->p_proc_flag & P_PR_LOCK);
529 ASSERT(MUTEX_HELD(&p->p_lock));
530
531 cv_signal(&pr_pid_cv[p->p_slot]);
532 p->p_proc_flag &= ~P_PR_LOCK;
533 mutex_exit(&p->p_lock);
534 THREAD_KPRI_RELEASE();
535 }
536
537 void
538 pid_init(void)
539 {
540 int i;
541
542 pid_hashsz = 1 << highbit(v.v_proc / pid_hashlen);
543
544 pidhash = kmem_zalloc(sizeof (struct pid *) * pid_hashsz, KM_SLEEP);
545 procdir = kmem_alloc(sizeof (union procent) * v.v_proc, KM_SLEEP);
546 pr_pid_cv = kmem_zalloc(sizeof (kcondvar_t) * v.v_proc, KM_SLEEP);
547 proc_lock = kmem_zalloc(sizeof (struct plock) * v.v_proc, KM_SLEEP);
548
549 nproc = 1;
550 practive = proc_sched;
551 proc_sched->p_next = NULL;
552 procdir[0].pe_proc = proc_sched;
553
554 procentfree = &procdir[1];
555 for (i = 1; i < v.v_proc - 1; i++)
556 procdir[i].pe_next = &procdir[i+1];
557 procdir[i].pe_next = NULL;
558
559 HASHPID(0) = &pid0;
560
561 upcount_init();
562 }
563
564 proc_t *
565 pid_entry(int slot)
566 {
567 union procent *pep;
568 proc_t *prp;
569
570 ASSERT(MUTEX_HELD(&pidlock));
571 ASSERT(slot >= 0 && slot < v.v_proc);
572
573 pep = procdir[slot].pe_next;
574 if (pep >= procdir && pep < &procdir[v.v_proc])
575 return (NULL);
576 prp = procdir[slot].pe_proc;
577 if (prp != 0 && prp->p_stat == SIDL)
578 return (NULL);
579 return (prp);
580 }
581
582 /*
583 * Send the specified signal to all processes whose process group ID is
584 * equal to 'pgid'
585 */
586
587 void
588 signal(pid_t pgid, int sig)
589 {
590 struct pid *pidp;
591 proc_t *prp;
592
593 mutex_enter(&pidlock);
594 mutex_enter(&pidlinklock);
595 if (pgid == 0 || (pidp = pid_lookup(pgid)) == NULL) {
596 mutex_exit(&pidlinklock);
597 mutex_exit(&pidlock);
598 return;
599 }
600 mutex_exit(&pidlinklock);
601 for (prp = pidp->pid_pglink; prp; prp = prp->p_pglink) {
602 mutex_enter(&prp->p_lock);
603 sigtoproc(prp, NULL, sig);
604 mutex_exit(&prp->p_lock);
605 }
606 mutex_exit(&pidlock);
607 }
608
609 /*
610 * Send the specified signal to the specified process
611 */
612
613 void
614 prsignal(struct pid *pidp, int sig)
615 {
616 if (!(pidp->pid_prinactive))
617 psignal(procdir[pidp->pid_prslot].pe_proc, sig);
618 }
619
620 #include <sys/sunddi.h>
621
622 /*
623 * DDI/DKI interfaces for drivers to send signals to processes
624 */
625
626 /*
627 * obtain an opaque reference to a process for signaling
628 */
629 void *
630 proc_ref(void)
631 {
632 struct pid *pidp;
633
634 mutex_enter(&pidlock);
635 pidp = curproc->p_pidp;
636 PID_HOLD(pidp);
637 mutex_exit(&pidlock);
638
639 return (pidp);
640 }
641
642 /*
643 * release a reference to a process
644 * - a process can exit even if a driver has a reference to it
645 * - one proc_unref for every proc_ref
646 */
647 void
648 proc_unref(void *pref)
649 {
650 mutex_enter(&pidlock);
651 PID_RELE((struct pid *)pref);
652 mutex_exit(&pidlock);
653 }
654
655 /*
656 * send a signal to a process
657 *
658 * - send the process the signal
659 * - if the process went away, return a -1
660 * - if the process is still there return 0
661 */
662 int
663 proc_signal(void *pref, int sig)
664 {
665 struct pid *pidp = pref;
666
667 prsignal(pidp, sig);
668 return (pidp->pid_prinactive ? -1 : 0);
669 }
670
671
672 static struct upcount **upc_hash; /* a boot time allocated array */
673 static ulong_t upc_hashmask;
674 #define UPC_HASH(x, y) ((ulong_t)(x ^ y) & upc_hashmask)
675
676 /*
677 * Get us off the ground. Called once at boot.
678 */
679 void
680 upcount_init(void)
681 {
682 ulong_t upc_hashsize;
683
684 /*
685 * An entry per MB of memory is our current guess
686 */
687 /*
688 * 2^20 is a meg, so shifting right by 20 - PAGESHIFT
689 * converts pages to megs (without overflowing a u_int
690 * if you have more than 4G of memory, like ptob(physmem)/1M
691 * would).
692 */
693 upc_hashsize = (1 << highbit(physmem >> (20 - PAGESHIFT)));
694 upc_hashmask = upc_hashsize - 1;
695 upc_hash = kmem_zalloc(upc_hashsize * sizeof (struct upcount *),
696 KM_SLEEP);
697 }
698
699 /*
700 * Increment the number of processes associated with a given uid and zoneid.
701 */
702 void
703 upcount_inc(uid_t uid, zoneid_t zoneid)
704 {
705 struct upcount **upc, **hupc;
706 struct upcount *new;
707
708 ASSERT(MUTEX_HELD(&pidlock));
709 new = NULL;
710 hupc = &upc_hash[UPC_HASH(uid, zoneid)];
711 top:
712 upc = hupc;
713 while ((*upc) != NULL) {
714 if ((*upc)->up_uid == uid && (*upc)->up_zoneid == zoneid) {
715 (*upc)->up_count++;
716 if (new) {
717 /*
718 * did not need `new' afterall.
719 */
720 kmem_free(new, sizeof (*new));
721 }
722 return;
723 }
724 upc = &(*upc)->up_next;
725 }
726
727 /*
728 * There is no entry for this <uid,zoneid> pair.
729 * Allocate one. If we have to drop pidlock, check
730 * again.
731 */
732 if (new == NULL) {
733 new = (struct upcount *)kmem_alloc(sizeof (*new), KM_NOSLEEP);
734 if (new == NULL) {
735 mutex_exit(&pidlock);
736 new = (struct upcount *)kmem_alloc(sizeof (*new),
737 KM_SLEEP);
738 mutex_enter(&pidlock);
739 goto top;
740 }
741 }
742
743
744 /*
745 * On the assumption that a new user is going to do some
746 * more forks, put the new upcount structure on the front.
747 */
748 upc = hupc;
749
750 new->up_uid = uid;
751 new->up_zoneid = zoneid;
752 new->up_count = 1;
753 new->up_next = *upc;
754
755 *upc = new;
756 }
757
758 /*
759 * Decrement the number of processes a given uid and zoneid has.
760 */
761 void
762 upcount_dec(uid_t uid, zoneid_t zoneid)
763 {
764 struct upcount **upc;
765 struct upcount *done;
766
767 ASSERT(MUTEX_HELD(&pidlock));
768
769 upc = &upc_hash[UPC_HASH(uid, zoneid)];
770 while ((*upc) != NULL) {
771 if ((*upc)->up_uid == uid && (*upc)->up_zoneid == zoneid) {
772 (*upc)->up_count--;
773 if ((*upc)->up_count == 0) {
774 done = *upc;
775 *upc = (*upc)->up_next;
776 kmem_free(done, sizeof (*done));
777 }
778 return;
779 }
780 upc = &(*upc)->up_next;
781 }
782 cmn_err(CE_PANIC, "decr_upcount-off the end");
783 }
784
785 /*
786 * Returns the number of processes a uid has.
787 * Non-existent uid's are assumed to have no processes.
788 */
789 int
790 upcount_get(uid_t uid, zoneid_t zoneid)
791 {
792 struct upcount *upc;
793
794 ASSERT(MUTEX_HELD(&pidlock));
795
796 upc = upc_hash[UPC_HASH(uid, zoneid)];
797 while (upc != NULL) {
798 if (upc->up_uid == uid && upc->up_zoneid == zoneid) {
799 return (upc->up_count);
800 }
801 upc = upc->up_next;
802 }
803 return (0);
804 }