1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/sysmacros.h>
29 #include <sys/signal.h>
30 #include <sys/stack.h>
31 #include <sys/pcb.h>
32 #include <sys/user.h>
33 #include <sys/systm.h>
34 #include <sys/sysinfo.h>
35 #include <sys/errno.h>
36 #include <sys/cmn_err.h>
37 #include <sys/cred.h>
38 #include <sys/resource.h>
39 #include <sys/task.h>
40 #include <sys/project.h>
41 #include <sys/proc.h>
42 #include <sys/debug.h>
43 #include <sys/disp.h>
44 #include <sys/class.h>
45 #include <vm/seg_kmem.h>
46 #include <vm/seg_kp.h>
47 #include <sys/machlock.h>
48 #include <sys/kmem.h>
49 #include <sys/varargs.h>
50 #include <sys/turnstile.h>
51 #include <sys/poll.h>
52 #include <sys/vtrace.h>
53 #include <sys/callb.h>
54 #include <c2/audit.h>
55 #include <sys/tnf.h>
56 #include <sys/sobject.h>
57 #include <sys/cpupart.h>
58 #include <sys/pset.h>
59 #include <sys/door.h>
60 #include <sys/spl.h>
61 #include <sys/copyops.h>
62 #include <sys/rctl.h>
63 #include <sys/brand.h>
64 #include <sys/pool.h>
65 #include <sys/zone.h>
66 #include <sys/tsol/label.h>
67 #include <sys/tsol/tndb.h>
68 #include <sys/cpc_impl.h>
69 #include <sys/sdt.h>
70 #include <sys/reboot.h>
71 #include <sys/kdi.h>
72 #include <sys/schedctl.h>
73 #include <sys/waitq.h>
74 #include <sys/cpucaps.h>
75 #include <sys/kiconv.h>
76
77 struct kmem_cache *thread_cache; /* cache of free threads */
78 struct kmem_cache *lwp_cache; /* cache of free lwps */
79 struct kmem_cache *turnstile_cache; /* cache of free turnstiles */
80
81 /*
82 * allthreads is only for use by kmem_readers. All kernel loops can use
83 * the current thread as a start/end point.
84 */
85 static kthread_t *allthreads = &t0; /* circular list of all threads */
86
87 static kcondvar_t reaper_cv; /* synchronization var */
88 kthread_t *thread_deathrow; /* circular list of reapable threads */
89 kthread_t *lwp_deathrow; /* circular list of reapable threads */
90 kmutex_t reaplock; /* protects lwp and thread deathrows */
91 int thread_reapcnt = 0; /* number of threads on deathrow */
92 int lwp_reapcnt = 0; /* number of lwps on deathrow */
93 int reaplimit = 16; /* delay reaping until reaplimit */
94
95 thread_free_lock_t *thread_free_lock;
96 /* protects tick thread from reaper */
97
98 extern int nthread;
99
100 /* System Scheduling classes. */
101 id_t syscid; /* system scheduling class ID */
102 id_t sysdccid = CLASS_UNUSED; /* reset when SDC loads */
103
104 void *segkp_thread; /* cookie for segkp pool */
105
106 int lwp_cache_sz = 32;
107 int t_cache_sz = 8;
108 static kt_did_t next_t_id = 1;
109
110 /* Default mode for thread binding to CPUs and processor sets */
111 int default_binding_mode = TB_ALLHARD;
112
113 /*
114 * Min/Max stack sizes for stack size parameters
115 */
116 #define MAX_STKSIZE (32 * DEFAULTSTKSZ)
117 #define MIN_STKSIZE DEFAULTSTKSZ
118
119 /*
120 * default_stksize overrides lwp_default_stksize if it is set.
121 */
122 int default_stksize;
123 int lwp_default_stksize;
124
125 static zone_key_t zone_thread_key;
126
127 unsigned int kmem_stackinfo; /* stackinfo feature on-off */
128 kmem_stkinfo_t *kmem_stkinfo_log; /* stackinfo circular log */
129 static kmutex_t kmem_stkinfo_lock; /* protects kmem_stkinfo_log */
130
131 /*
132 * forward declarations for internal thread specific data (tsd)
133 */
134 static void *tsd_realloc(void *, size_t, size_t);
135
136 void thread_reaper(void);
137
138 /* forward declarations for stackinfo feature */
139 static void stkinfo_begin(kthread_t *);
140 static void stkinfo_end(kthread_t *);
141 static size_t stkinfo_percent(caddr_t, caddr_t, caddr_t);
142
143 /*ARGSUSED*/
144 static int
145 turnstile_constructor(void *buf, void *cdrarg, int kmflags)
146 {
147 bzero(buf, sizeof (turnstile_t));
148 return (0);
149 }
150
151 /*ARGSUSED*/
152 static void
153 turnstile_destructor(void *buf, void *cdrarg)
154 {
155 turnstile_t *ts = buf;
156
157 ASSERT(ts->ts_free == NULL);
158 ASSERT(ts->ts_waiters == 0);
159 ASSERT(ts->ts_inheritor == NULL);
160 ASSERT(ts->ts_sleepq[0].sq_first == NULL);
161 ASSERT(ts->ts_sleepq[1].sq_first == NULL);
162 }
163
164 void
165 thread_init(void)
166 {
167 kthread_t *tp;
168 extern char sys_name[];
169 extern void idle();
170 struct cpu *cpu = CPU;
171 int i;
172 kmutex_t *lp;
173
174 mutex_init(&reaplock, NULL, MUTEX_SPIN, (void *)ipltospl(DISP_LEVEL));
175 thread_free_lock =
176 kmem_alloc(sizeof (thread_free_lock_t) * THREAD_FREE_NUM, KM_SLEEP);
177 for (i = 0; i < THREAD_FREE_NUM; i++) {
178 lp = &thread_free_lock[i].tf_lock;
179 mutex_init(lp, NULL, MUTEX_DEFAULT, NULL);
180 }
181
182 #if defined(__i386) || defined(__amd64)
183 thread_cache = kmem_cache_create("thread_cache", sizeof (kthread_t),
184 PTR24_ALIGN, NULL, NULL, NULL, NULL, NULL, 0);
185
186 /*
187 * "struct _klwp" includes a "struct pcb", which includes a
188 * "struct fpu", which needs to be 64-byte aligned on amd64
189 * (and even on i386) for xsave/xrstor.
190 */
191 lwp_cache = kmem_cache_create("lwp_cache", sizeof (klwp_t),
192 64, NULL, NULL, NULL, NULL, NULL, 0);
193 #else
194 /*
195 * Allocate thread structures from static_arena. This prevents
196 * issues where a thread tries to relocate its own thread
197 * structure and touches it after the mapping has been suspended.
198 */
199 thread_cache = kmem_cache_create("thread_cache", sizeof (kthread_t),
200 PTR24_ALIGN, NULL, NULL, NULL, NULL, static_arena, 0);
201
202 lwp_stk_cache_init();
203
204 lwp_cache = kmem_cache_create("lwp_cache", sizeof (klwp_t),
205 0, NULL, NULL, NULL, NULL, NULL, 0);
206 #endif
207
208 turnstile_cache = kmem_cache_create("turnstile_cache",
209 sizeof (turnstile_t), 0,
210 turnstile_constructor, turnstile_destructor, NULL, NULL, NULL, 0);
211
212 label_init();
213 cred_init();
214
215 /*
216 * Initialize various resource management facilities.
217 */
218 rctl_init();
219 cpucaps_init();
220 /*
221 * Zone_init() should be called before project_init() so that project ID
222 * for the first project is initialized correctly.
223 */
224 zone_init();
225 project_init();
226 brand_init();
227 kiconv_init();
228 task_init();
229 tcache_init();
230 pool_init();
231
232 curthread->t_ts = kmem_cache_alloc(turnstile_cache, KM_SLEEP);
233
234 /*
235 * Originally, we had two parameters to set default stack
236 * size: one for lwp's (lwp_default_stksize), and one for
237 * kernel-only threads (DEFAULTSTKSZ, a.k.a. _defaultstksz).
238 * Now we have a third parameter that overrides both if it is
239 * set to a legal stack size, called default_stksize.
240 */
241
242 if (default_stksize == 0) {
243 default_stksize = DEFAULTSTKSZ;
244 } else if (default_stksize % PAGESIZE != 0 ||
245 default_stksize > MAX_STKSIZE ||
246 default_stksize < MIN_STKSIZE) {
247 cmn_err(CE_WARN, "Illegal stack size. Using %d",
248 (int)DEFAULTSTKSZ);
249 default_stksize = DEFAULTSTKSZ;
250 } else {
251 lwp_default_stksize = default_stksize;
252 }
253
254 if (lwp_default_stksize == 0) {
255 lwp_default_stksize = default_stksize;
256 } else if (lwp_default_stksize % PAGESIZE != 0 ||
257 lwp_default_stksize > MAX_STKSIZE ||
258 lwp_default_stksize < MIN_STKSIZE) {
259 cmn_err(CE_WARN, "Illegal stack size. Using %d",
260 default_stksize);
261 lwp_default_stksize = default_stksize;
262 }
263
264 segkp_lwp = segkp_cache_init(segkp, lwp_cache_sz,
265 lwp_default_stksize,
266 (KPD_NOWAIT | KPD_HASREDZONE | KPD_LOCKED));
267
268 segkp_thread = segkp_cache_init(segkp, t_cache_sz,
269 default_stksize, KPD_HASREDZONE | KPD_LOCKED | KPD_NO_ANON);
270
271 (void) getcid(sys_name, &syscid);
272 curthread->t_cid = syscid; /* current thread is t0 */
273
274 /*
275 * Set up the first CPU's idle thread.
276 * It runs whenever the CPU has nothing worthwhile to do.
277 */
278 tp = thread_create(NULL, 0, idle, NULL, 0, &p0, TS_STOPPED, -1);
279 cpu->cpu_idle_thread = tp;
280 tp->t_preempt = 1;
281 tp->t_disp_queue = cpu->cpu_disp;
282 ASSERT(tp->t_disp_queue != NULL);
283 tp->t_bound_cpu = cpu;
284 tp->t_affinitycnt = 1;
285
286 /*
287 * Registering a thread in the callback table is usually
288 * done in the initialization code of the thread. In this
289 * case, we do it right after thread creation to avoid
290 * blocking idle thread while registering itself. It also
291 * avoids the possibility of reregistration in case a CPU
292 * restarts its idle thread.
293 */
294 CALLB_CPR_INIT_SAFE(tp, "idle");
295
296 /*
297 * Create the thread_reaper daemon. From this point on, exited
298 * threads will get reaped.
299 */
300 (void) thread_create(NULL, 0, (void (*)())thread_reaper,
301 NULL, 0, &p0, TS_RUN, minclsyspri);
302
303 /*
304 * Finish initializing the kernel memory allocator now that
305 * thread_create() is available.
306 */
307 kmem_thread_init();
308
309 if (boothowto & RB_DEBUG)
310 kdi_dvec_thravail();
311 }
312
313 /*
314 * Create a thread.
315 *
316 * thread_create() blocks for memory if necessary. It never fails.
317 *
318 * If stk is NULL, the thread is created at the base of the stack
319 * and cannot be swapped.
320 */
321 kthread_t *
322 thread_create(caddr_t stk, size_t stksize, void (*proc)(), void *arg,
323 size_t len, proc_t *pp, int state, pri_t pri)
324 {
325 kthread_t *t;
326 extern struct classfuncs sys_classfuncs;
327 turnstile_t *ts;
328
329 /*
330 * Every thread keeps a turnstile around in case it needs to block.
331 * The only reason the turnstile is not simply part of the thread
332 * structure is that we may have to break the association whenever
333 * more than one thread blocks on a given synchronization object.
334 * From a memory-management standpoint, turnstiles are like the
335 * "attached mblks" that hang off dblks in the streams allocator.
336 */
337 ts = kmem_cache_alloc(turnstile_cache, KM_SLEEP);
338
339 if (stk == NULL) {
340 /*
341 * alloc both thread and stack in segkp chunk
342 */
343
344 if (stksize < default_stksize)
345 stksize = default_stksize;
346
347 if (stksize == default_stksize) {
348 stk = (caddr_t)segkp_cache_get(segkp_thread);
349 } else {
350 stksize = roundup(stksize, PAGESIZE);
351 stk = (caddr_t)segkp_get(segkp, stksize,
352 (KPD_HASREDZONE | KPD_NO_ANON | KPD_LOCKED));
353 }
354
355 ASSERT(stk != NULL);
356
357 /*
358 * The machine-dependent mutex code may require that
359 * thread pointers (since they may be used for mutex owner
360 * fields) have certain alignment requirements.
361 * PTR24_ALIGN is the size of the alignment quanta.
362 * XXX - assumes stack grows toward low addresses.
363 */
364 if (stksize <= sizeof (kthread_t) + PTR24_ALIGN)
365 cmn_err(CE_PANIC, "thread_create: proposed stack size"
366 " too small to hold thread.");
367 #ifdef STACK_GROWTH_DOWN
368 stksize -= SA(sizeof (kthread_t) + PTR24_ALIGN - 1);
369 stksize &= -PTR24_ALIGN; /* make thread aligned */
370 t = (kthread_t *)(stk + stksize);
371 bzero(t, sizeof (kthread_t));
372 if (audit_active)
373 audit_thread_create(t);
374 t->t_stk = stk + stksize;
375 t->t_stkbase = stk;
376 #else /* stack grows to larger addresses */
377 stksize -= SA(sizeof (kthread_t));
378 t = (kthread_t *)(stk);
379 bzero(t, sizeof (kthread_t));
380 t->t_stk = stk + sizeof (kthread_t);
381 t->t_stkbase = stk + stksize + sizeof (kthread_t);
382 #endif /* STACK_GROWTH_DOWN */
383 t->t_flag |= T_TALLOCSTK;
384 t->t_swap = stk;
385 } else {
386 t = kmem_cache_alloc(thread_cache, KM_SLEEP);
387 bzero(t, sizeof (kthread_t));
388 ASSERT(((uintptr_t)t & (PTR24_ALIGN - 1)) == 0);
389 if (audit_active)
390 audit_thread_create(t);
391 /*
392 * Initialize t_stk to the kernel stack pointer to use
393 * upon entry to the kernel
394 */
395 #ifdef STACK_GROWTH_DOWN
396 t->t_stk = stk + stksize;
397 t->t_stkbase = stk;
398 #else
399 t->t_stk = stk; /* 3b2-like */
400 t->t_stkbase = stk + stksize;
401 #endif /* STACK_GROWTH_DOWN */
402 }
403
404 if (kmem_stackinfo != 0) {
405 stkinfo_begin(t);
406 }
407
408 t->t_ts = ts;
409
410 /*
411 * p_cred could be NULL if it thread_create is called before cred_init
412 * is called in main.
413 */
414 mutex_enter(&pp->p_crlock);
415 if (pp->p_cred)
416 crhold(t->t_cred = pp->p_cred);
417 mutex_exit(&pp->p_crlock);
418 t->t_start = gethrestime_sec();
419 t->t_startpc = proc;
420 t->t_procp = pp;
421 t->t_clfuncs = &sys_classfuncs.thread;
422 t->t_cid = syscid;
423 t->t_pri = pri;
424 t->t_stime = ddi_get_lbolt();
425 t->t_schedflag = TS_LOAD | TS_DONT_SWAP;
426 t->t_bind_cpu = PBIND_NONE;
427 t->t_bindflag = (uchar_t)default_binding_mode;
428 t->t_bind_pset = PS_NONE;
429 t->t_plockp = &pp->p_lock;
430 t->t_copyops = NULL;
431 t->t_taskq = NULL;
432 t->t_anttime = 0;
433 t->t_hatdepth = 0;
434
435 t->t_dtrace_vtime = 1; /* assure vtimestamp is always non-zero */
436
437 CPU_STATS_ADDQ(CPU, sys, nthreads, 1);
438 #ifndef NPROBE
439 /* Kernel probe */
440 tnf_thread_create(t);
441 #endif /* NPROBE */
442 LOCK_INIT_CLEAR(&t->t_lock);
443
444 /*
445 * Callers who give us a NULL proc must do their own
446 * stack initialization. e.g. lwp_create()
447 */
448 if (proc != NULL) {
449 t->t_stk = thread_stk_init(t->t_stk);
450 thread_load(t, proc, arg, len);
451 }
452
453 /*
454 * Put a hold on project0. If this thread is actually in a
455 * different project, then t_proj will be changed later in
456 * lwp_create(). All kernel-only threads must be in project 0.
457 */
458 t->t_proj = project_hold(proj0p);
459
460 lgrp_affinity_init(&t->t_lgrp_affinity);
461
462 mutex_enter(&pidlock);
463 nthread++;
464 t->t_did = next_t_id++;
465 t->t_prev = curthread->t_prev;
466 t->t_next = curthread;
467
468 /*
469 * Add the thread to the list of all threads, and initialize
470 * its t_cpu pointer. We need to block preemption since
471 * cpu_offline walks the thread list looking for threads
472 * with t_cpu pointing to the CPU being offlined. We want
473 * to make sure that the list is consistent and that if t_cpu
474 * is set, the thread is on the list.
475 */
476 kpreempt_disable();
477 curthread->t_prev->t_next = t;
478 curthread->t_prev = t;
479
480 /*
481 * Threads should never have a NULL t_cpu pointer so assign it
482 * here. If the thread is being created with state TS_RUN a
483 * better CPU may be chosen when it is placed on the run queue.
484 *
485 * We need to keep kernel preemption disabled when setting all
486 * three fields to keep them in sync. Also, always create in
487 * the default partition since that's where kernel threads go
488 * (if this isn't a kernel thread, t_cpupart will be changed
489 * in lwp_create before setting the thread runnable).
490 */
491 t->t_cpupart = &cp_default;
492
493 /*
494 * For now, affiliate this thread with the root lgroup.
495 * Since the kernel does not (presently) allocate its memory
496 * in a locality aware fashion, the root is an appropriate home.
497 * If this thread is later associated with an lwp, it will have
498 * it's lgroup re-assigned at that time.
499 */
500 lgrp_move_thread(t, &cp_default.cp_lgrploads[LGRP_ROOTID], 1);
501
502 /*
503 * Inherit the current cpu. If this cpu isn't part of the chosen
504 * lgroup, a new cpu will be chosen by cpu_choose when the thread
505 * is ready to run.
506 */
507 if (CPU->cpu_part == &cp_default)
508 t->t_cpu = CPU;
509 else
510 t->t_cpu = disp_lowpri_cpu(cp_default.cp_cpulist, t->t_lpl,
511 t->t_pri, NULL);
512
513 t->t_disp_queue = t->t_cpu->cpu_disp;
514 kpreempt_enable();
515
516 /*
517 * Initialize thread state and the dispatcher lock pointer.
518 * Need to hold onto pidlock to block allthreads walkers until
519 * the state is set.
520 */
521 switch (state) {
522 case TS_RUN:
523 curthread->t_oldspl = splhigh(); /* get dispatcher spl */
524 THREAD_SET_STATE(t, TS_STOPPED, &transition_lock);
525 CL_SETRUN(t);
526 thread_unlock(t);
527 break;
528
529 case TS_ONPROC:
530 THREAD_ONPROC(t, t->t_cpu);
531 break;
532
533 case TS_FREE:
534 /*
535 * Free state will be used for intr threads.
536 * The interrupt routine must set the thread dispatcher
537 * lock pointer (t_lockp) if starting on a CPU
538 * other than the current one.
539 */
540 THREAD_FREEINTR(t, CPU);
541 break;
542
543 case TS_STOPPED:
544 THREAD_SET_STATE(t, TS_STOPPED, &stop_lock);
545 break;
546
547 default: /* TS_SLEEP, TS_ZOMB or TS_TRANS */
548 cmn_err(CE_PANIC, "thread_create: invalid state %d", state);
549 }
550 mutex_exit(&pidlock);
551 return (t);
552 }
553
554 /*
555 * Move thread to project0 and take care of project reference counters.
556 */
557 void
558 thread_rele(kthread_t *t)
559 {
560 kproject_t *kpj;
561
562 thread_lock(t);
563
564 ASSERT(t == curthread || t->t_state == TS_FREE || t->t_procp == &p0);
565 kpj = ttoproj(t);
566 t->t_proj = proj0p;
567
568 thread_unlock(t);
569
570 if (kpj != proj0p) {
571 project_rele(kpj);
572 (void) project_hold(proj0p);
573 }
574 }
575
576 void
577 thread_exit(void)
578 {
579 kthread_t *t = curthread;
580
581 if ((t->t_proc_flag & TP_ZTHREAD) != 0)
582 cmn_err(CE_PANIC, "thread_exit: zthread_exit() not called");
583
584 tsd_exit(); /* Clean up this thread's TSD */
585
586 kcpc_passivate(); /* clean up performance counter state */
587
588 /*
589 * No kernel thread should have called poll() without arranging
590 * calling pollcleanup() here.
591 */
592 ASSERT(t->t_pollstate == NULL);
593 ASSERT(t->t_schedctl == NULL);
594 if (t->t_door)
595 door_slam(); /* in case thread did an upcall */
596
597 #ifndef NPROBE
598 /* Kernel probe */
599 if (t->t_tnf_tpdp)
600 tnf_thread_exit();
601 #endif /* NPROBE */
602
603 thread_rele(t);
604 t->t_preempt++;
605
606 /*
607 * remove thread from the all threads list so that
608 * death-row can use the same pointers.
609 */
610 mutex_enter(&pidlock);
611 t->t_next->t_prev = t->t_prev;
612 t->t_prev->t_next = t->t_next;
613 ASSERT(allthreads != t); /* t0 never exits */
614 cv_broadcast(&t->t_joincv); /* wake up anyone in thread_join */
615 mutex_exit(&pidlock);
616
617 if (t->t_ctx != NULL)
618 exitctx(t);
619 if (t->t_procp->p_pctx != NULL)
620 exitpctx(t->t_procp);
621
622 if (kmem_stackinfo != 0) {
623 stkinfo_end(t);
624 }
625
626 t->t_state = TS_ZOMB; /* set zombie thread */
627
628 swtch_from_zombie(); /* give up the CPU */
629 /* NOTREACHED */
630 }
631
632 /*
633 * Check to see if the specified thread is active (defined as being on
634 * the thread list). This is certainly a slow way to do this; if there's
635 * ever a reason to speed it up, we could maintain a hash table of active
636 * threads indexed by their t_did.
637 */
638 static kthread_t *
639 did_to_thread(kt_did_t tid)
640 {
641 kthread_t *t;
642
643 ASSERT(MUTEX_HELD(&pidlock));
644 for (t = curthread->t_next; t != curthread; t = t->t_next) {
645 if (t->t_did == tid)
646 break;
647 }
648 if (t->t_did == tid)
649 return (t);
650 else
651 return (NULL);
652 }
653
654 /*
655 * Wait for specified thread to exit. Returns immediately if the thread
656 * could not be found, meaning that it has either already exited or never
657 * existed.
658 */
659 void
660 thread_join(kt_did_t tid)
661 {
662 kthread_t *t;
663
664 ASSERT(tid != curthread->t_did);
665 ASSERT(tid != t0.t_did);
666
667 mutex_enter(&pidlock);
668 /*
669 * Make sure we check that the thread is on the thread list
670 * before blocking on it; otherwise we could end up blocking on
671 * a cv that's already been freed. In other words, don't cache
672 * the thread pointer across calls to cv_wait.
673 *
674 * The choice of loop invariant means that whenever a thread
675 * is taken off the allthreads list, a cv_broadcast must be
676 * performed on that thread's t_joincv to wake up any waiters.
677 * The broadcast doesn't have to happen right away, but it
678 * shouldn't be postponed indefinitely (e.g., by doing it in
679 * thread_free which may only be executed when the deathrow
680 * queue is processed.
681 */
682 while (t = did_to_thread(tid))
683 cv_wait(&t->t_joincv, &pidlock);
684 mutex_exit(&pidlock);
685 }
686
687 void
688 thread_free_prevent(kthread_t *t)
689 {
690 kmutex_t *lp;
691
692 lp = &thread_free_lock[THREAD_FREE_HASH(t)].tf_lock;
693 mutex_enter(lp);
694 }
695
696 void
697 thread_free_allow(kthread_t *t)
698 {
699 kmutex_t *lp;
700
701 lp = &thread_free_lock[THREAD_FREE_HASH(t)].tf_lock;
702 mutex_exit(lp);
703 }
704
705 static void
706 thread_free_barrier(kthread_t *t)
707 {
708 kmutex_t *lp;
709
710 lp = &thread_free_lock[THREAD_FREE_HASH(t)].tf_lock;
711 mutex_enter(lp);
712 mutex_exit(lp);
713 }
714
715 void
716 thread_free(kthread_t *t)
717 {
718 boolean_t allocstk = (t->t_flag & T_TALLOCSTK);
719 klwp_t *lwp = t->t_lwp;
720 caddr_t swap = t->t_swap;
721
722 ASSERT(t != &t0 && t->t_state == TS_FREE);
723 ASSERT(t->t_door == NULL);
724 ASSERT(t->t_schedctl == NULL);
725 ASSERT(t->t_pollstate == NULL);
726
727 t->t_pri = 0;
728 t->t_pc = 0;
729 t->t_sp = 0;
730 t->t_wchan0 = NULL;
731 t->t_wchan = NULL;
732 if (t->t_cred != NULL) {
733 crfree(t->t_cred);
734 t->t_cred = 0;
735 }
736 if (t->t_pdmsg) {
737 kmem_free(t->t_pdmsg, strlen(t->t_pdmsg) + 1);
738 t->t_pdmsg = NULL;
739 }
740 if (audit_active)
741 audit_thread_free(t);
742 #ifndef NPROBE
743 if (t->t_tnf_tpdp)
744 tnf_thread_free(t);
745 #endif /* NPROBE */
746 if (t->t_cldata) {
747 CL_EXITCLASS(t->t_cid, (caddr_t *)t->t_cldata);
748 }
749 if (t->t_rprof != NULL) {
750 kmem_free(t->t_rprof, sizeof (*t->t_rprof));
751 t->t_rprof = NULL;
752 }
753 t->t_lockp = NULL; /* nothing should try to lock this thread now */
754 if (lwp)
755 lwp_freeregs(lwp, 0);
756 if (t->t_ctx)
757 freectx(t, 0);
758 t->t_stk = NULL;
759 if (lwp)
760 lwp_stk_fini(lwp);
761 lock_clear(&t->t_lock);
762
763 if (t->t_ts->ts_waiters > 0)
764 panic("thread_free: turnstile still active");
765
766 kmem_cache_free(turnstile_cache, t->t_ts);
767
768 free_afd(&t->t_activefd);
769
770 /*
771 * Barrier for the tick accounting code. The tick accounting code
772 * holds this lock to keep the thread from going away while it's
773 * looking at it.
774 */
775 thread_free_barrier(t);
776
777 ASSERT(ttoproj(t) == proj0p);
778 project_rele(ttoproj(t));
779
780 lgrp_affinity_free(&t->t_lgrp_affinity);
781
782 mutex_enter(&pidlock);
783 nthread--;
784 mutex_exit(&pidlock);
785
786 /*
787 * Free thread, lwp and stack. This needs to be done carefully, since
788 * if T_TALLOCSTK is set, the thread is part of the stack.
789 */
790 t->t_lwp = NULL;
791 t->t_swap = NULL;
792
793 if (swap) {
794 segkp_release(segkp, swap);
795 }
796 if (lwp) {
797 kmem_cache_free(lwp_cache, lwp);
798 }
799 if (!allocstk) {
800 kmem_cache_free(thread_cache, t);
801 }
802 }
803
804 /*
805 * Removes threads associated with the given zone from a deathrow queue.
806 * tp is a pointer to the head of the deathrow queue, and countp is a
807 * pointer to the current deathrow count. Returns a linked list of
808 * threads removed from the list.
809 */
810 static kthread_t *
811 thread_zone_cleanup(kthread_t **tp, int *countp, zoneid_t zoneid)
812 {
813 kthread_t *tmp, *list = NULL;
814 cred_t *cr;
815
816 ASSERT(MUTEX_HELD(&reaplock));
817 while (*tp != NULL) {
818 if ((cr = (*tp)->t_cred) != NULL && crgetzoneid(cr) == zoneid) {
819 tmp = *tp;
820 *tp = tmp->t_forw;
821 tmp->t_forw = list;
822 list = tmp;
823 (*countp)--;
824 } else {
825 tp = &(*tp)->t_forw;
826 }
827 }
828 return (list);
829 }
830
831 static void
832 thread_reap_list(kthread_t *t)
833 {
834 kthread_t *next;
835
836 while (t != NULL) {
837 next = t->t_forw;
838 thread_free(t);
839 t = next;
840 }
841 }
842
843 /* ARGSUSED */
844 static void
845 thread_zone_destroy(zoneid_t zoneid, void *unused)
846 {
847 kthread_t *t, *l;
848
849 mutex_enter(&reaplock);
850 /*
851 * Pull threads and lwps associated with zone off deathrow lists.
852 */
853 t = thread_zone_cleanup(&thread_deathrow, &thread_reapcnt, zoneid);
854 l = thread_zone_cleanup(&lwp_deathrow, &lwp_reapcnt, zoneid);
855 mutex_exit(&reaplock);
856
857 /*
858 * Guard against race condition in mutex_owner_running:
859 * thread=owner(mutex)
860 * <interrupt>
861 * thread exits mutex
862 * thread exits
863 * thread reaped
864 * thread struct freed
865 * cpu = thread->t_cpu <- BAD POINTER DEREFERENCE.
866 * A cross call to all cpus will cause the interrupt handler
867 * to reset the PC if it is in mutex_owner_running, refreshing
868 * stale thread pointers.
869 */
870 mutex_sync(); /* sync with mutex code */
871
872 /*
873 * Reap threads
874 */
875 thread_reap_list(t);
876
877 /*
878 * Reap lwps
879 */
880 thread_reap_list(l);
881 }
882
883 /*
884 * cleanup zombie threads that are on deathrow.
885 */
886 void
887 thread_reaper(void)
888 {
889 kthread_t *t, *l;
890 callb_cpr_t cprinfo;
891
892 /*
893 * Register callback to clean up threads when zone is destroyed.
894 */
895 zone_key_create(&zone_thread_key, NULL, NULL, thread_zone_destroy);
896
897 CALLB_CPR_INIT(&cprinfo, &reaplock, callb_generic_cpr, "t_reaper");
898 for (;;) {
899 mutex_enter(&reaplock);
900 while (thread_deathrow == NULL && lwp_deathrow == NULL) {
901 CALLB_CPR_SAFE_BEGIN(&cprinfo);
902 cv_wait(&reaper_cv, &reaplock);
903 CALLB_CPR_SAFE_END(&cprinfo, &reaplock);
904 }
905 /*
906 * mutex_sync() needs to be called when reaping, but
907 * not too often. We limit reaping rate to once
908 * per second. Reaplimit is max rate at which threads can
909 * be freed. Does not impact thread destruction/creation.
910 */
911 t = thread_deathrow;
912 l = lwp_deathrow;
913 thread_deathrow = NULL;
914 lwp_deathrow = NULL;
915 thread_reapcnt = 0;
916 lwp_reapcnt = 0;
917 mutex_exit(&reaplock);
918
919 /*
920 * Guard against race condition in mutex_owner_running:
921 * thread=owner(mutex)
922 * <interrupt>
923 * thread exits mutex
924 * thread exits
925 * thread reaped
926 * thread struct freed
927 * cpu = thread->t_cpu <- BAD POINTER DEREFERENCE.
928 * A cross call to all cpus will cause the interrupt handler
929 * to reset the PC if it is in mutex_owner_running, refreshing
930 * stale thread pointers.
931 */
932 mutex_sync(); /* sync with mutex code */
933 /*
934 * Reap threads
935 */
936 thread_reap_list(t);
937
938 /*
939 * Reap lwps
940 */
941 thread_reap_list(l);
942 delay(hz);
943 }
944 }
945
946 /*
947 * This is called by lwpcreate, etc.() to put a lwp_deathrow thread onto
948 * thread_deathrow. The thread's state is changed already TS_FREE to indicate
949 * that is reapable. The thread already holds the reaplock, and was already
950 * freed.
951 */
952 void
953 reapq_move_lq_to_tq(kthread_t *t)
954 {
955 ASSERT(t->t_state == TS_FREE);
956 ASSERT(MUTEX_HELD(&reaplock));
957 t->t_forw = thread_deathrow;
958 thread_deathrow = t;
959 thread_reapcnt++;
960 if (lwp_reapcnt + thread_reapcnt > reaplimit)
961 cv_signal(&reaper_cv); /* wake the reaper */
962 }
963
964 /*
965 * This is called by resume() to put a zombie thread onto deathrow.
966 * The thread's state is changed to TS_FREE to indicate that is reapable.
967 * This is called from the idle thread so it must not block - just spin.
968 */
969 void
970 reapq_add(kthread_t *t)
971 {
972 mutex_enter(&reaplock);
973
974 /*
975 * lwp_deathrow contains threads with lwp linkage and
976 * swappable thread stacks which have the default stacksize.
977 * These threads' lwps and stacks may be reused by lwp_create().
978 *
979 * Anything else goes on thread_deathrow(), where it will eventually
980 * be thread_free()d.
981 */
982 if (t->t_flag & T_LWPREUSE) {
983 ASSERT(ttolwp(t) != NULL);
984 t->t_forw = lwp_deathrow;
985 lwp_deathrow = t;
986 lwp_reapcnt++;
987 } else {
988 t->t_forw = thread_deathrow;
989 thread_deathrow = t;
990 thread_reapcnt++;
991 }
992 if (lwp_reapcnt + thread_reapcnt > reaplimit)
993 cv_signal(&reaper_cv); /* wake the reaper */
994 t->t_state = TS_FREE;
995 lock_clear(&t->t_lock);
996
997 /*
998 * Before we return, we need to grab and drop the thread lock for
999 * the dead thread. At this point, the current thread is the idle
1000 * thread, and the dead thread's CPU lock points to the current
1001 * CPU -- and we must grab and drop the lock to synchronize with
1002 * a racing thread walking a blocking chain that the zombie thread
1003 * was recently in. By this point, that blocking chain is (by
1004 * definition) stale: the dead thread is not holding any locks, and
1005 * is therefore not in any blocking chains -- but if we do not regrab
1006 * our lock before freeing the dead thread's data structures, the
1007 * thread walking the (stale) blocking chain will die on memory
1008 * corruption when it attempts to drop the dead thread's lock. We
1009 * only need do this once because there is no way for the dead thread
1010 * to ever again be on a blocking chain: once we have grabbed and
1011 * dropped the thread lock, we are guaranteed that anyone that could
1012 * have seen this thread in a blocking chain can no longer see it.
1013 */
1014 thread_lock(t);
1015 thread_unlock(t);
1016
1017 mutex_exit(&reaplock);
1018 }
1019
1020 /*
1021 * Install thread context ops for the current thread.
1022 */
1023 void
1024 installctx(kthread_t *t, void *arg, void (*save)(void *),
1025 void (*restore)(void *), void (*fork)(void *, void *),
1026 void (*lwp_create)(void *, void *), void (*exit)(void *),
1027 void (*free)(void *, int))
1028 {
1029 struct ctxop *ctx;
1030
1031 ctx = kmem_alloc(sizeof (struct ctxop), KM_SLEEP);
1032 ctx->save_op = save;
1033 ctx->restore_op = restore;
1034 ctx->fork_op = fork;
1035 ctx->lwp_create_op = lwp_create;
1036 ctx->exit_op = exit;
1037 ctx->free_op = free;
1038 ctx->arg = arg;
1039 ctx->next = t->t_ctx;
1040 t->t_ctx = ctx;
1041 }
1042
1043 /*
1044 * Remove the thread context ops from a thread.
1045 */
1046 int
1047 removectx(kthread_t *t, void *arg, void (*save)(void *),
1048 void (*restore)(void *), void (*fork)(void *, void *),
1049 void (*lwp_create)(void *, void *), void (*exit)(void *),
1050 void (*free)(void *, int))
1051 {
1052 struct ctxop *ctx, *prev_ctx;
1053
1054 /*
1055 * The incoming kthread_t (which is the thread for which the
1056 * context ops will be removed) should be one of the following:
1057 *
1058 * a) the current thread,
1059 *
1060 * b) a thread of a process that's being forked (SIDL),
1061 *
1062 * c) a thread that belongs to the same process as the current
1063 * thread and for which the current thread is the agent thread,
1064 *
1065 * d) a thread that is TS_STOPPED which is indicative of it
1066 * being (if curthread is not an agent) a thread being created
1067 * as part of an lwp creation.
1068 */
1069 ASSERT(t == curthread || ttoproc(t)->p_stat == SIDL ||
1070 ttoproc(t)->p_agenttp == curthread || t->t_state == TS_STOPPED);
1071
1072 /*
1073 * Serialize modifications to t->t_ctx to prevent the agent thread
1074 * and the target thread from racing with each other during lwp exit.
1075 */
1076 mutex_enter(&t->t_ctx_lock);
1077 prev_ctx = NULL;
1078 for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next) {
1079 if (ctx->save_op == save && ctx->restore_op == restore &&
1080 ctx->fork_op == fork && ctx->lwp_create_op == lwp_create &&
1081 ctx->exit_op == exit && ctx->free_op == free &&
1082 ctx->arg == arg) {
1083 if (prev_ctx)
1084 prev_ctx->next = ctx->next;
1085 else
1086 t->t_ctx = ctx->next;
1087 mutex_exit(&t->t_ctx_lock);
1088 if (ctx->free_op != NULL)
1089 (ctx->free_op)(ctx->arg, 0);
1090 kmem_free(ctx, sizeof (struct ctxop));
1091 return (1);
1092 }
1093 prev_ctx = ctx;
1094 }
1095 mutex_exit(&t->t_ctx_lock);
1096
1097 return (0);
1098 }
1099
1100 void
1101 savectx(kthread_t *t)
1102 {
1103 struct ctxop *ctx;
1104
1105 ASSERT(t == curthread);
1106 for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next)
1107 if (ctx->save_op != NULL)
1108 (ctx->save_op)(ctx->arg);
1109 }
1110
1111 void
1112 restorectx(kthread_t *t)
1113 {
1114 struct ctxop *ctx;
1115
1116 ASSERT(t == curthread);
1117 for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next)
1118 if (ctx->restore_op != NULL)
1119 (ctx->restore_op)(ctx->arg);
1120 }
1121
1122 void
1123 forkctx(kthread_t *t, kthread_t *ct)
1124 {
1125 struct ctxop *ctx;
1126
1127 for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next)
1128 if (ctx->fork_op != NULL)
1129 (ctx->fork_op)(t, ct);
1130 }
1131
1132 /*
1133 * Note that this operator is only invoked via the _lwp_create
1134 * system call. The system may have other reasons to create lwps
1135 * e.g. the agent lwp or the doors unreferenced lwp.
1136 */
1137 void
1138 lwp_createctx(kthread_t *t, kthread_t *ct)
1139 {
1140 struct ctxop *ctx;
1141
1142 for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next)
1143 if (ctx->lwp_create_op != NULL)
1144 (ctx->lwp_create_op)(t, ct);
1145 }
1146
1147 /*
1148 * exitctx is called from thread_exit() and lwp_exit() to perform any actions
1149 * needed when the thread/LWP leaves the processor for the last time. This
1150 * routine is not intended to deal with freeing memory; freectx() is used for
1151 * that purpose during thread_free(). This routine is provided to allow for
1152 * clean-up that can't wait until thread_free().
1153 */
1154 void
1155 exitctx(kthread_t *t)
1156 {
1157 struct ctxop *ctx;
1158
1159 for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next)
1160 if (ctx->exit_op != NULL)
1161 (ctx->exit_op)(t);
1162 }
1163
1164 /*
1165 * freectx is called from thread_free() and exec() to get
1166 * rid of old thread context ops.
1167 */
1168 void
1169 freectx(kthread_t *t, int isexec)
1170 {
1171 struct ctxop *ctx;
1172
1173 while ((ctx = t->t_ctx) != NULL) {
1174 t->t_ctx = ctx->next;
1175 if (ctx->free_op != NULL)
1176 (ctx->free_op)(ctx->arg, isexec);
1177 kmem_free(ctx, sizeof (struct ctxop));
1178 }
1179 }
1180
1181 /*
1182 * freectx_ctx is called from lwp_create() when lwp is reused from
1183 * lwp_deathrow and its thread structure is added to thread_deathrow.
1184 * The thread structure to which this ctx was attached may be already
1185 * freed by the thread reaper so free_op implementations shouldn't rely
1186 * on thread structure to which this ctx was attached still being around.
1187 */
1188 void
1189 freectx_ctx(struct ctxop *ctx)
1190 {
1191 struct ctxop *nctx;
1192
1193 ASSERT(ctx != NULL);
1194
1195 do {
1196 nctx = ctx->next;
1197 if (ctx->free_op != NULL)
1198 (ctx->free_op)(ctx->arg, 0);
1199 kmem_free(ctx, sizeof (struct ctxop));
1200 } while ((ctx = nctx) != NULL);
1201 }
1202
1203 /*
1204 * Set the thread running; arrange for it to be swapped in if necessary.
1205 */
1206 void
1207 setrun_locked(kthread_t *t)
1208 {
1209 ASSERT(THREAD_LOCK_HELD(t));
1210 if (t->t_state == TS_SLEEP) {
1211 /*
1212 * Take off sleep queue.
1213 */
1214 SOBJ_UNSLEEP(t->t_sobj_ops, t);
1215 } else if (t->t_state & (TS_RUN | TS_ONPROC)) {
1216 /*
1217 * Already on dispatcher queue.
1218 */
1219 return;
1220 } else if (t->t_state == TS_WAIT) {
1221 waitq_setrun(t);
1222 } else if (t->t_state == TS_STOPPED) {
1223 /*
1224 * All of the sending of SIGCONT (TC_XSTART) and /proc
1225 * (TC_PSTART) and lwp_continue() (TC_CSTART) must have
1226 * requested that the thread be run.
1227 * Just calling setrun() is not sufficient to set a stopped
1228 * thread running. TP_TXSTART is always set if the thread
1229 * is not stopped by a jobcontrol stop signal.
1230 * TP_TPSTART is always set if /proc is not controlling it.
1231 * TP_TCSTART is always set if lwp_suspend() didn't stop it.
1232 * The thread won't be stopped unless one of these
1233 * three mechanisms did it.
1234 *
1235 * These flags must be set before calling setrun_locked(t).
1236 * They can't be passed as arguments because the streams
1237 * code calls setrun() indirectly and the mechanism for
1238 * doing so admits only one argument. Note that the
1239 * thread must be locked in order to change t_schedflags.
1240 */
1241 if ((t->t_schedflag & TS_ALLSTART) != TS_ALLSTART)
1242 return;
1243 /*
1244 * Process is no longer stopped (a thread is running).
1245 */
1246 t->t_whystop = 0;
1247 t->t_whatstop = 0;
1248 /*
1249 * Strictly speaking, we do not have to clear these
1250 * flags here; they are cleared on entry to stop().
1251 * However, they are confusing when doing kernel
1252 * debugging or when they are revealed by ps(1).
1253 */
1254 t->t_schedflag &= ~TS_ALLSTART;
1255 THREAD_TRANSITION(t); /* drop stopped-thread lock */
1256 ASSERT(t->t_lockp == &transition_lock);
1257 ASSERT(t->t_wchan0 == NULL && t->t_wchan == NULL);
1258 /*
1259 * Let the class put the process on the dispatcher queue.
1260 */
1261 CL_SETRUN(t);
1262 }
1263 }
1264
1265 void
1266 setrun(kthread_t *t)
1267 {
1268 thread_lock(t);
1269 setrun_locked(t);
1270 thread_unlock(t);
1271 }
1272
1273 /*
1274 * Unpin an interrupted thread.
1275 * When an interrupt occurs, the interrupt is handled on the stack
1276 * of an interrupt thread, taken from a pool linked to the CPU structure.
1277 *
1278 * When swtch() is switching away from an interrupt thread because it
1279 * blocked or was preempted, this routine is called to complete the
1280 * saving of the interrupted thread state, and returns the interrupted
1281 * thread pointer so it may be resumed.
1282 *
1283 * Called by swtch() only at high spl.
1284 */
1285 kthread_t *
1286 thread_unpin(void)
1287 {
1288 kthread_t *t = curthread; /* current thread */
1289 kthread_t *itp; /* interrupted thread */
1290 int i; /* interrupt level */
1291 extern int intr_passivate();
1292
1293 ASSERT(t->t_intr != NULL);
1294
1295 itp = t->t_intr; /* interrupted thread */
1296 t->t_intr = NULL; /* clear interrupt ptr */
1297
1298 /*
1299 * Get state from interrupt thread for the one
1300 * it interrupted.
1301 */
1302
1303 i = intr_passivate(t, itp);
1304
1305 TRACE_5(TR_FAC_INTR, TR_INTR_PASSIVATE,
1306 "intr_passivate:level %d curthread %p (%T) ithread %p (%T)",
1307 i, t, t, itp, itp);
1308
1309 /*
1310 * Dissociate the current thread from the interrupted thread's LWP.
1311 */
1312 t->t_lwp = NULL;
1313
1314 /*
1315 * Interrupt handlers above the level that spinlocks block must
1316 * not block.
1317 */
1318 #if DEBUG
1319 if (i < 0 || i > LOCK_LEVEL)
1320 cmn_err(CE_PANIC, "thread_unpin: ipl out of range %x", i);
1321 #endif
1322
1323 /*
1324 * Compute the CPU's base interrupt level based on the active
1325 * interrupts.
1326 */
1327 ASSERT(CPU->cpu_intr_actv & (1 << i));
1328 set_base_spl();
1329
1330 return (itp);
1331 }
1332
1333 /*
1334 * TSD -- THREAD SPECIFIC DATA
1335 */
1336 static kmutex_t tsd_mutex; /* linked list spin lock */
1337 static uint_t tsd_nkeys; /* size of destructor array */
1338 /* per-key destructor funcs */
1339 static void (**tsd_destructor)(void *);
1340 /* list of tsd_thread's */
1341 static struct tsd_thread *tsd_list;
1342
1343 /*
1344 * Default destructor
1345 * Needed because NULL destructor means that the key is unused
1346 */
1347 /* ARGSUSED */
1348 void
1349 tsd_defaultdestructor(void *value)
1350 {}
1351
1352 /*
1353 * Create a key (index into per thread array)
1354 * Locks out tsd_create, tsd_destroy, and tsd_exit
1355 * May allocate memory with lock held
1356 */
1357 void
1358 tsd_create(uint_t *keyp, void (*destructor)(void *))
1359 {
1360 int i;
1361 uint_t nkeys;
1362
1363 /*
1364 * if key is allocated, do nothing
1365 */
1366 mutex_enter(&tsd_mutex);
1367 if (*keyp) {
1368 mutex_exit(&tsd_mutex);
1369 return;
1370 }
1371 /*
1372 * find an unused key
1373 */
1374 if (destructor == NULL)
1375 destructor = tsd_defaultdestructor;
1376
1377 for (i = 0; i < tsd_nkeys; ++i)
1378 if (tsd_destructor[i] == NULL)
1379 break;
1380
1381 /*
1382 * if no unused keys, increase the size of the destructor array
1383 */
1384 if (i == tsd_nkeys) {
1385 if ((nkeys = (tsd_nkeys << 1)) == 0)
1386 nkeys = 1;
1387 tsd_destructor =
1388 (void (**)(void *))tsd_realloc((void *)tsd_destructor,
1389 (size_t)(tsd_nkeys * sizeof (void (*)(void *))),
1390 (size_t)(nkeys * sizeof (void (*)(void *))));
1391 tsd_nkeys = nkeys;
1392 }
1393
1394 /*
1395 * allocate the next available unused key
1396 */
1397 tsd_destructor[i] = destructor;
1398 *keyp = i + 1;
1399 mutex_exit(&tsd_mutex);
1400 }
1401
1402 /*
1403 * Destroy a key -- this is for unloadable modules
1404 *
1405 * Assumes that the caller is preventing tsd_set and tsd_get
1406 * Locks out tsd_create, tsd_destroy, and tsd_exit
1407 * May free memory with lock held
1408 */
1409 void
1410 tsd_destroy(uint_t *keyp)
1411 {
1412 uint_t key;
1413 struct tsd_thread *tsd;
1414
1415 /*
1416 * protect the key namespace and our destructor lists
1417 */
1418 mutex_enter(&tsd_mutex);
1419 key = *keyp;
1420 *keyp = 0;
1421
1422 ASSERT(key <= tsd_nkeys);
1423
1424 /*
1425 * if the key is valid
1426 */
1427 if (key != 0) {
1428 uint_t k = key - 1;
1429 /*
1430 * for every thread with TSD, call key's destructor
1431 */
1432 for (tsd = tsd_list; tsd; tsd = tsd->ts_next) {
1433 /*
1434 * no TSD for key in this thread
1435 */
1436 if (key > tsd->ts_nkeys)
1437 continue;
1438 /*
1439 * call destructor for key
1440 */
1441 if (tsd->ts_value[k] && tsd_destructor[k])
1442 (*tsd_destructor[k])(tsd->ts_value[k]);
1443 /*
1444 * reset value for key
1445 */
1446 tsd->ts_value[k] = NULL;
1447 }
1448 /*
1449 * actually free the key (NULL destructor == unused)
1450 */
1451 tsd_destructor[k] = NULL;
1452 }
1453
1454 mutex_exit(&tsd_mutex);
1455 }
1456
1457 /*
1458 * Quickly return the per thread value that was stored with the specified key
1459 * Assumes the caller is protecting key from tsd_create and tsd_destroy
1460 */
1461 void *
1462 tsd_get(uint_t key)
1463 {
1464 return (tsd_agent_get(curthread, key));
1465 }
1466
1467 /*
1468 * Set a per thread value indexed with the specified key
1469 */
1470 int
1471 tsd_set(uint_t key, void *value)
1472 {
1473 return (tsd_agent_set(curthread, key, value));
1474 }
1475
1476 /*
1477 * Like tsd_get(), except that the agent lwp can get the tsd of
1478 * another thread in the same process (the agent thread only runs when the
1479 * process is completely stopped by /proc), or syslwp is creating a new lwp.
1480 */
1481 void *
1482 tsd_agent_get(kthread_t *t, uint_t key)
1483 {
1484 struct tsd_thread *tsd = t->t_tsd;
1485
1486 ASSERT(t == curthread ||
1487 ttoproc(t)->p_agenttp == curthread || t->t_state == TS_STOPPED);
1488
1489 if (key && tsd != NULL && key <= tsd->ts_nkeys)
1490 return (tsd->ts_value[key - 1]);
1491 return (NULL);
1492 }
1493
1494 /*
1495 * Like tsd_set(), except that the agent lwp can set the tsd of
1496 * another thread in the same process, or syslwp can set the tsd
1497 * of a thread it's in the middle of creating.
1498 *
1499 * Assumes the caller is protecting key from tsd_create and tsd_destroy
1500 * May lock out tsd_destroy (and tsd_create), may allocate memory with
1501 * lock held
1502 */
1503 int
1504 tsd_agent_set(kthread_t *t, uint_t key, void *value)
1505 {
1506 struct tsd_thread *tsd = t->t_tsd;
1507
1508 ASSERT(t == curthread ||
1509 ttoproc(t)->p_agenttp == curthread || t->t_state == TS_STOPPED);
1510
1511 if (key == 0)
1512 return (EINVAL);
1513 if (tsd == NULL)
1514 tsd = t->t_tsd = kmem_zalloc(sizeof (*tsd), KM_SLEEP);
1515 if (key <= tsd->ts_nkeys) {
1516 tsd->ts_value[key - 1] = value;
1517 return (0);
1518 }
1519
1520 ASSERT(key <= tsd_nkeys);
1521
1522 /*
1523 * lock out tsd_destroy()
1524 */
1525 mutex_enter(&tsd_mutex);
1526 if (tsd->ts_nkeys == 0) {
1527 /*
1528 * Link onto list of threads with TSD
1529 */
1530 if ((tsd->ts_next = tsd_list) != NULL)
1531 tsd_list->ts_prev = tsd;
1532 tsd_list = tsd;
1533 }
1534
1535 /*
1536 * Allocate thread local storage and set the value for key
1537 */
1538 tsd->ts_value = tsd_realloc(tsd->ts_value,
1539 tsd->ts_nkeys * sizeof (void *),
1540 key * sizeof (void *));
1541 tsd->ts_nkeys = key;
1542 tsd->ts_value[key - 1] = value;
1543 mutex_exit(&tsd_mutex);
1544
1545 return (0);
1546 }
1547
1548
1549 /*
1550 * Return the per thread value that was stored with the specified key
1551 * If necessary, create the key and the value
1552 * Assumes the caller is protecting *keyp from tsd_destroy
1553 */
1554 void *
1555 tsd_getcreate(uint_t *keyp, void (*destroy)(void *), void *(*allocate)(void))
1556 {
1557 void *value;
1558 uint_t key = *keyp;
1559 struct tsd_thread *tsd = curthread->t_tsd;
1560
1561 if (tsd == NULL)
1562 tsd = curthread->t_tsd = kmem_zalloc(sizeof (*tsd), KM_SLEEP);
1563 if (key && key <= tsd->ts_nkeys && (value = tsd->ts_value[key - 1]))
1564 return (value);
1565 if (key == 0)
1566 tsd_create(keyp, destroy);
1567 (void) tsd_set(*keyp, value = (*allocate)());
1568
1569 return (value);
1570 }
1571
1572 /*
1573 * Called from thread_exit() to run the destructor function for each tsd
1574 * Locks out tsd_create and tsd_destroy
1575 * Assumes that the destructor *DOES NOT* use tsd
1576 */
1577 void
1578 tsd_exit(void)
1579 {
1580 int i;
1581 struct tsd_thread *tsd = curthread->t_tsd;
1582
1583 if (tsd == NULL)
1584 return;
1585
1586 if (tsd->ts_nkeys == 0) {
1587 kmem_free(tsd, sizeof (*tsd));
1588 curthread->t_tsd = NULL;
1589 return;
1590 }
1591
1592 /*
1593 * lock out tsd_create and tsd_destroy, call
1594 * the destructor, and mark the value as destroyed.
1595 */
1596 mutex_enter(&tsd_mutex);
1597
1598 for (i = 0; i < tsd->ts_nkeys; i++) {
1599 if (tsd->ts_value[i] && tsd_destructor[i])
1600 (*tsd_destructor[i])(tsd->ts_value[i]);
1601 tsd->ts_value[i] = NULL;
1602 }
1603
1604 /*
1605 * remove from linked list of threads with TSD
1606 */
1607 if (tsd->ts_next)
1608 tsd->ts_next->ts_prev = tsd->ts_prev;
1609 if (tsd->ts_prev)
1610 tsd->ts_prev->ts_next = tsd->ts_next;
1611 if (tsd_list == tsd)
1612 tsd_list = tsd->ts_next;
1613
1614 mutex_exit(&tsd_mutex);
1615
1616 /*
1617 * free up the TSD
1618 */
1619 kmem_free(tsd->ts_value, tsd->ts_nkeys * sizeof (void *));
1620 kmem_free(tsd, sizeof (struct tsd_thread));
1621 curthread->t_tsd = NULL;
1622 }
1623
1624 /*
1625 * realloc
1626 */
1627 static void *
1628 tsd_realloc(void *old, size_t osize, size_t nsize)
1629 {
1630 void *new;
1631
1632 new = kmem_zalloc(nsize, KM_SLEEP);
1633 if (old) {
1634 bcopy(old, new, osize);
1635 kmem_free(old, osize);
1636 }
1637 return (new);
1638 }
1639
1640 /*
1641 * Return non-zero if an interrupt is being serviced.
1642 */
1643 int
1644 servicing_interrupt()
1645 {
1646 int onintr = 0;
1647
1648 /* Are we an interrupt thread */
1649 if (curthread->t_flag & T_INTR_THREAD)
1650 return (1);
1651 /* Are we servicing a high level interrupt? */
1652 if (CPU_ON_INTR(CPU)) {
1653 kpreempt_disable();
1654 onintr = CPU_ON_INTR(CPU);
1655 kpreempt_enable();
1656 }
1657 return (onintr);
1658 }
1659
1660
1661 /*
1662 * Change the dispatch priority of a thread in the system.
1663 * Used when raising or lowering a thread's priority.
1664 * (E.g., priority inheritance)
1665 *
1666 * Since threads are queued according to their priority, we
1667 * we must check the thread's state to determine whether it
1668 * is on a queue somewhere. If it is, we've got to:
1669 *
1670 * o Dequeue the thread.
1671 * o Change its effective priority.
1672 * o Enqueue the thread.
1673 *
1674 * Assumptions: The thread whose priority we wish to change
1675 * must be locked before we call thread_change_(e)pri().
1676 * The thread_change(e)pri() function doesn't drop the thread
1677 * lock--that must be done by its caller.
1678 */
1679 void
1680 thread_change_epri(kthread_t *t, pri_t disp_pri)
1681 {
1682 uint_t state;
1683
1684 ASSERT(THREAD_LOCK_HELD(t));
1685
1686 /*
1687 * If the inherited priority hasn't actually changed,
1688 * just return.
1689 */
1690 if (t->t_epri == disp_pri)
1691 return;
1692
1693 state = t->t_state;
1694
1695 /*
1696 * If it's not on a queue, change the priority with impunity.
1697 */
1698 if ((state & (TS_SLEEP | TS_RUN | TS_WAIT)) == 0) {
1699 t->t_epri = disp_pri;
1700 if (state == TS_ONPROC) {
1701 cpu_t *cp = t->t_disp_queue->disp_cpu;
1702
1703 if (t == cp->cpu_dispthread)
1704 cp->cpu_dispatch_pri = DISP_PRIO(t);
1705 }
1706 } else if (state == TS_SLEEP) {
1707 /*
1708 * Take the thread out of its sleep queue.
1709 * Change the inherited priority.
1710 * Re-enqueue the thread.
1711 * Each synchronization object exports a function
1712 * to do this in an appropriate manner.
1713 */
1714 SOBJ_CHANGE_EPRI(t->t_sobj_ops, t, disp_pri);
1715 } else if (state == TS_WAIT) {
1716 /*
1717 * Re-enqueue a thread on the wait queue if its
1718 * effective priority needs to change.
1719 */
1720 if (disp_pri != t->t_epri)
1721 waitq_change_pri(t, disp_pri);
1722 } else {
1723 /*
1724 * The thread is on a run queue.
1725 * Note: setbackdq() may not put the thread
1726 * back on the same run queue where it originally
1727 * resided.
1728 */
1729 (void) dispdeq(t);
1730 t->t_epri = disp_pri;
1731 setbackdq(t);
1732 }
1733 schedctl_set_cidpri(t);
1734 }
1735
1736 /*
1737 * Function: Change the t_pri field of a thread.
1738 * Side Effects: Adjust the thread ordering on a run queue
1739 * or sleep queue, if necessary.
1740 * Returns: 1 if the thread was on a run queue, else 0.
1741 */
1742 int
1743 thread_change_pri(kthread_t *t, pri_t disp_pri, int front)
1744 {
1745 uint_t state;
1746 int on_rq = 0;
1747
1748 ASSERT(THREAD_LOCK_HELD(t));
1749
1750 state = t->t_state;
1751 THREAD_WILLCHANGE_PRI(t, disp_pri);
1752
1753 /*
1754 * If it's not on a queue, change the priority with impunity.
1755 */
1756 if ((state & (TS_SLEEP | TS_RUN | TS_WAIT)) == 0) {
1757 t->t_pri = disp_pri;
1758
1759 if (state == TS_ONPROC) {
1760 cpu_t *cp = t->t_disp_queue->disp_cpu;
1761
1762 if (t == cp->cpu_dispthread)
1763 cp->cpu_dispatch_pri = DISP_PRIO(t);
1764 }
1765 } else if (state == TS_SLEEP) {
1766 /*
1767 * If the priority has changed, take the thread out of
1768 * its sleep queue and change the priority.
1769 * Re-enqueue the thread.
1770 * Each synchronization object exports a function
1771 * to do this in an appropriate manner.
1772 */
1773 if (disp_pri != t->t_pri)
1774 SOBJ_CHANGE_PRI(t->t_sobj_ops, t, disp_pri);
1775 } else if (state == TS_WAIT) {
1776 /*
1777 * Re-enqueue a thread on the wait queue if its
1778 * priority needs to change.
1779 */
1780 if (disp_pri != t->t_pri)
1781 waitq_change_pri(t, disp_pri);
1782 } else {
1783 /*
1784 * The thread is on a run queue.
1785 * Note: setbackdq() may not put the thread
1786 * back on the same run queue where it originally
1787 * resided.
1788 *
1789 * We still requeue the thread even if the priority
1790 * is unchanged to preserve round-robin (and other)
1791 * effects between threads of the same priority.
1792 */
1793 on_rq = dispdeq(t);
1794 ASSERT(on_rq);
1795 t->t_pri = disp_pri;
1796 if (front) {
1797 setfrontdq(t);
1798 } else {
1799 setbackdq(t);
1800 }
1801 }
1802 schedctl_set_cidpri(t);
1803 return (on_rq);
1804 }
1805
1806 /*
1807 * Tunable kmem_stackinfo is set, fill the kernel thread stack with a
1808 * specific pattern.
1809 */
1810 static void
1811 stkinfo_begin(kthread_t *t)
1812 {
1813 caddr_t start; /* stack start */
1814 caddr_t end; /* stack end */
1815 uint64_t *ptr; /* pattern pointer */
1816
1817 /*
1818 * Stack grows up or down, see thread_create(),
1819 * compute stack memory area start and end (start < end).
1820 */
1821 if (t->t_stk > t->t_stkbase) {
1822 /* stack grows down */
1823 start = t->t_stkbase;
1824 end = t->t_stk;
1825 } else {
1826 /* stack grows up */
1827 start = t->t_stk;
1828 end = t->t_stkbase;
1829 }
1830
1831 /*
1832 * Stackinfo pattern size is 8 bytes. Ensure proper 8 bytes
1833 * alignement for start and end in stack area boundaries
1834 * (protection against corrupt t_stkbase/t_stk data).
1835 */
1836 if ((((uintptr_t)start) & 0x7) != 0) {
1837 start = (caddr_t)((((uintptr_t)start) & (~0x7)) + 8);
1838 }
1839 end = (caddr_t)(((uintptr_t)end) & (~0x7));
1840
1841 if ((end <= start) || (end - start) > (1024 * 1024)) {
1842 /* negative or stack size > 1 meg, assume bogus */
1843 return;
1844 }
1845
1846 /* fill stack area with a pattern (instead of zeros) */
1847 ptr = (uint64_t *)((void *)start);
1848 while (ptr < (uint64_t *)((void *)end)) {
1849 *ptr++ = KMEM_STKINFO_PATTERN;
1850 }
1851 }
1852
1853
1854 /*
1855 * Tunable kmem_stackinfo is set, create stackinfo log if doesn't already exist,
1856 * compute the percentage of kernel stack really used, and set in the log
1857 * if it's the latest highest percentage.
1858 */
1859 static void
1860 stkinfo_end(kthread_t *t)
1861 {
1862 caddr_t start; /* stack start */
1863 caddr_t end; /* stack end */
1864 uint64_t *ptr; /* pattern pointer */
1865 size_t stksz; /* stack size */
1866 size_t smallest = 0;
1867 size_t percent = 0;
1868 uint_t index = 0;
1869 uint_t i;
1870 static size_t smallest_percent = (size_t)-1;
1871 static uint_t full = 0;
1872
1873 /* create the stackinfo log, if doesn't already exist */
1874 mutex_enter(&kmem_stkinfo_lock);
1875 if (kmem_stkinfo_log == NULL) {
1876 kmem_stkinfo_log = (kmem_stkinfo_t *)
1877 kmem_zalloc(KMEM_STKINFO_LOG_SIZE *
1878 (sizeof (kmem_stkinfo_t)), KM_NOSLEEP);
1879 if (kmem_stkinfo_log == NULL) {
1880 mutex_exit(&kmem_stkinfo_lock);
1881 return;
1882 }
1883 }
1884 mutex_exit(&kmem_stkinfo_lock);
1885
1886 /*
1887 * Stack grows up or down, see thread_create(),
1888 * compute stack memory area start and end (start < end).
1889 */
1890 if (t->t_stk > t->t_stkbase) {
1891 /* stack grows down */
1892 start = t->t_stkbase;
1893 end = t->t_stk;
1894 } else {
1895 /* stack grows up */
1896 start = t->t_stk;
1897 end = t->t_stkbase;
1898 }
1899
1900 /* stack size as found in kthread_t */
1901 stksz = end - start;
1902
1903 /*
1904 * Stackinfo pattern size is 8 bytes. Ensure proper 8 bytes
1905 * alignement for start and end in stack area boundaries
1906 * (protection against corrupt t_stkbase/t_stk data).
1907 */
1908 if ((((uintptr_t)start) & 0x7) != 0) {
1909 start = (caddr_t)((((uintptr_t)start) & (~0x7)) + 8);
1910 }
1911 end = (caddr_t)(((uintptr_t)end) & (~0x7));
1912
1913 if ((end <= start) || (end - start) > (1024 * 1024)) {
1914 /* negative or stack size > 1 meg, assume bogus */
1915 return;
1916 }
1917
1918 /* search until no pattern in the stack */
1919 if (t->t_stk > t->t_stkbase) {
1920 /* stack grows down */
1921 #if defined(__i386) || defined(__amd64)
1922 /*
1923 * 6 longs are pushed on stack, see thread_load(). Skip
1924 * them, so if kthread has never run, percent is zero.
1925 * 8 bytes alignement is preserved for a 32 bit kernel,
1926 * 6 x 4 = 24, 24 is a multiple of 8.
1927 *
1928 */
1929 end -= (6 * sizeof (long));
1930 #endif
1931 ptr = (uint64_t *)((void *)start);
1932 while (ptr < (uint64_t *)((void *)end)) {
1933 if (*ptr != KMEM_STKINFO_PATTERN) {
1934 percent = stkinfo_percent(end,
1935 start, (caddr_t)ptr);
1936 break;
1937 }
1938 ptr++;
1939 }
1940 } else {
1941 /* stack grows up */
1942 ptr = (uint64_t *)((void *)end);
1943 ptr--;
1944 while (ptr >= (uint64_t *)((void *)start)) {
1945 if (*ptr != KMEM_STKINFO_PATTERN) {
1946 percent = stkinfo_percent(start,
1947 end, (caddr_t)ptr);
1948 break;
1949 }
1950 ptr--;
1951 }
1952 }
1953
1954 DTRACE_PROBE3(stack__usage, kthread_t *, t,
1955 size_t, stksz, size_t, percent);
1956
1957 if (percent == 0) {
1958 return;
1959 }
1960
1961 mutex_enter(&kmem_stkinfo_lock);
1962 if (full == KMEM_STKINFO_LOG_SIZE && percent < smallest_percent) {
1963 /*
1964 * The log is full and already contains the highest values
1965 */
1966 mutex_exit(&kmem_stkinfo_lock);
1967 return;
1968 }
1969
1970 /* keep a log of the highest used stack */
1971 for (i = 0; i < KMEM_STKINFO_LOG_SIZE; i++) {
1972 if (kmem_stkinfo_log[i].percent == 0) {
1973 index = i;
1974 full++;
1975 break;
1976 }
1977 if (smallest == 0) {
1978 smallest = kmem_stkinfo_log[i].percent;
1979 index = i;
1980 continue;
1981 }
1982 if (kmem_stkinfo_log[i].percent < smallest) {
1983 smallest = kmem_stkinfo_log[i].percent;
1984 index = i;
1985 }
1986 }
1987
1988 if (percent >= kmem_stkinfo_log[index].percent) {
1989 kmem_stkinfo_log[index].kthread = (caddr_t)t;
1990 kmem_stkinfo_log[index].t_startpc = (caddr_t)t->t_startpc;
1991 kmem_stkinfo_log[index].start = start;
1992 kmem_stkinfo_log[index].stksz = stksz;
1993 kmem_stkinfo_log[index].percent = percent;
1994 kmem_stkinfo_log[index].t_tid = t->t_tid;
1995 kmem_stkinfo_log[index].cmd[0] = '\0';
1996 if (t->t_tid != 0) {
1997 stksz = strlen((t->t_procp)->p_user.u_comm);
1998 if (stksz >= KMEM_STKINFO_STR_SIZE) {
1999 stksz = KMEM_STKINFO_STR_SIZE - 1;
2000 kmem_stkinfo_log[index].cmd[stksz] = '\0';
2001 } else {
2002 stksz += 1;
2003 }
2004 (void) memcpy(kmem_stkinfo_log[index].cmd,
2005 (t->t_procp)->p_user.u_comm, stksz);
2006 }
2007 if (percent < smallest_percent) {
2008 smallest_percent = percent;
2009 }
2010 }
2011 mutex_exit(&kmem_stkinfo_lock);
2012 }
2013
2014 /*
2015 * Tunable kmem_stackinfo is set, compute stack utilization percentage.
2016 */
2017 static size_t
2018 stkinfo_percent(caddr_t t_stk, caddr_t t_stkbase, caddr_t sp)
2019 {
2020 size_t percent;
2021 size_t s;
2022
2023 if (t_stk > t_stkbase) {
2024 /* stack grows down */
2025 if (sp > t_stk) {
2026 return (0);
2027 }
2028 if (sp < t_stkbase) {
2029 return (100);
2030 }
2031 percent = t_stk - sp + 1;
2032 s = t_stk - t_stkbase + 1;
2033 } else {
2034 /* stack grows up */
2035 if (sp < t_stk) {
2036 return (0);
2037 }
2038 if (sp > t_stkbase) {
2039 return (100);
2040 }
2041 percent = sp - t_stk + 1;
2042 s = t_stkbase - t_stk + 1;
2043 }
2044 percent = ((100 * percent) / s) + 1;
2045 if (percent > 100) {
2046 percent = 100;
2047 }
2048 return (percent);
2049 }