Print this page
10923 thread_affinity_set(CPU_CURRENT) can skip cpu_lock
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: John Levon <john.levon@joyent.com>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/os/cpu.c
+++ new/usr/src/uts/common/os/cpu.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
↓ open down ↓ |
13 lines elided |
↑ open up ↑ |
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright (c) 2012 by Delphix. All rights reserved.
24 + * Copyright 2018 Joyent, Inc.
24 25 */
25 26
26 27 /*
27 28 * Architecture-independent CPU control functions.
28 29 */
29 30
30 31 #include <sys/types.h>
31 32 #include <sys/param.h>
32 33 #include <sys/var.h>
33 34 #include <sys/thread.h>
34 35 #include <sys/cpuvar.h>
35 36 #include <sys/cpu_event.h>
36 37 #include <sys/kstat.h>
37 38 #include <sys/uadmin.h>
38 39 #include <sys/systm.h>
39 40 #include <sys/errno.h>
40 41 #include <sys/cmn_err.h>
41 42 #include <sys/procset.h>
42 43 #include <sys/processor.h>
43 44 #include <sys/debug.h>
44 45 #include <sys/cpupart.h>
45 46 #include <sys/lgrp.h>
46 47 #include <sys/pset.h>
47 48 #include <sys/pghw.h>
48 49 #include <sys/kmem.h>
49 50 #include <sys/kmem_impl.h> /* to set per-cpu kmem_cache offset */
50 51 #include <sys/atomic.h>
51 52 #include <sys/callb.h>
52 53 #include <sys/vtrace.h>
53 54 #include <sys/cyclic.h>
54 55 #include <sys/bitmap.h>
55 56 #include <sys/nvpair.h>
56 57 #include <sys/pool_pset.h>
57 58 #include <sys/msacct.h>
58 59 #include <sys/time.h>
59 60 #include <sys/archsystm.h>
60 61 #include <sys/sdt.h>
61 62 #if defined(__x86) || defined(__amd64)
62 63 #include <sys/x86_archext.h>
63 64 #endif
64 65 #include <sys/callo.h>
65 66
66 67 extern int mp_cpu_start(cpu_t *);
67 68 extern int mp_cpu_stop(cpu_t *);
68 69 extern int mp_cpu_poweron(cpu_t *);
69 70 extern int mp_cpu_poweroff(cpu_t *);
70 71 extern int mp_cpu_configure(int);
71 72 extern int mp_cpu_unconfigure(int);
72 73 extern void mp_cpu_faulted_enter(cpu_t *);
73 74 extern void mp_cpu_faulted_exit(cpu_t *);
74 75
75 76 extern int cmp_cpu_to_chip(processorid_t cpuid);
76 77 #ifdef __sparcv9
77 78 extern char *cpu_fru_fmri(cpu_t *cp);
78 79 #endif
79 80
80 81 static void cpu_add_active_internal(cpu_t *cp);
81 82 static void cpu_remove_active(cpu_t *cp);
82 83 static void cpu_info_kstat_create(cpu_t *cp);
83 84 static void cpu_info_kstat_destroy(cpu_t *cp);
84 85 static void cpu_stats_kstat_create(cpu_t *cp);
85 86 static void cpu_stats_kstat_destroy(cpu_t *cp);
86 87
87 88 static int cpu_sys_stats_ks_update(kstat_t *ksp, int rw);
88 89 static int cpu_vm_stats_ks_update(kstat_t *ksp, int rw);
89 90 static int cpu_stat_ks_update(kstat_t *ksp, int rw);
90 91 static int cpu_state_change_hooks(int, cpu_setup_t, cpu_setup_t);
91 92
92 93 /*
93 94 * cpu_lock protects ncpus, ncpus_online, cpu_flag, cpu_list, cpu_active,
94 95 * max_cpu_seqid_ever, and dispatch queue reallocations. The lock ordering with
95 96 * respect to related locks is:
96 97 *
97 98 * cpu_lock --> thread_free_lock ---> p_lock ---> thread_lock()
98 99 *
99 100 * Warning: Certain sections of code do not use the cpu_lock when
100 101 * traversing the cpu_list (e.g. mutex_vector_enter(), clock()). Since
101 102 * all cpus are paused during modifications to this list, a solution
102 103 * to protect the list is too either disable kernel preemption while
103 104 * walking the list, *or* recheck the cpu_next pointer at each
104 105 * iteration in the loop. Note that in no cases can any cached
105 106 * copies of the cpu pointers be kept as they may become invalid.
106 107 */
107 108 kmutex_t cpu_lock;
108 109 cpu_t *cpu_list; /* list of all CPUs */
109 110 cpu_t *clock_cpu_list; /* used by clock to walk CPUs */
110 111 cpu_t *cpu_active; /* list of active CPUs */
111 112 static cpuset_t cpu_available; /* set of available CPUs */
112 113 cpuset_t cpu_seqid_inuse; /* which cpu_seqids are in use */
113 114
114 115 cpu_t **cpu_seq; /* ptrs to CPUs, indexed by seq_id */
115 116
116 117 /*
117 118 * max_ncpus keeps the max cpus the system can have. Initially
118 119 * it's NCPU, but since most archs scan the devtree for cpus
119 120 * fairly early on during boot, the real max can be known before
120 121 * ncpus is set (useful for early NCPU based allocations).
121 122 */
122 123 int max_ncpus = NCPU;
123 124 /*
124 125 * platforms that set max_ncpus to maxiumum number of cpus that can be
125 126 * dynamically added will set boot_max_ncpus to the number of cpus found
126 127 * at device tree scan time during boot.
127 128 */
128 129 int boot_max_ncpus = -1;
129 130 int boot_ncpus = -1;
130 131 /*
131 132 * Maximum possible CPU id. This can never be >= NCPU since NCPU is
132 133 * used to size arrays that are indexed by CPU id.
133 134 */
134 135 processorid_t max_cpuid = NCPU - 1;
135 136
136 137 /*
137 138 * Maximum cpu_seqid was given. This number can only grow and never shrink. It
138 139 * can be used to optimize NCPU loops to avoid going through CPUs which were
139 140 * never on-line.
140 141 */
141 142 processorid_t max_cpu_seqid_ever = 0;
142 143
143 144 int ncpus = 1;
144 145 int ncpus_online = 1;
145 146
146 147 /*
147 148 * CPU that we're trying to offline. Protected by cpu_lock.
148 149 */
149 150 cpu_t *cpu_inmotion;
150 151
151 152 /*
152 153 * Can be raised to suppress further weakbinding, which are instead
153 154 * satisfied by disabling preemption. Must be raised/lowered under cpu_lock,
154 155 * while individual thread weakbinding synchronization is done under thread
155 156 * lock.
156 157 */
157 158 int weakbindingbarrier;
158 159
159 160 /*
160 161 * Variables used in pause_cpus().
161 162 */
162 163 static volatile char safe_list[NCPU];
163 164
164 165 static struct _cpu_pause_info {
165 166 int cp_spl; /* spl saved in pause_cpus() */
166 167 volatile int cp_go; /* Go signal sent after all ready */
167 168 int cp_count; /* # of CPUs to pause */
168 169 ksema_t cp_sem; /* synch pause_cpus & cpu_pause */
169 170 kthread_id_t cp_paused;
170 171 void *(*cp_func)(void *);
171 172 } cpu_pause_info;
172 173
173 174 static kmutex_t pause_free_mutex;
174 175 static kcondvar_t pause_free_cv;
175 176
176 177
177 178 static struct cpu_sys_stats_ks_data {
178 179 kstat_named_t cpu_ticks_idle;
179 180 kstat_named_t cpu_ticks_user;
180 181 kstat_named_t cpu_ticks_kernel;
181 182 kstat_named_t cpu_ticks_wait;
182 183 kstat_named_t cpu_nsec_idle;
183 184 kstat_named_t cpu_nsec_user;
184 185 kstat_named_t cpu_nsec_kernel;
185 186 kstat_named_t cpu_nsec_dtrace;
186 187 kstat_named_t cpu_nsec_intr;
187 188 kstat_named_t cpu_load_intr;
188 189 kstat_named_t wait_ticks_io;
189 190 kstat_named_t dtrace_probes;
190 191 kstat_named_t bread;
191 192 kstat_named_t bwrite;
192 193 kstat_named_t lread;
193 194 kstat_named_t lwrite;
194 195 kstat_named_t phread;
195 196 kstat_named_t phwrite;
196 197 kstat_named_t pswitch;
197 198 kstat_named_t trap;
198 199 kstat_named_t intr;
199 200 kstat_named_t syscall;
200 201 kstat_named_t sysread;
201 202 kstat_named_t syswrite;
202 203 kstat_named_t sysfork;
203 204 kstat_named_t sysvfork;
204 205 kstat_named_t sysexec;
205 206 kstat_named_t readch;
206 207 kstat_named_t writech;
207 208 kstat_named_t rcvint;
208 209 kstat_named_t xmtint;
209 210 kstat_named_t mdmint;
210 211 kstat_named_t rawch;
211 212 kstat_named_t canch;
212 213 kstat_named_t outch;
213 214 kstat_named_t msg;
214 215 kstat_named_t sema;
215 216 kstat_named_t namei;
216 217 kstat_named_t ufsiget;
217 218 kstat_named_t ufsdirblk;
218 219 kstat_named_t ufsipage;
219 220 kstat_named_t ufsinopage;
220 221 kstat_named_t procovf;
221 222 kstat_named_t intrthread;
222 223 kstat_named_t intrblk;
223 224 kstat_named_t intrunpin;
224 225 kstat_named_t idlethread;
225 226 kstat_named_t inv_swtch;
226 227 kstat_named_t nthreads;
227 228 kstat_named_t cpumigrate;
228 229 kstat_named_t xcalls;
229 230 kstat_named_t mutex_adenters;
230 231 kstat_named_t rw_rdfails;
231 232 kstat_named_t rw_wrfails;
232 233 kstat_named_t modload;
233 234 kstat_named_t modunload;
234 235 kstat_named_t bawrite;
235 236 kstat_named_t iowait;
236 237 } cpu_sys_stats_ks_data_template = {
237 238 { "cpu_ticks_idle", KSTAT_DATA_UINT64 },
238 239 { "cpu_ticks_user", KSTAT_DATA_UINT64 },
239 240 { "cpu_ticks_kernel", KSTAT_DATA_UINT64 },
240 241 { "cpu_ticks_wait", KSTAT_DATA_UINT64 },
241 242 { "cpu_nsec_idle", KSTAT_DATA_UINT64 },
242 243 { "cpu_nsec_user", KSTAT_DATA_UINT64 },
243 244 { "cpu_nsec_kernel", KSTAT_DATA_UINT64 },
244 245 { "cpu_nsec_dtrace", KSTAT_DATA_UINT64 },
245 246 { "cpu_nsec_intr", KSTAT_DATA_UINT64 },
246 247 { "cpu_load_intr", KSTAT_DATA_UINT64 },
247 248 { "wait_ticks_io", KSTAT_DATA_UINT64 },
248 249 { "dtrace_probes", KSTAT_DATA_UINT64 },
249 250 { "bread", KSTAT_DATA_UINT64 },
250 251 { "bwrite", KSTAT_DATA_UINT64 },
251 252 { "lread", KSTAT_DATA_UINT64 },
252 253 { "lwrite", KSTAT_DATA_UINT64 },
253 254 { "phread", KSTAT_DATA_UINT64 },
254 255 { "phwrite", KSTAT_DATA_UINT64 },
255 256 { "pswitch", KSTAT_DATA_UINT64 },
256 257 { "trap", KSTAT_DATA_UINT64 },
257 258 { "intr", KSTAT_DATA_UINT64 },
258 259 { "syscall", KSTAT_DATA_UINT64 },
259 260 { "sysread", KSTAT_DATA_UINT64 },
260 261 { "syswrite", KSTAT_DATA_UINT64 },
261 262 { "sysfork", KSTAT_DATA_UINT64 },
262 263 { "sysvfork", KSTAT_DATA_UINT64 },
263 264 { "sysexec", KSTAT_DATA_UINT64 },
264 265 { "readch", KSTAT_DATA_UINT64 },
265 266 { "writech", KSTAT_DATA_UINT64 },
266 267 { "rcvint", KSTAT_DATA_UINT64 },
267 268 { "xmtint", KSTAT_DATA_UINT64 },
268 269 { "mdmint", KSTAT_DATA_UINT64 },
269 270 { "rawch", KSTAT_DATA_UINT64 },
270 271 { "canch", KSTAT_DATA_UINT64 },
271 272 { "outch", KSTAT_DATA_UINT64 },
272 273 { "msg", KSTAT_DATA_UINT64 },
273 274 { "sema", KSTAT_DATA_UINT64 },
274 275 { "namei", KSTAT_DATA_UINT64 },
275 276 { "ufsiget", KSTAT_DATA_UINT64 },
276 277 { "ufsdirblk", KSTAT_DATA_UINT64 },
277 278 { "ufsipage", KSTAT_DATA_UINT64 },
278 279 { "ufsinopage", KSTAT_DATA_UINT64 },
279 280 { "procovf", KSTAT_DATA_UINT64 },
280 281 { "intrthread", KSTAT_DATA_UINT64 },
281 282 { "intrblk", KSTAT_DATA_UINT64 },
282 283 { "intrunpin", KSTAT_DATA_UINT64 },
283 284 { "idlethread", KSTAT_DATA_UINT64 },
284 285 { "inv_swtch", KSTAT_DATA_UINT64 },
285 286 { "nthreads", KSTAT_DATA_UINT64 },
286 287 { "cpumigrate", KSTAT_DATA_UINT64 },
287 288 { "xcalls", KSTAT_DATA_UINT64 },
288 289 { "mutex_adenters", KSTAT_DATA_UINT64 },
289 290 { "rw_rdfails", KSTAT_DATA_UINT64 },
290 291 { "rw_wrfails", KSTAT_DATA_UINT64 },
291 292 { "modload", KSTAT_DATA_UINT64 },
292 293 { "modunload", KSTAT_DATA_UINT64 },
293 294 { "bawrite", KSTAT_DATA_UINT64 },
294 295 { "iowait", KSTAT_DATA_UINT64 },
295 296 };
296 297
297 298 static struct cpu_vm_stats_ks_data {
298 299 kstat_named_t pgrec;
299 300 kstat_named_t pgfrec;
300 301 kstat_named_t pgin;
301 302 kstat_named_t pgpgin;
302 303 kstat_named_t pgout;
303 304 kstat_named_t pgpgout;
304 305 kstat_named_t swapin;
305 306 kstat_named_t pgswapin;
306 307 kstat_named_t swapout;
307 308 kstat_named_t pgswapout;
308 309 kstat_named_t zfod;
309 310 kstat_named_t dfree;
310 311 kstat_named_t scan;
311 312 kstat_named_t rev;
312 313 kstat_named_t hat_fault;
313 314 kstat_named_t as_fault;
314 315 kstat_named_t maj_fault;
315 316 kstat_named_t cow_fault;
316 317 kstat_named_t prot_fault;
317 318 kstat_named_t softlock;
318 319 kstat_named_t kernel_asflt;
319 320 kstat_named_t pgrrun;
320 321 kstat_named_t execpgin;
321 322 kstat_named_t execpgout;
322 323 kstat_named_t execfree;
323 324 kstat_named_t anonpgin;
324 325 kstat_named_t anonpgout;
325 326 kstat_named_t anonfree;
326 327 kstat_named_t fspgin;
327 328 kstat_named_t fspgout;
328 329 kstat_named_t fsfree;
329 330 } cpu_vm_stats_ks_data_template = {
330 331 { "pgrec", KSTAT_DATA_UINT64 },
331 332 { "pgfrec", KSTAT_DATA_UINT64 },
332 333 { "pgin", KSTAT_DATA_UINT64 },
333 334 { "pgpgin", KSTAT_DATA_UINT64 },
334 335 { "pgout", KSTAT_DATA_UINT64 },
335 336 { "pgpgout", KSTAT_DATA_UINT64 },
336 337 { "swapin", KSTAT_DATA_UINT64 },
337 338 { "pgswapin", KSTAT_DATA_UINT64 },
338 339 { "swapout", KSTAT_DATA_UINT64 },
339 340 { "pgswapout", KSTAT_DATA_UINT64 },
340 341 { "zfod", KSTAT_DATA_UINT64 },
341 342 { "dfree", KSTAT_DATA_UINT64 },
342 343 { "scan", KSTAT_DATA_UINT64 },
343 344 { "rev", KSTAT_DATA_UINT64 },
344 345 { "hat_fault", KSTAT_DATA_UINT64 },
345 346 { "as_fault", KSTAT_DATA_UINT64 },
346 347 { "maj_fault", KSTAT_DATA_UINT64 },
347 348 { "cow_fault", KSTAT_DATA_UINT64 },
348 349 { "prot_fault", KSTAT_DATA_UINT64 },
349 350 { "softlock", KSTAT_DATA_UINT64 },
350 351 { "kernel_asflt", KSTAT_DATA_UINT64 },
351 352 { "pgrrun", KSTAT_DATA_UINT64 },
352 353 { "execpgin", KSTAT_DATA_UINT64 },
353 354 { "execpgout", KSTAT_DATA_UINT64 },
354 355 { "execfree", KSTAT_DATA_UINT64 },
355 356 { "anonpgin", KSTAT_DATA_UINT64 },
356 357 { "anonpgout", KSTAT_DATA_UINT64 },
357 358 { "anonfree", KSTAT_DATA_UINT64 },
358 359 { "fspgin", KSTAT_DATA_UINT64 },
359 360 { "fspgout", KSTAT_DATA_UINT64 },
360 361 { "fsfree", KSTAT_DATA_UINT64 },
361 362 };
362 363
363 364 /*
364 365 * Force the specified thread to migrate to the appropriate processor.
365 366 * Called with thread lock held, returns with it dropped.
366 367 */
367 368 static void
368 369 force_thread_migrate(kthread_id_t tp)
369 370 {
370 371 ASSERT(THREAD_LOCK_HELD(tp));
371 372 if (tp == curthread) {
372 373 THREAD_TRANSITION(tp);
373 374 CL_SETRUN(tp);
374 375 thread_unlock_nopreempt(tp);
375 376 swtch();
376 377 } else {
377 378 if (tp->t_state == TS_ONPROC) {
378 379 cpu_surrender(tp);
↓ open down ↓ |
345 lines elided |
↑ open up ↑ |
379 380 } else if (tp->t_state == TS_RUN) {
380 381 (void) dispdeq(tp);
381 382 setbackdq(tp);
382 383 }
383 384 thread_unlock(tp);
384 385 }
385 386 }
386 387
387 388 /*
388 389 * Set affinity for a specified CPU.
389 - * A reference count is incremented and the affinity is held until the
390 - * reference count is decremented to zero by thread_affinity_clear().
391 - * This is so regions of code requiring affinity can be nested.
392 - * Caller needs to ensure that cpu_id remains valid, which can be
393 - * done by holding cpu_lock across this call, unless the caller
394 - * specifies CPU_CURRENT in which case the cpu_lock will be acquired
395 - * by thread_affinity_set and CPU->cpu_id will be the target CPU.
390 + *
391 + * Specifying a cpu_id of CPU_CURRENT, allowed _only_ when setting affinity for
392 + * curthread, will set affinity to the CPU on which the thread is currently
393 + * running. For other cpu_id values, the caller must ensure that the
394 + * referenced CPU remains valid, which can be done by holding cpu_lock across
395 + * this call.
396 + *
397 + * CPU affinity is guaranteed after return of thread_affinity_set(). If a
398 + * caller setting affinity to CPU_CURRENT requires that its thread not migrate
399 + * CPUs prior to a successful return, it should take extra precautions (such as
400 + * their own call to kpreempt_disable) to ensure that safety.
401 + *
402 + * A CPU affinity reference count is maintained by thread_affinity_set and
403 + * thread_affinity_clear (incrementing and decrementing it, respectively),
404 + * maintaining CPU affinity while the count is non-zero, and allowing regions
405 + * of code which require affinity to be nested.
396 406 */
397 407 void
398 408 thread_affinity_set(kthread_id_t t, int cpu_id)
399 409 {
400 - cpu_t *cp;
401 - int c;
410 + cpu_t *cp;
402 411
403 412 ASSERT(!(t == curthread && t->t_weakbound_cpu != NULL));
404 413
405 - if ((c = cpu_id) == CPU_CURRENT) {
406 - mutex_enter(&cpu_lock);
407 - cpu_id = CPU->cpu_id;
414 + if (cpu_id == CPU_CURRENT) {
415 + VERIFY3P(t, ==, curthread);
416 + kpreempt_disable();
417 + cp = CPU;
418 + } else {
419 + /*
420 + * We should be asserting that cpu_lock is held here, but
421 + * the NCA code doesn't acquire it. The following assert
422 + * should be uncommented when the NCA code is fixed.
423 + *
424 + * ASSERT(MUTEX_HELD(&cpu_lock));
425 + */
426 + VERIFY((cpu_id >= 0) && (cpu_id < NCPU));
427 + cp = cpu[cpu_id];
428 +
429 + /* user must provide a good cpu_id */
430 + VERIFY(cp != NULL);
408 431 }
432 +
409 433 /*
410 - * We should be asserting that cpu_lock is held here, but
411 - * the NCA code doesn't acquire it. The following assert
412 - * should be uncommented when the NCA code is fixed.
413 - *
414 - * ASSERT(MUTEX_HELD(&cpu_lock));
415 - */
416 - ASSERT((cpu_id >= 0) && (cpu_id < NCPU));
417 - cp = cpu[cpu_id];
418 - ASSERT(cp != NULL); /* user must provide a good cpu_id */
419 - /*
420 434 * If there is already a hard affinity requested, and this affinity
421 435 * conflicts with that, panic.
422 436 */
423 437 thread_lock(t);
424 438 if (t->t_affinitycnt > 0 && t->t_bound_cpu != cp) {
425 439 panic("affinity_set: setting %p but already bound to %p",
426 440 (void *)cp, (void *)t->t_bound_cpu);
427 441 }
428 442 t->t_affinitycnt++;
429 443 t->t_bound_cpu = cp;
430 444
431 445 /*
432 446 * Make sure we're running on the right CPU.
433 447 */
434 448 if (cp != t->t_cpu || t != curthread) {
449 + ASSERT(cpu_id != CPU_CURRENT);
435 450 force_thread_migrate(t); /* drops thread lock */
436 451 } else {
437 452 thread_unlock(t);
438 453 }
439 454
440 - if (c == CPU_CURRENT)
441 - mutex_exit(&cpu_lock);
455 + if (cpu_id == CPU_CURRENT) {
456 + kpreempt_enable();
457 + }
442 458 }
443 459
444 460 /*
445 461 * Wrapper for backward compatibility.
446 462 */
447 463 void
448 464 affinity_set(int cpu_id)
449 465 {
450 466 thread_affinity_set(curthread, cpu_id);
451 467 }
452 468
453 469 /*
454 470 * Decrement the affinity reservation count and if it becomes zero,
455 471 * clear the CPU affinity for the current thread, or set it to the user's
456 472 * software binding request.
457 473 */
458 474 void
459 475 thread_affinity_clear(kthread_id_t t)
460 476 {
461 477 register processorid_t binding;
462 478
463 479 thread_lock(t);
464 480 if (--t->t_affinitycnt == 0) {
465 481 if ((binding = t->t_bind_cpu) == PBIND_NONE) {
466 482 /*
467 483 * Adjust disp_max_unbound_pri if necessary.
468 484 */
469 485 disp_adjust_unbound_pri(t);
470 486 t->t_bound_cpu = NULL;
471 487 if (t->t_cpu->cpu_part != t->t_cpupart) {
472 488 force_thread_migrate(t);
473 489 return;
474 490 }
475 491 } else {
476 492 t->t_bound_cpu = cpu[binding];
477 493 /*
478 494 * Make sure the thread is running on the bound CPU.
479 495 */
480 496 if (t->t_cpu != t->t_bound_cpu) {
481 497 force_thread_migrate(t);
482 498 return; /* already dropped lock */
483 499 }
484 500 }
485 501 }
486 502 thread_unlock(t);
487 503 }
488 504
489 505 /*
490 506 * Wrapper for backward compatibility.
491 507 */
492 508 void
493 509 affinity_clear(void)
494 510 {
495 511 thread_affinity_clear(curthread);
496 512 }
497 513
498 514 /*
499 515 * Weak cpu affinity. Bind to the "current" cpu for short periods
500 516 * of time during which the thread must not block (but may be preempted).
501 517 * Use this instead of kpreempt_disable() when it is only "no migration"
502 518 * rather than "no preemption" semantics that are required - disabling
503 519 * preemption holds higher priority threads off of cpu and if the
504 520 * operation that is protected is more than momentary this is not good
505 521 * for realtime etc.
506 522 *
507 523 * Weakly bound threads will not prevent a cpu from being offlined -
508 524 * we'll only run them on the cpu to which they are weakly bound but
509 525 * (because they do not block) we'll always be able to move them on to
510 526 * another cpu at offline time if we give them just a short moment to
511 527 * run during which they will unbind. To give a cpu a chance of offlining,
512 528 * however, we require a barrier to weak bindings that may be raised for a
513 529 * given cpu (offline/move code may set this and then wait a short time for
514 530 * existing weak bindings to drop); the cpu_inmotion pointer is that barrier.
515 531 *
516 532 * There are few restrictions on the calling context of thread_nomigrate.
517 533 * The caller must not hold the thread lock. Calls may be nested.
518 534 *
519 535 * After weakbinding a thread must not perform actions that may block.
520 536 * In particular it must not call thread_affinity_set; calling that when
521 537 * already weakbound is nonsensical anyway.
522 538 *
523 539 * If curthread is prevented from migrating for other reasons
524 540 * (kernel preemption disabled; high pil; strongly bound; interrupt thread)
525 541 * then the weak binding will succeed even if this cpu is the target of an
526 542 * offline/move request.
527 543 */
528 544 void
529 545 thread_nomigrate(void)
530 546 {
531 547 cpu_t *cp;
532 548 kthread_id_t t = curthread;
533 549
534 550 again:
535 551 kpreempt_disable();
536 552 cp = CPU;
537 553
538 554 /*
539 555 * A highlevel interrupt must not modify t_nomigrate or
540 556 * t_weakbound_cpu of the thread it has interrupted. A lowlevel
541 557 * interrupt thread cannot migrate and we can avoid the
542 558 * thread_lock call below by short-circuiting here. In either
543 559 * case we can just return since no migration is possible and
544 560 * the condition will persist (ie, when we test for these again
545 561 * in thread_allowmigrate they can't have changed). Migration
546 562 * is also impossible if we're at or above DISP_LEVEL pil.
547 563 */
548 564 if (CPU_ON_INTR(cp) || t->t_flag & T_INTR_THREAD ||
549 565 getpil() >= DISP_LEVEL) {
550 566 kpreempt_enable();
551 567 return;
552 568 }
553 569
554 570 /*
555 571 * We must be consistent with existing weak bindings. Since we
556 572 * may be interrupted between the increment of t_nomigrate and
557 573 * the store to t_weakbound_cpu below we cannot assume that
558 574 * t_weakbound_cpu will be set if t_nomigrate is. Note that we
559 575 * cannot assert t_weakbound_cpu == t_bind_cpu since that is not
560 576 * always the case.
561 577 */
562 578 if (t->t_nomigrate && t->t_weakbound_cpu && t->t_weakbound_cpu != cp) {
563 579 if (!panicstr)
564 580 panic("thread_nomigrate: binding to %p but already "
565 581 "bound to %p", (void *)cp,
566 582 (void *)t->t_weakbound_cpu);
567 583 }
568 584
569 585 /*
570 586 * At this point we have preemption disabled and we don't yet hold
571 587 * the thread lock. So it's possible that somebody else could
572 588 * set t_bind_cpu here and not be able to force us across to the
573 589 * new cpu (since we have preemption disabled).
574 590 */
575 591 thread_lock(curthread);
576 592
577 593 /*
578 594 * If further weak bindings are being (temporarily) suppressed then
579 595 * we'll settle for disabling kernel preemption (which assures
580 596 * no migration provided the thread does not block which it is
581 597 * not allowed to if using thread_nomigrate). We must remember
582 598 * this disposition so we can take appropriate action in
583 599 * thread_allowmigrate. If this is a nested call and the
584 600 * thread is already weakbound then fall through as normal.
585 601 * We remember the decision to settle for kpreempt_disable through
586 602 * negative nesting counting in t_nomigrate. Once a thread has had one
587 603 * weakbinding request satisfied in this way any further (nested)
588 604 * requests will continue to be satisfied in the same way,
589 605 * even if weak bindings have recommenced.
590 606 */
591 607 if (t->t_nomigrate < 0 || weakbindingbarrier && t->t_nomigrate == 0) {
592 608 --t->t_nomigrate;
593 609 thread_unlock(curthread);
594 610 return; /* with kpreempt_disable still active */
595 611 }
596 612
597 613 /*
598 614 * We hold thread_lock so t_bind_cpu cannot change. We could,
599 615 * however, be running on a different cpu to which we are t_bound_cpu
600 616 * to (as explained above). If we grant the weak binding request
601 617 * in that case then the dispatcher must favour our weak binding
602 618 * over our strong (in which case, just as when preemption is
603 619 * disabled, we can continue to run on a cpu other than the one to
604 620 * which we are strongbound; the difference in this case is that
605 621 * this thread can be preempted and so can appear on the dispatch
606 622 * queues of a cpu other than the one it is strongbound to).
607 623 *
608 624 * If the cpu we are running on does not appear to be a current
609 625 * offline target (we check cpu_inmotion to determine this - since
610 626 * we don't hold cpu_lock we may not see a recent store to that,
611 627 * so it's possible that we at times can grant a weak binding to a
612 628 * cpu that is an offline target, but that one request will not
613 629 * prevent the offline from succeeding) then we will always grant
614 630 * the weak binding request. This includes the case above where
615 631 * we grant a weakbinding not commensurate with our strong binding.
616 632 *
617 633 * If our cpu does appear to be an offline target then we're inclined
618 634 * not to grant the weakbinding request just yet - we'd prefer to
619 635 * migrate to another cpu and grant the request there. The
620 636 * exceptions are those cases where going through preemption code
621 637 * will not result in us changing cpu:
622 638 *
623 639 * . interrupts have already bypassed this case (see above)
624 640 * . we are already weakbound to this cpu (dispatcher code will
625 641 * always return us to the weakbound cpu)
626 642 * . preemption was disabled even before we disabled it above
627 643 * . we are strongbound to this cpu (if we're strongbound to
628 644 * another and not yet running there the trip through the
629 645 * dispatcher will move us to the strongbound cpu and we
630 646 * will grant the weak binding there)
631 647 */
632 648 if (cp != cpu_inmotion || t->t_nomigrate > 0 || t->t_preempt > 1 ||
633 649 t->t_bound_cpu == cp) {
634 650 /*
635 651 * Don't be tempted to store to t_weakbound_cpu only on
636 652 * the first nested bind request - if we're interrupted
637 653 * after the increment of t_nomigrate and before the
638 654 * store to t_weakbound_cpu and the interrupt calls
639 655 * thread_nomigrate then the assertion in thread_allowmigrate
640 656 * would fail.
641 657 */
642 658 t->t_nomigrate++;
643 659 t->t_weakbound_cpu = cp;
644 660 membar_producer();
645 661 thread_unlock(curthread);
646 662 /*
647 663 * Now that we have dropped the thread_lock another thread
648 664 * can set our t_weakbound_cpu, and will try to migrate us
649 665 * to the strongbound cpu (which will not be prevented by
650 666 * preemption being disabled since we're about to enable
651 667 * preemption). We have granted the weakbinding to the current
652 668 * cpu, so again we are in the position that is is is possible
653 669 * that our weak and strong bindings differ. Again this
654 670 * is catered for by dispatcher code which will favour our
655 671 * weak binding.
656 672 */
657 673 kpreempt_enable();
658 674 } else {
659 675 /*
660 676 * Move to another cpu before granting the request by
661 677 * forcing this thread through preemption code. When we
662 678 * get to set{front,back}dq called from CL_PREEMPT()
663 679 * cpu_choose() will be used to select a cpu to queue
664 680 * us on - that will see cpu_inmotion and take
665 681 * steps to avoid returning us to this cpu.
666 682 */
667 683 cp->cpu_kprunrun = 1;
668 684 thread_unlock(curthread);
669 685 kpreempt_enable(); /* will call preempt() */
670 686 goto again;
671 687 }
672 688 }
673 689
674 690 void
675 691 thread_allowmigrate(void)
676 692 {
677 693 kthread_id_t t = curthread;
678 694
679 695 ASSERT(t->t_weakbound_cpu == CPU ||
680 696 (t->t_nomigrate < 0 && t->t_preempt > 0) ||
681 697 CPU_ON_INTR(CPU) || t->t_flag & T_INTR_THREAD ||
682 698 getpil() >= DISP_LEVEL);
683 699
684 700 if (CPU_ON_INTR(CPU) || (t->t_flag & T_INTR_THREAD) ||
685 701 getpil() >= DISP_LEVEL)
686 702 return;
687 703
688 704 if (t->t_nomigrate < 0) {
689 705 /*
690 706 * This thread was granted "weak binding" in the
691 707 * stronger form of kernel preemption disabling.
692 708 * Undo a level of nesting for both t_nomigrate
693 709 * and t_preempt.
694 710 */
695 711 ++t->t_nomigrate;
696 712 kpreempt_enable();
697 713 } else if (--t->t_nomigrate == 0) {
698 714 /*
699 715 * Time to drop the weak binding. We need to cater
700 716 * for the case where we're weakbound to a different
701 717 * cpu than that to which we're strongbound (a very
702 718 * temporary arrangement that must only persist until
703 719 * weak binding drops). We don't acquire thread_lock
704 720 * here so even as this code executes t_bound_cpu
705 721 * may be changing. So we disable preemption and
706 722 * a) in the case that t_bound_cpu changes while we
707 723 * have preemption disabled kprunrun will be set
708 724 * asynchronously, and b) if before disabling
709 725 * preemption we were already on a different cpu to
710 726 * our t_bound_cpu then we set kprunrun ourselves
711 727 * to force a trip through the dispatcher when
712 728 * preemption is enabled.
713 729 */
714 730 kpreempt_disable();
715 731 if (t->t_bound_cpu &&
716 732 t->t_weakbound_cpu != t->t_bound_cpu)
717 733 CPU->cpu_kprunrun = 1;
718 734 t->t_weakbound_cpu = NULL;
719 735 membar_producer();
720 736 kpreempt_enable();
721 737 }
722 738 }
723 739
724 740 /*
725 741 * weakbinding_stop can be used to temporarily cause weakbindings made
726 742 * with thread_nomigrate to be satisfied through the stronger action of
727 743 * kpreempt_disable. weakbinding_start recommences normal weakbinding.
728 744 */
729 745
730 746 void
731 747 weakbinding_stop(void)
732 748 {
733 749 ASSERT(MUTEX_HELD(&cpu_lock));
734 750 weakbindingbarrier = 1;
735 751 membar_producer(); /* make visible before subsequent thread_lock */
736 752 }
737 753
738 754 void
739 755 weakbinding_start(void)
740 756 {
741 757 ASSERT(MUTEX_HELD(&cpu_lock));
742 758 weakbindingbarrier = 0;
743 759 }
744 760
745 761 void
746 762 null_xcall(void)
747 763 {
748 764 }
749 765
750 766 /*
751 767 * This routine is called to place the CPUs in a safe place so that
752 768 * one of them can be taken off line or placed on line. What we are
753 769 * trying to do here is prevent a thread from traversing the list
754 770 * of active CPUs while we are changing it or from getting placed on
755 771 * the run queue of a CPU that has just gone off line. We do this by
756 772 * creating a thread with the highest possible prio for each CPU and
757 773 * having it call this routine. The advantage of this method is that
758 774 * we can eliminate all checks for CPU_ACTIVE in the disp routines.
759 775 * This makes disp faster at the expense of making p_online() slower
760 776 * which is a good trade off.
761 777 */
762 778 static void
763 779 cpu_pause(int index)
764 780 {
765 781 int s;
766 782 struct _cpu_pause_info *cpi = &cpu_pause_info;
767 783 volatile char *safe = &safe_list[index];
768 784 long lindex = index;
769 785
770 786 ASSERT((curthread->t_bound_cpu != NULL) || (*safe == PAUSE_DIE));
771 787
772 788 while (*safe != PAUSE_DIE) {
773 789 *safe = PAUSE_READY;
774 790 membar_enter(); /* make sure stores are flushed */
775 791 sema_v(&cpi->cp_sem); /* signal requesting thread */
776 792
777 793 /*
778 794 * Wait here until all pause threads are running. That
779 795 * indicates that it's safe to do the spl. Until
780 796 * cpu_pause_info.cp_go is set, we don't want to spl
781 797 * because that might block clock interrupts needed
782 798 * to preempt threads on other CPUs.
783 799 */
784 800 while (cpi->cp_go == 0)
785 801 ;
786 802 /*
787 803 * Even though we are at the highest disp prio, we need
788 804 * to block out all interrupts below LOCK_LEVEL so that
789 805 * an intr doesn't come in, wake up a thread, and call
790 806 * setbackdq/setfrontdq.
791 807 */
792 808 s = splhigh();
793 809 /*
794 810 * if cp_func has been set then call it using index as the
795 811 * argument, currently only used by cpr_suspend_cpus().
796 812 * This function is used as the code to execute on the
797 813 * "paused" cpu's when a machine comes out of a sleep state
798 814 * and CPU's were powered off. (could also be used for
799 815 * hotplugging CPU's).
800 816 */
801 817 if (cpi->cp_func != NULL)
802 818 (*cpi->cp_func)((void *)lindex);
803 819
804 820 mach_cpu_pause(safe);
805 821
806 822 splx(s);
807 823 /*
808 824 * Waiting is at an end. Switch out of cpu_pause
809 825 * loop and resume useful work.
810 826 */
811 827 swtch();
812 828 }
813 829
814 830 mutex_enter(&pause_free_mutex);
815 831 *safe = PAUSE_DEAD;
816 832 cv_broadcast(&pause_free_cv);
817 833 mutex_exit(&pause_free_mutex);
818 834 }
819 835
820 836 /*
821 837 * Allow the cpus to start running again.
822 838 */
823 839 void
824 840 start_cpus()
825 841 {
826 842 int i;
827 843
828 844 ASSERT(MUTEX_HELD(&cpu_lock));
829 845 ASSERT(cpu_pause_info.cp_paused);
830 846 cpu_pause_info.cp_paused = NULL;
831 847 for (i = 0; i < NCPU; i++)
832 848 safe_list[i] = PAUSE_IDLE;
833 849 membar_enter(); /* make sure stores are flushed */
834 850 affinity_clear();
835 851 splx(cpu_pause_info.cp_spl);
836 852 kpreempt_enable();
837 853 }
838 854
839 855 /*
840 856 * Allocate a pause thread for a CPU.
841 857 */
842 858 static void
843 859 cpu_pause_alloc(cpu_t *cp)
844 860 {
845 861 kthread_id_t t;
846 862 long cpun = cp->cpu_id;
847 863
848 864 /*
849 865 * Note, v.v_nglobpris will not change value as long as I hold
850 866 * cpu_lock.
851 867 */
852 868 t = thread_create(NULL, 0, cpu_pause, (void *)cpun,
853 869 0, &p0, TS_STOPPED, v.v_nglobpris - 1);
854 870 thread_lock(t);
855 871 t->t_bound_cpu = cp;
856 872 t->t_disp_queue = cp->cpu_disp;
857 873 t->t_affinitycnt = 1;
858 874 t->t_preempt = 1;
859 875 thread_unlock(t);
860 876 cp->cpu_pause_thread = t;
861 877 /*
862 878 * Registering a thread in the callback table is usually done
863 879 * in the initialization code of the thread. In this
864 880 * case, we do it right after thread creation because the
865 881 * thread itself may never run, and we need to register the
866 882 * fact that it is safe for cpr suspend.
867 883 */
868 884 CALLB_CPR_INIT_SAFE(t, "cpu_pause");
869 885 }
870 886
871 887 /*
872 888 * Free a pause thread for a CPU.
873 889 */
874 890 static void
875 891 cpu_pause_free(cpu_t *cp)
876 892 {
877 893 kthread_id_t t;
878 894 int cpun = cp->cpu_id;
879 895
880 896 ASSERT(MUTEX_HELD(&cpu_lock));
881 897 /*
882 898 * We have to get the thread and tell it to die.
883 899 */
884 900 if ((t = cp->cpu_pause_thread) == NULL) {
885 901 ASSERT(safe_list[cpun] == PAUSE_IDLE);
886 902 return;
887 903 }
888 904 thread_lock(t);
889 905 t->t_cpu = CPU; /* disp gets upset if last cpu is quiesced. */
890 906 t->t_bound_cpu = NULL; /* Must un-bind; cpu may not be running. */
891 907 t->t_pri = v.v_nglobpris - 1;
892 908 ASSERT(safe_list[cpun] == PAUSE_IDLE);
893 909 safe_list[cpun] = PAUSE_DIE;
894 910 THREAD_TRANSITION(t);
895 911 setbackdq(t);
896 912 thread_unlock_nopreempt(t);
897 913
898 914 /*
899 915 * If we don't wait for the thread to actually die, it may try to
900 916 * run on the wrong cpu as part of an actual call to pause_cpus().
901 917 */
902 918 mutex_enter(&pause_free_mutex);
903 919 while (safe_list[cpun] != PAUSE_DEAD) {
904 920 cv_wait(&pause_free_cv, &pause_free_mutex);
905 921 }
906 922 mutex_exit(&pause_free_mutex);
907 923 safe_list[cpun] = PAUSE_IDLE;
908 924
909 925 cp->cpu_pause_thread = NULL;
910 926 }
911 927
912 928 /*
913 929 * Initialize basic structures for pausing CPUs.
914 930 */
915 931 void
916 932 cpu_pause_init()
917 933 {
918 934 sema_init(&cpu_pause_info.cp_sem, 0, NULL, SEMA_DEFAULT, NULL);
919 935 /*
920 936 * Create initial CPU pause thread.
921 937 */
922 938 cpu_pause_alloc(CPU);
923 939 }
924 940
925 941 /*
926 942 * Start the threads used to pause another CPU.
927 943 */
928 944 static int
929 945 cpu_pause_start(processorid_t cpu_id)
930 946 {
931 947 int i;
932 948 int cpu_count = 0;
933 949
934 950 for (i = 0; i < NCPU; i++) {
935 951 cpu_t *cp;
936 952 kthread_id_t t;
937 953
938 954 cp = cpu[i];
939 955 if (!CPU_IN_SET(cpu_available, i) || (i == cpu_id)) {
940 956 safe_list[i] = PAUSE_WAIT;
941 957 continue;
942 958 }
943 959
944 960 /*
945 961 * Skip CPU if it is quiesced or not yet started.
946 962 */
947 963 if ((cp->cpu_flags & (CPU_QUIESCED | CPU_READY)) != CPU_READY) {
948 964 safe_list[i] = PAUSE_WAIT;
949 965 continue;
950 966 }
951 967
952 968 /*
953 969 * Start this CPU's pause thread.
954 970 */
955 971 t = cp->cpu_pause_thread;
956 972 thread_lock(t);
957 973 /*
958 974 * Reset the priority, since nglobpris may have
959 975 * changed since the thread was created, if someone
960 976 * has loaded the RT (or some other) scheduling
961 977 * class.
962 978 */
963 979 t->t_pri = v.v_nglobpris - 1;
964 980 THREAD_TRANSITION(t);
965 981 setbackdq(t);
966 982 thread_unlock_nopreempt(t);
967 983 ++cpu_count;
968 984 }
969 985 return (cpu_count);
970 986 }
971 987
972 988
973 989 /*
974 990 * Pause all of the CPUs except the one we are on by creating a high
975 991 * priority thread bound to those CPUs.
976 992 *
977 993 * Note that one must be extremely careful regarding code
978 994 * executed while CPUs are paused. Since a CPU may be paused
979 995 * while a thread scheduling on that CPU is holding an adaptive
980 996 * lock, code executed with CPUs paused must not acquire adaptive
981 997 * (or low-level spin) locks. Also, such code must not block,
982 998 * since the thread that is supposed to initiate the wakeup may
983 999 * never run.
984 1000 *
985 1001 * With a few exceptions, the restrictions on code executed with CPUs
986 1002 * paused match those for code executed at high-level interrupt
987 1003 * context.
988 1004 */
989 1005 void
990 1006 pause_cpus(cpu_t *off_cp, void *(*func)(void *))
991 1007 {
992 1008 processorid_t cpu_id;
993 1009 int i;
994 1010 struct _cpu_pause_info *cpi = &cpu_pause_info;
995 1011
996 1012 ASSERT(MUTEX_HELD(&cpu_lock));
997 1013 ASSERT(cpi->cp_paused == NULL);
998 1014 cpi->cp_count = 0;
999 1015 cpi->cp_go = 0;
1000 1016 for (i = 0; i < NCPU; i++)
1001 1017 safe_list[i] = PAUSE_IDLE;
1002 1018 kpreempt_disable();
1003 1019
1004 1020 cpi->cp_func = func;
1005 1021
1006 1022 /*
1007 1023 * If running on the cpu that is going offline, get off it.
1008 1024 * This is so that it won't be necessary to rechoose a CPU
1009 1025 * when done.
1010 1026 */
1011 1027 if (CPU == off_cp)
1012 1028 cpu_id = off_cp->cpu_next_part->cpu_id;
1013 1029 else
1014 1030 cpu_id = CPU->cpu_id;
1015 1031 affinity_set(cpu_id);
1016 1032
1017 1033 /*
1018 1034 * Start the pause threads and record how many were started
1019 1035 */
1020 1036 cpi->cp_count = cpu_pause_start(cpu_id);
1021 1037
1022 1038 /*
1023 1039 * Now wait for all CPUs to be running the pause thread.
1024 1040 */
1025 1041 while (cpi->cp_count > 0) {
1026 1042 /*
1027 1043 * Spin reading the count without grabbing the disp
1028 1044 * lock to make sure we don't prevent the pause
1029 1045 * threads from getting the lock.
1030 1046 */
1031 1047 while (sema_held(&cpi->cp_sem))
1032 1048 ;
1033 1049 if (sema_tryp(&cpi->cp_sem))
1034 1050 --cpi->cp_count;
1035 1051 }
1036 1052 cpi->cp_go = 1; /* all have reached cpu_pause */
1037 1053
1038 1054 /*
1039 1055 * Now wait for all CPUs to spl. (Transition from PAUSE_READY
1040 1056 * to PAUSE_WAIT.)
1041 1057 */
1042 1058 for (i = 0; i < NCPU; i++) {
1043 1059 while (safe_list[i] != PAUSE_WAIT)
1044 1060 ;
1045 1061 }
1046 1062 cpi->cp_spl = splhigh(); /* block dispatcher on this CPU */
1047 1063 cpi->cp_paused = curthread;
1048 1064 }
1049 1065
1050 1066 /*
1051 1067 * Check whether the current thread has CPUs paused
1052 1068 */
1053 1069 int
1054 1070 cpus_paused(void)
1055 1071 {
1056 1072 if (cpu_pause_info.cp_paused != NULL) {
1057 1073 ASSERT(cpu_pause_info.cp_paused == curthread);
1058 1074 return (1);
1059 1075 }
1060 1076 return (0);
1061 1077 }
1062 1078
1063 1079 static cpu_t *
1064 1080 cpu_get_all(processorid_t cpun)
1065 1081 {
1066 1082 ASSERT(MUTEX_HELD(&cpu_lock));
1067 1083
1068 1084 if (cpun >= NCPU || cpun < 0 || !CPU_IN_SET(cpu_available, cpun))
1069 1085 return (NULL);
1070 1086 return (cpu[cpun]);
1071 1087 }
1072 1088
1073 1089 /*
1074 1090 * Check whether cpun is a valid processor id and whether it should be
1075 1091 * visible from the current zone. If it is, return a pointer to the
1076 1092 * associated CPU structure.
1077 1093 */
1078 1094 cpu_t *
1079 1095 cpu_get(processorid_t cpun)
1080 1096 {
1081 1097 cpu_t *c;
1082 1098
1083 1099 ASSERT(MUTEX_HELD(&cpu_lock));
1084 1100 c = cpu_get_all(cpun);
1085 1101 if (c != NULL && !INGLOBALZONE(curproc) && pool_pset_enabled() &&
1086 1102 zone_pset_get(curproc->p_zone) != cpupart_query_cpu(c))
1087 1103 return (NULL);
1088 1104 return (c);
1089 1105 }
1090 1106
1091 1107 /*
1092 1108 * The following functions should be used to check CPU states in the kernel.
1093 1109 * They should be invoked with cpu_lock held. Kernel subsystems interested
1094 1110 * in CPU states should *not* use cpu_get_state() and various P_ONLINE/etc
1095 1111 * states. Those are for user-land (and system call) use only.
1096 1112 */
1097 1113
1098 1114 /*
1099 1115 * Determine whether the CPU is online and handling interrupts.
1100 1116 */
1101 1117 int
1102 1118 cpu_is_online(cpu_t *cpu)
1103 1119 {
1104 1120 ASSERT(MUTEX_HELD(&cpu_lock));
1105 1121 return (cpu_flagged_online(cpu->cpu_flags));
1106 1122 }
1107 1123
1108 1124 /*
1109 1125 * Determine whether the CPU is offline (this includes spare and faulted).
1110 1126 */
1111 1127 int
1112 1128 cpu_is_offline(cpu_t *cpu)
1113 1129 {
1114 1130 ASSERT(MUTEX_HELD(&cpu_lock));
1115 1131 return (cpu_flagged_offline(cpu->cpu_flags));
1116 1132 }
1117 1133
1118 1134 /*
1119 1135 * Determine whether the CPU is powered off.
1120 1136 */
1121 1137 int
1122 1138 cpu_is_poweredoff(cpu_t *cpu)
1123 1139 {
1124 1140 ASSERT(MUTEX_HELD(&cpu_lock));
1125 1141 return (cpu_flagged_poweredoff(cpu->cpu_flags));
1126 1142 }
1127 1143
1128 1144 /*
1129 1145 * Determine whether the CPU is handling interrupts.
1130 1146 */
1131 1147 int
1132 1148 cpu_is_nointr(cpu_t *cpu)
1133 1149 {
1134 1150 ASSERT(MUTEX_HELD(&cpu_lock));
1135 1151 return (cpu_flagged_nointr(cpu->cpu_flags));
1136 1152 }
1137 1153
1138 1154 /*
1139 1155 * Determine whether the CPU is active (scheduling threads).
1140 1156 */
1141 1157 int
1142 1158 cpu_is_active(cpu_t *cpu)
1143 1159 {
1144 1160 ASSERT(MUTEX_HELD(&cpu_lock));
1145 1161 return (cpu_flagged_active(cpu->cpu_flags));
1146 1162 }
1147 1163
1148 1164 /*
1149 1165 * Same as above, but these require cpu_flags instead of cpu_t pointers.
1150 1166 */
1151 1167 int
1152 1168 cpu_flagged_online(cpu_flag_t cpu_flags)
1153 1169 {
1154 1170 return (cpu_flagged_active(cpu_flags) &&
1155 1171 (cpu_flags & CPU_ENABLE));
1156 1172 }
1157 1173
1158 1174 int
1159 1175 cpu_flagged_offline(cpu_flag_t cpu_flags)
1160 1176 {
1161 1177 return (((cpu_flags & CPU_POWEROFF) == 0) &&
1162 1178 ((cpu_flags & (CPU_READY | CPU_OFFLINE)) != CPU_READY));
1163 1179 }
1164 1180
1165 1181 int
1166 1182 cpu_flagged_poweredoff(cpu_flag_t cpu_flags)
1167 1183 {
1168 1184 return ((cpu_flags & CPU_POWEROFF) == CPU_POWEROFF);
1169 1185 }
1170 1186
1171 1187 int
1172 1188 cpu_flagged_nointr(cpu_flag_t cpu_flags)
1173 1189 {
1174 1190 return (cpu_flagged_active(cpu_flags) &&
1175 1191 (cpu_flags & CPU_ENABLE) == 0);
1176 1192 }
1177 1193
1178 1194 int
1179 1195 cpu_flagged_active(cpu_flag_t cpu_flags)
1180 1196 {
1181 1197 return (((cpu_flags & (CPU_POWEROFF | CPU_FAULTED | CPU_SPARE)) == 0) &&
1182 1198 ((cpu_flags & (CPU_READY | CPU_OFFLINE)) == CPU_READY));
1183 1199 }
1184 1200
1185 1201 /*
1186 1202 * Bring the indicated CPU online.
1187 1203 */
1188 1204 int
1189 1205 cpu_online(cpu_t *cp)
1190 1206 {
1191 1207 int error = 0;
1192 1208
1193 1209 /*
1194 1210 * Handle on-line request.
1195 1211 * This code must put the new CPU on the active list before
1196 1212 * starting it because it will not be paused, and will start
1197 1213 * using the active list immediately. The real start occurs
1198 1214 * when the CPU_QUIESCED flag is turned off.
1199 1215 */
1200 1216
1201 1217 ASSERT(MUTEX_HELD(&cpu_lock));
1202 1218
1203 1219 /*
1204 1220 * Put all the cpus into a known safe place.
1205 1221 * No mutexes can be entered while CPUs are paused.
1206 1222 */
1207 1223 error = mp_cpu_start(cp); /* arch-dep hook */
1208 1224 if (error == 0) {
1209 1225 pg_cpupart_in(cp, cp->cpu_part);
1210 1226 pause_cpus(NULL, NULL);
1211 1227 cpu_add_active_internal(cp);
1212 1228 if (cp->cpu_flags & CPU_FAULTED) {
1213 1229 cp->cpu_flags &= ~CPU_FAULTED;
1214 1230 mp_cpu_faulted_exit(cp);
1215 1231 }
1216 1232 cp->cpu_flags &= ~(CPU_QUIESCED | CPU_OFFLINE | CPU_FROZEN |
1217 1233 CPU_SPARE);
1218 1234 CPU_NEW_GENERATION(cp);
1219 1235 start_cpus();
1220 1236 cpu_stats_kstat_create(cp);
1221 1237 cpu_create_intrstat(cp);
1222 1238 lgrp_kstat_create(cp);
1223 1239 cpu_state_change_notify(cp->cpu_id, CPU_ON);
1224 1240 cpu_intr_enable(cp); /* arch-dep hook */
1225 1241 cpu_state_change_notify(cp->cpu_id, CPU_INTR_ON);
1226 1242 cpu_set_state(cp);
1227 1243 cyclic_online(cp);
1228 1244 /*
1229 1245 * This has to be called only after cyclic_online(). This
1230 1246 * function uses cyclics.
1231 1247 */
1232 1248 callout_cpu_online(cp);
1233 1249 poke_cpu(cp->cpu_id);
1234 1250 }
1235 1251
1236 1252 return (error);
1237 1253 }
1238 1254
1239 1255 /*
1240 1256 * Take the indicated CPU offline.
1241 1257 */
1242 1258 int
1243 1259 cpu_offline(cpu_t *cp, int flags)
1244 1260 {
1245 1261 cpupart_t *pp;
1246 1262 int error = 0;
1247 1263 cpu_t *ncp;
1248 1264 int intr_enable;
1249 1265 int cyclic_off = 0;
1250 1266 int callout_off = 0;
1251 1267 int loop_count;
1252 1268 int no_quiesce = 0;
1253 1269 int (*bound_func)(struct cpu *, int);
1254 1270 kthread_t *t;
1255 1271 lpl_t *cpu_lpl;
1256 1272 proc_t *p;
1257 1273 int lgrp_diff_lpl;
1258 1274 boolean_t unbind_all_threads = (flags & CPU_FORCED) != 0;
1259 1275
1260 1276 ASSERT(MUTEX_HELD(&cpu_lock));
1261 1277
1262 1278 /*
1263 1279 * If we're going from faulted or spare to offline, just
1264 1280 * clear these flags and update CPU state.
1265 1281 */
1266 1282 if (cp->cpu_flags & (CPU_FAULTED | CPU_SPARE)) {
1267 1283 if (cp->cpu_flags & CPU_FAULTED) {
1268 1284 cp->cpu_flags &= ~CPU_FAULTED;
1269 1285 mp_cpu_faulted_exit(cp);
1270 1286 }
1271 1287 cp->cpu_flags &= ~CPU_SPARE;
1272 1288 cpu_set_state(cp);
1273 1289 return (0);
1274 1290 }
1275 1291
1276 1292 /*
1277 1293 * Handle off-line request.
1278 1294 */
1279 1295 pp = cp->cpu_part;
1280 1296 /*
1281 1297 * Don't offline last online CPU in partition
1282 1298 */
1283 1299 if (ncpus_online <= 1 || pp->cp_ncpus <= 1 || cpu_intr_count(cp) < 2)
1284 1300 return (EBUSY);
1285 1301 /*
1286 1302 * Unbind all soft-bound threads bound to our CPU and hard bound threads
1287 1303 * if we were asked to.
1288 1304 */
1289 1305 error = cpu_unbind(cp->cpu_id, unbind_all_threads);
1290 1306 if (error != 0)
1291 1307 return (error);
1292 1308 /*
1293 1309 * We shouldn't be bound to this CPU ourselves.
1294 1310 */
1295 1311 if (curthread->t_bound_cpu == cp)
1296 1312 return (EBUSY);
1297 1313
1298 1314 /*
1299 1315 * Tell interested parties that this CPU is going offline.
1300 1316 */
1301 1317 CPU_NEW_GENERATION(cp);
1302 1318 cpu_state_change_notify(cp->cpu_id, CPU_OFF);
1303 1319
1304 1320 /*
1305 1321 * Tell the PG subsystem that the CPU is leaving the partition
1306 1322 */
1307 1323 pg_cpupart_out(cp, pp);
1308 1324
1309 1325 /*
1310 1326 * Take the CPU out of interrupt participation so we won't find
1311 1327 * bound kernel threads. If the architecture cannot completely
1312 1328 * shut off interrupts on the CPU, don't quiesce it, but don't
1313 1329 * run anything but interrupt thread... this is indicated by
1314 1330 * the CPU_OFFLINE flag being on but the CPU_QUIESCE flag being
1315 1331 * off.
1316 1332 */
1317 1333 intr_enable = cp->cpu_flags & CPU_ENABLE;
1318 1334 if (intr_enable)
1319 1335 no_quiesce = cpu_intr_disable(cp);
1320 1336
1321 1337 /*
1322 1338 * Record that we are aiming to offline this cpu. This acts as
1323 1339 * a barrier to further weak binding requests in thread_nomigrate
1324 1340 * and also causes cpu_choose, disp_lowpri_cpu and setfrontdq to
1325 1341 * lean away from this cpu. Further strong bindings are already
1326 1342 * avoided since we hold cpu_lock. Since threads that are set
1327 1343 * runnable around now and others coming off the target cpu are
1328 1344 * directed away from the target, existing strong and weak bindings
1329 1345 * (especially the latter) to the target cpu stand maximum chance of
1330 1346 * being able to unbind during the short delay loop below (if other
1331 1347 * unbound threads compete they may not see cpu in time to unbind
1332 1348 * even if they would do so immediately.
1333 1349 */
1334 1350 cpu_inmotion = cp;
1335 1351 membar_enter();
1336 1352
1337 1353 /*
1338 1354 * Check for kernel threads (strong or weak) bound to that CPU.
1339 1355 * Strongly bound threads may not unbind, and we'll have to return
1340 1356 * EBUSY. Weakly bound threads should always disappear - we've
1341 1357 * stopped more weak binding with cpu_inmotion and existing
1342 1358 * bindings will drain imminently (they may not block). Nonetheless
1343 1359 * we will wait for a fixed period for all bound threads to disappear.
1344 1360 * Inactive interrupt threads are OK (they'll be in TS_FREE
1345 1361 * state). If test finds some bound threads, wait a few ticks
1346 1362 * to give short-lived threads (such as interrupts) chance to
1347 1363 * complete. Note that if no_quiesce is set, i.e. this cpu
1348 1364 * is required to service interrupts, then we take the route
1349 1365 * that permits interrupt threads to be active (or bypassed).
1350 1366 */
1351 1367 bound_func = no_quiesce ? disp_bound_threads : disp_bound_anythreads;
1352 1368
1353 1369 again: for (loop_count = 0; (*bound_func)(cp, 0); loop_count++) {
1354 1370 if (loop_count >= 5) {
1355 1371 error = EBUSY; /* some threads still bound */
1356 1372 break;
1357 1373 }
1358 1374
1359 1375 /*
1360 1376 * If some threads were assigned, give them
1361 1377 * a chance to complete or move.
1362 1378 *
1363 1379 * This assumes that the clock_thread is not bound
1364 1380 * to any CPU, because the clock_thread is needed to
1365 1381 * do the delay(hz/100).
1366 1382 *
1367 1383 * Note: we still hold the cpu_lock while waiting for
1368 1384 * the next clock tick. This is OK since it isn't
1369 1385 * needed for anything else except processor_bind(2),
1370 1386 * and system initialization. If we drop the lock,
1371 1387 * we would risk another p_online disabling the last
1372 1388 * processor.
1373 1389 */
1374 1390 delay(hz/100);
1375 1391 }
1376 1392
1377 1393 if (error == 0 && callout_off == 0) {
1378 1394 callout_cpu_offline(cp);
1379 1395 callout_off = 1;
1380 1396 }
1381 1397
1382 1398 if (error == 0 && cyclic_off == 0) {
1383 1399 if (!cyclic_offline(cp)) {
1384 1400 /*
1385 1401 * We must have bound cyclics...
1386 1402 */
1387 1403 error = EBUSY;
1388 1404 goto out;
1389 1405 }
1390 1406 cyclic_off = 1;
1391 1407 }
1392 1408
1393 1409 /*
1394 1410 * Call mp_cpu_stop() to perform any special operations
1395 1411 * needed for this machine architecture to offline a CPU.
1396 1412 */
1397 1413 if (error == 0)
1398 1414 error = mp_cpu_stop(cp); /* arch-dep hook */
1399 1415
1400 1416 /*
1401 1417 * If that all worked, take the CPU offline and decrement
1402 1418 * ncpus_online.
1403 1419 */
1404 1420 if (error == 0) {
1405 1421 /*
1406 1422 * Put all the cpus into a known safe place.
1407 1423 * No mutexes can be entered while CPUs are paused.
1408 1424 */
1409 1425 pause_cpus(cp, NULL);
1410 1426 /*
1411 1427 * Repeat the operation, if necessary, to make sure that
1412 1428 * all outstanding low-level interrupts run to completion
1413 1429 * before we set the CPU_QUIESCED flag. It's also possible
1414 1430 * that a thread has weak bound to the cpu despite our raising
1415 1431 * cpu_inmotion above since it may have loaded that
1416 1432 * value before the barrier became visible (this would have
1417 1433 * to be the thread that was on the target cpu at the time
1418 1434 * we raised the barrier).
1419 1435 */
1420 1436 if ((!no_quiesce && cp->cpu_intr_actv != 0) ||
1421 1437 (*bound_func)(cp, 1)) {
1422 1438 start_cpus();
1423 1439 (void) mp_cpu_start(cp);
1424 1440 goto again;
1425 1441 }
1426 1442 ncp = cp->cpu_next_part;
1427 1443 cpu_lpl = cp->cpu_lpl;
1428 1444 ASSERT(cpu_lpl != NULL);
1429 1445
1430 1446 /*
1431 1447 * Remove the CPU from the list of active CPUs.
1432 1448 */
1433 1449 cpu_remove_active(cp);
1434 1450
1435 1451 /*
1436 1452 * Walk the active process list and look for threads
1437 1453 * whose home lgroup needs to be updated, or
1438 1454 * the last CPU they run on is the one being offlined now.
1439 1455 */
1440 1456
1441 1457 ASSERT(curthread->t_cpu != cp);
1442 1458 for (p = practive; p != NULL; p = p->p_next) {
1443 1459
1444 1460 t = p->p_tlist;
1445 1461
1446 1462 if (t == NULL)
1447 1463 continue;
1448 1464
1449 1465 lgrp_diff_lpl = 0;
1450 1466
1451 1467 do {
1452 1468 ASSERT(t->t_lpl != NULL);
1453 1469 /*
1454 1470 * Taking last CPU in lpl offline
1455 1471 * Rehome thread if it is in this lpl
1456 1472 * Otherwise, update the count of how many
1457 1473 * threads are in this CPU's lgroup but have
1458 1474 * a different lpl.
1459 1475 */
1460 1476
1461 1477 if (cpu_lpl->lpl_ncpu == 0) {
1462 1478 if (t->t_lpl == cpu_lpl)
1463 1479 lgrp_move_thread(t,
1464 1480 lgrp_choose(t,
1465 1481 t->t_cpupart), 0);
1466 1482 else if (t->t_lpl->lpl_lgrpid ==
1467 1483 cpu_lpl->lpl_lgrpid)
1468 1484 lgrp_diff_lpl++;
1469 1485 }
1470 1486 ASSERT(t->t_lpl->lpl_ncpu > 0);
1471 1487
1472 1488 /*
1473 1489 * Update CPU last ran on if it was this CPU
1474 1490 */
1475 1491 if (t->t_cpu == cp && t->t_bound_cpu != cp)
1476 1492 t->t_cpu = disp_lowpri_cpu(ncp,
1477 1493 t->t_lpl, t->t_pri, NULL);
1478 1494 ASSERT(t->t_cpu != cp || t->t_bound_cpu == cp ||
1479 1495 t->t_weakbound_cpu == cp);
1480 1496
1481 1497 t = t->t_forw;
1482 1498 } while (t != p->p_tlist);
1483 1499
1484 1500 /*
1485 1501 * Didn't find any threads in the same lgroup as this
1486 1502 * CPU with a different lpl, so remove the lgroup from
1487 1503 * the process lgroup bitmask.
1488 1504 */
1489 1505
1490 1506 if (lgrp_diff_lpl == 0)
1491 1507 klgrpset_del(p->p_lgrpset, cpu_lpl->lpl_lgrpid);
1492 1508 }
1493 1509
1494 1510 /*
1495 1511 * Walk thread list looking for threads that need to be
1496 1512 * rehomed, since there are some threads that are not in
1497 1513 * their process's p_tlist.
1498 1514 */
1499 1515
1500 1516 t = curthread;
1501 1517 do {
1502 1518 ASSERT(t != NULL && t->t_lpl != NULL);
1503 1519
1504 1520 /*
1505 1521 * Rehome threads with same lpl as this CPU when this
1506 1522 * is the last CPU in the lpl.
1507 1523 */
1508 1524
1509 1525 if ((cpu_lpl->lpl_ncpu == 0) && (t->t_lpl == cpu_lpl))
1510 1526 lgrp_move_thread(t,
1511 1527 lgrp_choose(t, t->t_cpupart), 1);
1512 1528
1513 1529 ASSERT(t->t_lpl->lpl_ncpu > 0);
1514 1530
1515 1531 /*
1516 1532 * Update CPU last ran on if it was this CPU
1517 1533 */
1518 1534
1519 1535 if (t->t_cpu == cp && t->t_bound_cpu != cp) {
1520 1536 t->t_cpu = disp_lowpri_cpu(ncp,
1521 1537 t->t_lpl, t->t_pri, NULL);
1522 1538 }
1523 1539 ASSERT(t->t_cpu != cp || t->t_bound_cpu == cp ||
1524 1540 t->t_weakbound_cpu == cp);
1525 1541 t = t->t_next;
1526 1542
1527 1543 } while (t != curthread);
1528 1544 ASSERT((cp->cpu_flags & (CPU_FAULTED | CPU_SPARE)) == 0);
1529 1545 cp->cpu_flags |= CPU_OFFLINE;
1530 1546 disp_cpu_inactive(cp);
1531 1547 if (!no_quiesce)
1532 1548 cp->cpu_flags |= CPU_QUIESCED;
1533 1549 ncpus_online--;
1534 1550 cpu_set_state(cp);
1535 1551 cpu_inmotion = NULL;
1536 1552 start_cpus();
1537 1553 cpu_stats_kstat_destroy(cp);
1538 1554 cpu_delete_intrstat(cp);
1539 1555 lgrp_kstat_destroy(cp);
1540 1556 }
1541 1557
1542 1558 out:
1543 1559 cpu_inmotion = NULL;
1544 1560
1545 1561 /*
1546 1562 * If we failed, re-enable interrupts.
1547 1563 * Do this even if cpu_intr_disable returned an error, because
1548 1564 * it may have partially disabled interrupts.
1549 1565 */
1550 1566 if (error && intr_enable)
1551 1567 cpu_intr_enable(cp);
1552 1568
1553 1569 /*
1554 1570 * If we failed, but managed to offline the cyclic subsystem on this
1555 1571 * CPU, bring it back online.
1556 1572 */
1557 1573 if (error && cyclic_off)
1558 1574 cyclic_online(cp);
1559 1575
1560 1576 /*
1561 1577 * If we failed, but managed to offline callouts on this CPU,
1562 1578 * bring it back online.
1563 1579 */
1564 1580 if (error && callout_off)
1565 1581 callout_cpu_online(cp);
1566 1582
1567 1583 /*
1568 1584 * If we failed, tell the PG subsystem that the CPU is back
1569 1585 */
1570 1586 pg_cpupart_in(cp, pp);
1571 1587
1572 1588 /*
1573 1589 * If we failed, we need to notify everyone that this CPU is back on.
1574 1590 */
1575 1591 if (error != 0) {
1576 1592 CPU_NEW_GENERATION(cp);
1577 1593 cpu_state_change_notify(cp->cpu_id, CPU_ON);
1578 1594 cpu_state_change_notify(cp->cpu_id, CPU_INTR_ON);
1579 1595 }
1580 1596
1581 1597 return (error);
1582 1598 }
1583 1599
1584 1600 /*
1585 1601 * Mark the indicated CPU as faulted, taking it offline.
1586 1602 */
1587 1603 int
1588 1604 cpu_faulted(cpu_t *cp, int flags)
1589 1605 {
1590 1606 int error = 0;
1591 1607
1592 1608 ASSERT(MUTEX_HELD(&cpu_lock));
1593 1609 ASSERT(!cpu_is_poweredoff(cp));
1594 1610
1595 1611 if (cpu_is_offline(cp)) {
1596 1612 cp->cpu_flags &= ~CPU_SPARE;
1597 1613 cp->cpu_flags |= CPU_FAULTED;
1598 1614 mp_cpu_faulted_enter(cp);
1599 1615 cpu_set_state(cp);
1600 1616 return (0);
1601 1617 }
1602 1618
1603 1619 if ((error = cpu_offline(cp, flags)) == 0) {
1604 1620 cp->cpu_flags |= CPU_FAULTED;
1605 1621 mp_cpu_faulted_enter(cp);
1606 1622 cpu_set_state(cp);
1607 1623 }
1608 1624
1609 1625 return (error);
1610 1626 }
1611 1627
1612 1628 /*
1613 1629 * Mark the indicated CPU as a spare, taking it offline.
1614 1630 */
1615 1631 int
1616 1632 cpu_spare(cpu_t *cp, int flags)
1617 1633 {
1618 1634 int error = 0;
1619 1635
1620 1636 ASSERT(MUTEX_HELD(&cpu_lock));
1621 1637 ASSERT(!cpu_is_poweredoff(cp));
1622 1638
1623 1639 if (cpu_is_offline(cp)) {
1624 1640 if (cp->cpu_flags & CPU_FAULTED) {
1625 1641 cp->cpu_flags &= ~CPU_FAULTED;
1626 1642 mp_cpu_faulted_exit(cp);
1627 1643 }
1628 1644 cp->cpu_flags |= CPU_SPARE;
1629 1645 cpu_set_state(cp);
1630 1646 return (0);
1631 1647 }
1632 1648
1633 1649 if ((error = cpu_offline(cp, flags)) == 0) {
1634 1650 cp->cpu_flags |= CPU_SPARE;
1635 1651 cpu_set_state(cp);
1636 1652 }
1637 1653
1638 1654 return (error);
1639 1655 }
1640 1656
1641 1657 /*
1642 1658 * Take the indicated CPU from poweroff to offline.
1643 1659 */
1644 1660 int
1645 1661 cpu_poweron(cpu_t *cp)
1646 1662 {
1647 1663 int error = ENOTSUP;
1648 1664
1649 1665 ASSERT(MUTEX_HELD(&cpu_lock));
1650 1666 ASSERT(cpu_is_poweredoff(cp));
1651 1667
1652 1668 error = mp_cpu_poweron(cp); /* arch-dep hook */
1653 1669 if (error == 0)
1654 1670 cpu_set_state(cp);
1655 1671
1656 1672 return (error);
1657 1673 }
1658 1674
1659 1675 /*
1660 1676 * Take the indicated CPU from any inactive state to powered off.
1661 1677 */
1662 1678 int
1663 1679 cpu_poweroff(cpu_t *cp)
1664 1680 {
1665 1681 int error = ENOTSUP;
1666 1682
1667 1683 ASSERT(MUTEX_HELD(&cpu_lock));
1668 1684 ASSERT(cpu_is_offline(cp));
1669 1685
1670 1686 if (!(cp->cpu_flags & CPU_QUIESCED))
1671 1687 return (EBUSY); /* not completely idle */
1672 1688
1673 1689 error = mp_cpu_poweroff(cp); /* arch-dep hook */
1674 1690 if (error == 0)
1675 1691 cpu_set_state(cp);
1676 1692
1677 1693 return (error);
1678 1694 }
1679 1695
1680 1696 /*
1681 1697 * Initialize the Sequential CPU id lookup table
1682 1698 */
1683 1699 void
1684 1700 cpu_seq_tbl_init()
1685 1701 {
1686 1702 cpu_t **tbl;
1687 1703
1688 1704 tbl = kmem_zalloc(sizeof (struct cpu *) * max_ncpus, KM_SLEEP);
1689 1705 tbl[0] = CPU;
1690 1706
1691 1707 cpu_seq = tbl;
1692 1708 }
1693 1709
1694 1710 /*
1695 1711 * Initialize the CPU lists for the first CPU.
1696 1712 */
1697 1713 void
1698 1714 cpu_list_init(cpu_t *cp)
1699 1715 {
1700 1716 cp->cpu_next = cp;
1701 1717 cp->cpu_prev = cp;
1702 1718 cpu_list = cp;
1703 1719 clock_cpu_list = cp;
1704 1720
1705 1721 cp->cpu_next_onln = cp;
1706 1722 cp->cpu_prev_onln = cp;
1707 1723 cpu_active = cp;
1708 1724
1709 1725 cp->cpu_seqid = 0;
1710 1726 CPUSET_ADD(cpu_seqid_inuse, 0);
1711 1727
1712 1728 /*
1713 1729 * Bootstrap cpu_seq using cpu_list
1714 1730 * The cpu_seq[] table will be dynamically allocated
1715 1731 * when kmem later becomes available (but before going MP)
1716 1732 */
1717 1733 cpu_seq = &cpu_list;
1718 1734
1719 1735 cp->cpu_cache_offset = KMEM_CPU_CACHE_OFFSET(cp->cpu_seqid);
1720 1736 cp_default.cp_cpulist = cp;
1721 1737 cp_default.cp_ncpus = 1;
1722 1738 cp->cpu_next_part = cp;
1723 1739 cp->cpu_prev_part = cp;
1724 1740 cp->cpu_part = &cp_default;
1725 1741
1726 1742 CPUSET_ADD(cpu_available, cp->cpu_id);
1727 1743 }
1728 1744
1729 1745 /*
1730 1746 * Insert a CPU into the list of available CPUs.
1731 1747 */
1732 1748 void
1733 1749 cpu_add_unit(cpu_t *cp)
1734 1750 {
1735 1751 int seqid;
1736 1752
1737 1753 ASSERT(MUTEX_HELD(&cpu_lock));
1738 1754 ASSERT(cpu_list != NULL); /* list started in cpu_list_init */
1739 1755
1740 1756 lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)cp, 0);
1741 1757
1742 1758 /*
1743 1759 * Note: most users of the cpu_list will grab the
1744 1760 * cpu_lock to insure that it isn't modified. However,
1745 1761 * certain users can't or won't do that. To allow this
1746 1762 * we pause the other cpus. Users who walk the list
1747 1763 * without cpu_lock, must disable kernel preemption
1748 1764 * to insure that the list isn't modified underneath
1749 1765 * them. Also, any cached pointers to cpu structures
1750 1766 * must be revalidated by checking to see if the
1751 1767 * cpu_next pointer points to itself. This check must
1752 1768 * be done with the cpu_lock held or kernel preemption
1753 1769 * disabled. This check relies upon the fact that
1754 1770 * old cpu structures are not free'ed or cleared after
1755 1771 * then are removed from the cpu_list.
1756 1772 *
1757 1773 * Note that the clock code walks the cpu list dereferencing
1758 1774 * the cpu_part pointer, so we need to initialize it before
1759 1775 * adding the cpu to the list.
1760 1776 */
1761 1777 cp->cpu_part = &cp_default;
1762 1778 pause_cpus(NULL, NULL);
1763 1779 cp->cpu_next = cpu_list;
1764 1780 cp->cpu_prev = cpu_list->cpu_prev;
1765 1781 cpu_list->cpu_prev->cpu_next = cp;
1766 1782 cpu_list->cpu_prev = cp;
1767 1783 start_cpus();
1768 1784
1769 1785 for (seqid = 0; CPU_IN_SET(cpu_seqid_inuse, seqid); seqid++)
1770 1786 continue;
1771 1787 CPUSET_ADD(cpu_seqid_inuse, seqid);
1772 1788 cp->cpu_seqid = seqid;
1773 1789
1774 1790 if (seqid > max_cpu_seqid_ever)
1775 1791 max_cpu_seqid_ever = seqid;
1776 1792
1777 1793 ASSERT(ncpus < max_ncpus);
1778 1794 ncpus++;
1779 1795 cp->cpu_cache_offset = KMEM_CPU_CACHE_OFFSET(cp->cpu_seqid);
1780 1796 cpu[cp->cpu_id] = cp;
1781 1797 CPUSET_ADD(cpu_available, cp->cpu_id);
1782 1798 cpu_seq[cp->cpu_seqid] = cp;
1783 1799
1784 1800 /*
1785 1801 * allocate a pause thread for this CPU.
1786 1802 */
1787 1803 cpu_pause_alloc(cp);
1788 1804
1789 1805 /*
1790 1806 * So that new CPUs won't have NULL prev_onln and next_onln pointers,
1791 1807 * link them into a list of just that CPU.
1792 1808 * This is so that disp_lowpri_cpu will work for thread_create in
1793 1809 * pause_cpus() when called from the startup thread in a new CPU.
1794 1810 */
1795 1811 cp->cpu_next_onln = cp;
1796 1812 cp->cpu_prev_onln = cp;
1797 1813 cpu_info_kstat_create(cp);
1798 1814 cp->cpu_next_part = cp;
1799 1815 cp->cpu_prev_part = cp;
1800 1816
1801 1817 init_cpu_mstate(cp, CMS_SYSTEM);
1802 1818
1803 1819 pool_pset_mod = gethrtime();
1804 1820 }
1805 1821
1806 1822 /*
1807 1823 * Do the opposite of cpu_add_unit().
1808 1824 */
1809 1825 void
1810 1826 cpu_del_unit(int cpuid)
1811 1827 {
1812 1828 struct cpu *cp, *cpnext;
1813 1829
1814 1830 ASSERT(MUTEX_HELD(&cpu_lock));
1815 1831 cp = cpu[cpuid];
1816 1832 ASSERT(cp != NULL);
1817 1833
1818 1834 ASSERT(cp->cpu_next_onln == cp);
1819 1835 ASSERT(cp->cpu_prev_onln == cp);
1820 1836 ASSERT(cp->cpu_next_part == cp);
1821 1837 ASSERT(cp->cpu_prev_part == cp);
1822 1838
1823 1839 /*
1824 1840 * Tear down the CPU's physical ID cache, and update any
1825 1841 * processor groups
1826 1842 */
1827 1843 pg_cpu_fini(cp, NULL);
1828 1844 pghw_physid_destroy(cp);
1829 1845
1830 1846 /*
1831 1847 * Destroy kstat stuff.
1832 1848 */
1833 1849 cpu_info_kstat_destroy(cp);
1834 1850 term_cpu_mstate(cp);
1835 1851 /*
1836 1852 * Free up pause thread.
1837 1853 */
1838 1854 cpu_pause_free(cp);
1839 1855 CPUSET_DEL(cpu_available, cp->cpu_id);
1840 1856 cpu[cp->cpu_id] = NULL;
1841 1857 cpu_seq[cp->cpu_seqid] = NULL;
1842 1858
1843 1859 /*
1844 1860 * The clock thread and mutex_vector_enter cannot hold the
1845 1861 * cpu_lock while traversing the cpu list, therefore we pause
1846 1862 * all other threads by pausing the other cpus. These, and any
1847 1863 * other routines holding cpu pointers while possibly sleeping
1848 1864 * must be sure to call kpreempt_disable before processing the
1849 1865 * list and be sure to check that the cpu has not been deleted
1850 1866 * after any sleeps (check cp->cpu_next != NULL). We guarantee
1851 1867 * to keep the deleted cpu structure around.
1852 1868 *
1853 1869 * Note that this MUST be done AFTER cpu_available
1854 1870 * has been updated so that we don't waste time
1855 1871 * trying to pause the cpu we're trying to delete.
1856 1872 */
1857 1873 pause_cpus(NULL, NULL);
1858 1874
1859 1875 cpnext = cp->cpu_next;
1860 1876 cp->cpu_prev->cpu_next = cp->cpu_next;
1861 1877 cp->cpu_next->cpu_prev = cp->cpu_prev;
1862 1878 if (cp == cpu_list)
1863 1879 cpu_list = cpnext;
1864 1880
1865 1881 /*
1866 1882 * Signals that the cpu has been deleted (see above).
1867 1883 */
1868 1884 cp->cpu_next = NULL;
1869 1885 cp->cpu_prev = NULL;
1870 1886
1871 1887 start_cpus();
1872 1888
1873 1889 CPUSET_DEL(cpu_seqid_inuse, cp->cpu_seqid);
1874 1890 ncpus--;
1875 1891 lgrp_config(LGRP_CONFIG_CPU_DEL, (uintptr_t)cp, 0);
1876 1892
1877 1893 pool_pset_mod = gethrtime();
1878 1894 }
1879 1895
1880 1896 /*
1881 1897 * Add a CPU to the list of active CPUs.
1882 1898 * This routine must not get any locks, because other CPUs are paused.
1883 1899 */
1884 1900 static void
1885 1901 cpu_add_active_internal(cpu_t *cp)
1886 1902 {
1887 1903 cpupart_t *pp = cp->cpu_part;
1888 1904
1889 1905 ASSERT(MUTEX_HELD(&cpu_lock));
1890 1906 ASSERT(cpu_list != NULL); /* list started in cpu_list_init */
1891 1907
1892 1908 ncpus_online++;
1893 1909 cpu_set_state(cp);
1894 1910 cp->cpu_next_onln = cpu_active;
1895 1911 cp->cpu_prev_onln = cpu_active->cpu_prev_onln;
1896 1912 cpu_active->cpu_prev_onln->cpu_next_onln = cp;
1897 1913 cpu_active->cpu_prev_onln = cp;
1898 1914
1899 1915 if (pp->cp_cpulist) {
1900 1916 cp->cpu_next_part = pp->cp_cpulist;
1901 1917 cp->cpu_prev_part = pp->cp_cpulist->cpu_prev_part;
1902 1918 pp->cp_cpulist->cpu_prev_part->cpu_next_part = cp;
1903 1919 pp->cp_cpulist->cpu_prev_part = cp;
1904 1920 } else {
1905 1921 ASSERT(pp->cp_ncpus == 0);
1906 1922 pp->cp_cpulist = cp->cpu_next_part = cp->cpu_prev_part = cp;
1907 1923 }
1908 1924 pp->cp_ncpus++;
1909 1925 if (pp->cp_ncpus == 1) {
1910 1926 cp_numparts_nonempty++;
1911 1927 ASSERT(cp_numparts_nonempty != 0);
1912 1928 }
1913 1929
1914 1930 pg_cpu_active(cp);
1915 1931 lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)cp, 0);
1916 1932
1917 1933 bzero(&cp->cpu_loadavg, sizeof (cp->cpu_loadavg));
1918 1934 }
1919 1935
1920 1936 /*
1921 1937 * Add a CPU to the list of active CPUs.
1922 1938 * This is called from machine-dependent layers when a new CPU is started.
1923 1939 */
1924 1940 void
1925 1941 cpu_add_active(cpu_t *cp)
1926 1942 {
1927 1943 pg_cpupart_in(cp, cp->cpu_part);
1928 1944
1929 1945 pause_cpus(NULL, NULL);
1930 1946 cpu_add_active_internal(cp);
1931 1947 start_cpus();
1932 1948
1933 1949 cpu_stats_kstat_create(cp);
1934 1950 cpu_create_intrstat(cp);
1935 1951 lgrp_kstat_create(cp);
1936 1952 cpu_state_change_notify(cp->cpu_id, CPU_INIT);
1937 1953 }
1938 1954
1939 1955
1940 1956 /*
1941 1957 * Remove a CPU from the list of active CPUs.
1942 1958 * This routine must not get any locks, because other CPUs are paused.
1943 1959 */
1944 1960 /* ARGSUSED */
1945 1961 static void
1946 1962 cpu_remove_active(cpu_t *cp)
1947 1963 {
1948 1964 cpupart_t *pp = cp->cpu_part;
1949 1965
1950 1966 ASSERT(MUTEX_HELD(&cpu_lock));
1951 1967 ASSERT(cp->cpu_next_onln != cp); /* not the last one */
1952 1968 ASSERT(cp->cpu_prev_onln != cp); /* not the last one */
1953 1969
1954 1970 pg_cpu_inactive(cp);
1955 1971
1956 1972 lgrp_config(LGRP_CONFIG_CPU_OFFLINE, (uintptr_t)cp, 0);
1957 1973
1958 1974 if (cp == clock_cpu_list)
1959 1975 clock_cpu_list = cp->cpu_next_onln;
1960 1976
1961 1977 cp->cpu_prev_onln->cpu_next_onln = cp->cpu_next_onln;
1962 1978 cp->cpu_next_onln->cpu_prev_onln = cp->cpu_prev_onln;
1963 1979 if (cpu_active == cp) {
1964 1980 cpu_active = cp->cpu_next_onln;
1965 1981 }
1966 1982 cp->cpu_next_onln = cp;
1967 1983 cp->cpu_prev_onln = cp;
1968 1984
1969 1985 cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part;
1970 1986 cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part;
1971 1987 if (pp->cp_cpulist == cp) {
1972 1988 pp->cp_cpulist = cp->cpu_next_part;
1973 1989 ASSERT(pp->cp_cpulist != cp);
1974 1990 }
1975 1991 cp->cpu_next_part = cp;
1976 1992 cp->cpu_prev_part = cp;
1977 1993 pp->cp_ncpus--;
1978 1994 if (pp->cp_ncpus == 0) {
1979 1995 cp_numparts_nonempty--;
1980 1996 ASSERT(cp_numparts_nonempty != 0);
1981 1997 }
1982 1998 }
1983 1999
1984 2000 /*
1985 2001 * Routine used to setup a newly inserted CPU in preparation for starting
1986 2002 * it running code.
1987 2003 */
1988 2004 int
1989 2005 cpu_configure(int cpuid)
1990 2006 {
1991 2007 int retval = 0;
1992 2008
1993 2009 ASSERT(MUTEX_HELD(&cpu_lock));
1994 2010
1995 2011 /*
1996 2012 * Some structures are statically allocated based upon
1997 2013 * the maximum number of cpus the system supports. Do not
1998 2014 * try to add anything beyond this limit.
1999 2015 */
2000 2016 if (cpuid < 0 || cpuid >= NCPU) {
2001 2017 return (EINVAL);
2002 2018 }
2003 2019
2004 2020 if ((cpu[cpuid] != NULL) && (cpu[cpuid]->cpu_flags != 0)) {
2005 2021 return (EALREADY);
2006 2022 }
2007 2023
2008 2024 if ((retval = mp_cpu_configure(cpuid)) != 0) {
2009 2025 return (retval);
2010 2026 }
2011 2027
2012 2028 cpu[cpuid]->cpu_flags = CPU_QUIESCED | CPU_OFFLINE | CPU_POWEROFF;
2013 2029 cpu_set_state(cpu[cpuid]);
2014 2030 retval = cpu_state_change_hooks(cpuid, CPU_CONFIG, CPU_UNCONFIG);
2015 2031 if (retval != 0)
2016 2032 (void) mp_cpu_unconfigure(cpuid);
2017 2033
2018 2034 return (retval);
2019 2035 }
2020 2036
2021 2037 /*
2022 2038 * Routine used to cleanup a CPU that has been powered off. This will
2023 2039 * destroy all per-cpu information related to this cpu.
2024 2040 */
2025 2041 int
2026 2042 cpu_unconfigure(int cpuid)
2027 2043 {
2028 2044 int error;
2029 2045
2030 2046 ASSERT(MUTEX_HELD(&cpu_lock));
2031 2047
2032 2048 if (cpu[cpuid] == NULL) {
2033 2049 return (ENODEV);
2034 2050 }
2035 2051
2036 2052 if (cpu[cpuid]->cpu_flags == 0) {
2037 2053 return (EALREADY);
2038 2054 }
2039 2055
2040 2056 if ((cpu[cpuid]->cpu_flags & CPU_POWEROFF) == 0) {
2041 2057 return (EBUSY);
2042 2058 }
2043 2059
2044 2060 if (cpu[cpuid]->cpu_props != NULL) {
2045 2061 (void) nvlist_free(cpu[cpuid]->cpu_props);
2046 2062 cpu[cpuid]->cpu_props = NULL;
2047 2063 }
2048 2064
2049 2065 error = cpu_state_change_hooks(cpuid, CPU_UNCONFIG, CPU_CONFIG);
2050 2066
2051 2067 if (error != 0)
2052 2068 return (error);
2053 2069
2054 2070 return (mp_cpu_unconfigure(cpuid));
2055 2071 }
2056 2072
2057 2073 /*
2058 2074 * Routines for registering and de-registering cpu_setup callback functions.
2059 2075 *
2060 2076 * Caller's context
2061 2077 * These routines must not be called from a driver's attach(9E) or
2062 2078 * detach(9E) entry point.
2063 2079 *
2064 2080 * NOTE: CPU callbacks should not block. They are called with cpu_lock held.
2065 2081 */
2066 2082
2067 2083 /*
2068 2084 * Ideally, these would be dynamically allocated and put into a linked
2069 2085 * list; however that is not feasible because the registration routine
2070 2086 * has to be available before the kmem allocator is working (in fact,
2071 2087 * it is called by the kmem allocator init code). In any case, there
2072 2088 * are quite a few extra entries for future users.
2073 2089 */
2074 2090 #define NCPU_SETUPS 20
2075 2091
2076 2092 struct cpu_setup {
2077 2093 cpu_setup_func_t *func;
2078 2094 void *arg;
2079 2095 } cpu_setups[NCPU_SETUPS];
2080 2096
2081 2097 void
2082 2098 register_cpu_setup_func(cpu_setup_func_t *func, void *arg)
2083 2099 {
2084 2100 int i;
2085 2101
2086 2102 ASSERT(MUTEX_HELD(&cpu_lock));
2087 2103
2088 2104 for (i = 0; i < NCPU_SETUPS; i++)
2089 2105 if (cpu_setups[i].func == NULL)
2090 2106 break;
2091 2107 if (i >= NCPU_SETUPS)
2092 2108 cmn_err(CE_PANIC, "Ran out of cpu_setup callback entries");
2093 2109
2094 2110 cpu_setups[i].func = func;
2095 2111 cpu_setups[i].arg = arg;
2096 2112 }
2097 2113
2098 2114 void
2099 2115 unregister_cpu_setup_func(cpu_setup_func_t *func, void *arg)
2100 2116 {
2101 2117 int i;
2102 2118
2103 2119 ASSERT(MUTEX_HELD(&cpu_lock));
2104 2120
2105 2121 for (i = 0; i < NCPU_SETUPS; i++)
2106 2122 if ((cpu_setups[i].func == func) &&
2107 2123 (cpu_setups[i].arg == arg))
2108 2124 break;
2109 2125 if (i >= NCPU_SETUPS)
2110 2126 cmn_err(CE_PANIC, "Could not find cpu_setup callback to "
2111 2127 "deregister");
2112 2128
2113 2129 cpu_setups[i].func = NULL;
2114 2130 cpu_setups[i].arg = 0;
2115 2131 }
2116 2132
2117 2133 /*
2118 2134 * Call any state change hooks for this CPU, ignore any errors.
2119 2135 */
2120 2136 void
2121 2137 cpu_state_change_notify(int id, cpu_setup_t what)
2122 2138 {
2123 2139 int i;
2124 2140
2125 2141 ASSERT(MUTEX_HELD(&cpu_lock));
2126 2142
2127 2143 for (i = 0; i < NCPU_SETUPS; i++) {
2128 2144 if (cpu_setups[i].func != NULL) {
2129 2145 cpu_setups[i].func(what, id, cpu_setups[i].arg);
2130 2146 }
2131 2147 }
2132 2148 }
2133 2149
2134 2150 /*
2135 2151 * Call any state change hooks for this CPU, undo it if error found.
2136 2152 */
2137 2153 static int
2138 2154 cpu_state_change_hooks(int id, cpu_setup_t what, cpu_setup_t undo)
2139 2155 {
2140 2156 int i;
2141 2157 int retval = 0;
2142 2158
2143 2159 ASSERT(MUTEX_HELD(&cpu_lock));
2144 2160
2145 2161 for (i = 0; i < NCPU_SETUPS; i++) {
2146 2162 if (cpu_setups[i].func != NULL) {
2147 2163 retval = cpu_setups[i].func(what, id,
2148 2164 cpu_setups[i].arg);
2149 2165 if (retval) {
2150 2166 for (i--; i >= 0; i--) {
2151 2167 if (cpu_setups[i].func != NULL)
2152 2168 cpu_setups[i].func(undo,
2153 2169 id, cpu_setups[i].arg);
2154 2170 }
2155 2171 break;
2156 2172 }
2157 2173 }
2158 2174 }
2159 2175 return (retval);
2160 2176 }
2161 2177
2162 2178 /*
2163 2179 * Export information about this CPU via the kstat mechanism.
2164 2180 */
2165 2181 static struct {
2166 2182 kstat_named_t ci_state;
2167 2183 kstat_named_t ci_state_begin;
2168 2184 kstat_named_t ci_cpu_type;
2169 2185 kstat_named_t ci_fpu_type;
2170 2186 kstat_named_t ci_clock_MHz;
2171 2187 kstat_named_t ci_chip_id;
2172 2188 kstat_named_t ci_implementation;
2173 2189 kstat_named_t ci_brandstr;
2174 2190 kstat_named_t ci_core_id;
2175 2191 kstat_named_t ci_curr_clock_Hz;
2176 2192 kstat_named_t ci_supp_freq_Hz;
2177 2193 kstat_named_t ci_pg_id;
2178 2194 #if defined(__sparcv9)
2179 2195 kstat_named_t ci_device_ID;
2180 2196 kstat_named_t ci_cpu_fru;
2181 2197 #endif
2182 2198 #if defined(__x86)
2183 2199 kstat_named_t ci_vendorstr;
2184 2200 kstat_named_t ci_family;
2185 2201 kstat_named_t ci_model;
2186 2202 kstat_named_t ci_step;
2187 2203 kstat_named_t ci_clogid;
2188 2204 kstat_named_t ci_pkg_core_id;
2189 2205 kstat_named_t ci_ncpuperchip;
2190 2206 kstat_named_t ci_ncoreperchip;
2191 2207 kstat_named_t ci_max_cstates;
2192 2208 kstat_named_t ci_curr_cstate;
2193 2209 kstat_named_t ci_cacheid;
2194 2210 kstat_named_t ci_sktstr;
2195 2211 #endif
2196 2212 } cpu_info_template = {
2197 2213 { "state", KSTAT_DATA_CHAR },
2198 2214 { "state_begin", KSTAT_DATA_LONG },
2199 2215 { "cpu_type", KSTAT_DATA_CHAR },
2200 2216 { "fpu_type", KSTAT_DATA_CHAR },
2201 2217 { "clock_MHz", KSTAT_DATA_LONG },
2202 2218 { "chip_id", KSTAT_DATA_LONG },
2203 2219 { "implementation", KSTAT_DATA_STRING },
2204 2220 { "brand", KSTAT_DATA_STRING },
2205 2221 { "core_id", KSTAT_DATA_LONG },
2206 2222 { "current_clock_Hz", KSTAT_DATA_UINT64 },
2207 2223 { "supported_frequencies_Hz", KSTAT_DATA_STRING },
2208 2224 { "pg_id", KSTAT_DATA_LONG },
2209 2225 #if defined(__sparcv9)
2210 2226 { "device_ID", KSTAT_DATA_UINT64 },
2211 2227 { "cpu_fru", KSTAT_DATA_STRING },
2212 2228 #endif
2213 2229 #if defined(__x86)
2214 2230 { "vendor_id", KSTAT_DATA_STRING },
2215 2231 { "family", KSTAT_DATA_INT32 },
2216 2232 { "model", KSTAT_DATA_INT32 },
2217 2233 { "stepping", KSTAT_DATA_INT32 },
2218 2234 { "clog_id", KSTAT_DATA_INT32 },
2219 2235 { "pkg_core_id", KSTAT_DATA_LONG },
2220 2236 { "ncpu_per_chip", KSTAT_DATA_INT32 },
2221 2237 { "ncore_per_chip", KSTAT_DATA_INT32 },
2222 2238 { "supported_max_cstates", KSTAT_DATA_INT32 },
2223 2239 { "current_cstate", KSTAT_DATA_INT32 },
2224 2240 { "cache_id", KSTAT_DATA_INT32 },
2225 2241 { "socket_type", KSTAT_DATA_STRING },
2226 2242 #endif
2227 2243 };
2228 2244
2229 2245 static kmutex_t cpu_info_template_lock;
2230 2246
2231 2247 static int
2232 2248 cpu_info_kstat_update(kstat_t *ksp, int rw)
2233 2249 {
2234 2250 cpu_t *cp = ksp->ks_private;
2235 2251 const char *pi_state;
2236 2252
2237 2253 if (rw == KSTAT_WRITE)
2238 2254 return (EACCES);
2239 2255
2240 2256 #if defined(__x86)
2241 2257 /* Is the cpu still initialising itself? */
2242 2258 if (cpuid_checkpass(cp, 1) == 0)
2243 2259 return (ENXIO);
2244 2260 #endif
2245 2261 switch (cp->cpu_type_info.pi_state) {
2246 2262 case P_ONLINE:
2247 2263 pi_state = PS_ONLINE;
2248 2264 break;
2249 2265 case P_POWEROFF:
2250 2266 pi_state = PS_POWEROFF;
2251 2267 break;
2252 2268 case P_NOINTR:
2253 2269 pi_state = PS_NOINTR;
2254 2270 break;
2255 2271 case P_FAULTED:
2256 2272 pi_state = PS_FAULTED;
2257 2273 break;
2258 2274 case P_SPARE:
2259 2275 pi_state = PS_SPARE;
2260 2276 break;
2261 2277 case P_OFFLINE:
2262 2278 pi_state = PS_OFFLINE;
2263 2279 break;
2264 2280 default:
2265 2281 pi_state = "unknown";
2266 2282 }
2267 2283 (void) strcpy(cpu_info_template.ci_state.value.c, pi_state);
2268 2284 cpu_info_template.ci_state_begin.value.l = cp->cpu_state_begin;
2269 2285 (void) strncpy(cpu_info_template.ci_cpu_type.value.c,
2270 2286 cp->cpu_type_info.pi_processor_type, 15);
2271 2287 (void) strncpy(cpu_info_template.ci_fpu_type.value.c,
2272 2288 cp->cpu_type_info.pi_fputypes, 15);
2273 2289 cpu_info_template.ci_clock_MHz.value.l = cp->cpu_type_info.pi_clock;
2274 2290 cpu_info_template.ci_chip_id.value.l =
2275 2291 pg_plat_hw_instance_id(cp, PGHW_CHIP);
2276 2292 kstat_named_setstr(&cpu_info_template.ci_implementation,
2277 2293 cp->cpu_idstr);
2278 2294 kstat_named_setstr(&cpu_info_template.ci_brandstr, cp->cpu_brandstr);
2279 2295 cpu_info_template.ci_core_id.value.l = pg_plat_get_core_id(cp);
2280 2296 cpu_info_template.ci_curr_clock_Hz.value.ui64 =
2281 2297 cp->cpu_curr_clock;
2282 2298 cpu_info_template.ci_pg_id.value.l =
2283 2299 cp->cpu_pg && cp->cpu_pg->cmt_lineage ?
2284 2300 cp->cpu_pg->cmt_lineage->pg_id : -1;
2285 2301 kstat_named_setstr(&cpu_info_template.ci_supp_freq_Hz,
2286 2302 cp->cpu_supp_freqs);
2287 2303 #if defined(__sparcv9)
2288 2304 cpu_info_template.ci_device_ID.value.ui64 =
2289 2305 cpunodes[cp->cpu_id].device_id;
2290 2306 kstat_named_setstr(&cpu_info_template.ci_cpu_fru, cpu_fru_fmri(cp));
2291 2307 #endif
2292 2308 #if defined(__x86)
2293 2309 kstat_named_setstr(&cpu_info_template.ci_vendorstr,
2294 2310 cpuid_getvendorstr(cp));
2295 2311 cpu_info_template.ci_family.value.l = cpuid_getfamily(cp);
2296 2312 cpu_info_template.ci_model.value.l = cpuid_getmodel(cp);
2297 2313 cpu_info_template.ci_step.value.l = cpuid_getstep(cp);
2298 2314 cpu_info_template.ci_clogid.value.l = cpuid_get_clogid(cp);
2299 2315 cpu_info_template.ci_ncpuperchip.value.l = cpuid_get_ncpu_per_chip(cp);
2300 2316 cpu_info_template.ci_ncoreperchip.value.l =
2301 2317 cpuid_get_ncore_per_chip(cp);
2302 2318 cpu_info_template.ci_pkg_core_id.value.l = cpuid_get_pkgcoreid(cp);
2303 2319 cpu_info_template.ci_max_cstates.value.l = cp->cpu_m.max_cstates;
2304 2320 cpu_info_template.ci_curr_cstate.value.l = cpu_idle_get_cpu_state(cp);
2305 2321 cpu_info_template.ci_cacheid.value.i32 = cpuid_get_cacheid(cp);
2306 2322 kstat_named_setstr(&cpu_info_template.ci_sktstr,
2307 2323 cpuid_getsocketstr(cp));
2308 2324 #endif
2309 2325
2310 2326 return (0);
2311 2327 }
2312 2328
2313 2329 static void
2314 2330 cpu_info_kstat_create(cpu_t *cp)
2315 2331 {
2316 2332 zoneid_t zoneid;
2317 2333
2318 2334 ASSERT(MUTEX_HELD(&cpu_lock));
2319 2335
2320 2336 if (pool_pset_enabled())
2321 2337 zoneid = GLOBAL_ZONEID;
2322 2338 else
2323 2339 zoneid = ALL_ZONES;
2324 2340 if ((cp->cpu_info_kstat = kstat_create_zone("cpu_info", cp->cpu_id,
2325 2341 NULL, "misc", KSTAT_TYPE_NAMED,
2326 2342 sizeof (cpu_info_template) / sizeof (kstat_named_t),
2327 2343 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_VAR_SIZE, zoneid)) != NULL) {
2328 2344 cp->cpu_info_kstat->ks_data_size += 2 * CPU_IDSTRLEN;
2329 2345 #if defined(__sparcv9)
2330 2346 cp->cpu_info_kstat->ks_data_size +=
2331 2347 strlen(cpu_fru_fmri(cp)) + 1;
2332 2348 #endif
2333 2349 #if defined(__x86)
2334 2350 cp->cpu_info_kstat->ks_data_size += X86_VENDOR_STRLEN;
2335 2351 #endif
2336 2352 if (cp->cpu_supp_freqs != NULL)
2337 2353 cp->cpu_info_kstat->ks_data_size +=
2338 2354 strlen(cp->cpu_supp_freqs) + 1;
2339 2355 cp->cpu_info_kstat->ks_lock = &cpu_info_template_lock;
2340 2356 cp->cpu_info_kstat->ks_data = &cpu_info_template;
2341 2357 cp->cpu_info_kstat->ks_private = cp;
2342 2358 cp->cpu_info_kstat->ks_update = cpu_info_kstat_update;
2343 2359 kstat_install(cp->cpu_info_kstat);
2344 2360 }
2345 2361 }
2346 2362
2347 2363 static void
2348 2364 cpu_info_kstat_destroy(cpu_t *cp)
2349 2365 {
2350 2366 ASSERT(MUTEX_HELD(&cpu_lock));
2351 2367
2352 2368 kstat_delete(cp->cpu_info_kstat);
2353 2369 cp->cpu_info_kstat = NULL;
2354 2370 }
2355 2371
2356 2372 /*
2357 2373 * Create and install kstats for the boot CPU.
2358 2374 */
2359 2375 void
2360 2376 cpu_kstat_init(cpu_t *cp)
2361 2377 {
2362 2378 mutex_enter(&cpu_lock);
2363 2379 cpu_info_kstat_create(cp);
2364 2380 cpu_stats_kstat_create(cp);
2365 2381 cpu_create_intrstat(cp);
2366 2382 cpu_set_state(cp);
2367 2383 mutex_exit(&cpu_lock);
2368 2384 }
2369 2385
2370 2386 /*
2371 2387 * Make visible to the zone that subset of the cpu information that would be
2372 2388 * initialized when a cpu is configured (but still offline).
2373 2389 */
2374 2390 void
2375 2391 cpu_visibility_configure(cpu_t *cp, zone_t *zone)
2376 2392 {
2377 2393 zoneid_t zoneid = zone ? zone->zone_id : ALL_ZONES;
2378 2394
2379 2395 ASSERT(MUTEX_HELD(&cpu_lock));
2380 2396 ASSERT(pool_pset_enabled());
2381 2397 ASSERT(cp != NULL);
2382 2398
2383 2399 if (zoneid != ALL_ZONES && zoneid != GLOBAL_ZONEID) {
2384 2400 zone->zone_ncpus++;
2385 2401 ASSERT(zone->zone_ncpus <= ncpus);
2386 2402 }
2387 2403 if (cp->cpu_info_kstat != NULL)
2388 2404 kstat_zone_add(cp->cpu_info_kstat, zoneid);
2389 2405 }
2390 2406
2391 2407 /*
2392 2408 * Make visible to the zone that subset of the cpu information that would be
2393 2409 * initialized when a previously configured cpu is onlined.
2394 2410 */
2395 2411 void
2396 2412 cpu_visibility_online(cpu_t *cp, zone_t *zone)
2397 2413 {
2398 2414 kstat_t *ksp;
2399 2415 char name[sizeof ("cpu_stat") + 10]; /* enough for 32-bit cpuids */
2400 2416 zoneid_t zoneid = zone ? zone->zone_id : ALL_ZONES;
2401 2417 processorid_t cpun;
2402 2418
2403 2419 ASSERT(MUTEX_HELD(&cpu_lock));
2404 2420 ASSERT(pool_pset_enabled());
2405 2421 ASSERT(cp != NULL);
2406 2422 ASSERT(cpu_is_active(cp));
2407 2423
2408 2424 cpun = cp->cpu_id;
2409 2425 if (zoneid != ALL_ZONES && zoneid != GLOBAL_ZONEID) {
2410 2426 zone->zone_ncpus_online++;
2411 2427 ASSERT(zone->zone_ncpus_online <= ncpus_online);
2412 2428 }
2413 2429 (void) snprintf(name, sizeof (name), "cpu_stat%d", cpun);
2414 2430 if ((ksp = kstat_hold_byname("cpu_stat", cpun, name, ALL_ZONES))
2415 2431 != NULL) {
2416 2432 kstat_zone_add(ksp, zoneid);
2417 2433 kstat_rele(ksp);
2418 2434 }
2419 2435 if ((ksp = kstat_hold_byname("cpu", cpun, "sys", ALL_ZONES)) != NULL) {
2420 2436 kstat_zone_add(ksp, zoneid);
2421 2437 kstat_rele(ksp);
2422 2438 }
2423 2439 if ((ksp = kstat_hold_byname("cpu", cpun, "vm", ALL_ZONES)) != NULL) {
2424 2440 kstat_zone_add(ksp, zoneid);
2425 2441 kstat_rele(ksp);
2426 2442 }
2427 2443 if ((ksp = kstat_hold_byname("cpu", cpun, "intrstat", ALL_ZONES)) !=
2428 2444 NULL) {
2429 2445 kstat_zone_add(ksp, zoneid);
2430 2446 kstat_rele(ksp);
2431 2447 }
2432 2448 }
2433 2449
2434 2450 /*
2435 2451 * Update relevant kstats such that cpu is now visible to processes
2436 2452 * executing in specified zone.
2437 2453 */
2438 2454 void
2439 2455 cpu_visibility_add(cpu_t *cp, zone_t *zone)
2440 2456 {
2441 2457 cpu_visibility_configure(cp, zone);
2442 2458 if (cpu_is_active(cp))
2443 2459 cpu_visibility_online(cp, zone);
2444 2460 }
2445 2461
2446 2462 /*
2447 2463 * Make invisible to the zone that subset of the cpu information that would be
2448 2464 * torn down when a previously offlined cpu is unconfigured.
2449 2465 */
2450 2466 void
2451 2467 cpu_visibility_unconfigure(cpu_t *cp, zone_t *zone)
2452 2468 {
2453 2469 zoneid_t zoneid = zone ? zone->zone_id : ALL_ZONES;
2454 2470
2455 2471 ASSERT(MUTEX_HELD(&cpu_lock));
2456 2472 ASSERT(pool_pset_enabled());
2457 2473 ASSERT(cp != NULL);
2458 2474
2459 2475 if (zoneid != ALL_ZONES && zoneid != GLOBAL_ZONEID) {
2460 2476 ASSERT(zone->zone_ncpus != 0);
2461 2477 zone->zone_ncpus--;
2462 2478 }
2463 2479 if (cp->cpu_info_kstat)
2464 2480 kstat_zone_remove(cp->cpu_info_kstat, zoneid);
2465 2481 }
2466 2482
2467 2483 /*
2468 2484 * Make invisible to the zone that subset of the cpu information that would be
2469 2485 * torn down when a cpu is offlined (but still configured).
2470 2486 */
2471 2487 void
2472 2488 cpu_visibility_offline(cpu_t *cp, zone_t *zone)
2473 2489 {
2474 2490 kstat_t *ksp;
2475 2491 char name[sizeof ("cpu_stat") + 10]; /* enough for 32-bit cpuids */
2476 2492 zoneid_t zoneid = zone ? zone->zone_id : ALL_ZONES;
2477 2493 processorid_t cpun;
2478 2494
2479 2495 ASSERT(MUTEX_HELD(&cpu_lock));
2480 2496 ASSERT(pool_pset_enabled());
2481 2497 ASSERT(cp != NULL);
2482 2498 ASSERT(cpu_is_active(cp));
2483 2499
2484 2500 cpun = cp->cpu_id;
2485 2501 if (zoneid != ALL_ZONES && zoneid != GLOBAL_ZONEID) {
2486 2502 ASSERT(zone->zone_ncpus_online != 0);
2487 2503 zone->zone_ncpus_online--;
2488 2504 }
2489 2505
2490 2506 if ((ksp = kstat_hold_byname("cpu", cpun, "intrstat", ALL_ZONES)) !=
2491 2507 NULL) {
2492 2508 kstat_zone_remove(ksp, zoneid);
2493 2509 kstat_rele(ksp);
2494 2510 }
2495 2511 if ((ksp = kstat_hold_byname("cpu", cpun, "vm", ALL_ZONES)) != NULL) {
2496 2512 kstat_zone_remove(ksp, zoneid);
2497 2513 kstat_rele(ksp);
2498 2514 }
2499 2515 if ((ksp = kstat_hold_byname("cpu", cpun, "sys", ALL_ZONES)) != NULL) {
2500 2516 kstat_zone_remove(ksp, zoneid);
2501 2517 kstat_rele(ksp);
2502 2518 }
2503 2519 (void) snprintf(name, sizeof (name), "cpu_stat%d", cpun);
2504 2520 if ((ksp = kstat_hold_byname("cpu_stat", cpun, name, ALL_ZONES))
2505 2521 != NULL) {
2506 2522 kstat_zone_remove(ksp, zoneid);
2507 2523 kstat_rele(ksp);
2508 2524 }
2509 2525 }
2510 2526
2511 2527 /*
2512 2528 * Update relevant kstats such that cpu is no longer visible to processes
2513 2529 * executing in specified zone.
2514 2530 */
2515 2531 void
2516 2532 cpu_visibility_remove(cpu_t *cp, zone_t *zone)
2517 2533 {
2518 2534 if (cpu_is_active(cp))
2519 2535 cpu_visibility_offline(cp, zone);
2520 2536 cpu_visibility_unconfigure(cp, zone);
2521 2537 }
2522 2538
2523 2539 /*
2524 2540 * Bind a thread to a CPU as requested.
2525 2541 */
2526 2542 int
2527 2543 cpu_bind_thread(kthread_id_t tp, processorid_t bind, processorid_t *obind,
2528 2544 int *error)
2529 2545 {
2530 2546 processorid_t binding;
2531 2547 cpu_t *cp = NULL;
2532 2548
2533 2549 ASSERT(MUTEX_HELD(&cpu_lock));
2534 2550 ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
2535 2551
2536 2552 thread_lock(tp);
2537 2553
2538 2554 /*
2539 2555 * Record old binding, but change the obind, which was initialized
2540 2556 * to PBIND_NONE, only if this thread has a binding. This avoids
2541 2557 * reporting PBIND_NONE for a process when some LWPs are bound.
2542 2558 */
2543 2559 binding = tp->t_bind_cpu;
2544 2560 if (binding != PBIND_NONE)
2545 2561 *obind = binding; /* record old binding */
2546 2562
2547 2563 switch (bind) {
2548 2564 case PBIND_QUERY:
2549 2565 /* Just return the old binding */
2550 2566 thread_unlock(tp);
2551 2567 return (0);
2552 2568
2553 2569 case PBIND_QUERY_TYPE:
2554 2570 /* Return the binding type */
2555 2571 *obind = TB_CPU_IS_SOFT(tp) ? PBIND_SOFT : PBIND_HARD;
2556 2572 thread_unlock(tp);
2557 2573 return (0);
2558 2574
2559 2575 case PBIND_SOFT:
2560 2576 /*
2561 2577 * Set soft binding for this thread and return the actual
2562 2578 * binding
2563 2579 */
2564 2580 TB_CPU_SOFT_SET(tp);
2565 2581 thread_unlock(tp);
2566 2582 return (0);
2567 2583
2568 2584 case PBIND_HARD:
2569 2585 /*
2570 2586 * Set hard binding for this thread and return the actual
2571 2587 * binding
2572 2588 */
2573 2589 TB_CPU_HARD_SET(tp);
2574 2590 thread_unlock(tp);
2575 2591 return (0);
2576 2592
2577 2593 default:
2578 2594 break;
2579 2595 }
2580 2596
2581 2597 /*
2582 2598 * If this thread/LWP cannot be bound because of permission
2583 2599 * problems, just note that and return success so that the
2584 2600 * other threads/LWPs will be bound. This is the way
2585 2601 * processor_bind() is defined to work.
2586 2602 *
2587 2603 * Binding will get EPERM if the thread is of system class
2588 2604 * or hasprocperm() fails.
2589 2605 */
2590 2606 if (tp->t_cid == 0 || !hasprocperm(tp->t_cred, CRED())) {
2591 2607 *error = EPERM;
2592 2608 thread_unlock(tp);
2593 2609 return (0);
2594 2610 }
2595 2611
2596 2612 binding = bind;
2597 2613 if (binding != PBIND_NONE) {
2598 2614 cp = cpu_get((processorid_t)binding);
2599 2615 /*
2600 2616 * Make sure binding is valid and is in right partition.
2601 2617 */
2602 2618 if (cp == NULL || tp->t_cpupart != cp->cpu_part) {
2603 2619 *error = EINVAL;
2604 2620 thread_unlock(tp);
2605 2621 return (0);
2606 2622 }
2607 2623 }
2608 2624 tp->t_bind_cpu = binding; /* set new binding */
2609 2625
2610 2626 /*
2611 2627 * If there is no system-set reason for affinity, set
2612 2628 * the t_bound_cpu field to reflect the binding.
2613 2629 */
2614 2630 if (tp->t_affinitycnt == 0) {
2615 2631 if (binding == PBIND_NONE) {
2616 2632 /*
2617 2633 * We may need to adjust disp_max_unbound_pri
2618 2634 * since we're becoming unbound.
2619 2635 */
2620 2636 disp_adjust_unbound_pri(tp);
2621 2637
2622 2638 tp->t_bound_cpu = NULL; /* set new binding */
2623 2639
2624 2640 /*
2625 2641 * Move thread to lgroup with strongest affinity
2626 2642 * after unbinding
2627 2643 */
2628 2644 if (tp->t_lgrp_affinity)
2629 2645 lgrp_move_thread(tp,
2630 2646 lgrp_choose(tp, tp->t_cpupart), 1);
2631 2647
2632 2648 if (tp->t_state == TS_ONPROC &&
2633 2649 tp->t_cpu->cpu_part != tp->t_cpupart)
2634 2650 cpu_surrender(tp);
2635 2651 } else {
2636 2652 lpl_t *lpl;
2637 2653
2638 2654 tp->t_bound_cpu = cp;
2639 2655 ASSERT(cp->cpu_lpl != NULL);
2640 2656
2641 2657 /*
2642 2658 * Set home to lgroup with most affinity containing CPU
2643 2659 * that thread is being bound or minimum bounding
2644 2660 * lgroup if no affinities set
2645 2661 */
2646 2662 if (tp->t_lgrp_affinity)
2647 2663 lpl = lgrp_affinity_best(tp, tp->t_cpupart,
2648 2664 LGRP_NONE, B_FALSE);
2649 2665 else
2650 2666 lpl = cp->cpu_lpl;
2651 2667
2652 2668 if (tp->t_lpl != lpl) {
2653 2669 /* can't grab cpu_lock */
2654 2670 lgrp_move_thread(tp, lpl, 1);
2655 2671 }
2656 2672
2657 2673 /*
2658 2674 * Make the thread switch to the bound CPU.
2659 2675 * If the thread is runnable, we need to
2660 2676 * requeue it even if t_cpu is already set
2661 2677 * to the right CPU, since it may be on a
2662 2678 * kpreempt queue and need to move to a local
2663 2679 * queue. We could check t_disp_queue to
2664 2680 * avoid unnecessary overhead if it's already
2665 2681 * on the right queue, but since this isn't
2666 2682 * a performance-critical operation it doesn't
2667 2683 * seem worth the extra code and complexity.
2668 2684 *
2669 2685 * If the thread is weakbound to the cpu then it will
2670 2686 * resist the new binding request until the weak
2671 2687 * binding drops. The cpu_surrender or requeueing
2672 2688 * below could be skipped in such cases (since it
2673 2689 * will have no effect), but that would require
2674 2690 * thread_allowmigrate to acquire thread_lock so
2675 2691 * we'll take the very occasional hit here instead.
2676 2692 */
2677 2693 if (tp->t_state == TS_ONPROC) {
2678 2694 cpu_surrender(tp);
2679 2695 } else if (tp->t_state == TS_RUN) {
2680 2696 cpu_t *ocp = tp->t_cpu;
2681 2697
2682 2698 (void) dispdeq(tp);
2683 2699 setbackdq(tp);
2684 2700 /*
2685 2701 * Either on the bound CPU's disp queue now,
2686 2702 * or swapped out or on the swap queue.
2687 2703 */
2688 2704 ASSERT(tp->t_disp_queue == cp->cpu_disp ||
2689 2705 tp->t_weakbound_cpu == ocp ||
2690 2706 (tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ))
2691 2707 != TS_LOAD);
2692 2708 }
2693 2709 }
2694 2710 }
2695 2711
2696 2712 /*
2697 2713 * Our binding has changed; set TP_CHANGEBIND.
2698 2714 */
2699 2715 tp->t_proc_flag |= TP_CHANGEBIND;
2700 2716 aston(tp);
2701 2717
2702 2718 thread_unlock(tp);
2703 2719
2704 2720 return (0);
2705 2721 }
2706 2722
2707 2723 #if CPUSET_WORDS > 1
2708 2724
2709 2725 /*
2710 2726 * Functions for implementing cpuset operations when a cpuset is more
2711 2727 * than one word. On platforms where a cpuset is a single word these
2712 2728 * are implemented as macros in cpuvar.h.
2713 2729 */
2714 2730
2715 2731 void
2716 2732 cpuset_all(cpuset_t *s)
2717 2733 {
2718 2734 int i;
2719 2735
2720 2736 for (i = 0; i < CPUSET_WORDS; i++)
2721 2737 s->cpub[i] = ~0UL;
2722 2738 }
2723 2739
2724 2740 void
2725 2741 cpuset_all_but(cpuset_t *s, uint_t cpu)
2726 2742 {
2727 2743 cpuset_all(s);
2728 2744 CPUSET_DEL(*s, cpu);
2729 2745 }
2730 2746
2731 2747 void
2732 2748 cpuset_only(cpuset_t *s, uint_t cpu)
2733 2749 {
2734 2750 CPUSET_ZERO(*s);
2735 2751 CPUSET_ADD(*s, cpu);
2736 2752 }
2737 2753
2738 2754 int
2739 2755 cpuset_isnull(cpuset_t *s)
2740 2756 {
2741 2757 int i;
2742 2758
2743 2759 for (i = 0; i < CPUSET_WORDS; i++)
2744 2760 if (s->cpub[i] != 0)
2745 2761 return (0);
2746 2762 return (1);
2747 2763 }
2748 2764
2749 2765 int
2750 2766 cpuset_cmp(cpuset_t *s1, cpuset_t *s2)
2751 2767 {
2752 2768 int i;
2753 2769
2754 2770 for (i = 0; i < CPUSET_WORDS; i++)
2755 2771 if (s1->cpub[i] != s2->cpub[i])
2756 2772 return (0);
2757 2773 return (1);
2758 2774 }
2759 2775
2760 2776 uint_t
2761 2777 cpuset_find(cpuset_t *s)
2762 2778 {
2763 2779
2764 2780 uint_t i;
2765 2781 uint_t cpu = (uint_t)-1;
2766 2782
2767 2783 /*
2768 2784 * Find a cpu in the cpuset
2769 2785 */
2770 2786 for (i = 0; i < CPUSET_WORDS; i++) {
2771 2787 cpu = (uint_t)(lowbit(s->cpub[i]) - 1);
2772 2788 if (cpu != (uint_t)-1) {
2773 2789 cpu += i * BT_NBIPUL;
2774 2790 break;
2775 2791 }
2776 2792 }
2777 2793 return (cpu);
2778 2794 }
2779 2795
2780 2796 void
2781 2797 cpuset_bounds(cpuset_t *s, uint_t *smallestid, uint_t *largestid)
2782 2798 {
2783 2799 int i, j;
2784 2800 uint_t bit;
2785 2801
2786 2802 /*
2787 2803 * First, find the smallest cpu id in the set.
2788 2804 */
2789 2805 for (i = 0; i < CPUSET_WORDS; i++) {
2790 2806 if (s->cpub[i] != 0) {
2791 2807 bit = (uint_t)(lowbit(s->cpub[i]) - 1);
2792 2808 ASSERT(bit != (uint_t)-1);
2793 2809 *smallestid = bit + (i * BT_NBIPUL);
2794 2810
2795 2811 /*
2796 2812 * Now find the largest cpu id in
2797 2813 * the set and return immediately.
2798 2814 * Done in an inner loop to avoid
2799 2815 * having to break out of the first
2800 2816 * loop.
2801 2817 */
2802 2818 for (j = CPUSET_WORDS - 1; j >= i; j--) {
2803 2819 if (s->cpub[j] != 0) {
2804 2820 bit = (uint_t)(highbit(s->cpub[j]) - 1);
2805 2821 ASSERT(bit != (uint_t)-1);
2806 2822 *largestid = bit + (j * BT_NBIPUL);
2807 2823 ASSERT(*largestid >= *smallestid);
2808 2824 return;
2809 2825 }
2810 2826 }
2811 2827
2812 2828 /*
2813 2829 * If this code is reached, a
2814 2830 * smallestid was found, but not a
2815 2831 * largestid. The cpuset must have
2816 2832 * been changed during the course
2817 2833 * of this function call.
2818 2834 */
2819 2835 ASSERT(0);
2820 2836 }
2821 2837 }
2822 2838 *smallestid = *largestid = CPUSET_NOTINSET;
2823 2839 }
2824 2840
2825 2841 #endif /* CPUSET_WORDS */
2826 2842
2827 2843 /*
2828 2844 * Unbind threads bound to specified CPU.
2829 2845 *
2830 2846 * If `unbind_all_threads' is true, unbind all user threads bound to a given
2831 2847 * CPU. Otherwise unbind all soft-bound user threads.
2832 2848 */
2833 2849 int
2834 2850 cpu_unbind(processorid_t cpu, boolean_t unbind_all_threads)
2835 2851 {
2836 2852 processorid_t obind;
2837 2853 kthread_t *tp;
2838 2854 int ret = 0;
2839 2855 proc_t *pp;
2840 2856 int err, berr = 0;
2841 2857
2842 2858 ASSERT(MUTEX_HELD(&cpu_lock));
2843 2859
2844 2860 mutex_enter(&pidlock);
2845 2861 for (pp = practive; pp != NULL; pp = pp->p_next) {
2846 2862 mutex_enter(&pp->p_lock);
2847 2863 tp = pp->p_tlist;
2848 2864 /*
2849 2865 * Skip zombies, kernel processes, and processes in
2850 2866 * other zones, if called from a non-global zone.
2851 2867 */
2852 2868 if (tp == NULL || (pp->p_flag & SSYS) ||
2853 2869 !HASZONEACCESS(curproc, pp->p_zone->zone_id)) {
2854 2870 mutex_exit(&pp->p_lock);
2855 2871 continue;
2856 2872 }
2857 2873 do {
2858 2874 if (tp->t_bind_cpu != cpu)
2859 2875 continue;
2860 2876 /*
2861 2877 * Skip threads with hard binding when
2862 2878 * `unbind_all_threads' is not specified.
2863 2879 */
2864 2880 if (!unbind_all_threads && TB_CPU_IS_HARD(tp))
2865 2881 continue;
2866 2882 err = cpu_bind_thread(tp, PBIND_NONE, &obind, &berr);
2867 2883 if (ret == 0)
2868 2884 ret = err;
2869 2885 } while ((tp = tp->t_forw) != pp->p_tlist);
2870 2886 mutex_exit(&pp->p_lock);
2871 2887 }
2872 2888 mutex_exit(&pidlock);
2873 2889 if (ret == 0)
2874 2890 ret = berr;
2875 2891 return (ret);
2876 2892 }
2877 2893
2878 2894
2879 2895 /*
2880 2896 * Destroy all remaining bound threads on a cpu.
2881 2897 */
2882 2898 void
2883 2899 cpu_destroy_bound_threads(cpu_t *cp)
2884 2900 {
2885 2901 extern id_t syscid;
2886 2902 register kthread_id_t t, tlist, tnext;
2887 2903
2888 2904 /*
2889 2905 * Destroy all remaining bound threads on the cpu. This
2890 2906 * should include both the interrupt threads and the idle thread.
2891 2907 * This requires some care, since we need to traverse the
2892 2908 * thread list with the pidlock mutex locked, but thread_free
2893 2909 * also locks the pidlock mutex. So, we collect the threads
2894 2910 * we're going to reap in a list headed by "tlist", then we
2895 2911 * unlock the pidlock mutex and traverse the tlist list,
2896 2912 * doing thread_free's on the thread's. Simple, n'est pas?
2897 2913 * Also, this depends on thread_free not mucking with the
2898 2914 * t_next and t_prev links of the thread.
2899 2915 */
2900 2916
2901 2917 if ((t = curthread) != NULL) {
2902 2918
2903 2919 tlist = NULL;
2904 2920 mutex_enter(&pidlock);
2905 2921 do {
2906 2922 tnext = t->t_next;
2907 2923 if (t->t_bound_cpu == cp) {
2908 2924
2909 2925 /*
2910 2926 * We've found a bound thread, carefully unlink
2911 2927 * it out of the thread list, and add it to
2912 2928 * our "tlist". We "know" we don't have to
2913 2929 * worry about unlinking curthread (the thread
2914 2930 * that is executing this code).
2915 2931 */
2916 2932 t->t_next->t_prev = t->t_prev;
2917 2933 t->t_prev->t_next = t->t_next;
2918 2934 t->t_next = tlist;
2919 2935 tlist = t;
2920 2936 ASSERT(t->t_cid == syscid);
2921 2937 /* wake up anyone blocked in thread_join */
2922 2938 cv_broadcast(&t->t_joincv);
2923 2939 /*
2924 2940 * t_lwp set by interrupt threads and not
2925 2941 * cleared.
2926 2942 */
2927 2943 t->t_lwp = NULL;
2928 2944 /*
2929 2945 * Pause and idle threads always have
2930 2946 * t_state set to TS_ONPROC.
2931 2947 */
2932 2948 t->t_state = TS_FREE;
2933 2949 t->t_prev = NULL; /* Just in case */
2934 2950 }
2935 2951
2936 2952 } while ((t = tnext) != curthread);
2937 2953
2938 2954 mutex_exit(&pidlock);
2939 2955
2940 2956 mutex_sync();
2941 2957 for (t = tlist; t != NULL; t = tnext) {
2942 2958 tnext = t->t_next;
2943 2959 thread_free(t);
2944 2960 }
2945 2961 }
2946 2962 }
2947 2963
2948 2964 /*
2949 2965 * Update the cpu_supp_freqs of this cpu. This information is returned
2950 2966 * as part of cpu_info kstats. If the cpu_info_kstat exists already, then
2951 2967 * maintain the kstat data size.
2952 2968 */
2953 2969 void
2954 2970 cpu_set_supp_freqs(cpu_t *cp, const char *freqs)
2955 2971 {
2956 2972 char clkstr[sizeof ("18446744073709551615") + 1]; /* ui64 MAX */
2957 2973 const char *lfreqs = clkstr;
2958 2974 boolean_t kstat_exists = B_FALSE;
2959 2975 kstat_t *ksp;
2960 2976 size_t len;
2961 2977
2962 2978 /*
2963 2979 * A NULL pointer means we only support one speed.
2964 2980 */
2965 2981 if (freqs == NULL)
2966 2982 (void) snprintf(clkstr, sizeof (clkstr), "%"PRIu64,
2967 2983 cp->cpu_curr_clock);
2968 2984 else
2969 2985 lfreqs = freqs;
2970 2986
2971 2987 /*
2972 2988 * Make sure the frequency doesn't change while a snapshot is
2973 2989 * going on. Of course, we only need to worry about this if
2974 2990 * the kstat exists.
2975 2991 */
2976 2992 if ((ksp = cp->cpu_info_kstat) != NULL) {
2977 2993 mutex_enter(ksp->ks_lock);
2978 2994 kstat_exists = B_TRUE;
2979 2995 }
2980 2996
2981 2997 /*
2982 2998 * Free any previously allocated string and if the kstat
2983 2999 * already exists, then update its data size.
2984 3000 */
2985 3001 if (cp->cpu_supp_freqs != NULL) {
2986 3002 len = strlen(cp->cpu_supp_freqs) + 1;
2987 3003 kmem_free(cp->cpu_supp_freqs, len);
2988 3004 if (kstat_exists)
2989 3005 ksp->ks_data_size -= len;
2990 3006 }
2991 3007
2992 3008 /*
2993 3009 * Allocate the new string and set the pointer.
2994 3010 */
2995 3011 len = strlen(lfreqs) + 1;
2996 3012 cp->cpu_supp_freqs = kmem_alloc(len, KM_SLEEP);
2997 3013 (void) strcpy(cp->cpu_supp_freqs, lfreqs);
2998 3014
2999 3015 /*
3000 3016 * If the kstat already exists then update the data size and
3001 3017 * free the lock.
3002 3018 */
3003 3019 if (kstat_exists) {
3004 3020 ksp->ks_data_size += len;
3005 3021 mutex_exit(ksp->ks_lock);
3006 3022 }
3007 3023 }
3008 3024
3009 3025 /*
3010 3026 * Indicate the current CPU's clock freqency (in Hz).
3011 3027 * The calling context must be such that CPU references are safe.
3012 3028 */
3013 3029 void
3014 3030 cpu_set_curr_clock(uint64_t new_clk)
3015 3031 {
3016 3032 uint64_t old_clk;
3017 3033
3018 3034 old_clk = CPU->cpu_curr_clock;
3019 3035 CPU->cpu_curr_clock = new_clk;
3020 3036
3021 3037 /*
3022 3038 * The cpu-change-speed DTrace probe exports the frequency in Hz
3023 3039 */
3024 3040 DTRACE_PROBE3(cpu__change__speed, processorid_t, CPU->cpu_id,
3025 3041 uint64_t, old_clk, uint64_t, new_clk);
3026 3042 }
3027 3043
3028 3044 /*
3029 3045 * processor_info(2) and p_online(2) status support functions
3030 3046 * The constants returned by the cpu_get_state() and cpu_get_state_str() are
3031 3047 * for use in communicating processor state information to userland. Kernel
3032 3048 * subsystems should only be using the cpu_flags value directly. Subsystems
3033 3049 * modifying cpu_flags should record the state change via a call to the
3034 3050 * cpu_set_state().
3035 3051 */
3036 3052
3037 3053 /*
3038 3054 * Update the pi_state of this CPU. This function provides the CPU status for
3039 3055 * the information returned by processor_info(2).
3040 3056 */
3041 3057 void
3042 3058 cpu_set_state(cpu_t *cpu)
3043 3059 {
3044 3060 ASSERT(MUTEX_HELD(&cpu_lock));
3045 3061 cpu->cpu_type_info.pi_state = cpu_get_state(cpu);
3046 3062 cpu->cpu_state_begin = gethrestime_sec();
3047 3063 pool_cpu_mod = gethrtime();
3048 3064 }
3049 3065
3050 3066 /*
3051 3067 * Return offline/online/other status for the indicated CPU. Use only for
3052 3068 * communication with user applications; cpu_flags provides the in-kernel
3053 3069 * interface.
3054 3070 */
3055 3071 int
3056 3072 cpu_get_state(cpu_t *cpu)
3057 3073 {
3058 3074 ASSERT(MUTEX_HELD(&cpu_lock));
3059 3075 if (cpu->cpu_flags & CPU_POWEROFF)
3060 3076 return (P_POWEROFF);
3061 3077 else if (cpu->cpu_flags & CPU_FAULTED)
3062 3078 return (P_FAULTED);
3063 3079 else if (cpu->cpu_flags & CPU_SPARE)
3064 3080 return (P_SPARE);
3065 3081 else if ((cpu->cpu_flags & (CPU_READY | CPU_OFFLINE)) != CPU_READY)
3066 3082 return (P_OFFLINE);
3067 3083 else if (cpu->cpu_flags & CPU_ENABLE)
3068 3084 return (P_ONLINE);
3069 3085 else
3070 3086 return (P_NOINTR);
3071 3087 }
3072 3088
3073 3089 /*
3074 3090 * Return processor_info(2) state as a string.
3075 3091 */
3076 3092 const char *
3077 3093 cpu_get_state_str(cpu_t *cpu)
3078 3094 {
3079 3095 const char *string;
3080 3096
3081 3097 switch (cpu_get_state(cpu)) {
3082 3098 case P_ONLINE:
3083 3099 string = PS_ONLINE;
3084 3100 break;
3085 3101 case P_POWEROFF:
3086 3102 string = PS_POWEROFF;
3087 3103 break;
3088 3104 case P_NOINTR:
3089 3105 string = PS_NOINTR;
3090 3106 break;
3091 3107 case P_SPARE:
3092 3108 string = PS_SPARE;
3093 3109 break;
3094 3110 case P_FAULTED:
3095 3111 string = PS_FAULTED;
3096 3112 break;
3097 3113 case P_OFFLINE:
3098 3114 string = PS_OFFLINE;
3099 3115 break;
3100 3116 default:
3101 3117 string = "unknown";
3102 3118 break;
3103 3119 }
3104 3120 return (string);
3105 3121 }
3106 3122
3107 3123 /*
3108 3124 * Export this CPU's statistics (cpu_stat_t and cpu_stats_t) as raw and named
3109 3125 * kstats, respectively. This is done when a CPU is initialized or placed
3110 3126 * online via p_online(2).
3111 3127 */
3112 3128 static void
3113 3129 cpu_stats_kstat_create(cpu_t *cp)
3114 3130 {
3115 3131 int instance = cp->cpu_id;
3116 3132 char *module = "cpu";
3117 3133 char *class = "misc";
3118 3134 kstat_t *ksp;
3119 3135 zoneid_t zoneid;
3120 3136
3121 3137 ASSERT(MUTEX_HELD(&cpu_lock));
3122 3138
3123 3139 if (pool_pset_enabled())
3124 3140 zoneid = GLOBAL_ZONEID;
3125 3141 else
3126 3142 zoneid = ALL_ZONES;
3127 3143 /*
3128 3144 * Create named kstats
3129 3145 */
3130 3146 #define CPU_STATS_KS_CREATE(name, tsize, update_func) \
3131 3147 ksp = kstat_create_zone(module, instance, (name), class, \
3132 3148 KSTAT_TYPE_NAMED, (tsize) / sizeof (kstat_named_t), 0, \
3133 3149 zoneid); \
3134 3150 if (ksp != NULL) { \
3135 3151 ksp->ks_private = cp; \
3136 3152 ksp->ks_update = (update_func); \
3137 3153 kstat_install(ksp); \
3138 3154 } else \
3139 3155 cmn_err(CE_WARN, "cpu: unable to create %s:%d:%s kstat", \
3140 3156 module, instance, (name));
3141 3157
3142 3158 CPU_STATS_KS_CREATE("sys", sizeof (cpu_sys_stats_ks_data_template),
3143 3159 cpu_sys_stats_ks_update);
3144 3160 CPU_STATS_KS_CREATE("vm", sizeof (cpu_vm_stats_ks_data_template),
3145 3161 cpu_vm_stats_ks_update);
3146 3162
3147 3163 /*
3148 3164 * Export the familiar cpu_stat_t KSTAT_TYPE_RAW kstat.
3149 3165 */
3150 3166 ksp = kstat_create_zone("cpu_stat", cp->cpu_id, NULL,
3151 3167 "misc", KSTAT_TYPE_RAW, sizeof (cpu_stat_t), 0, zoneid);
3152 3168 if (ksp != NULL) {
3153 3169 ksp->ks_update = cpu_stat_ks_update;
3154 3170 ksp->ks_private = cp;
3155 3171 kstat_install(ksp);
3156 3172 }
3157 3173 }
3158 3174
3159 3175 static void
3160 3176 cpu_stats_kstat_destroy(cpu_t *cp)
3161 3177 {
3162 3178 char ks_name[KSTAT_STRLEN];
3163 3179
3164 3180 (void) sprintf(ks_name, "cpu_stat%d", cp->cpu_id);
3165 3181 kstat_delete_byname("cpu_stat", cp->cpu_id, ks_name);
3166 3182
3167 3183 kstat_delete_byname("cpu", cp->cpu_id, "sys");
3168 3184 kstat_delete_byname("cpu", cp->cpu_id, "vm");
3169 3185 }
3170 3186
3171 3187 static int
3172 3188 cpu_sys_stats_ks_update(kstat_t *ksp, int rw)
3173 3189 {
3174 3190 cpu_t *cp = (cpu_t *)ksp->ks_private;
3175 3191 struct cpu_sys_stats_ks_data *csskd;
3176 3192 cpu_sys_stats_t *css;
3177 3193 hrtime_t msnsecs[NCMSTATES];
3178 3194 int i;
3179 3195
3180 3196 if (rw == KSTAT_WRITE)
3181 3197 return (EACCES);
3182 3198
3183 3199 csskd = ksp->ks_data;
3184 3200 css = &cp->cpu_stats.sys;
3185 3201
3186 3202 /*
3187 3203 * Read CPU mstate, but compare with the last values we
3188 3204 * received to make sure that the returned kstats never
3189 3205 * decrease.
3190 3206 */
3191 3207
3192 3208 get_cpu_mstate(cp, msnsecs);
3193 3209 if (csskd->cpu_nsec_idle.value.ui64 > msnsecs[CMS_IDLE])
3194 3210 msnsecs[CMS_IDLE] = csskd->cpu_nsec_idle.value.ui64;
3195 3211 if (csskd->cpu_nsec_user.value.ui64 > msnsecs[CMS_USER])
3196 3212 msnsecs[CMS_USER] = csskd->cpu_nsec_user.value.ui64;
3197 3213 if (csskd->cpu_nsec_kernel.value.ui64 > msnsecs[CMS_SYSTEM])
3198 3214 msnsecs[CMS_SYSTEM] = csskd->cpu_nsec_kernel.value.ui64;
3199 3215
3200 3216 bcopy(&cpu_sys_stats_ks_data_template, ksp->ks_data,
3201 3217 sizeof (cpu_sys_stats_ks_data_template));
3202 3218
3203 3219 csskd->cpu_ticks_wait.value.ui64 = 0;
3204 3220 csskd->wait_ticks_io.value.ui64 = 0;
3205 3221
3206 3222 csskd->cpu_nsec_idle.value.ui64 = msnsecs[CMS_IDLE];
3207 3223 csskd->cpu_nsec_user.value.ui64 = msnsecs[CMS_USER];
3208 3224 csskd->cpu_nsec_kernel.value.ui64 = msnsecs[CMS_SYSTEM];
3209 3225 csskd->cpu_ticks_idle.value.ui64 =
3210 3226 NSEC_TO_TICK(csskd->cpu_nsec_idle.value.ui64);
3211 3227 csskd->cpu_ticks_user.value.ui64 =
3212 3228 NSEC_TO_TICK(csskd->cpu_nsec_user.value.ui64);
3213 3229 csskd->cpu_ticks_kernel.value.ui64 =
3214 3230 NSEC_TO_TICK(csskd->cpu_nsec_kernel.value.ui64);
3215 3231 csskd->cpu_nsec_dtrace.value.ui64 = cp->cpu_dtrace_nsec;
3216 3232 csskd->dtrace_probes.value.ui64 = cp->cpu_dtrace_probes;
3217 3233 csskd->cpu_nsec_intr.value.ui64 = cp->cpu_intrlast;
3218 3234 csskd->cpu_load_intr.value.ui64 = cp->cpu_intrload;
3219 3235 csskd->bread.value.ui64 = css->bread;
3220 3236 csskd->bwrite.value.ui64 = css->bwrite;
3221 3237 csskd->lread.value.ui64 = css->lread;
3222 3238 csskd->lwrite.value.ui64 = css->lwrite;
3223 3239 csskd->phread.value.ui64 = css->phread;
3224 3240 csskd->phwrite.value.ui64 = css->phwrite;
3225 3241 csskd->pswitch.value.ui64 = css->pswitch;
3226 3242 csskd->trap.value.ui64 = css->trap;
3227 3243 csskd->intr.value.ui64 = 0;
3228 3244 for (i = 0; i < PIL_MAX; i++)
3229 3245 csskd->intr.value.ui64 += css->intr[i];
3230 3246 csskd->syscall.value.ui64 = css->syscall;
3231 3247 csskd->sysread.value.ui64 = css->sysread;
3232 3248 csskd->syswrite.value.ui64 = css->syswrite;
3233 3249 csskd->sysfork.value.ui64 = css->sysfork;
3234 3250 csskd->sysvfork.value.ui64 = css->sysvfork;
3235 3251 csskd->sysexec.value.ui64 = css->sysexec;
3236 3252 csskd->readch.value.ui64 = css->readch;
3237 3253 csskd->writech.value.ui64 = css->writech;
3238 3254 csskd->rcvint.value.ui64 = css->rcvint;
3239 3255 csskd->xmtint.value.ui64 = css->xmtint;
3240 3256 csskd->mdmint.value.ui64 = css->mdmint;
3241 3257 csskd->rawch.value.ui64 = css->rawch;
3242 3258 csskd->canch.value.ui64 = css->canch;
3243 3259 csskd->outch.value.ui64 = css->outch;
3244 3260 csskd->msg.value.ui64 = css->msg;
3245 3261 csskd->sema.value.ui64 = css->sema;
3246 3262 csskd->namei.value.ui64 = css->namei;
3247 3263 csskd->ufsiget.value.ui64 = css->ufsiget;
3248 3264 csskd->ufsdirblk.value.ui64 = css->ufsdirblk;
3249 3265 csskd->ufsipage.value.ui64 = css->ufsipage;
3250 3266 csskd->ufsinopage.value.ui64 = css->ufsinopage;
3251 3267 csskd->procovf.value.ui64 = css->procovf;
3252 3268 csskd->intrthread.value.ui64 = 0;
3253 3269 for (i = 0; i < LOCK_LEVEL - 1; i++)
3254 3270 csskd->intrthread.value.ui64 += css->intr[i];
3255 3271 csskd->intrblk.value.ui64 = css->intrblk;
3256 3272 csskd->intrunpin.value.ui64 = css->intrunpin;
3257 3273 csskd->idlethread.value.ui64 = css->idlethread;
3258 3274 csskd->inv_swtch.value.ui64 = css->inv_swtch;
3259 3275 csskd->nthreads.value.ui64 = css->nthreads;
3260 3276 csskd->cpumigrate.value.ui64 = css->cpumigrate;
3261 3277 csskd->xcalls.value.ui64 = css->xcalls;
3262 3278 csskd->mutex_adenters.value.ui64 = css->mutex_adenters;
3263 3279 csskd->rw_rdfails.value.ui64 = css->rw_rdfails;
3264 3280 csskd->rw_wrfails.value.ui64 = css->rw_wrfails;
3265 3281 csskd->modload.value.ui64 = css->modload;
3266 3282 csskd->modunload.value.ui64 = css->modunload;
3267 3283 csskd->bawrite.value.ui64 = css->bawrite;
3268 3284 csskd->iowait.value.ui64 = css->iowait;
3269 3285
3270 3286 return (0);
3271 3287 }
3272 3288
3273 3289 static int
3274 3290 cpu_vm_stats_ks_update(kstat_t *ksp, int rw)
3275 3291 {
3276 3292 cpu_t *cp = (cpu_t *)ksp->ks_private;
3277 3293 struct cpu_vm_stats_ks_data *cvskd;
3278 3294 cpu_vm_stats_t *cvs;
3279 3295
3280 3296 if (rw == KSTAT_WRITE)
3281 3297 return (EACCES);
3282 3298
3283 3299 cvs = &cp->cpu_stats.vm;
3284 3300 cvskd = ksp->ks_data;
3285 3301
3286 3302 bcopy(&cpu_vm_stats_ks_data_template, ksp->ks_data,
3287 3303 sizeof (cpu_vm_stats_ks_data_template));
3288 3304 cvskd->pgrec.value.ui64 = cvs->pgrec;
3289 3305 cvskd->pgfrec.value.ui64 = cvs->pgfrec;
3290 3306 cvskd->pgin.value.ui64 = cvs->pgin;
3291 3307 cvskd->pgpgin.value.ui64 = cvs->pgpgin;
3292 3308 cvskd->pgout.value.ui64 = cvs->pgout;
3293 3309 cvskd->pgpgout.value.ui64 = cvs->pgpgout;
3294 3310 cvskd->swapin.value.ui64 = cvs->swapin;
3295 3311 cvskd->pgswapin.value.ui64 = cvs->pgswapin;
3296 3312 cvskd->swapout.value.ui64 = cvs->swapout;
3297 3313 cvskd->pgswapout.value.ui64 = cvs->pgswapout;
3298 3314 cvskd->zfod.value.ui64 = cvs->zfod;
3299 3315 cvskd->dfree.value.ui64 = cvs->dfree;
3300 3316 cvskd->scan.value.ui64 = cvs->scan;
3301 3317 cvskd->rev.value.ui64 = cvs->rev;
3302 3318 cvskd->hat_fault.value.ui64 = cvs->hat_fault;
3303 3319 cvskd->as_fault.value.ui64 = cvs->as_fault;
3304 3320 cvskd->maj_fault.value.ui64 = cvs->maj_fault;
3305 3321 cvskd->cow_fault.value.ui64 = cvs->cow_fault;
3306 3322 cvskd->prot_fault.value.ui64 = cvs->prot_fault;
3307 3323 cvskd->softlock.value.ui64 = cvs->softlock;
3308 3324 cvskd->kernel_asflt.value.ui64 = cvs->kernel_asflt;
3309 3325 cvskd->pgrrun.value.ui64 = cvs->pgrrun;
3310 3326 cvskd->execpgin.value.ui64 = cvs->execpgin;
3311 3327 cvskd->execpgout.value.ui64 = cvs->execpgout;
3312 3328 cvskd->execfree.value.ui64 = cvs->execfree;
3313 3329 cvskd->anonpgin.value.ui64 = cvs->anonpgin;
3314 3330 cvskd->anonpgout.value.ui64 = cvs->anonpgout;
3315 3331 cvskd->anonfree.value.ui64 = cvs->anonfree;
3316 3332 cvskd->fspgin.value.ui64 = cvs->fspgin;
3317 3333 cvskd->fspgout.value.ui64 = cvs->fspgout;
3318 3334 cvskd->fsfree.value.ui64 = cvs->fsfree;
3319 3335
3320 3336 return (0);
3321 3337 }
3322 3338
3323 3339 static int
3324 3340 cpu_stat_ks_update(kstat_t *ksp, int rw)
3325 3341 {
3326 3342 cpu_stat_t *cso;
3327 3343 cpu_t *cp;
3328 3344 int i;
3329 3345 hrtime_t msnsecs[NCMSTATES];
3330 3346
3331 3347 cso = (cpu_stat_t *)ksp->ks_data;
3332 3348 cp = (cpu_t *)ksp->ks_private;
3333 3349
3334 3350 if (rw == KSTAT_WRITE)
3335 3351 return (EACCES);
3336 3352
3337 3353 /*
3338 3354 * Read CPU mstate, but compare with the last values we
3339 3355 * received to make sure that the returned kstats never
3340 3356 * decrease.
3341 3357 */
3342 3358
3343 3359 get_cpu_mstate(cp, msnsecs);
3344 3360 msnsecs[CMS_IDLE] = NSEC_TO_TICK(msnsecs[CMS_IDLE]);
3345 3361 msnsecs[CMS_USER] = NSEC_TO_TICK(msnsecs[CMS_USER]);
3346 3362 msnsecs[CMS_SYSTEM] = NSEC_TO_TICK(msnsecs[CMS_SYSTEM]);
3347 3363 if (cso->cpu_sysinfo.cpu[CPU_IDLE] < msnsecs[CMS_IDLE])
3348 3364 cso->cpu_sysinfo.cpu[CPU_IDLE] = msnsecs[CMS_IDLE];
3349 3365 if (cso->cpu_sysinfo.cpu[CPU_USER] < msnsecs[CMS_USER])
3350 3366 cso->cpu_sysinfo.cpu[CPU_USER] = msnsecs[CMS_USER];
3351 3367 if (cso->cpu_sysinfo.cpu[CPU_KERNEL] < msnsecs[CMS_SYSTEM])
3352 3368 cso->cpu_sysinfo.cpu[CPU_KERNEL] = msnsecs[CMS_SYSTEM];
3353 3369 cso->cpu_sysinfo.cpu[CPU_WAIT] = 0;
3354 3370 cso->cpu_sysinfo.wait[W_IO] = 0;
3355 3371 cso->cpu_sysinfo.wait[W_SWAP] = 0;
3356 3372 cso->cpu_sysinfo.wait[W_PIO] = 0;
3357 3373 cso->cpu_sysinfo.bread = CPU_STATS(cp, sys.bread);
3358 3374 cso->cpu_sysinfo.bwrite = CPU_STATS(cp, sys.bwrite);
3359 3375 cso->cpu_sysinfo.lread = CPU_STATS(cp, sys.lread);
3360 3376 cso->cpu_sysinfo.lwrite = CPU_STATS(cp, sys.lwrite);
3361 3377 cso->cpu_sysinfo.phread = CPU_STATS(cp, sys.phread);
3362 3378 cso->cpu_sysinfo.phwrite = CPU_STATS(cp, sys.phwrite);
3363 3379 cso->cpu_sysinfo.pswitch = CPU_STATS(cp, sys.pswitch);
3364 3380 cso->cpu_sysinfo.trap = CPU_STATS(cp, sys.trap);
3365 3381 cso->cpu_sysinfo.intr = 0;
3366 3382 for (i = 0; i < PIL_MAX; i++)
3367 3383 cso->cpu_sysinfo.intr += CPU_STATS(cp, sys.intr[i]);
3368 3384 cso->cpu_sysinfo.syscall = CPU_STATS(cp, sys.syscall);
3369 3385 cso->cpu_sysinfo.sysread = CPU_STATS(cp, sys.sysread);
3370 3386 cso->cpu_sysinfo.syswrite = CPU_STATS(cp, sys.syswrite);
3371 3387 cso->cpu_sysinfo.sysfork = CPU_STATS(cp, sys.sysfork);
3372 3388 cso->cpu_sysinfo.sysvfork = CPU_STATS(cp, sys.sysvfork);
3373 3389 cso->cpu_sysinfo.sysexec = CPU_STATS(cp, sys.sysexec);
3374 3390 cso->cpu_sysinfo.readch = CPU_STATS(cp, sys.readch);
3375 3391 cso->cpu_sysinfo.writech = CPU_STATS(cp, sys.writech);
3376 3392 cso->cpu_sysinfo.rcvint = CPU_STATS(cp, sys.rcvint);
3377 3393 cso->cpu_sysinfo.xmtint = CPU_STATS(cp, sys.xmtint);
3378 3394 cso->cpu_sysinfo.mdmint = CPU_STATS(cp, sys.mdmint);
3379 3395 cso->cpu_sysinfo.rawch = CPU_STATS(cp, sys.rawch);
3380 3396 cso->cpu_sysinfo.canch = CPU_STATS(cp, sys.canch);
3381 3397 cso->cpu_sysinfo.outch = CPU_STATS(cp, sys.outch);
3382 3398 cso->cpu_sysinfo.msg = CPU_STATS(cp, sys.msg);
3383 3399 cso->cpu_sysinfo.sema = CPU_STATS(cp, sys.sema);
3384 3400 cso->cpu_sysinfo.namei = CPU_STATS(cp, sys.namei);
3385 3401 cso->cpu_sysinfo.ufsiget = CPU_STATS(cp, sys.ufsiget);
3386 3402 cso->cpu_sysinfo.ufsdirblk = CPU_STATS(cp, sys.ufsdirblk);
3387 3403 cso->cpu_sysinfo.ufsipage = CPU_STATS(cp, sys.ufsipage);
3388 3404 cso->cpu_sysinfo.ufsinopage = CPU_STATS(cp, sys.ufsinopage);
3389 3405 cso->cpu_sysinfo.inodeovf = 0;
3390 3406 cso->cpu_sysinfo.fileovf = 0;
3391 3407 cso->cpu_sysinfo.procovf = CPU_STATS(cp, sys.procovf);
3392 3408 cso->cpu_sysinfo.intrthread = 0;
3393 3409 for (i = 0; i < LOCK_LEVEL - 1; i++)
3394 3410 cso->cpu_sysinfo.intrthread += CPU_STATS(cp, sys.intr[i]);
3395 3411 cso->cpu_sysinfo.intrblk = CPU_STATS(cp, sys.intrblk);
3396 3412 cso->cpu_sysinfo.idlethread = CPU_STATS(cp, sys.idlethread);
3397 3413 cso->cpu_sysinfo.inv_swtch = CPU_STATS(cp, sys.inv_swtch);
3398 3414 cso->cpu_sysinfo.nthreads = CPU_STATS(cp, sys.nthreads);
3399 3415 cso->cpu_sysinfo.cpumigrate = CPU_STATS(cp, sys.cpumigrate);
3400 3416 cso->cpu_sysinfo.xcalls = CPU_STATS(cp, sys.xcalls);
3401 3417 cso->cpu_sysinfo.mutex_adenters = CPU_STATS(cp, sys.mutex_adenters);
3402 3418 cso->cpu_sysinfo.rw_rdfails = CPU_STATS(cp, sys.rw_rdfails);
3403 3419 cso->cpu_sysinfo.rw_wrfails = CPU_STATS(cp, sys.rw_wrfails);
3404 3420 cso->cpu_sysinfo.modload = CPU_STATS(cp, sys.modload);
3405 3421 cso->cpu_sysinfo.modunload = CPU_STATS(cp, sys.modunload);
3406 3422 cso->cpu_sysinfo.bawrite = CPU_STATS(cp, sys.bawrite);
3407 3423 cso->cpu_sysinfo.rw_enters = 0;
3408 3424 cso->cpu_sysinfo.win_uo_cnt = 0;
3409 3425 cso->cpu_sysinfo.win_uu_cnt = 0;
3410 3426 cso->cpu_sysinfo.win_so_cnt = 0;
3411 3427 cso->cpu_sysinfo.win_su_cnt = 0;
3412 3428 cso->cpu_sysinfo.win_suo_cnt = 0;
3413 3429
3414 3430 cso->cpu_syswait.iowait = CPU_STATS(cp, sys.iowait);
3415 3431 cso->cpu_syswait.swap = 0;
3416 3432 cso->cpu_syswait.physio = 0;
3417 3433
3418 3434 cso->cpu_vminfo.pgrec = CPU_STATS(cp, vm.pgrec);
3419 3435 cso->cpu_vminfo.pgfrec = CPU_STATS(cp, vm.pgfrec);
3420 3436 cso->cpu_vminfo.pgin = CPU_STATS(cp, vm.pgin);
3421 3437 cso->cpu_vminfo.pgpgin = CPU_STATS(cp, vm.pgpgin);
3422 3438 cso->cpu_vminfo.pgout = CPU_STATS(cp, vm.pgout);
3423 3439 cso->cpu_vminfo.pgpgout = CPU_STATS(cp, vm.pgpgout);
3424 3440 cso->cpu_vminfo.swapin = CPU_STATS(cp, vm.swapin);
3425 3441 cso->cpu_vminfo.pgswapin = CPU_STATS(cp, vm.pgswapin);
3426 3442 cso->cpu_vminfo.swapout = CPU_STATS(cp, vm.swapout);
3427 3443 cso->cpu_vminfo.pgswapout = CPU_STATS(cp, vm.pgswapout);
3428 3444 cso->cpu_vminfo.zfod = CPU_STATS(cp, vm.zfod);
3429 3445 cso->cpu_vminfo.dfree = CPU_STATS(cp, vm.dfree);
3430 3446 cso->cpu_vminfo.scan = CPU_STATS(cp, vm.scan);
3431 3447 cso->cpu_vminfo.rev = CPU_STATS(cp, vm.rev);
3432 3448 cso->cpu_vminfo.hat_fault = CPU_STATS(cp, vm.hat_fault);
3433 3449 cso->cpu_vminfo.as_fault = CPU_STATS(cp, vm.as_fault);
3434 3450 cso->cpu_vminfo.maj_fault = CPU_STATS(cp, vm.maj_fault);
3435 3451 cso->cpu_vminfo.cow_fault = CPU_STATS(cp, vm.cow_fault);
3436 3452 cso->cpu_vminfo.prot_fault = CPU_STATS(cp, vm.prot_fault);
3437 3453 cso->cpu_vminfo.softlock = CPU_STATS(cp, vm.softlock);
3438 3454 cso->cpu_vminfo.kernel_asflt = CPU_STATS(cp, vm.kernel_asflt);
3439 3455 cso->cpu_vminfo.pgrrun = CPU_STATS(cp, vm.pgrrun);
3440 3456 cso->cpu_vminfo.execpgin = CPU_STATS(cp, vm.execpgin);
3441 3457 cso->cpu_vminfo.execpgout = CPU_STATS(cp, vm.execpgout);
3442 3458 cso->cpu_vminfo.execfree = CPU_STATS(cp, vm.execfree);
3443 3459 cso->cpu_vminfo.anonpgin = CPU_STATS(cp, vm.anonpgin);
3444 3460 cso->cpu_vminfo.anonpgout = CPU_STATS(cp, vm.anonpgout);
3445 3461 cso->cpu_vminfo.anonfree = CPU_STATS(cp, vm.anonfree);
3446 3462 cso->cpu_vminfo.fspgin = CPU_STATS(cp, vm.fspgin);
3447 3463 cso->cpu_vminfo.fspgout = CPU_STATS(cp, vm.fspgout);
3448 3464 cso->cpu_vminfo.fsfree = CPU_STATS(cp, vm.fsfree);
3449 3465
3450 3466 return (0);
3451 3467 }
↓ open down ↓ |
3000 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX