8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25 /*
26 * Copyright (c) 2010, Intel Corporation.
27 * All rights reserved.
28 */
29
30 #include <sys/types.h>
31 #include <sys/param.h>
32 #include <sys/t_lock.h>
33 #include <sys/thread.h>
34 #include <sys/cpuvar.h>
35 #include <sys/x_call.h>
36 #include <sys/xc_levels.h>
37 #include <sys/cpu.h>
38 #include <sys/psw.h>
39 #include <sys/sunddi.h>
40 #include <sys/debug.h>
41 #include <sys/systm.h>
42 #include <sys/archsystm.h>
43 #include <sys/machsystm.h>
44 #include <sys/mutex_impl.h>
45 #include <sys/stack.h>
46 #include <sys/promif.h>
47 #include <sys/x86_archext.h>
62 * Every CPU has xc_work_cnt, which indicates it has messages to process.
63 * This value is incremented as message traffic is initiated and decremented
64 * with every message that finishes all processing.
65 *
66 * The code needs no mfence or other membar_*() calls. The uses of
67 * atomic_cas_ptr(), atomic_cas_32() and atomic_dec_32() for the message
68 * passing are implemented with LOCK prefix instructions which are
69 * equivalent to mfence.
70 *
71 * One interesting aspect of this implmentation is that it allows 2 or more
72 * CPUs to initiate cross calls to intersecting sets of CPUs at the same time.
73 * The cross call processing by the CPUs will happen in any order with only
74 * a guarantee, for xc_call() and xc_sync(), that an initiator won't return
75 * from cross calls before all slaves have invoked the function.
76 *
77 * The reason for this asynchronous approach is to allow for fast global
78 * TLB shootdowns. If all CPUs, say N, tried to do a global TLB invalidation
79 * on a different Virtual Address at the same time. The old code required
80 * N squared IPIs. With this method, depending on timing, it could happen
81 * with just N IPIs.
82 */
83
84 /*
85 * The default is to not enable collecting counts of IPI information, since
86 * the updating of shared cachelines could cause excess bus traffic.
87 */
88 uint_t xc_collect_enable = 0;
89 uint64_t xc_total_cnt = 0; /* total #IPIs sent for cross calls */
90 uint64_t xc_multi_cnt = 0; /* # times we piggy backed on another IPI */
91
92 /*
93 * Values for message states. Here are the normal transitions. A transition
94 * of "->" happens in the slave cpu and "=>" happens in the master cpu as
95 * the messages are passed back and forth.
96 *
97 * FREE => ASYNC -> DONE => FREE
98 * FREE => CALL -> DONE => FREE
99 * FREE => SYNC -> WAITING => RELEASED -> DONE => FREE
100 *
101 * The interesing one above is ASYNC. You might ask, why not go directly
102 * to FREE, instead of DONE. If it did that, it might be possible to exhaust
103 * the master's xc_free list if a master can generate ASYNC messages faster
104 * then the slave can process them. That could be handled with more complicated
105 * handling. However since nothing important uses ASYNC, I've not bothered.
106 */
107 #define XC_MSG_FREE (0) /* msg in xc_free queue */
108 #define XC_MSG_ASYNC (1) /* msg in slave xc_msgbox */
109 #define XC_MSG_CALL (2) /* msg in slave xc_msgbox */
110 #define XC_MSG_SYNC (3) /* msg in slave xc_msgbox */
111 #define XC_MSG_WAITING (4) /* msg in master xc_msgbox or xc_waiters */
112 #define XC_MSG_RELEASED (5) /* msg in slave xc_msgbox */
113 #define XC_MSG_DONE (6) /* msg in master xc_msgbox */
114
115 /*
116 * We allow for one high priority message at a time to happen in the system.
117 * This is used for panic, kmdb, etc., so no locking is done.
118 */
119 static volatile cpuset_t xc_priority_set_store;
120 static volatile ulong_t *xc_priority_set = CPUSET2BV(xc_priority_set_store);
121 static xc_data_t xc_priority_data;
122
123 /*
124 * Wrappers to avoid C compiler warnings due to volatile. The atomic bit
125 * operations don't accept volatile bit vectors - which is a bit silly.
126 */
127 #define XC_BT_SET(vector, b) BT_ATOMIC_SET((ulong_t *)(vector), (b))
128 #define XC_BT_CLEAR(vector, b) BT_ATOMIC_CLEAR((ulong_t *)(vector), (b))
129
130 /*
131 * Decrement a CPU's work count
132 */
133 static void
134 xc_decrement(struct machcpu *mcpu)
135 {
136 atomic_dec_32(&mcpu->xc_work_cnt);
137 }
138
139 /*
140 * Increment a CPU's work count and return the old value
141 */
142 static int
143 xc_increment(struct machcpu *mcpu)
144 {
145 int old;
146 do {
147 old = mcpu->xc_work_cnt;
148 } while (atomic_cas_32(&mcpu->xc_work_cnt, old, old + 1) != old);
149 return (old);
150 }
176 * Extract a message from a queue. The extraction is atomic only
177 * when just one thread does extractions from the queue.
178 * If the queue is empty, NULL is returned.
179 */
180 static xc_msg_t *
181 xc_extract(xc_msg_t **queue)
182 {
183 xc_msg_t *old_head;
184
185 do {
186 old_head = (xc_msg_t *)*(volatile xc_msg_t **)queue;
187 if (old_head == NULL)
188 return (old_head);
189 } while (atomic_cas_ptr(queue, old_head, old_head->xc_next) !=
190 old_head);
191 old_head->xc_next = NULL;
192 return (old_head);
193 }
194
195 /*
196 * Initialize the machcpu fields used for cross calls
197 */
198 static uint_t xc_initialized = 0;
199
200 void
201 xc_init_cpu(struct cpu *cpup)
202 {
203 xc_msg_t *msg;
204 int c;
205
206 /*
207 * Allocate message buffers for the new CPU.
208 */
209 for (c = 0; c < max_ncpus; ++c) {
210 if (plat_dr_support_cpu()) {
211 /*
212 * Allocate a message buffer for every CPU possible
213 * in system, including our own, and add them to our xc
214 * message queue.
215 */
311 uint_t
312 xc_serv(caddr_t arg1, caddr_t arg2)
313 {
314 struct machcpu *mcpup = &(CPU->cpu_m);
315 xc_msg_t *msg;
316 xc_data_t *data;
317 xc_msg_t *xc_waiters = NULL;
318 uint32_t num_waiting = 0;
319 xc_func_t func;
320 xc_arg_t a1;
321 xc_arg_t a2;
322 xc_arg_t a3;
323 uint_t rc = DDI_INTR_UNCLAIMED;
324
325 while (mcpup->xc_work_cnt != 0) {
326 rc = DDI_INTR_CLAIMED;
327
328 /*
329 * We may have to wait for a message to arrive.
330 */
331 for (msg = NULL; msg == NULL;
332 msg = xc_extract(&mcpup->xc_msgbox)) {
333
334 /*
335 * Alway check for and handle a priority message.
336 */
337 if (BT_TEST(xc_priority_set, CPU->cpu_id)) {
338 func = xc_priority_data.xc_func;
339 a1 = xc_priority_data.xc_a1;
340 a2 = xc_priority_data.xc_a2;
341 a3 = xc_priority_data.xc_a3;
342 XC_BT_CLEAR(xc_priority_set, CPU->cpu_id);
343 xc_decrement(mcpup);
344 func(a1, a2, a3);
345 if (mcpup->xc_work_cnt == 0)
346 return (rc);
347 }
348
349 /*
350 * wait for a message to arrive
351 */
352 SMT_PAUSE();
353 }
354
355
356 /*
357 * process the message
358 */
359 switch (msg->xc_command) {
360
361 /*
362 * ASYNC gives back the message immediately, then we do the
426
427 /*
428 * DONE means a slave has completely finished up.
429 * Once we collect all the DONE messages, we'll exit
430 * processing too.
431 */
432 case XC_MSG_DONE:
433 msg->xc_command = XC_MSG_FREE;
434 xc_insert(&mcpup->xc_free, msg);
435 xc_decrement(mcpup);
436 break;
437
438 case XC_MSG_FREE:
439 panic("free message 0x%p in msgbox", (void *)msg);
440 break;
441
442 default:
443 panic("bad message 0x%p in msgbox", (void *)msg);
444 break;
445 }
446 }
447 return (rc);
448 }
449
450 /*
451 * Initiate cross call processing.
452 */
453 static void
454 xc_common(
455 xc_func_t func,
456 xc_arg_t arg1,
457 xc_arg_t arg2,
458 xc_arg_t arg3,
459 ulong_t *set,
460 uint_t command)
461 {
462 int c;
463 struct cpu *cpup;
464 xc_msg_t *msg;
465 xc_data_t *data;
564 if (cpup == NULL || !(cpup->cpu_flags & CPU_READY))
565 continue;
566
567 /*
568 * The value of 40000 here is from old kernel code. It
569 * really should be changed to some time based value, since
570 * under a hypervisor, there's no guarantee a remote CPU
571 * is even scheduled.
572 */
573 for (i = 0; BT_TEST(xc_priority_set, c) && i < 40000; ++i)
574 SMT_PAUSE();
575
576 /*
577 * Some CPU did not respond to a previous priority request. It's
578 * probably deadlocked with interrupts blocked or some such
579 * problem. We'll just erase the previous request - which was
580 * most likely a kmdb_enter that has already expired - and plow
581 * ahead.
582 */
583 if (BT_TEST(xc_priority_set, c)) {
584 XC_BT_CLEAR(xc_priority_set, c);
585 if (cpup->cpu_m.xc_work_cnt > 0)
586 xc_decrement(&cpup->cpu_m);
587 }
588 }
589
590 /*
591 * fill in cross call data
592 */
593 xc_priority_data.xc_func = func;
594 xc_priority_data.xc_a1 = arg1;
595 xc_priority_data.xc_a2 = arg2;
596 xc_priority_data.xc_a3 = arg3;
597
598 /*
599 * Post messages to all CPUs involved that are CPU_READY
600 * We'll always IPI, plus bang on the xc_msgbox for i86_mwait()
601 */
602 for (c = 0; c < max_ncpus; ++c) {
603 if (!BT_TEST(set, c))
604 continue;
605 cpup = cpu[c];
606 if (cpup == NULL || !(cpup->cpu_flags & CPU_READY) ||
607 cpup == CPU)
608 continue;
609 (void) xc_increment(&cpup->cpu_m);
610 XC_BT_SET(xc_priority_set, c);
611 send_dirint(c, XC_HI_PIL);
612 for (i = 0; i < 10; ++i) {
613 (void) atomic_cas_ptr(&cpup->cpu_m.xc_msgbox,
614 cpup->cpu_m.xc_msgbox, cpup->cpu_m.xc_msgbox);
615 }
616 }
617 }
618
619 /*
620 * Do cross call to all other CPUs with absolutely no waiting or handshaking.
621 * This should only be used for extraordinary operations, like panic(), which
622 * need to work, in some fashion, in a not completely functional system.
623 * All other uses that want minimal waiting should use xc_call_nowait().
624 */
625 void
626 xc_priority(
627 xc_arg_t arg1,
628 xc_arg_t arg2,
629 xc_arg_t arg3,
630 ulong_t *set,
|
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25 /*
26 * Copyright (c) 2010, Intel Corporation.
27 * All rights reserved.
28 * Copyright 2018 Joyent, Inc.
29 */
30
31 #include <sys/types.h>
32 #include <sys/param.h>
33 #include <sys/t_lock.h>
34 #include <sys/thread.h>
35 #include <sys/cpuvar.h>
36 #include <sys/x_call.h>
37 #include <sys/xc_levels.h>
38 #include <sys/cpu.h>
39 #include <sys/psw.h>
40 #include <sys/sunddi.h>
41 #include <sys/debug.h>
42 #include <sys/systm.h>
43 #include <sys/archsystm.h>
44 #include <sys/machsystm.h>
45 #include <sys/mutex_impl.h>
46 #include <sys/stack.h>
47 #include <sys/promif.h>
48 #include <sys/x86_archext.h>
63 * Every CPU has xc_work_cnt, which indicates it has messages to process.
64 * This value is incremented as message traffic is initiated and decremented
65 * with every message that finishes all processing.
66 *
67 * The code needs no mfence or other membar_*() calls. The uses of
68 * atomic_cas_ptr(), atomic_cas_32() and atomic_dec_32() for the message
69 * passing are implemented with LOCK prefix instructions which are
70 * equivalent to mfence.
71 *
72 * One interesting aspect of this implmentation is that it allows 2 or more
73 * CPUs to initiate cross calls to intersecting sets of CPUs at the same time.
74 * The cross call processing by the CPUs will happen in any order with only
75 * a guarantee, for xc_call() and xc_sync(), that an initiator won't return
76 * from cross calls before all slaves have invoked the function.
77 *
78 * The reason for this asynchronous approach is to allow for fast global
79 * TLB shootdowns. If all CPUs, say N, tried to do a global TLB invalidation
80 * on a different Virtual Address at the same time. The old code required
81 * N squared IPIs. With this method, depending on timing, it could happen
82 * with just N IPIs.
83 *
84 * Here are the normal transitions for XC_MSG_* values in ->xc_command. A
85 * transition of "->" happens in the slave cpu and "=>" happens in the master
86 * cpu as the messages are passed back and forth.
87 *
88 * FREE => ASYNC -> DONE => FREE
89 * FREE => CALL -> DONE => FREE
90 * FREE => SYNC -> WAITING => RELEASED -> DONE => FREE
91 *
92 * The interesting one above is ASYNC. You might ask, why not go directly
93 * to FREE, instead of DONE? If it did that, it might be possible to exhaust
94 * the master's xc_free list if a master can generate ASYNC messages faster
95 * then the slave can process them. That could be handled with more complicated
96 * handling. However since nothing important uses ASYNC, I've not bothered.
97 */
98
99 /*
100 * The default is to not enable collecting counts of IPI information, since
101 * the updating of shared cachelines could cause excess bus traffic.
102 */
103 uint_t xc_collect_enable = 0;
104 uint64_t xc_total_cnt = 0; /* total #IPIs sent for cross calls */
105 uint64_t xc_multi_cnt = 0; /* # times we piggy backed on another IPI */
106
107 /*
108 * We allow for one high priority message at a time to happen in the system.
109 * This is used for panic, kmdb, etc., so no locking is done.
110 */
111 static volatile cpuset_t xc_priority_set_store;
112 static volatile ulong_t *xc_priority_set = CPUSET2BV(xc_priority_set_store);
113 static xc_data_t xc_priority_data;
114
115 /*
116 * Decrement a CPU's work count
117 */
118 static void
119 xc_decrement(struct machcpu *mcpu)
120 {
121 atomic_dec_32(&mcpu->xc_work_cnt);
122 }
123
124 /*
125 * Increment a CPU's work count and return the old value
126 */
127 static int
128 xc_increment(struct machcpu *mcpu)
129 {
130 int old;
131 do {
132 old = mcpu->xc_work_cnt;
133 } while (atomic_cas_32(&mcpu->xc_work_cnt, old, old + 1) != old);
134 return (old);
135 }
161 * Extract a message from a queue. The extraction is atomic only
162 * when just one thread does extractions from the queue.
163 * If the queue is empty, NULL is returned.
164 */
165 static xc_msg_t *
166 xc_extract(xc_msg_t **queue)
167 {
168 xc_msg_t *old_head;
169
170 do {
171 old_head = (xc_msg_t *)*(volatile xc_msg_t **)queue;
172 if (old_head == NULL)
173 return (old_head);
174 } while (atomic_cas_ptr(queue, old_head, old_head->xc_next) !=
175 old_head);
176 old_head->xc_next = NULL;
177 return (old_head);
178 }
179
180 /*
181 * Extract the next message from the CPU's queue, and place the message in
182 * .xc_curmsg. The latter is solely to make debugging (and ::xcall) more
183 * useful.
184 */
185 static xc_msg_t *
186 xc_get(void)
187 {
188 struct machcpu *mcpup = &CPU->cpu_m;
189 xc_msg_t *msg = xc_extract(&mcpup->xc_msgbox);
190 mcpup->xc_curmsg = msg;
191 return (msg);
192 }
193
194 /*
195 * Initialize the machcpu fields used for cross calls
196 */
197 static uint_t xc_initialized = 0;
198
199 void
200 xc_init_cpu(struct cpu *cpup)
201 {
202 xc_msg_t *msg;
203 int c;
204
205 /*
206 * Allocate message buffers for the new CPU.
207 */
208 for (c = 0; c < max_ncpus; ++c) {
209 if (plat_dr_support_cpu()) {
210 /*
211 * Allocate a message buffer for every CPU possible
212 * in system, including our own, and add them to our xc
213 * message queue.
214 */
310 uint_t
311 xc_serv(caddr_t arg1, caddr_t arg2)
312 {
313 struct machcpu *mcpup = &(CPU->cpu_m);
314 xc_msg_t *msg;
315 xc_data_t *data;
316 xc_msg_t *xc_waiters = NULL;
317 uint32_t num_waiting = 0;
318 xc_func_t func;
319 xc_arg_t a1;
320 xc_arg_t a2;
321 xc_arg_t a3;
322 uint_t rc = DDI_INTR_UNCLAIMED;
323
324 while (mcpup->xc_work_cnt != 0) {
325 rc = DDI_INTR_CLAIMED;
326
327 /*
328 * We may have to wait for a message to arrive.
329 */
330 for (msg = NULL; msg == NULL; msg = xc_get()) {
331
332 /*
333 * Alway check for and handle a priority message.
334 */
335 if (BT_TEST(xc_priority_set, CPU->cpu_id)) {
336 func = xc_priority_data.xc_func;
337 a1 = xc_priority_data.xc_a1;
338 a2 = xc_priority_data.xc_a2;
339 a3 = xc_priority_data.xc_a3;
340 BT_ATOMIC_CLEAR(xc_priority_set, CPU->cpu_id);
341 xc_decrement(mcpup);
342 func(a1, a2, a3);
343 if (mcpup->xc_work_cnt == 0)
344 return (rc);
345 }
346
347 /*
348 * wait for a message to arrive
349 */
350 SMT_PAUSE();
351 }
352
353
354 /*
355 * process the message
356 */
357 switch (msg->xc_command) {
358
359 /*
360 * ASYNC gives back the message immediately, then we do the
424
425 /*
426 * DONE means a slave has completely finished up.
427 * Once we collect all the DONE messages, we'll exit
428 * processing too.
429 */
430 case XC_MSG_DONE:
431 msg->xc_command = XC_MSG_FREE;
432 xc_insert(&mcpup->xc_free, msg);
433 xc_decrement(mcpup);
434 break;
435
436 case XC_MSG_FREE:
437 panic("free message 0x%p in msgbox", (void *)msg);
438 break;
439
440 default:
441 panic("bad message 0x%p in msgbox", (void *)msg);
442 break;
443 }
444
445 CPU->cpu_m.xc_curmsg = NULL;
446 }
447 return (rc);
448 }
449
450 /*
451 * Initiate cross call processing.
452 */
453 static void
454 xc_common(
455 xc_func_t func,
456 xc_arg_t arg1,
457 xc_arg_t arg2,
458 xc_arg_t arg3,
459 ulong_t *set,
460 uint_t command)
461 {
462 int c;
463 struct cpu *cpup;
464 xc_msg_t *msg;
465 xc_data_t *data;
564 if (cpup == NULL || !(cpup->cpu_flags & CPU_READY))
565 continue;
566
567 /*
568 * The value of 40000 here is from old kernel code. It
569 * really should be changed to some time based value, since
570 * under a hypervisor, there's no guarantee a remote CPU
571 * is even scheduled.
572 */
573 for (i = 0; BT_TEST(xc_priority_set, c) && i < 40000; ++i)
574 SMT_PAUSE();
575
576 /*
577 * Some CPU did not respond to a previous priority request. It's
578 * probably deadlocked with interrupts blocked or some such
579 * problem. We'll just erase the previous request - which was
580 * most likely a kmdb_enter that has already expired - and plow
581 * ahead.
582 */
583 if (BT_TEST(xc_priority_set, c)) {
584 BT_ATOMIC_CLEAR(xc_priority_set, c);
585 if (cpup->cpu_m.xc_work_cnt > 0)
586 xc_decrement(&cpup->cpu_m);
587 }
588 }
589
590 /*
591 * fill in cross call data
592 */
593 xc_priority_data.xc_func = func;
594 xc_priority_data.xc_a1 = arg1;
595 xc_priority_data.xc_a2 = arg2;
596 xc_priority_data.xc_a3 = arg3;
597
598 /*
599 * Post messages to all CPUs involved that are CPU_READY
600 * We'll always IPI, plus bang on the xc_msgbox for i86_mwait()
601 */
602 for (c = 0; c < max_ncpus; ++c) {
603 if (!BT_TEST(set, c))
604 continue;
605 cpup = cpu[c];
606 if (cpup == NULL || !(cpup->cpu_flags & CPU_READY) ||
607 cpup == CPU)
608 continue;
609 (void) xc_increment(&cpup->cpu_m);
610 BT_ATOMIC_SET(xc_priority_set, c);
611 send_dirint(c, XC_HI_PIL);
612 for (i = 0; i < 10; ++i) {
613 (void) atomic_cas_ptr(&cpup->cpu_m.xc_msgbox,
614 cpup->cpu_m.xc_msgbox, cpup->cpu_m.xc_msgbox);
615 }
616 }
617 }
618
619 /*
620 * Do cross call to all other CPUs with absolutely no waiting or handshaking.
621 * This should only be used for extraordinary operations, like panic(), which
622 * need to work, in some fashion, in a not completely functional system.
623 * All other uses that want minimal waiting should use xc_call_nowait().
624 */
625 void
626 xc_priority(
627 xc_arg_t arg1,
628 xc_arg_t arg2,
629 xc_arg_t arg3,
630 ulong_t *set,
|