Print this page
9736 kmdb tortures via single-step miscellaneous trap
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/cmd/mdb/common/kmdb/kaif_start.c
+++ new/usr/src/cmd/mdb/common/kmdb/kaif_start.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
↓ open down ↓ |
13 lines elided |
↑ open up ↑ |
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 + * Copyright 2018 Joyent, Inc.
24 25 */
25 26
26 -#pragma ident "%Z%%M% %I% %E% SMI"
27 -
28 27 /*
29 28 * The main CPU-control loops, used to control masters and slaves.
30 29 */
31 30
32 31 #include <sys/types.h>
33 32
34 33 #include <kmdb/kaif.h>
35 34 #include <kmdb/kaif_start.h>
36 35 #include <kmdb/kmdb_asmutil.h>
37 36 #include <kmdb/kmdb_dpi_impl.h>
38 37 #include <kmdb/kmdb_kdi.h>
39 38
40 39 #define KAIF_SLAVE_CMD_SPIN 0
41 40 #define KAIF_SLAVE_CMD_SWITCH 1
42 41 #define KAIF_SLAVE_CMD_RESUME 2
43 42 #define KAIF_SLAVE_CMD_FLUSH 3
44 43 #define KAIF_SLAVE_CMD_REBOOT 4
45 44 #if defined(__sparc)
46 45 #define KAIF_SLAVE_CMD_ACK 5
47 46 #endif
48 47
49 48
50 49 /*
51 50 * Used to synchronize attempts to set kaif_master_cpuid. kaif_master_cpuid may
52 51 * be read without kaif_master_lock, and may be written by the current master
53 52 * CPU.
54 53 */
55 54 int kaif_master_cpuid = KAIF_MASTER_CPUID_UNSET;
56 55 static uintptr_t kaif_master_lock = 0;
57 56
58 57 /*
59 58 * Used to ensure that all CPUs leave the debugger together. kaif_loop_lock must
60 59 * be held to write kaif_looping, but need not be held to read it.
61 60 */
62 61 static volatile uint_t kaif_looping;
63 62 static uintptr_t kaif_loop_lock;
64 63
65 64 static volatile int kaif_slave_cmd;
66 65 static volatile int kaif_slave_tgt; /* target cpuid for CMD_SWITCH */
67 66
68 67 static void
69 68 kaif_lock_enter(uintptr_t *lock)
70 69 {
71 70 while (cas(lock, 0, 1) != 0)
72 71 continue;
73 72 membar_producer();
74 73 }
75 74
76 75 static void
77 76 kaif_lock_exit(uintptr_t *lock)
78 77 {
79 78 *lock = 0;
80 79 membar_producer();
81 80 }
82 81
83 82 static void
84 83 kaif_start_slaves(int cmd)
85 84 {
86 85 kaif_slave_cmd = cmd;
87 86 kmdb_kdi_start_slaves();
88 87 }
89 88
90 89 static int
91 90 kaif_master_loop(kaif_cpusave_t *cpusave)
92 91 {
93 92 int notflushed, i;
94 93
95 94 #if defined(__sparc)
96 95 kaif_prom_rearm();
97 96 #endif
98 97 kaif_trap_set_debugger();
99 98
100 99 /*
101 100 * If we re-entered due to a ::switch, we need to tell the slave CPUs
102 101 * to sleep again.
103 102 */
104 103 kmdb_kdi_stop_slaves(cpusave->krs_cpu_id, 0);
105 104
106 105 master_loop:
107 106 switch (kmdb_dpi_reenter()) {
108 107 case KMDB_DPI_CMD_SWITCH_CPU:
109 108 /*
110 109 * We assume that the target CPU is a valid slave. There's no
111 110 * easy way to complain here, so we'll assume that the caller
112 111 * has done the proper checking.
113 112 */
114 113 if (kmdb_dpi_switch_target == cpusave->krs_cpu_id)
115 114 break;
116 115
117 116 kaif_slave_tgt = kaif_master_cpuid = kmdb_dpi_switch_target;
118 117 cpusave->krs_cpu_state = KAIF_CPU_STATE_SLAVE;
119 118 membar_producer();
120 119
121 120 /*
122 121 * Switch back to the saved trap table before we switch CPUs --
123 122 * we need to make sure that only one CPU is on the debugger's
124 123 * table at a time.
125 124 */
126 125 kaif_trap_set_saved(cpusave);
127 126
128 127 kaif_start_slaves(KAIF_SLAVE_CMD_SWITCH);
129 128
130 129 /* The new master is now awake */
131 130 return (KAIF_CPU_CMD_SWITCH);
132 131
133 132 case KMDB_DPI_CMD_RESUME_ALL:
134 133 case KMDB_DPI_CMD_RESUME_UNLOAD:
135 134 /*
136 135 * Resume everyone, clean up for next entry.
137 136 */
138 137 kaif_master_cpuid = KAIF_MASTER_CPUID_UNSET;
139 138 membar_producer();
140 139 kaif_start_slaves(KAIF_SLAVE_CMD_RESUME);
141 140
142 141 if (kmdb_dpi_work_required())
143 142 kmdb_dpi_wrintr_fire();
144 143
145 144 kaif_trap_set_saved(cpusave);
146 145
147 146 return (KAIF_CPU_CMD_RESUME);
148 147
149 148 case KMDB_DPI_CMD_RESUME_MASTER:
150 149 /*
151 150 * Single-CPU resume, which is performed on the debugger's
152 151 * trap table (so no need to switch back).
153 152 */
154 153 return (KAIF_CPU_CMD_RESUME_MASTER);
155 154
156 155 case KMDB_DPI_CMD_FLUSH_CACHES:
157 156 kaif_start_slaves(KAIF_SLAVE_CMD_FLUSH);
158 157
159 158 /*
160 159 * Wait for the other cpus to finish flushing their caches.
161 160 */
162 161 do {
163 162 notflushed = 0;
164 163 for (i = 0; i < kaif_ncpusave; i++) {
165 164 kaif_cpusave_t *save = &kaif_cpusave[i];
166 165
167 166 if (save->krs_cpu_state ==
168 167 KAIF_CPU_STATE_SLAVE &&
169 168 !save->krs_cpu_flushed) {
170 169 notflushed++;
171 170 break;
172 171 }
173 172 }
174 173 } while (notflushed > 0);
175 174
176 175 kaif_slave_cmd = KAIF_SLAVE_CMD_SPIN;
177 176 break;
178 177
179 178 #if defined(__i386) || defined(__amd64)
180 179 case KMDB_DPI_CMD_REBOOT:
181 180 /*
182 181 * Reboot must be initiated by CPU 0. I could ask why, but I'm
183 182 * afraid that I don't want to know the answer.
184 183 */
185 184 if (cpusave->krs_cpu_id == 0)
186 185 kmdb_kdi_reboot();
187 186
188 187 kaif_start_slaves(KAIF_SLAVE_CMD_REBOOT);
189 188
190 189 /*
191 190 * Spin forever, waiting for CPU 0 (apparently a slave) to
192 191 * reboot the system.
193 192 */
194 193 for (;;)
195 194 continue;
196 195
197 196 /*NOTREACHED*/
198 197 break;
199 198 #endif
200 199 }
201 200
202 201 goto master_loop;
203 202 }
204 203
205 204 static int
206 205 kaif_slave_loop(kaif_cpusave_t *cpusave)
207 206 {
208 207 int slavecmd, rv;
209 208
210 209 #if defined(__sparc)
211 210 /*
212 211 * If the user elects to drop to OBP from the debugger, some OBP
213 212 * implementations will cross-call the slaves. We have to turn
214 213 * IE back on so we can receive the cross-calls. If we don't,
215 214 * some OBP implementations will wait forever.
216 215 */
217 216 interrupts_on();
218 217 #endif
219 218
220 219 /* Wait for duty to call */
221 220 for (;;) {
222 221 slavecmd = kaif_slave_cmd;
223 222
224 223 if (slavecmd == KAIF_SLAVE_CMD_SWITCH &&
225 224 kaif_slave_tgt == cpusave->krs_cpu_id) {
226 225 kaif_slave_cmd = KAIF_SLAVE_CMD_SPIN;
227 226 cpusave->krs_cpu_state = KAIF_CPU_STATE_MASTER;
228 227 rv = KAIF_CPU_CMD_SWITCH;
229 228 break;
230 229
231 230 } else if (slavecmd == KAIF_SLAVE_CMD_FLUSH) {
232 231 kmdb_kdi_flush_caches();
233 232 cpusave->krs_cpu_flushed = 1;
234 233 continue;
235 234
236 235 #if defined(__i386) || defined(__amd64)
237 236 } else if (slavecmd == KAIF_SLAVE_CMD_REBOOT &&
238 237 cpusave->krs_cpu_id == 0) {
239 238 rv = 0;
240 239 kmdb_kdi_reboot();
↓ open down ↓ |
203 lines elided |
↑ open up ↑ |
241 240 break;
242 241 #endif
243 242
244 243 } else if (slavecmd == KAIF_SLAVE_CMD_RESUME) {
245 244 rv = KAIF_CPU_CMD_RESUME;
246 245 break;
247 246 #if defined(__sparc)
248 247 } else if (slavecmd == KAIF_SLAVE_CMD_ACK) {
249 248 cpusave->krs_cpu_acked = 1;
250 249 } else if (cpusave->krs_cpu_acked &&
251 - slavecmd == KAIF_SLAVE_CMD_SPIN) {
250 + slavecmd == KAIF_SLAVE_CMD_SPIN) {
252 251 cpusave->krs_cpu_acked = 0;
253 252 #endif
254 253 }
255 254
256 255 kmdb_kdi_slave_wait();
257 256 }
258 257
259 258 #if defined(__sparc)
260 259 interrupts_off();
261 260 #endif
262 261
263 262 return (rv);
264 263 }
265 264
266 265 static void
267 266 kaif_select_master(kaif_cpusave_t *cpusave)
268 267 {
269 268 kaif_lock_enter(&kaif_master_lock);
270 269
271 270 if (kaif_master_cpuid == KAIF_MASTER_CPUID_UNSET) {
272 271 /* This is the master. */
273 272 kaif_master_cpuid = cpusave->krs_cpu_id;
274 273 cpusave->krs_cpu_state = KAIF_CPU_STATE_MASTER;
275 274 kaif_slave_cmd = KAIF_SLAVE_CMD_SPIN;
276 275
277 276 membar_producer();
278 277
279 278 kmdb_kdi_stop_slaves(cpusave->krs_cpu_id, 1);
280 279 } else {
281 280 /* The master was already chosen - go be a slave */
282 281 cpusave->krs_cpu_state = KAIF_CPU_STATE_SLAVE;
283 282 membar_producer();
284 283 }
↓ open down ↓ |
23 lines elided |
↑ open up ↑ |
285 284
286 285 kaif_lock_exit(&kaif_master_lock);
287 286 }
288 287
289 288 int
290 289 kaif_main_loop(kaif_cpusave_t *cpusave)
291 290 {
292 291 int cmd;
293 292
294 293 if (kaif_master_cpuid == KAIF_MASTER_CPUID_UNSET) {
294 +
295 + /*
296 + * Special case: Unload requested before first debugger entry.
297 + * Don't stop the world, as there's nothing to clean up that
298 + * can't be handled by the running kernel.
299 + */
295 300 if (!kmdb_dpi_resume_requested &&
296 301 kmdb_kdi_get_unload_request()) {
297 - /*
298 - * Special case: Unload requested before first debugger
299 - * entry. Don't stop the world, as there's nothing to
300 - * clean up that can't be handled by the running kernel.
301 - */
302 302 cpusave->krs_cpu_state = KAIF_CPU_STATE_NONE;
303 303 return (KAIF_CPU_CMD_RESUME);
304 + }
305 +
306 + /*
307 + * We're a slave with no master, so just resume. This can
308 + * happen if, prior to this, two CPUs both raced through
309 + * kdi_cmnint() - for example, a breakpoint on a frequently
310 + * called function. The loser will be redirected to the slave
311 + * loop; note that the event itself is lost at this point.
312 + *
313 + * The winner will then cross-call that slave, but it won't
314 + * actually be received until the slave returns to the kernel
315 + * and enables interrupts. We'll then come back in via
316 + * kdi_slave_entry() and hit this path.
317 + */
318 + if (cpusave->krs_cpu_state == KAIF_CPU_STATE_SLAVE) {
319 + cpusave->krs_cpu_state = KAIF_CPU_STATE_NONE;
320 + return (KAIF_CPU_CMD_RESUME);
304 321 }
305 322
306 323 kaif_select_master(cpusave);
307 324
308 325 #ifdef __sparc
309 326 if (kaif_master_cpuid == cpusave->krs_cpu_id) {
310 327 /*
311 328 * Everyone has arrived, so we can disarm the post-PROM
312 329 * entry point.
313 330 */
314 331 *kaif_promexitarmp = 0;
315 332 membar_producer();
316 333 }
317 334 #endif
318 335 } else if (kaif_master_cpuid == cpusave->krs_cpu_id) {
319 336 cpusave->krs_cpu_state = KAIF_CPU_STATE_MASTER;
320 337 } else {
321 338 cpusave->krs_cpu_state = KAIF_CPU_STATE_SLAVE;
322 339 }
323 340
324 341 cpusave->krs_cpu_flushed = 0;
325 342
326 343 kaif_lock_enter(&kaif_loop_lock);
327 344 kaif_looping++;
328 345 kaif_lock_exit(&kaif_loop_lock);
329 346
330 347 /*
331 348 * We know who the master and slaves are, so now they can go off
332 349 * to their respective loops.
333 350 */
334 351 do {
335 352 if (kaif_master_cpuid == cpusave->krs_cpu_id)
336 353 cmd = kaif_master_loop(cpusave);
337 354 else
338 355 cmd = kaif_slave_loop(cpusave);
339 356 } while (cmd == KAIF_CPU_CMD_SWITCH);
340 357
341 358 kaif_lock_enter(&kaif_loop_lock);
342 359 kaif_looping--;
343 360 kaif_lock_exit(&kaif_loop_lock);
344 361
345 362 cpusave->krs_cpu_state = KAIF_CPU_STATE_NONE;
346 363
347 364 if (cmd == KAIF_CPU_CMD_RESUME) {
348 365 /*
349 366 * By this point, the master has directed the slaves to resume,
350 367 * and everyone is making their way to this point. We're going
351 368 * to block here until all CPUs leave the master and slave
352 369 * loops. When all have arrived, we'll turn them all loose.
353 370 * This barrier is required for two reasons:
354 371 *
355 372 * 1. There exists a race condition whereby a CPU could reenter
356 373 * the debugger while another CPU is still in the slave loop
357 374 * from this debugger entry. This usually happens when the
358 375 * current master releases the slaves, and makes it back to
359 376 * the world before the slaves notice the release. The
360 377 * former master then triggers a debugger entry, and attempts
361 378 * to stop the slaves for this entry before they've even
362 379 * resumed from the last one. When the slaves arrive here,
363 380 * they'll have re-disabled interrupts, and will thus ignore
364 381 * cross-calls until they finish resuming.
365 382 *
366 383 * 2. At the time of this writing, there exists a SPARC bug that
367 384 * causes an apparently unsolicited interrupt vector trap
368 385 * from OBP to one of the slaves. This wouldn't normally be
369 386 * a problem but for the fact that the cross-called CPU
370 387 * encounters some sort of failure while in OBP. OBP
371 388 * recovers by executing the debugger-hook word, which sends
372 389 * the slave back into the debugger, triggering a debugger
373 390 * fault. This problem seems to only happen during resume,
374 391 * the result being that all CPUs save for the cross-called
375 392 * one make it back into the world, while the cross-called
376 393 * one is stuck at the debugger fault prompt. Leave the
377 394 * world in that state too long, and you'll get a mondo
378 395 * timeout panic. If we hold everyone here, we can give the
379 396 * the user a chance to trigger a panic for further analysis.
380 397 * To trigger the bug, "pool_unlock:b :c" and "while : ; do
381 398 * psrset -p ; done".
382 399 *
383 400 * When the second item is fixed, the barrier can move into
384 401 * kaif_select_master(), immediately prior to the setting of
385 402 * kaif_master_cpuid.
386 403 */
387 404 while (kaif_looping != 0)
388 405 continue;
389 406 }
390 407
391 408 return (cmd);
392 409 }
393 410
394 411
395 412 #if defined(__sparc)
396 413
397 414 static int slave_loop_barrier_failures = 0; /* for debug */
398 415
399 416 /*
400 417 * There exist a race condition observed by some
401 418 * platforms where the kmdb master cpu exits to OBP via
402 419 * prom_enter_mon (e.g. "$q" command) and then later re-enter
403 420 * kmdb (typing "go") while the slaves are still proceeding
404 421 * from the OBP idle-loop back to the kmdb slave loop. The
405 422 * problem arises when the master cpu now back in kmdb proceed
406 423 * to re-enter OBP (e.g. doing a prom_read() from the kmdb main
407 424 * loop) while the slaves are still trying to get out of (the
408 425 * previous trip in) OBP into the safety of the kmdb slave loop.
409 426 * This routine forces the slaves to explicitly acknowledge
410 427 * that they are back in the slave loop. The master cpu can
411 428 * call this routine to ensure that all slave cpus are back
412 429 * in the slave loop before proceeding.
413 430 */
414 431 void
415 432 kaif_slave_loop_barrier(void)
416 433 {
417 434 extern void kdi_usecwait(clock_t);
418 435 int i;
419 436 int not_acked;
420 437 int timeout_count = 0;
421 438
422 439 kaif_start_slaves(KAIF_SLAVE_CMD_ACK);
423 440
424 441 /*
425 442 * Wait for slave cpus to explicitly acknowledge
426 443 * that they are spinning in the slave loop.
427 444 */
428 445 do {
429 446 not_acked = 0;
430 447 for (i = 0; i < kaif_ncpusave; i++) {
431 448 kaif_cpusave_t *save = &kaif_cpusave[i];
432 449
433 450 if (save->krs_cpu_state ==
434 451 KAIF_CPU_STATE_SLAVE &&
435 452 !save->krs_cpu_acked) {
436 453 not_acked++;
437 454 break;
438 455 }
439 456 }
440 457
441 458 if (not_acked == 0)
442 459 break;
443 460
444 461 /*
445 462 * Play it safe and do a timeout delay.
446 463 * We will do at most kaif_ncpusave delays before
447 464 * bailing out of this barrier.
448 465 */
449 466 kdi_usecwait(200);
450 467
451 468 } while (++timeout_count < kaif_ncpusave);
452 469
453 470 if (not_acked > 0)
454 471 /*
455 472 * we cannot establish a barrier with all
456 473 * the slave cpus coming back from OBP
457 474 * Record this fact for future debugging
458 475 */
459 476 slave_loop_barrier_failures++;
460 477
461 478 kaif_slave_cmd = KAIF_SLAVE_CMD_SPIN;
462 479 }
463 480 #endif
↓ open down ↓ |
150 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX