9736 kmdb tortures via single-step miscellaneous trap Reviewed by: Robert Mustacchi <rm@joyent.com> Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * Copyright 2018 Joyent, Inc. 25 */ 26 27 /* 28 * The main CPU-control loops, used to control masters and slaves. 29 */ 30 31 #include <sys/types.h> 32 33 #include <kmdb/kaif.h> 34 #include <kmdb/kaif_start.h> 35 #include <kmdb/kmdb_asmutil.h> 36 #include <kmdb/kmdb_dpi_impl.h> 37 #include <kmdb/kmdb_kdi.h> 38 39 #define KAIF_SLAVE_CMD_SPIN 0 40 #define KAIF_SLAVE_CMD_SWITCH 1 41 #define KAIF_SLAVE_CMD_RESUME 2 42 #define KAIF_SLAVE_CMD_FLUSH 3 43 #define KAIF_SLAVE_CMD_REBOOT 4 44 #if defined(__sparc) 45 #define KAIF_SLAVE_CMD_ACK 5 46 #endif 47 48 49 /* 50 * Used to synchronize attempts to set kaif_master_cpuid. kaif_master_cpuid may 51 * be read without kaif_master_lock, and may be written by the current master 52 * CPU. 53 */ 54 int kaif_master_cpuid = KAIF_MASTER_CPUID_UNSET; 55 static uintptr_t kaif_master_lock = 0; 56 57 /* 58 * Used to ensure that all CPUs leave the debugger together. kaif_loop_lock must 59 * be held to write kaif_looping, but need not be held to read it. 60 */ 61 static volatile uint_t kaif_looping; 62 static uintptr_t kaif_loop_lock; 63 64 static volatile int kaif_slave_cmd; 65 static volatile int kaif_slave_tgt; /* target cpuid for CMD_SWITCH */ 66 67 static void 68 kaif_lock_enter(uintptr_t *lock) 69 { 70 while (cas(lock, 0, 1) != 0) 71 continue; 72 membar_producer(); 73 } 74 75 static void 76 kaif_lock_exit(uintptr_t *lock) 77 { 78 *lock = 0; 79 membar_producer(); 80 } 81 82 static void 83 kaif_start_slaves(int cmd) 84 { 85 kaif_slave_cmd = cmd; 86 kmdb_kdi_start_slaves(); 87 } 88 89 static int 90 kaif_master_loop(kaif_cpusave_t *cpusave) 91 { 92 int notflushed, i; 93 94 #if defined(__sparc) 95 kaif_prom_rearm(); 96 #endif 97 kaif_trap_set_debugger(); 98 99 /* 100 * If we re-entered due to a ::switch, we need to tell the slave CPUs 101 * to sleep again. 102 */ 103 kmdb_kdi_stop_slaves(cpusave->krs_cpu_id, 0); 104 105 master_loop: 106 switch (kmdb_dpi_reenter()) { 107 case KMDB_DPI_CMD_SWITCH_CPU: 108 /* 109 * We assume that the target CPU is a valid slave. There's no 110 * easy way to complain here, so we'll assume that the caller 111 * has done the proper checking. 112 */ 113 if (kmdb_dpi_switch_target == cpusave->krs_cpu_id) 114 break; 115 116 kaif_slave_tgt = kaif_master_cpuid = kmdb_dpi_switch_target; 117 cpusave->krs_cpu_state = KAIF_CPU_STATE_SLAVE; 118 membar_producer(); 119 120 /* 121 * Switch back to the saved trap table before we switch CPUs -- 122 * we need to make sure that only one CPU is on the debugger's 123 * table at a time. 124 */ 125 kaif_trap_set_saved(cpusave); 126 127 kaif_start_slaves(KAIF_SLAVE_CMD_SWITCH); 128 129 /* The new master is now awake */ 130 return (KAIF_CPU_CMD_SWITCH); 131 132 case KMDB_DPI_CMD_RESUME_ALL: 133 case KMDB_DPI_CMD_RESUME_UNLOAD: 134 /* 135 * Resume everyone, clean up for next entry. 136 */ 137 kaif_master_cpuid = KAIF_MASTER_CPUID_UNSET; 138 membar_producer(); 139 kaif_start_slaves(KAIF_SLAVE_CMD_RESUME); 140 141 if (kmdb_dpi_work_required()) 142 kmdb_dpi_wrintr_fire(); 143 144 kaif_trap_set_saved(cpusave); 145 146 return (KAIF_CPU_CMD_RESUME); 147 148 case KMDB_DPI_CMD_RESUME_MASTER: 149 /* 150 * Single-CPU resume, which is performed on the debugger's 151 * trap table (so no need to switch back). 152 */ 153 return (KAIF_CPU_CMD_RESUME_MASTER); 154 155 case KMDB_DPI_CMD_FLUSH_CACHES: 156 kaif_start_slaves(KAIF_SLAVE_CMD_FLUSH); 157 158 /* 159 * Wait for the other cpus to finish flushing their caches. 160 */ 161 do { 162 notflushed = 0; 163 for (i = 0; i < kaif_ncpusave; i++) { 164 kaif_cpusave_t *save = &kaif_cpusave[i]; 165 166 if (save->krs_cpu_state == 167 KAIF_CPU_STATE_SLAVE && 168 !save->krs_cpu_flushed) { 169 notflushed++; 170 break; 171 } 172 } 173 } while (notflushed > 0); 174 175 kaif_slave_cmd = KAIF_SLAVE_CMD_SPIN; 176 break; 177 178 #if defined(__i386) || defined(__amd64) 179 case KMDB_DPI_CMD_REBOOT: 180 /* 181 * Reboot must be initiated by CPU 0. I could ask why, but I'm 182 * afraid that I don't want to know the answer. 183 */ 184 if (cpusave->krs_cpu_id == 0) 185 kmdb_kdi_reboot(); 186 187 kaif_start_slaves(KAIF_SLAVE_CMD_REBOOT); 188 189 /* 190 * Spin forever, waiting for CPU 0 (apparently a slave) to 191 * reboot the system. 192 */ 193 for (;;) 194 continue; 195 196 /*NOTREACHED*/ 197 break; 198 #endif 199 } 200 201 goto master_loop; 202 } 203 204 static int 205 kaif_slave_loop(kaif_cpusave_t *cpusave) 206 { 207 int slavecmd, rv; 208 209 #if defined(__sparc) 210 /* 211 * If the user elects to drop to OBP from the debugger, some OBP 212 * implementations will cross-call the slaves. We have to turn 213 * IE back on so we can receive the cross-calls. If we don't, 214 * some OBP implementations will wait forever. 215 */ 216 interrupts_on(); 217 #endif 218 219 /* Wait for duty to call */ 220 for (;;) { 221 slavecmd = kaif_slave_cmd; 222 223 if (slavecmd == KAIF_SLAVE_CMD_SWITCH && 224 kaif_slave_tgt == cpusave->krs_cpu_id) { 225 kaif_slave_cmd = KAIF_SLAVE_CMD_SPIN; 226 cpusave->krs_cpu_state = KAIF_CPU_STATE_MASTER; 227 rv = KAIF_CPU_CMD_SWITCH; 228 break; 229 230 } else if (slavecmd == KAIF_SLAVE_CMD_FLUSH) { 231 kmdb_kdi_flush_caches(); 232 cpusave->krs_cpu_flushed = 1; 233 continue; 234 235 #if defined(__i386) || defined(__amd64) 236 } else if (slavecmd == KAIF_SLAVE_CMD_REBOOT && 237 cpusave->krs_cpu_id == 0) { 238 rv = 0; 239 kmdb_kdi_reboot(); 240 break; 241 #endif 242 243 } else if (slavecmd == KAIF_SLAVE_CMD_RESUME) { 244 rv = KAIF_CPU_CMD_RESUME; 245 break; 246 #if defined(__sparc) 247 } else if (slavecmd == KAIF_SLAVE_CMD_ACK) { 248 cpusave->krs_cpu_acked = 1; 249 } else if (cpusave->krs_cpu_acked && 250 slavecmd == KAIF_SLAVE_CMD_SPIN) { 251 cpusave->krs_cpu_acked = 0; 252 #endif 253 } 254 255 kmdb_kdi_slave_wait(); 256 } 257 258 #if defined(__sparc) 259 interrupts_off(); 260 #endif 261 262 return (rv); 263 } 264 265 static void 266 kaif_select_master(kaif_cpusave_t *cpusave) 267 { 268 kaif_lock_enter(&kaif_master_lock); 269 270 if (kaif_master_cpuid == KAIF_MASTER_CPUID_UNSET) { 271 /* This is the master. */ 272 kaif_master_cpuid = cpusave->krs_cpu_id; 273 cpusave->krs_cpu_state = KAIF_CPU_STATE_MASTER; 274 kaif_slave_cmd = KAIF_SLAVE_CMD_SPIN; 275 276 membar_producer(); 277 278 kmdb_kdi_stop_slaves(cpusave->krs_cpu_id, 1); 279 } else { 280 /* The master was already chosen - go be a slave */ 281 cpusave->krs_cpu_state = KAIF_CPU_STATE_SLAVE; 282 membar_producer(); 283 } 284 285 kaif_lock_exit(&kaif_master_lock); 286 } 287 288 int 289 kaif_main_loop(kaif_cpusave_t *cpusave) 290 { 291 int cmd; 292 293 if (kaif_master_cpuid == KAIF_MASTER_CPUID_UNSET) { 294 295 /* 296 * Special case: Unload requested before first debugger entry. 297 * Don't stop the world, as there's nothing to clean up that 298 * can't be handled by the running kernel. 299 */ 300 if (!kmdb_dpi_resume_requested && 301 kmdb_kdi_get_unload_request()) { 302 cpusave->krs_cpu_state = KAIF_CPU_STATE_NONE; 303 return (KAIF_CPU_CMD_RESUME); 304 } 305 306 /* 307 * We're a slave with no master, so just resume. This can 308 * happen if, prior to this, two CPUs both raced through 309 * kdi_cmnint() - for example, a breakpoint on a frequently 310 * called function. The loser will be redirected to the slave 311 * loop; note that the event itself is lost at this point. 312 * 313 * The winner will then cross-call that slave, but it won't 314 * actually be received until the slave returns to the kernel 315 * and enables interrupts. We'll then come back in via 316 * kdi_slave_entry() and hit this path. 317 */ 318 if (cpusave->krs_cpu_state == KAIF_CPU_STATE_SLAVE) { 319 cpusave->krs_cpu_state = KAIF_CPU_STATE_NONE; 320 return (KAIF_CPU_CMD_RESUME); 321 } 322 323 kaif_select_master(cpusave); 324 325 #ifdef __sparc 326 if (kaif_master_cpuid == cpusave->krs_cpu_id) { 327 /* 328 * Everyone has arrived, so we can disarm the post-PROM 329 * entry point. 330 */ 331 *kaif_promexitarmp = 0; 332 membar_producer(); 333 } 334 #endif 335 } else if (kaif_master_cpuid == cpusave->krs_cpu_id) { 336 cpusave->krs_cpu_state = KAIF_CPU_STATE_MASTER; 337 } else { 338 cpusave->krs_cpu_state = KAIF_CPU_STATE_SLAVE; 339 } 340 341 cpusave->krs_cpu_flushed = 0; 342 343 kaif_lock_enter(&kaif_loop_lock); 344 kaif_looping++; 345 kaif_lock_exit(&kaif_loop_lock); 346 347 /* 348 * We know who the master and slaves are, so now they can go off 349 * to their respective loops. 350 */ 351 do { 352 if (kaif_master_cpuid == cpusave->krs_cpu_id) 353 cmd = kaif_master_loop(cpusave); 354 else 355 cmd = kaif_slave_loop(cpusave); 356 } while (cmd == KAIF_CPU_CMD_SWITCH); 357 358 kaif_lock_enter(&kaif_loop_lock); 359 kaif_looping--; 360 kaif_lock_exit(&kaif_loop_lock); 361 362 cpusave->krs_cpu_state = KAIF_CPU_STATE_NONE; 363 364 if (cmd == KAIF_CPU_CMD_RESUME) { 365 /* 366 * By this point, the master has directed the slaves to resume, 367 * and everyone is making their way to this point. We're going 368 * to block here until all CPUs leave the master and slave 369 * loops. When all have arrived, we'll turn them all loose. 370 * This barrier is required for two reasons: 371 * 372 * 1. There exists a race condition whereby a CPU could reenter 373 * the debugger while another CPU is still in the slave loop 374 * from this debugger entry. This usually happens when the 375 * current master releases the slaves, and makes it back to 376 * the world before the slaves notice the release. The 377 * former master then triggers a debugger entry, and attempts 378 * to stop the slaves for this entry before they've even 379 * resumed from the last one. When the slaves arrive here, 380 * they'll have re-disabled interrupts, and will thus ignore 381 * cross-calls until they finish resuming. 382 * 383 * 2. At the time of this writing, there exists a SPARC bug that 384 * causes an apparently unsolicited interrupt vector trap 385 * from OBP to one of the slaves. This wouldn't normally be 386 * a problem but for the fact that the cross-called CPU 387 * encounters some sort of failure while in OBP. OBP 388 * recovers by executing the debugger-hook word, which sends 389 * the slave back into the debugger, triggering a debugger 390 * fault. This problem seems to only happen during resume, 391 * the result being that all CPUs save for the cross-called 392 * one make it back into the world, while the cross-called 393 * one is stuck at the debugger fault prompt. Leave the 394 * world in that state too long, and you'll get a mondo 395 * timeout panic. If we hold everyone here, we can give the 396 * the user a chance to trigger a panic for further analysis. 397 * To trigger the bug, "pool_unlock:b :c" and "while : ; do 398 * psrset -p ; done". 399 * 400 * When the second item is fixed, the barrier can move into 401 * kaif_select_master(), immediately prior to the setting of 402 * kaif_master_cpuid. 403 */ 404 while (kaif_looping != 0) 405 continue; 406 } 407 408 return (cmd); 409 } 410 411 412 #if defined(__sparc) 413 414 static int slave_loop_barrier_failures = 0; /* for debug */ 415 416 /* 417 * There exist a race condition observed by some 418 * platforms where the kmdb master cpu exits to OBP via 419 * prom_enter_mon (e.g. "$q" command) and then later re-enter 420 * kmdb (typing "go") while the slaves are still proceeding 421 * from the OBP idle-loop back to the kmdb slave loop. The 422 * problem arises when the master cpu now back in kmdb proceed 423 * to re-enter OBP (e.g. doing a prom_read() from the kmdb main 424 * loop) while the slaves are still trying to get out of (the 425 * previous trip in) OBP into the safety of the kmdb slave loop. 426 * This routine forces the slaves to explicitly acknowledge 427 * that they are back in the slave loop. The master cpu can 428 * call this routine to ensure that all slave cpus are back 429 * in the slave loop before proceeding. 430 */ 431 void 432 kaif_slave_loop_barrier(void) 433 { 434 extern void kdi_usecwait(clock_t); 435 int i; 436 int not_acked; 437 int timeout_count = 0; 438 439 kaif_start_slaves(KAIF_SLAVE_CMD_ACK); 440 441 /* 442 * Wait for slave cpus to explicitly acknowledge 443 * that they are spinning in the slave loop. 444 */ 445 do { 446 not_acked = 0; 447 for (i = 0; i < kaif_ncpusave; i++) { 448 kaif_cpusave_t *save = &kaif_cpusave[i]; 449 450 if (save->krs_cpu_state == 451 KAIF_CPU_STATE_SLAVE && 452 !save->krs_cpu_acked) { 453 not_acked++; 454 break; 455 } 456 } 457 458 if (not_acked == 0) 459 break; 460 461 /* 462 * Play it safe and do a timeout delay. 463 * We will do at most kaif_ncpusave delays before 464 * bailing out of this barrier. 465 */ 466 kdi_usecwait(200); 467 468 } while (++timeout_count < kaif_ncpusave); 469 470 if (not_acked > 0) 471 /* 472 * we cannot establish a barrier with all 473 * the slave cpus coming back from OBP 474 * Record this fact for future debugging 475 */ 476 slave_loop_barrier_failures++; 477 478 kaif_slave_cmd = KAIF_SLAVE_CMD_SPIN; 479 } 480 #endif --- EOF ---