9736 kmdb tortures via single-step miscellaneous trap Reviewed by: Robert Mustacchi <rm@joyent.com> Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * The main CPU-control loops, used to control masters and slaves. 30 */ 31 32 #include <sys/types.h> 33 34 #include <kmdb/kaif.h> 35 #include <kmdb/kaif_start.h> 36 #include <kmdb/kmdb_asmutil.h> 37 #include <kmdb/kmdb_dpi_impl.h> 38 #include <kmdb/kmdb_kdi.h> 39 40 #define KAIF_SLAVE_CMD_SPIN 0 41 #define KAIF_SLAVE_CMD_SWITCH 1 42 #define KAIF_SLAVE_CMD_RESUME 2 43 #define KAIF_SLAVE_CMD_FLUSH 3 44 #define KAIF_SLAVE_CMD_REBOOT 4 45 #if defined(__sparc) 46 #define KAIF_SLAVE_CMD_ACK 5 47 #endif 48 49 50 /* 51 * Used to synchronize attempts to set kaif_master_cpuid. kaif_master_cpuid may 52 * be read without kaif_master_lock, and may be written by the current master 53 * CPU. 54 */ 55 int kaif_master_cpuid = KAIF_MASTER_CPUID_UNSET; 56 static uintptr_t kaif_master_lock = 0; 57 58 /* 59 * Used to ensure that all CPUs leave the debugger together. kaif_loop_lock must 60 * be held to write kaif_looping, but need not be held to read it. 61 */ 62 static volatile uint_t kaif_looping; 63 static uintptr_t kaif_loop_lock; 64 65 static volatile int kaif_slave_cmd; 66 static volatile int kaif_slave_tgt; /* target cpuid for CMD_SWITCH */ 67 68 static void 69 kaif_lock_enter(uintptr_t *lock) 70 { 71 while (cas(lock, 0, 1) != 0) 72 continue; 73 membar_producer(); 74 } 75 76 static void 77 kaif_lock_exit(uintptr_t *lock) 78 { 79 *lock = 0; 80 membar_producer(); 81 } 82 83 static void 84 kaif_start_slaves(int cmd) 85 { 86 kaif_slave_cmd = cmd; 87 kmdb_kdi_start_slaves(); 88 } 89 90 static int 91 kaif_master_loop(kaif_cpusave_t *cpusave) 92 { 93 int notflushed, i; 94 95 #if defined(__sparc) 96 kaif_prom_rearm(); 97 #endif 98 kaif_trap_set_debugger(); 99 100 /* 101 * If we re-entered due to a ::switch, we need to tell the slave CPUs 102 * to sleep again. 103 */ 104 kmdb_kdi_stop_slaves(cpusave->krs_cpu_id, 0); 105 106 master_loop: 107 switch (kmdb_dpi_reenter()) { 108 case KMDB_DPI_CMD_SWITCH_CPU: 109 /* 110 * We assume that the target CPU is a valid slave. There's no 111 * easy way to complain here, so we'll assume that the caller 112 * has done the proper checking. 113 */ 114 if (kmdb_dpi_switch_target == cpusave->krs_cpu_id) 115 break; 116 117 kaif_slave_tgt = kaif_master_cpuid = kmdb_dpi_switch_target; 118 cpusave->krs_cpu_state = KAIF_CPU_STATE_SLAVE; 119 membar_producer(); 120 121 /* 122 * Switch back to the saved trap table before we switch CPUs -- 123 * we need to make sure that only one CPU is on the debugger's 124 * table at a time. 125 */ 126 kaif_trap_set_saved(cpusave); 127 128 kaif_start_slaves(KAIF_SLAVE_CMD_SWITCH); 129 130 /* The new master is now awake */ 131 return (KAIF_CPU_CMD_SWITCH); 132 133 case KMDB_DPI_CMD_RESUME_ALL: 134 case KMDB_DPI_CMD_RESUME_UNLOAD: 135 /* 136 * Resume everyone, clean up for next entry. 137 */ 138 kaif_master_cpuid = KAIF_MASTER_CPUID_UNSET; 139 membar_producer(); 140 kaif_start_slaves(KAIF_SLAVE_CMD_RESUME); 141 142 if (kmdb_dpi_work_required()) 143 kmdb_dpi_wrintr_fire(); 144 145 kaif_trap_set_saved(cpusave); 146 147 return (KAIF_CPU_CMD_RESUME); 148 149 case KMDB_DPI_CMD_RESUME_MASTER: 150 /* 151 * Single-CPU resume, which is performed on the debugger's 152 * trap table (so no need to switch back). 153 */ 154 return (KAIF_CPU_CMD_RESUME_MASTER); 155 156 case KMDB_DPI_CMD_FLUSH_CACHES: 157 kaif_start_slaves(KAIF_SLAVE_CMD_FLUSH); 158 159 /* 160 * Wait for the other cpus to finish flushing their caches. 161 */ 162 do { 163 notflushed = 0; 164 for (i = 0; i < kaif_ncpusave; i++) { 165 kaif_cpusave_t *save = &kaif_cpusave[i]; 166 167 if (save->krs_cpu_state == 168 KAIF_CPU_STATE_SLAVE && 169 !save->krs_cpu_flushed) { 170 notflushed++; 171 break; 172 } 173 } 174 } while (notflushed > 0); 175 176 kaif_slave_cmd = KAIF_SLAVE_CMD_SPIN; 177 break; 178 179 #if defined(__i386) || defined(__amd64) 180 case KMDB_DPI_CMD_REBOOT: 181 /* 182 * Reboot must be initiated by CPU 0. I could ask why, but I'm 183 * afraid that I don't want to know the answer. 184 */ 185 if (cpusave->krs_cpu_id == 0) 186 kmdb_kdi_reboot(); 187 188 kaif_start_slaves(KAIF_SLAVE_CMD_REBOOT); 189 190 /* 191 * Spin forever, waiting for CPU 0 (apparently a slave) to 192 * reboot the system. 193 */ 194 for (;;) 195 continue; 196 197 /*NOTREACHED*/ 198 break; 199 #endif 200 } 201 202 goto master_loop; 203 } 204 205 static int 206 kaif_slave_loop(kaif_cpusave_t *cpusave) 207 { 208 int slavecmd, rv; 209 210 #if defined(__sparc) 211 /* 212 * If the user elects to drop to OBP from the debugger, some OBP 213 * implementations will cross-call the slaves. We have to turn 214 * IE back on so we can receive the cross-calls. If we don't, 215 * some OBP implementations will wait forever. 216 */ 217 interrupts_on(); 218 #endif 219 220 /* Wait for duty to call */ 221 for (;;) { 222 slavecmd = kaif_slave_cmd; 223 224 if (slavecmd == KAIF_SLAVE_CMD_SWITCH && 225 kaif_slave_tgt == cpusave->krs_cpu_id) { 226 kaif_slave_cmd = KAIF_SLAVE_CMD_SPIN; 227 cpusave->krs_cpu_state = KAIF_CPU_STATE_MASTER; 228 rv = KAIF_CPU_CMD_SWITCH; 229 break; 230 231 } else if (slavecmd == KAIF_SLAVE_CMD_FLUSH) { 232 kmdb_kdi_flush_caches(); 233 cpusave->krs_cpu_flushed = 1; 234 continue; 235 236 #if defined(__i386) || defined(__amd64) 237 } else if (slavecmd == KAIF_SLAVE_CMD_REBOOT && 238 cpusave->krs_cpu_id == 0) { 239 rv = 0; 240 kmdb_kdi_reboot(); 241 break; 242 #endif 243 244 } else if (slavecmd == KAIF_SLAVE_CMD_RESUME) { 245 rv = KAIF_CPU_CMD_RESUME; 246 break; 247 #if defined(__sparc) 248 } else if (slavecmd == KAIF_SLAVE_CMD_ACK) { 249 cpusave->krs_cpu_acked = 1; 250 } else if (cpusave->krs_cpu_acked && 251 slavecmd == KAIF_SLAVE_CMD_SPIN) { 252 cpusave->krs_cpu_acked = 0; 253 #endif 254 } 255 256 kmdb_kdi_slave_wait(); 257 } 258 259 #if defined(__sparc) 260 interrupts_off(); 261 #endif 262 263 return (rv); 264 } 265 266 static void 267 kaif_select_master(kaif_cpusave_t *cpusave) 268 { 269 kaif_lock_enter(&kaif_master_lock); 270 271 if (kaif_master_cpuid == KAIF_MASTER_CPUID_UNSET) { 272 /* This is the master. */ 273 kaif_master_cpuid = cpusave->krs_cpu_id; 274 cpusave->krs_cpu_state = KAIF_CPU_STATE_MASTER; 275 kaif_slave_cmd = KAIF_SLAVE_CMD_SPIN; 276 277 membar_producer(); 278 279 kmdb_kdi_stop_slaves(cpusave->krs_cpu_id, 1); 280 } else { 281 /* The master was already chosen - go be a slave */ 282 cpusave->krs_cpu_state = KAIF_CPU_STATE_SLAVE; 283 membar_producer(); 284 } 285 286 kaif_lock_exit(&kaif_master_lock); 287 } 288 289 int 290 kaif_main_loop(kaif_cpusave_t *cpusave) 291 { 292 int cmd; 293 294 if (kaif_master_cpuid == KAIF_MASTER_CPUID_UNSET) { 295 if (!kmdb_dpi_resume_requested && 296 kmdb_kdi_get_unload_request()) { 297 /* 298 * Special case: Unload requested before first debugger 299 * entry. Don't stop the world, as there's nothing to 300 * clean up that can't be handled by the running kernel. 301 */ 302 cpusave->krs_cpu_state = KAIF_CPU_STATE_NONE; 303 return (KAIF_CPU_CMD_RESUME); 304 } 305 306 kaif_select_master(cpusave); 307 308 #ifdef __sparc 309 if (kaif_master_cpuid == cpusave->krs_cpu_id) { 310 /* 311 * Everyone has arrived, so we can disarm the post-PROM 312 * entry point. 313 */ 314 *kaif_promexitarmp = 0; 315 membar_producer(); 316 } 317 #endif 318 } else if (kaif_master_cpuid == cpusave->krs_cpu_id) { 319 cpusave->krs_cpu_state = KAIF_CPU_STATE_MASTER; 320 } else { 321 cpusave->krs_cpu_state = KAIF_CPU_STATE_SLAVE; 322 } 323 324 cpusave->krs_cpu_flushed = 0; 325 326 kaif_lock_enter(&kaif_loop_lock); 327 kaif_looping++; 328 kaif_lock_exit(&kaif_loop_lock); 329 330 /* 331 * We know who the master and slaves are, so now they can go off 332 * to their respective loops. 333 */ 334 do { 335 if (kaif_master_cpuid == cpusave->krs_cpu_id) 336 cmd = kaif_master_loop(cpusave); 337 else 338 cmd = kaif_slave_loop(cpusave); 339 } while (cmd == KAIF_CPU_CMD_SWITCH); 340 341 kaif_lock_enter(&kaif_loop_lock); 342 kaif_looping--; 343 kaif_lock_exit(&kaif_loop_lock); 344 345 cpusave->krs_cpu_state = KAIF_CPU_STATE_NONE; 346 347 if (cmd == KAIF_CPU_CMD_RESUME) { 348 /* 349 * By this point, the master has directed the slaves to resume, 350 * and everyone is making their way to this point. We're going 351 * to block here until all CPUs leave the master and slave 352 * loops. When all have arrived, we'll turn them all loose. 353 * This barrier is required for two reasons: 354 * 355 * 1. There exists a race condition whereby a CPU could reenter 356 * the debugger while another CPU is still in the slave loop 357 * from this debugger entry. This usually happens when the 358 * current master releases the slaves, and makes it back to 359 * the world before the slaves notice the release. The 360 * former master then triggers a debugger entry, and attempts 361 * to stop the slaves for this entry before they've even 362 * resumed from the last one. When the slaves arrive here, 363 * they'll have re-disabled interrupts, and will thus ignore 364 * cross-calls until they finish resuming. 365 * 366 * 2. At the time of this writing, there exists a SPARC bug that 367 * causes an apparently unsolicited interrupt vector trap 368 * from OBP to one of the slaves. This wouldn't normally be 369 * a problem but for the fact that the cross-called CPU 370 * encounters some sort of failure while in OBP. OBP 371 * recovers by executing the debugger-hook word, which sends 372 * the slave back into the debugger, triggering a debugger 373 * fault. This problem seems to only happen during resume, 374 * the result being that all CPUs save for the cross-called 375 * one make it back into the world, while the cross-called 376 * one is stuck at the debugger fault prompt. Leave the 377 * world in that state too long, and you'll get a mondo 378 * timeout panic. If we hold everyone here, we can give the 379 * the user a chance to trigger a panic for further analysis. 380 * To trigger the bug, "pool_unlock:b :c" and "while : ; do 381 * psrset -p ; done". 382 * 383 * When the second item is fixed, the barrier can move into 384 * kaif_select_master(), immediately prior to the setting of 385 * kaif_master_cpuid. 386 */ 387 while (kaif_looping != 0) 388 continue; 389 } 390 391 return (cmd); 392 } 393 394 395 #if defined(__sparc) 396 397 static int slave_loop_barrier_failures = 0; /* for debug */ 398 399 /* 400 * There exist a race condition observed by some 401 * platforms where the kmdb master cpu exits to OBP via 402 * prom_enter_mon (e.g. "$q" command) and then later re-enter 403 * kmdb (typing "go") while the slaves are still proceeding 404 * from the OBP idle-loop back to the kmdb slave loop. The 405 * problem arises when the master cpu now back in kmdb proceed 406 * to re-enter OBP (e.g. doing a prom_read() from the kmdb main 407 * loop) while the slaves are still trying to get out of (the 408 * previous trip in) OBP into the safety of the kmdb slave loop. 409 * This routine forces the slaves to explicitly acknowledge 410 * that they are back in the slave loop. The master cpu can 411 * call this routine to ensure that all slave cpus are back 412 * in the slave loop before proceeding. 413 */ 414 void 415 kaif_slave_loop_barrier(void) 416 { 417 extern void kdi_usecwait(clock_t); 418 int i; 419 int not_acked; 420 int timeout_count = 0; 421 422 kaif_start_slaves(KAIF_SLAVE_CMD_ACK); 423 424 /* 425 * Wait for slave cpus to explicitly acknowledge 426 * that they are spinning in the slave loop. 427 */ 428 do { 429 not_acked = 0; 430 for (i = 0; i < kaif_ncpusave; i++) { 431 kaif_cpusave_t *save = &kaif_cpusave[i]; 432 433 if (save->krs_cpu_state == 434 KAIF_CPU_STATE_SLAVE && 435 !save->krs_cpu_acked) { 436 not_acked++; 437 break; 438 } 439 } 440 441 if (not_acked == 0) 442 break; 443 444 /* 445 * Play it safe and do a timeout delay. 446 * We will do at most kaif_ncpusave delays before 447 * bailing out of this barrier. 448 */ 449 kdi_usecwait(200); 450 451 } while (++timeout_count < kaif_ncpusave); 452 453 if (not_acked > 0) 454 /* 455 * we cannot establish a barrier with all 456 * the slave cpus coming back from OBP 457 * Record this fact for future debugging 458 */ 459 slave_loop_barrier_failures++; 460 461 kaif_slave_cmd = KAIF_SLAVE_CMD_SPIN; 462 } 463 #endif --- EOF ---