9736 kmdb tortures via single-step miscellaneous trap
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.

  24  */
  25 
  26 #pragma ident   "%Z%%M% %I%     %E% SMI"
  27 
  28 /*
  29  * The main CPU-control loops, used to control masters and slaves.
  30  */
  31 
  32 #include <sys/types.h>
  33 
  34 #include <kmdb/kaif.h>
  35 #include <kmdb/kaif_start.h>
  36 #include <kmdb/kmdb_asmutil.h>
  37 #include <kmdb/kmdb_dpi_impl.h>
  38 #include <kmdb/kmdb_kdi.h>
  39 
  40 #define KAIF_SLAVE_CMD_SPIN     0
  41 #define KAIF_SLAVE_CMD_SWITCH   1
  42 #define KAIF_SLAVE_CMD_RESUME   2
  43 #define KAIF_SLAVE_CMD_FLUSH    3
  44 #define KAIF_SLAVE_CMD_REBOOT   4
  45 #if defined(__sparc)
  46 #define KAIF_SLAVE_CMD_ACK      5
  47 #endif
  48 
  49 
  50 /*
  51  * Used to synchronize attempts to set kaif_master_cpuid.  kaif_master_cpuid may
  52  * be read without kaif_master_lock, and may be written by the current master
  53  * CPU.
  54  */
  55 int kaif_master_cpuid = KAIF_MASTER_CPUID_UNSET;
  56 static uintptr_t kaif_master_lock = 0;
  57 
  58 /*
  59  * Used to ensure that all CPUs leave the debugger together. kaif_loop_lock must
  60  * be held to write kaif_looping, but need not be held to read it.
  61  */
  62 static volatile uint_t kaif_looping;
  63 static uintptr_t kaif_loop_lock;
  64 
  65 static volatile int kaif_slave_cmd;
  66 static volatile int kaif_slave_tgt;     /* target cpuid for CMD_SWITCH */
  67 
  68 static void
  69 kaif_lock_enter(uintptr_t *lock)
  70 {
  71         while (cas(lock, 0, 1) != 0)
  72                 continue;
  73         membar_producer();
  74 }
  75 
  76 static void
  77 kaif_lock_exit(uintptr_t *lock)
  78 {
  79         *lock = 0;
  80         membar_producer();
  81 }
  82 
  83 static void
  84 kaif_start_slaves(int cmd)
  85 {
  86         kaif_slave_cmd = cmd;
  87         kmdb_kdi_start_slaves();
  88 }
  89 
  90 static int
  91 kaif_master_loop(kaif_cpusave_t *cpusave)
  92 {
  93         int notflushed, i;
  94 
  95 #if defined(__sparc)
  96         kaif_prom_rearm();
  97 #endif
  98         kaif_trap_set_debugger();
  99 
 100         /*
 101          * If we re-entered due to a ::switch, we need to tell the slave CPUs
 102          * to sleep again.
 103          */
 104         kmdb_kdi_stop_slaves(cpusave->krs_cpu_id, 0);
 105 
 106 master_loop:
 107         switch (kmdb_dpi_reenter()) {
 108         case KMDB_DPI_CMD_SWITCH_CPU:
 109                 /*
 110                  * We assume that the target CPU is a valid slave.  There's no
 111                  * easy way to complain here, so we'll assume that the caller
 112                  * has done the proper checking.
 113                  */
 114                 if (kmdb_dpi_switch_target == cpusave->krs_cpu_id)
 115                         break;
 116 
 117                 kaif_slave_tgt = kaif_master_cpuid = kmdb_dpi_switch_target;
 118                 cpusave->krs_cpu_state = KAIF_CPU_STATE_SLAVE;
 119                 membar_producer();
 120 
 121                 /*
 122                  * Switch back to the saved trap table before we switch CPUs --
 123                  * we need to make sure that only one CPU is on the debugger's
 124                  * table at a time.
 125                  */
 126                 kaif_trap_set_saved(cpusave);
 127 
 128                 kaif_start_slaves(KAIF_SLAVE_CMD_SWITCH);
 129 
 130                 /* The new master is now awake */
 131                 return (KAIF_CPU_CMD_SWITCH);
 132 
 133         case KMDB_DPI_CMD_RESUME_ALL:
 134         case KMDB_DPI_CMD_RESUME_UNLOAD:
 135                 /*
 136                  * Resume everyone, clean up for next entry.
 137                  */
 138                 kaif_master_cpuid = KAIF_MASTER_CPUID_UNSET;
 139                 membar_producer();
 140                 kaif_start_slaves(KAIF_SLAVE_CMD_RESUME);
 141 
 142                 if (kmdb_dpi_work_required())
 143                         kmdb_dpi_wrintr_fire();
 144 
 145                 kaif_trap_set_saved(cpusave);
 146 
 147                 return (KAIF_CPU_CMD_RESUME);
 148 
 149         case KMDB_DPI_CMD_RESUME_MASTER:
 150                 /*
 151                  * Single-CPU resume, which is performed on the debugger's
 152                  * trap table (so no need to switch back).
 153                  */
 154                 return (KAIF_CPU_CMD_RESUME_MASTER);
 155 
 156         case KMDB_DPI_CMD_FLUSH_CACHES:
 157                 kaif_start_slaves(KAIF_SLAVE_CMD_FLUSH);
 158 
 159                 /*
 160                  * Wait for the other cpus to finish flushing their caches.
 161                  */
 162                 do {
 163                         notflushed = 0;
 164                         for (i = 0; i < kaif_ncpusave; i++) {
 165                                 kaif_cpusave_t *save = &kaif_cpusave[i];
 166 
 167                                 if (save->krs_cpu_state ==
 168                                     KAIF_CPU_STATE_SLAVE &&
 169                                     !save->krs_cpu_flushed) {
 170                                         notflushed++;
 171                                         break;
 172                                 }
 173                         }
 174                 } while (notflushed > 0);
 175 
 176                 kaif_slave_cmd = KAIF_SLAVE_CMD_SPIN;
 177                 break;
 178 
 179 #if defined(__i386) || defined(__amd64)
 180         case KMDB_DPI_CMD_REBOOT:
 181                 /*
 182                  * Reboot must be initiated by CPU 0.  I could ask why, but I'm
 183                  * afraid that I don't want to know the answer.
 184                  */
 185                 if (cpusave->krs_cpu_id == 0)
 186                         kmdb_kdi_reboot();
 187 
 188                 kaif_start_slaves(KAIF_SLAVE_CMD_REBOOT);
 189 
 190                 /*
 191                  * Spin forever, waiting for CPU 0 (apparently a slave) to
 192                  * reboot the system.
 193                  */
 194                 for (;;)
 195                         continue;
 196 
 197                 /*NOTREACHED*/
 198                 break;
 199 #endif
 200         }
 201 
 202         goto master_loop;
 203 }
 204 
 205 static int
 206 kaif_slave_loop(kaif_cpusave_t *cpusave)
 207 {
 208         int slavecmd, rv;
 209 
 210 #if defined(__sparc)
 211         /*
 212          * If the user elects to drop to OBP from the debugger, some OBP
 213          * implementations will cross-call the slaves.  We have to turn
 214          * IE back on so we can receive the cross-calls.  If we don't,
 215          * some OBP implementations will wait forever.
 216          */
 217         interrupts_on();
 218 #endif
 219 
 220         /* Wait for duty to call */
 221         for (;;) {
 222                 slavecmd = kaif_slave_cmd;
 223 
 224                 if (slavecmd == KAIF_SLAVE_CMD_SWITCH &&
 225                     kaif_slave_tgt == cpusave->krs_cpu_id) {
 226                         kaif_slave_cmd = KAIF_SLAVE_CMD_SPIN;
 227                         cpusave->krs_cpu_state = KAIF_CPU_STATE_MASTER;
 228                         rv = KAIF_CPU_CMD_SWITCH;
 229                         break;
 230 
 231                 } else if (slavecmd == KAIF_SLAVE_CMD_FLUSH) {
 232                         kmdb_kdi_flush_caches();
 233                         cpusave->krs_cpu_flushed = 1;
 234                         continue;
 235 
 236 #if defined(__i386) || defined(__amd64)
 237                 } else if (slavecmd == KAIF_SLAVE_CMD_REBOOT &&
 238                     cpusave->krs_cpu_id == 0) {
 239                         rv = 0;
 240                         kmdb_kdi_reboot();
 241                         break;
 242 #endif
 243 
 244                 } else if (slavecmd == KAIF_SLAVE_CMD_RESUME) {
 245                         rv = KAIF_CPU_CMD_RESUME;
 246                         break;
 247 #if defined(__sparc)
 248                 } else if (slavecmd == KAIF_SLAVE_CMD_ACK) {
 249                         cpusave->krs_cpu_acked = 1;
 250                 } else if (cpusave->krs_cpu_acked &&
 251                         slavecmd == KAIF_SLAVE_CMD_SPIN) {
 252                         cpusave->krs_cpu_acked = 0;
 253 #endif
 254                 }
 255 
 256                 kmdb_kdi_slave_wait();
 257         }
 258 
 259 #if defined(__sparc)
 260         interrupts_off();
 261 #endif
 262 
 263         return (rv);
 264 }
 265 
 266 static void
 267 kaif_select_master(kaif_cpusave_t *cpusave)
 268 {
 269         kaif_lock_enter(&kaif_master_lock);
 270 
 271         if (kaif_master_cpuid == KAIF_MASTER_CPUID_UNSET) {
 272                 /* This is the master. */
 273                 kaif_master_cpuid = cpusave->krs_cpu_id;
 274                 cpusave->krs_cpu_state = KAIF_CPU_STATE_MASTER;
 275                 kaif_slave_cmd = KAIF_SLAVE_CMD_SPIN;
 276 
 277                 membar_producer();
 278 
 279                 kmdb_kdi_stop_slaves(cpusave->krs_cpu_id, 1);
 280         } else {
 281                 /* The master was already chosen - go be a slave */
 282                 cpusave->krs_cpu_state = KAIF_CPU_STATE_SLAVE;
 283                 membar_producer();
 284         }
 285 
 286         kaif_lock_exit(&kaif_master_lock);
 287 }
 288 
 289 int
 290 kaif_main_loop(kaif_cpusave_t *cpusave)
 291 {
 292         int cmd;
 293 
 294         if (kaif_master_cpuid == KAIF_MASTER_CPUID_UNSET) {






 295                 if (!kmdb_dpi_resume_requested &&
 296                     kmdb_kdi_get_unload_request()) {




 297                         /*
 298                          * Special case: Unload requested before first debugger
 299                          * entry.  Don't stop the world, as there's nothing to
 300                          * clean up that can't be handled by the running kernel.







 301                          */

 302                         cpusave->krs_cpu_state = KAIF_CPU_STATE_NONE;
 303                         return (KAIF_CPU_CMD_RESUME);
 304                 }
 305 
 306                 kaif_select_master(cpusave);
 307 
 308 #ifdef __sparc
 309                 if (kaif_master_cpuid == cpusave->krs_cpu_id) {
 310                         /*
 311                          * Everyone has arrived, so we can disarm the post-PROM
 312                          * entry point.
 313                          */
 314                         *kaif_promexitarmp = 0;
 315                         membar_producer();
 316                 }
 317 #endif
 318         } else if (kaif_master_cpuid == cpusave->krs_cpu_id) {
 319                 cpusave->krs_cpu_state = KAIF_CPU_STATE_MASTER;
 320         } else {
 321                 cpusave->krs_cpu_state = KAIF_CPU_STATE_SLAVE;
 322         }
 323 
 324         cpusave->krs_cpu_flushed = 0;
 325 
 326         kaif_lock_enter(&kaif_loop_lock);
 327         kaif_looping++;
 328         kaif_lock_exit(&kaif_loop_lock);
 329 
 330         /*
 331          * We know who the master and slaves are, so now they can go off
 332          * to their respective loops.
 333          */
 334         do {
 335                 if (kaif_master_cpuid == cpusave->krs_cpu_id)
 336                         cmd = kaif_master_loop(cpusave);
 337                 else
 338                         cmd = kaif_slave_loop(cpusave);
 339         } while (cmd == KAIF_CPU_CMD_SWITCH);
 340 
 341         kaif_lock_enter(&kaif_loop_lock);
 342         kaif_looping--;
 343         kaif_lock_exit(&kaif_loop_lock);
 344 
 345         cpusave->krs_cpu_state = KAIF_CPU_STATE_NONE;
 346 
 347         if (cmd == KAIF_CPU_CMD_RESUME) {
 348                 /*
 349                  * By this point, the master has directed the slaves to resume,
 350                  * and everyone is making their way to this point.  We're going
 351                  * to block here until all CPUs leave the master and slave
 352                  * loops.  When all have arrived, we'll turn them all loose.
 353                  * This barrier is required for two reasons:
 354                  *
 355                  * 1. There exists a race condition whereby a CPU could reenter
 356                  *    the debugger while another CPU is still in the slave loop
 357                  *    from this debugger entry.  This usually happens when the
 358                  *    current master releases the slaves, and makes it back to
 359                  *    the world before the slaves notice the release.  The
 360                  *    former master then triggers a debugger entry, and attempts
 361                  *    to stop the slaves for this entry before they've even
 362                  *    resumed from the last one.  When the slaves arrive here,
 363                  *    they'll have re-disabled interrupts, and will thus ignore
 364                  *    cross-calls until they finish resuming.
 365                  *
 366                  * 2. At the time of this writing, there exists a SPARC bug that
 367                  *    causes an apparently unsolicited interrupt vector trap
 368                  *    from OBP to one of the slaves.  This wouldn't normally be
 369                  *    a problem but for the fact that the cross-called CPU
 370                  *    encounters some sort of failure while in OBP.  OBP
 371                  *    recovers by executing the debugger-hook word, which sends
 372                  *    the slave back into the debugger, triggering a debugger
 373                  *    fault.  This problem seems to only happen during resume,
 374                  *    the result being that all CPUs save for the cross-called
 375                  *    one make it back into the world, while the cross-called
 376                  *    one is stuck at the debugger fault prompt.  Leave the
 377                  *    world in that state too long, and you'll get a mondo
 378                  *    timeout panic.  If we hold everyone here, we can give the
 379                  *    the user a chance to trigger a panic for further analysis.
 380                  *    To trigger the bug, "pool_unlock:b :c" and "while : ; do
 381                  *    psrset -p ; done".
 382                  *
 383                  * When the second item is fixed, the barrier can move into
 384                  * kaif_select_master(), immediately prior to the setting of
 385                  * kaif_master_cpuid.
 386                  */
 387                 while (kaif_looping != 0)
 388                         continue;
 389         }
 390 
 391         return (cmd);
 392 }
 393 
 394 
 395 #if defined(__sparc)
 396 
 397 static int slave_loop_barrier_failures = 0;     /* for debug */
 398 
 399 /*
 400  * There exist a race condition observed by some
 401  * platforms where the kmdb master cpu exits to OBP via
 402  * prom_enter_mon (e.g. "$q" command) and then later re-enter
 403  * kmdb (typing "go") while the slaves are still proceeding
 404  * from the OBP idle-loop back to the kmdb slave loop. The
 405  * problem arises when the master cpu now back in kmdb proceed
 406  * to re-enter OBP (e.g. doing a prom_read() from the kmdb main
 407  * loop) while the slaves are still trying to get out of (the
 408  * previous trip in) OBP into the safety of the kmdb slave loop.
 409  * This routine forces the slaves to explicitly acknowledge
 410  * that they are back in the slave loop. The master cpu can
 411  * call this routine to ensure that all slave cpus are back
 412  * in the slave loop before proceeding.
 413  */
 414 void
 415 kaif_slave_loop_barrier(void)
 416 {
 417         extern void kdi_usecwait(clock_t);
 418         int i;
 419         int not_acked;
 420         int timeout_count = 0;
 421 
 422         kaif_start_slaves(KAIF_SLAVE_CMD_ACK);
 423 
 424         /*
 425          * Wait for slave cpus to explicitly acknowledge
 426          * that they are spinning in the slave loop.
 427          */
 428         do {
 429                 not_acked = 0;
 430                 for (i = 0; i < kaif_ncpusave; i++) {
 431                         kaif_cpusave_t *save = &kaif_cpusave[i];
 432 
 433                         if (save->krs_cpu_state ==
 434                             KAIF_CPU_STATE_SLAVE &&
 435                             !save->krs_cpu_acked) {
 436                                 not_acked++;
 437                                 break;
 438                         }
 439                 }
 440 
 441                 if (not_acked == 0)
 442                         break;
 443 
 444                 /*
 445                  * Play it safe and do a timeout delay.
 446                  * We will do at most kaif_ncpusave delays before
 447                  * bailing out of this barrier.
 448                  */
 449                 kdi_usecwait(200);
 450 
 451         } while (++timeout_count < kaif_ncpusave);
 452 
 453         if (not_acked > 0)
 454                 /*
 455                  * we cannot establish a barrier with all
 456                  * the slave cpus coming back from OBP
 457                  * Record this fact for future debugging
 458                  */
 459                 slave_loop_barrier_failures++;
 460 
 461         kaif_slave_cmd = KAIF_SLAVE_CMD_SPIN;
 462 }
 463 #endif
--- EOF ---