9736 kmdb tortures via single-step miscellaneous trap
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  * Copyright 2018 Joyent, Inc.
  25  */
  26 


  27 /*
  28  * The main CPU-control loops, used to control masters and slaves.
  29  */
  30 
  31 #include <sys/types.h>
  32 
  33 #include <kmdb/kaif.h>
  34 #include <kmdb/kaif_start.h>
  35 #include <kmdb/kmdb_asmutil.h>
  36 #include <kmdb/kmdb_dpi_impl.h>
  37 #include <kmdb/kmdb_kdi.h>
  38 
  39 #define KAIF_SLAVE_CMD_SPIN     0
  40 #define KAIF_SLAVE_CMD_SWITCH   1
  41 #define KAIF_SLAVE_CMD_RESUME   2
  42 #define KAIF_SLAVE_CMD_FLUSH    3
  43 #define KAIF_SLAVE_CMD_REBOOT   4
  44 #if defined(__sparc)
  45 #define KAIF_SLAVE_CMD_ACK      5
  46 #endif
  47 
  48 
  49 /*
  50  * Used to synchronize attempts to set kaif_master_cpuid.  kaif_master_cpuid may
  51  * be read without kaif_master_lock, and may be written by the current master
  52  * CPU.
  53  */
  54 int kaif_master_cpuid = KAIF_MASTER_CPUID_UNSET;
  55 static uintptr_t kaif_master_lock = 0;
  56 
  57 /*
  58  * Used to ensure that all CPUs leave the debugger together. kaif_loop_lock must
  59  * be held to write kaif_looping, but need not be held to read it.
  60  */
  61 static volatile uint_t kaif_looping;
  62 static uintptr_t kaif_loop_lock;
  63 
  64 static volatile int kaif_slave_cmd;
  65 static volatile int kaif_slave_tgt;     /* target cpuid for CMD_SWITCH */
  66 
  67 static void
  68 kaif_lock_enter(uintptr_t *lock)
  69 {
  70         while (cas(lock, 0, 1) != 0)
  71                 continue;
  72         membar_producer();
  73 }
  74 
  75 static void
  76 kaif_lock_exit(uintptr_t *lock)
  77 {
  78         *lock = 0;
  79         membar_producer();
  80 }
  81 
  82 static void
  83 kaif_start_slaves(int cmd)
  84 {
  85         kaif_slave_cmd = cmd;
  86         kmdb_kdi_start_slaves();
  87 }
  88 
  89 static int
  90 kaif_master_loop(kaif_cpusave_t *cpusave)
  91 {
  92         int notflushed, i;
  93 
  94 #if defined(__sparc)
  95         kaif_prom_rearm();
  96 #endif
  97         kaif_trap_set_debugger();
  98 
  99         /*
 100          * If we re-entered due to a ::switch, we need to tell the slave CPUs
 101          * to sleep again.
 102          */
 103         kmdb_kdi_stop_slaves(cpusave->krs_cpu_id, 0);
 104 
 105 master_loop:
 106         switch (kmdb_dpi_reenter()) {
 107         case KMDB_DPI_CMD_SWITCH_CPU:
 108                 /*
 109                  * We assume that the target CPU is a valid slave.  There's no
 110                  * easy way to complain here, so we'll assume that the caller
 111                  * has done the proper checking.
 112                  */
 113                 if (kmdb_dpi_switch_target == cpusave->krs_cpu_id)
 114                         break;
 115 
 116                 kaif_slave_tgt = kaif_master_cpuid = kmdb_dpi_switch_target;
 117                 cpusave->krs_cpu_state = KAIF_CPU_STATE_SLAVE;
 118                 membar_producer();
 119 
 120                 /*
 121                  * Switch back to the saved trap table before we switch CPUs --
 122                  * we need to make sure that only one CPU is on the debugger's
 123                  * table at a time.
 124                  */
 125                 kaif_trap_set_saved(cpusave);
 126 
 127                 kaif_start_slaves(KAIF_SLAVE_CMD_SWITCH);
 128 
 129                 /* The new master is now awake */
 130                 return (KAIF_CPU_CMD_SWITCH);
 131 
 132         case KMDB_DPI_CMD_RESUME_ALL:
 133         case KMDB_DPI_CMD_RESUME_UNLOAD:
 134                 /*
 135                  * Resume everyone, clean up for next entry.
 136                  */
 137                 kaif_master_cpuid = KAIF_MASTER_CPUID_UNSET;
 138                 membar_producer();
 139                 kaif_start_slaves(KAIF_SLAVE_CMD_RESUME);
 140 
 141                 if (kmdb_dpi_work_required())
 142                         kmdb_dpi_wrintr_fire();
 143 
 144                 kaif_trap_set_saved(cpusave);
 145 
 146                 return (KAIF_CPU_CMD_RESUME);
 147 
 148         case KMDB_DPI_CMD_RESUME_MASTER:
 149                 /*
 150                  * Single-CPU resume, which is performed on the debugger's
 151                  * trap table (so no need to switch back).
 152                  */
 153                 return (KAIF_CPU_CMD_RESUME_MASTER);
 154 
 155         case KMDB_DPI_CMD_FLUSH_CACHES:
 156                 kaif_start_slaves(KAIF_SLAVE_CMD_FLUSH);
 157 
 158                 /*
 159                  * Wait for the other cpus to finish flushing their caches.
 160                  */
 161                 do {
 162                         notflushed = 0;
 163                         for (i = 0; i < kaif_ncpusave; i++) {
 164                                 kaif_cpusave_t *save = &kaif_cpusave[i];
 165 
 166                                 if (save->krs_cpu_state ==
 167                                     KAIF_CPU_STATE_SLAVE &&
 168                                     !save->krs_cpu_flushed) {
 169                                         notflushed++;
 170                                         break;
 171                                 }
 172                         }
 173                 } while (notflushed > 0);
 174 
 175                 kaif_slave_cmd = KAIF_SLAVE_CMD_SPIN;
 176                 break;
 177 
 178 #if defined(__i386) || defined(__amd64)
 179         case KMDB_DPI_CMD_REBOOT:
 180                 /*
 181                  * Reboot must be initiated by CPU 0.  I could ask why, but I'm
 182                  * afraid that I don't want to know the answer.
 183                  */
 184                 if (cpusave->krs_cpu_id == 0)
 185                         kmdb_kdi_reboot();
 186 
 187                 kaif_start_slaves(KAIF_SLAVE_CMD_REBOOT);
 188 
 189                 /*
 190                  * Spin forever, waiting for CPU 0 (apparently a slave) to
 191                  * reboot the system.
 192                  */
 193                 for (;;)
 194                         continue;
 195 
 196                 /*NOTREACHED*/
 197                 break;
 198 #endif
 199         }
 200 
 201         goto master_loop;
 202 }
 203 
 204 static int
 205 kaif_slave_loop(kaif_cpusave_t *cpusave)
 206 {
 207         int slavecmd, rv;
 208 
 209 #if defined(__sparc)
 210         /*
 211          * If the user elects to drop to OBP from the debugger, some OBP
 212          * implementations will cross-call the slaves.  We have to turn
 213          * IE back on so we can receive the cross-calls.  If we don't,
 214          * some OBP implementations will wait forever.
 215          */
 216         interrupts_on();
 217 #endif
 218 
 219         /* Wait for duty to call */
 220         for (;;) {
 221                 slavecmd = kaif_slave_cmd;
 222 
 223                 if (slavecmd == KAIF_SLAVE_CMD_SWITCH &&
 224                     kaif_slave_tgt == cpusave->krs_cpu_id) {
 225                         kaif_slave_cmd = KAIF_SLAVE_CMD_SPIN;
 226                         cpusave->krs_cpu_state = KAIF_CPU_STATE_MASTER;
 227                         rv = KAIF_CPU_CMD_SWITCH;
 228                         break;
 229 
 230                 } else if (slavecmd == KAIF_SLAVE_CMD_FLUSH) {
 231                         kmdb_kdi_flush_caches();
 232                         cpusave->krs_cpu_flushed = 1;
 233                         continue;
 234 
 235 #if defined(__i386) || defined(__amd64)
 236                 } else if (slavecmd == KAIF_SLAVE_CMD_REBOOT &&
 237                     cpusave->krs_cpu_id == 0) {
 238                         rv = 0;
 239                         kmdb_kdi_reboot();
 240                         break;
 241 #endif
 242 
 243                 } else if (slavecmd == KAIF_SLAVE_CMD_RESUME) {
 244                         rv = KAIF_CPU_CMD_RESUME;
 245                         break;
 246 #if defined(__sparc)
 247                 } else if (slavecmd == KAIF_SLAVE_CMD_ACK) {
 248                         cpusave->krs_cpu_acked = 1;
 249                 } else if (cpusave->krs_cpu_acked &&
 250                     slavecmd == KAIF_SLAVE_CMD_SPIN) {
 251                         cpusave->krs_cpu_acked = 0;
 252 #endif
 253                 }
 254 
 255                 kmdb_kdi_slave_wait();
 256         }
 257 
 258 #if defined(__sparc)
 259         interrupts_off();
 260 #endif
 261 
 262         return (rv);
 263 }
 264 
 265 static void
 266 kaif_select_master(kaif_cpusave_t *cpusave)
 267 {
 268         kaif_lock_enter(&kaif_master_lock);
 269 
 270         if (kaif_master_cpuid == KAIF_MASTER_CPUID_UNSET) {
 271                 /* This is the master. */
 272                 kaif_master_cpuid = cpusave->krs_cpu_id;
 273                 cpusave->krs_cpu_state = KAIF_CPU_STATE_MASTER;
 274                 kaif_slave_cmd = KAIF_SLAVE_CMD_SPIN;
 275 
 276                 membar_producer();
 277 
 278                 kmdb_kdi_stop_slaves(cpusave->krs_cpu_id, 1);
 279         } else {
 280                 /* The master was already chosen - go be a slave */
 281                 cpusave->krs_cpu_state = KAIF_CPU_STATE_SLAVE;
 282                 membar_producer();
 283         }
 284 
 285         kaif_lock_exit(&kaif_master_lock);
 286 }
 287 
 288 int
 289 kaif_main_loop(kaif_cpusave_t *cpusave)
 290 {
 291         int cmd;
 292 
 293         if (kaif_master_cpuid == KAIF_MASTER_CPUID_UNSET) {
 294 
 295                 /*
 296                  * Special case: Unload requested before first debugger entry.
 297                  * Don't stop the world, as there's nothing to clean up that
 298                  * can't be handled by the running kernel.
 299                  */
 300                 if (!kmdb_dpi_resume_requested &&
 301                     kmdb_kdi_get_unload_request()) {
 302                         cpusave->krs_cpu_state = KAIF_CPU_STATE_NONE;
 303                         return (KAIF_CPU_CMD_RESUME);
 304                 }
 305 
 306                 /*
 307                  * We're a slave with no master, so just resume.  This can
 308                  * happen if, prior to this, two CPUs both raced through
 309                  * kdi_cmnint() - for example, a breakpoint on a frequently
 310                  * called function.  The loser will be redirected to the slave
 311                  * loop; note that the event itself is lost at this point.
 312                  *
 313                  * The winner will then cross-call that slave, but it won't
 314                  * actually be received until the slave returns to the kernel
 315                  * and enables interrupts.  We'll then come back in via
 316                  * kdi_slave_entry() and hit this path.
 317                  */
 318                 if (cpusave->krs_cpu_state == KAIF_CPU_STATE_SLAVE) {
 319                         cpusave->krs_cpu_state = KAIF_CPU_STATE_NONE;
 320                         return (KAIF_CPU_CMD_RESUME);
 321                 }
 322 
 323                 kaif_select_master(cpusave);
 324 
 325 #ifdef __sparc
 326                 if (kaif_master_cpuid == cpusave->krs_cpu_id) {
 327                         /*
 328                          * Everyone has arrived, so we can disarm the post-PROM
 329                          * entry point.
 330                          */
 331                         *kaif_promexitarmp = 0;
 332                         membar_producer();
 333                 }
 334 #endif
 335         } else if (kaif_master_cpuid == cpusave->krs_cpu_id) {
 336                 cpusave->krs_cpu_state = KAIF_CPU_STATE_MASTER;
 337         } else {
 338                 cpusave->krs_cpu_state = KAIF_CPU_STATE_SLAVE;
 339         }
 340 
 341         cpusave->krs_cpu_flushed = 0;
 342 
 343         kaif_lock_enter(&kaif_loop_lock);
 344         kaif_looping++;
 345         kaif_lock_exit(&kaif_loop_lock);
 346 
 347         /*
 348          * We know who the master and slaves are, so now they can go off
 349          * to their respective loops.
 350          */
 351         do {
 352                 if (kaif_master_cpuid == cpusave->krs_cpu_id)
 353                         cmd = kaif_master_loop(cpusave);
 354                 else
 355                         cmd = kaif_slave_loop(cpusave);
 356         } while (cmd == KAIF_CPU_CMD_SWITCH);
 357 
 358         kaif_lock_enter(&kaif_loop_lock);
 359         kaif_looping--;
 360         kaif_lock_exit(&kaif_loop_lock);
 361 
 362         cpusave->krs_cpu_state = KAIF_CPU_STATE_NONE;
 363 
 364         if (cmd == KAIF_CPU_CMD_RESUME) {
 365                 /*
 366                  * By this point, the master has directed the slaves to resume,
 367                  * and everyone is making their way to this point.  We're going
 368                  * to block here until all CPUs leave the master and slave
 369                  * loops.  When all have arrived, we'll turn them all loose.
 370                  * This barrier is required for two reasons:
 371                  *
 372                  * 1. There exists a race condition whereby a CPU could reenter
 373                  *    the debugger while another CPU is still in the slave loop
 374                  *    from this debugger entry.  This usually happens when the
 375                  *    current master releases the slaves, and makes it back to
 376                  *    the world before the slaves notice the release.  The
 377                  *    former master then triggers a debugger entry, and attempts
 378                  *    to stop the slaves for this entry before they've even
 379                  *    resumed from the last one.  When the slaves arrive here,
 380                  *    they'll have re-disabled interrupts, and will thus ignore
 381                  *    cross-calls until they finish resuming.
 382                  *
 383                  * 2. At the time of this writing, there exists a SPARC bug that
 384                  *    causes an apparently unsolicited interrupt vector trap
 385                  *    from OBP to one of the slaves.  This wouldn't normally be
 386                  *    a problem but for the fact that the cross-called CPU
 387                  *    encounters some sort of failure while in OBP.  OBP
 388                  *    recovers by executing the debugger-hook word, which sends
 389                  *    the slave back into the debugger, triggering a debugger
 390                  *    fault.  This problem seems to only happen during resume,
 391                  *    the result being that all CPUs save for the cross-called
 392                  *    one make it back into the world, while the cross-called
 393                  *    one is stuck at the debugger fault prompt.  Leave the
 394                  *    world in that state too long, and you'll get a mondo
 395                  *    timeout panic.  If we hold everyone here, we can give the
 396                  *    the user a chance to trigger a panic for further analysis.
 397                  *    To trigger the bug, "pool_unlock:b :c" and "while : ; do
 398                  *    psrset -p ; done".
 399                  *
 400                  * When the second item is fixed, the barrier can move into
 401                  * kaif_select_master(), immediately prior to the setting of
 402                  * kaif_master_cpuid.
 403                  */
 404                 while (kaif_looping != 0)
 405                         continue;
 406         }
 407 
 408         return (cmd);
 409 }
 410 
 411 
 412 #if defined(__sparc)
 413 
 414 static int slave_loop_barrier_failures = 0;     /* for debug */
 415 
 416 /*
 417  * There exist a race condition observed by some
 418  * platforms where the kmdb master cpu exits to OBP via
 419  * prom_enter_mon (e.g. "$q" command) and then later re-enter
 420  * kmdb (typing "go") while the slaves are still proceeding
 421  * from the OBP idle-loop back to the kmdb slave loop. The
 422  * problem arises when the master cpu now back in kmdb proceed
 423  * to re-enter OBP (e.g. doing a prom_read() from the kmdb main
 424  * loop) while the slaves are still trying to get out of (the
 425  * previous trip in) OBP into the safety of the kmdb slave loop.
 426  * This routine forces the slaves to explicitly acknowledge
 427  * that they are back in the slave loop. The master cpu can
 428  * call this routine to ensure that all slave cpus are back
 429  * in the slave loop before proceeding.
 430  */
 431 void
 432 kaif_slave_loop_barrier(void)
 433 {
 434         extern void kdi_usecwait(clock_t);
 435         int i;
 436         int not_acked;
 437         int timeout_count = 0;
 438 
 439         kaif_start_slaves(KAIF_SLAVE_CMD_ACK);
 440 
 441         /*
 442          * Wait for slave cpus to explicitly acknowledge
 443          * that they are spinning in the slave loop.
 444          */
 445         do {
 446                 not_acked = 0;
 447                 for (i = 0; i < kaif_ncpusave; i++) {
 448                         kaif_cpusave_t *save = &kaif_cpusave[i];
 449 
 450                         if (save->krs_cpu_state ==
 451                             KAIF_CPU_STATE_SLAVE &&
 452                             !save->krs_cpu_acked) {
 453                                 not_acked++;
 454                                 break;
 455                         }
 456                 }
 457 
 458                 if (not_acked == 0)
 459                         break;
 460 
 461                 /*
 462                  * Play it safe and do a timeout delay.
 463                  * We will do at most kaif_ncpusave delays before
 464                  * bailing out of this barrier.
 465                  */
 466                 kdi_usecwait(200);
 467 
 468         } while (++timeout_count < kaif_ncpusave);
 469 
 470         if (not_acked > 0)
 471                 /*
 472                  * we cannot establish a barrier with all
 473                  * the slave cpus coming back from OBP
 474                  * Record this fact for future debugging
 475                  */
 476                 slave_loop_barrier_failures++;
 477 
 478         kaif_slave_cmd = KAIF_SLAVE_CMD_SPIN;
 479 }
 480 #endif
--- EOF ---