1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 /*
  26  * Copyright 2019, Joyent, Inc.
  27  * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
  28  */
  29 
  30 /*
  31  * PSMI 1.1 extensions are supported only in 2.6 and later versions.
  32  * PSMI 1.2 extensions are supported only in 2.7 and later versions.
  33  * PSMI 1.3 and 1.4 extensions are supported in Solaris 10.
  34  * PSMI 1.5 extensions are supported in Solaris Nevada.
  35  * PSMI 1.6 extensions are supported in Solaris Nevada.
  36  * PSMI 1.7 extensions are supported in Solaris Nevada.
  37  */
  38 #define PSMI_1_7
  39 
  40 #include <sys/processor.h>
  41 #include <sys/time.h>
  42 #include <sys/psm.h>
  43 #include <sys/smp_impldefs.h>
  44 #include <sys/cram.h>
  45 #include <sys/acpi/acpi.h>
  46 #include <sys/acpica.h>
  47 #include <sys/psm_common.h>
  48 #include <sys/apic.h>
  49 #include <sys/pit.h>
  50 #include <sys/ddi.h>
  51 #include <sys/sunddi.h>
  52 #include <sys/ddi_impldefs.h>
  53 #include <sys/pci.h>
  54 #include <sys/promif.h>
  55 #include <sys/x86_archext.h>
  56 #include <sys/cpc_impl.h>
  57 #include <sys/uadmin.h>
  58 #include <sys/panic.h>
  59 #include <sys/debug.h>
  60 #include <sys/archsystm.h>
  61 #include <sys/trap.h>
  62 #include <sys/machsystm.h>
  63 #include <sys/sysmacros.h>
  64 #include <sys/cpuvar.h>
  65 #include <sys/rm_platter.h>
  66 #include <sys/privregs.h>
  67 #include <sys/note.h>
  68 #include <sys/pci_intr_lib.h>
  69 #include <sys/spl.h>
  70 #include <sys/clock.h>
  71 #include <sys/dditypes.h>
  72 #include <sys/sunddi.h>
  73 #include <sys/x_call.h>
  74 #include <sys/reboot.h>
  75 #include <sys/hpet.h>
  76 #include <sys/apic_common.h>
  77 #include <sys/apic_timer.h>
  78 
  79 static void     apic_record_ioapic_rdt(void *intrmap_private,
  80                     ioapic_rdt_t *irdt);
  81 static void     apic_record_msi(void *intrmap_private, msi_regs_t *mregs);
  82 
  83 /*
  84  * Common routines between pcplusmp & apix (taken from apic.c).
  85  */
  86 
  87 int     apic_clkinit(int);
  88 hrtime_t apic_gethrtime(void);
  89 void    apic_send_ipi(int, int);
  90 void    apic_set_idlecpu(processorid_t);
  91 void    apic_unset_idlecpu(processorid_t);
  92 void    apic_shutdown(int, int);
  93 void    apic_preshutdown(int, int);
  94 processorid_t   apic_get_next_processorid(processorid_t);
  95 
  96 hrtime_t apic_gettime();
  97 
  98 enum apic_ioapic_method_type apix_mul_ioapic_method = APIC_MUL_IOAPIC_PCPLUSMP;
  99 
 100 /* Now the ones for Dynamic Interrupt distribution */
 101 int     apic_enable_dynamic_migration = 0;
 102 
 103 /* maximum loop count when sending Start IPIs. */
 104 int apic_sipi_max_loop_count = 0x1000;
 105 
 106 /*
 107  * These variables are frequently accessed in apic_intr_enter(),
 108  * apic_intr_exit and apic_setspl, so group them together
 109  */
 110 volatile uint32_t *apicadr =  NULL;     /* virtual addr of local APIC   */
 111 int apic_setspl_delay = 1;              /* apic_setspl - delay enable   */
 112 int apic_clkvect;
 113 
 114 /* vector at which error interrupts come in */
 115 int apic_errvect;
 116 int apic_enable_error_intr = 1;
 117 int apic_error_display_delay = 100;
 118 
 119 /* vector at which performance counter overflow interrupts come in */
 120 int apic_cpcovf_vect;
 121 int apic_enable_cpcovf_intr = 1;
 122 
 123 /* vector at which CMCI interrupts come in */
 124 int apic_cmci_vect;
 125 extern void cmi_cmci_trap(void);
 126 
 127 lock_t apic_mode_switch_lock;
 128 
 129 int apic_pir_vect;
 130 
 131 /*
 132  * Patchable global variables.
 133  */
 134 int     apic_forceload = 0;
 135 
 136 int     apic_coarse_hrtime = 1;         /* 0 - use accurate slow gethrtime() */
 137 
 138 int     apic_flat_model = 0;            /* 0 - clustered. 1 - flat */
 139 int     apic_panic_on_nmi = 0;
 140 int     apic_panic_on_apic_error = 0;
 141 
 142 int     apic_verbose = 0;       /* 0x1ff */
 143 
 144 #ifdef DEBUG
 145 int     apic_debug = 0;
 146 int     apic_restrict_vector = 0;
 147 
 148 int     apic_debug_msgbuf[APIC_DEBUG_MSGBUFSIZE];
 149 int     apic_debug_msgbufindex = 0;
 150 
 151 #endif /* DEBUG */
 152 
 153 uint_t apic_nticks = 0;
 154 uint_t apic_skipped_redistribute = 0;
 155 
 156 uint_t last_count_read = 0;
 157 lock_t  apic_gethrtime_lock;
 158 volatile int    apic_hrtime_stamp = 0;
 159 volatile hrtime_t apic_nsec_since_boot = 0;
 160 
 161 static  hrtime_t        apic_last_hrtime = 0;
 162 int             apic_hrtime_error = 0;
 163 int             apic_remote_hrterr = 0;
 164 int             apic_num_nmis = 0;
 165 int             apic_apic_error = 0;
 166 int             apic_num_apic_errors = 0;
 167 int             apic_num_cksum_errors = 0;
 168 
 169 int     apic_error = 0;
 170 
 171 static  int     apic_cmos_ssb_set = 0;
 172 
 173 /* use to make sure only one cpu handles the nmi */
 174 lock_t  apic_nmi_lock;
 175 /* use to make sure only one cpu handles the error interrupt */
 176 lock_t  apic_error_lock;
 177 
 178 static  struct {
 179         uchar_t cntl;
 180         uchar_t data;
 181 } aspen_bmc[] = {
 182         { CC_SMS_WR_START,      0x18 },         /* NetFn/LUN */
 183         { CC_SMS_WR_NEXT,       0x24 },         /* Cmd SET_WATCHDOG_TIMER */
 184         { CC_SMS_WR_NEXT,       0x84 },         /* DataByte 1: SMS/OS no log */
 185         { CC_SMS_WR_NEXT,       0x2 },          /* DataByte 2: Power Down */
 186         { CC_SMS_WR_NEXT,       0x0 },          /* DataByte 3: no pre-timeout */
 187         { CC_SMS_WR_NEXT,       0x0 },          /* DataByte 4: timer expir. */
 188         { CC_SMS_WR_NEXT,       0xa },          /* DataByte 5: init countdown */
 189         { CC_SMS_WR_END,        0x0 },          /* DataByte 6: init countdown */
 190 
 191         { CC_SMS_WR_START,      0x18 },         /* NetFn/LUN */
 192         { CC_SMS_WR_END,        0x22 }          /* Cmd RESET_WATCHDOG_TIMER */
 193 };
 194 
 195 static  struct {
 196         int     port;
 197         uchar_t data;
 198 } sitka_bmc[] = {
 199         { SMS_COMMAND_REGISTER, SMS_WRITE_START },
 200         { SMS_DATA_REGISTER,    0x18 },         /* NetFn/LUN */
 201         { SMS_DATA_REGISTER,    0x24 },         /* Cmd SET_WATCHDOG_TIMER */
 202         { SMS_DATA_REGISTER,    0x84 },         /* DataByte 1: SMS/OS no log */
 203         { SMS_DATA_REGISTER,    0x2 },          /* DataByte 2: Power Down */
 204         { SMS_DATA_REGISTER,    0x0 },          /* DataByte 3: no pre-timeout */
 205         { SMS_DATA_REGISTER,    0x0 },          /* DataByte 4: timer expir. */
 206         { SMS_DATA_REGISTER,    0xa },          /* DataByte 5: init countdown */
 207         { SMS_COMMAND_REGISTER, SMS_WRITE_END },
 208         { SMS_DATA_REGISTER,    0x0 },          /* DataByte 6: init countdown */
 209 
 210         { SMS_COMMAND_REGISTER, SMS_WRITE_START },
 211         { SMS_DATA_REGISTER,    0x18 },         /* NetFn/LUN */
 212         { SMS_COMMAND_REGISTER, SMS_WRITE_END },
 213         { SMS_DATA_REGISTER,    0x22 }          /* Cmd RESET_WATCHDOG_TIMER */
 214 };
 215 
 216 /* Patchable global variables. */
 217 int             apic_kmdb_on_nmi = 0;           /* 0 - no, 1 - yes enter kmdb */
 218 uint32_t        apic_divide_reg_init = 0;       /* 0 - divide by 2 */
 219 
 220 /* default apic ops without interrupt remapping */
 221 static apic_intrmap_ops_t apic_nointrmap_ops = {
 222         (int (*)(int))return_instr,
 223         (void (*)(int))return_instr,
 224         (void (*)(void **, dev_info_t *, uint16_t, int, uchar_t))return_instr,
 225         (void (*)(void *, void *, uint16_t, int))return_instr,
 226         (void (*)(void **))return_instr,
 227         apic_record_ioapic_rdt,
 228         apic_record_msi,
 229 };
 230 
 231 apic_intrmap_ops_t *apic_vt_ops = &apic_nointrmap_ops;
 232 apic_cpus_info_t        *apic_cpus = NULL;
 233 cpuset_t        apic_cpumask;
 234 uint_t          apic_picinit_called;
 235 
 236 /* Flag to indicate that we need to shut down all processors */
 237 static uint_t   apic_shutdown_processors;
 238 
 239 /*
 240  * Probe the ioapic method for apix module. Called in apic_probe_common()
 241  */
 242 int
 243 apic_ioapic_method_probe()
 244 {
 245         if (apix_enable == 0)
 246                 return (PSM_SUCCESS);
 247 
 248         /*
 249          * Set IOAPIC EOI handling method. The priority from low to high is:
 250          *      1. IOxAPIC: with EOI register
 251          *      2. IOMMU interrupt mapping
 252          *      3. Mask-Before-EOI method for systems without boot
 253          *      interrupt routing, such as systems with only one IOAPIC;
 254          *      NVIDIA CK8-04/MCP55 systems; systems with bridge solution
 255          *      which disables the boot interrupt routing already.
 256          *      4. Directed EOI
 257          */
 258         if (apic_io_ver[0] >= 0x20)
 259                 apix_mul_ioapic_method = APIC_MUL_IOAPIC_IOXAPIC;
 260         if ((apic_io_max == 1) || (apic_nvidia_io_max == apic_io_max))
 261                 apix_mul_ioapic_method = APIC_MUL_IOAPIC_MASK;
 262         if (apic_directed_EOI_supported())
 263                 apix_mul_ioapic_method = APIC_MUL_IOAPIC_DEOI;
 264 
 265         /* fall back to pcplusmp */
 266         if (apix_mul_ioapic_method == APIC_MUL_IOAPIC_PCPLUSMP) {
 267                 /* make sure apix is after pcplusmp in /etc/mach */
 268                 apix_enable = 0; /* go ahead with pcplusmp install next */
 269                 return (PSM_FAILURE);
 270         }
 271 
 272         return (PSM_SUCCESS);
 273 }
 274 
 275 /*
 276  * handler for APIC Error interrupt. Just print a warning and continue
 277  */
 278 int
 279 apic_error_intr()
 280 {
 281         uint_t  error0, error1, error;
 282         uint_t  i;
 283 
 284         /*
 285          * We need to write before read as per 7.4.17 of system prog manual.
 286          * We do both and or the results to be safe
 287          */
 288         error0 = apic_reg_ops->apic_read(APIC_ERROR_STATUS);
 289         apic_reg_ops->apic_write(APIC_ERROR_STATUS, 0);
 290         error1 = apic_reg_ops->apic_read(APIC_ERROR_STATUS);
 291         error = error0 | error1;
 292 
 293         /*
 294          * Clear the APIC error status (do this on all cpus that enter here)
 295          * (two writes are required due to the semantics of accessing the
 296          * error status register.)
 297          */
 298         apic_reg_ops->apic_write(APIC_ERROR_STATUS, 0);
 299         apic_reg_ops->apic_write(APIC_ERROR_STATUS, 0);
 300 
 301         /*
 302          * Prevent more than 1 CPU from handling error interrupt causing
 303          * double printing (interleave of characters from multiple
 304          * CPU's when using prom_printf)
 305          */
 306         if (lock_try(&apic_error_lock) == 0)
 307                 return (error ? DDI_INTR_CLAIMED : DDI_INTR_UNCLAIMED);
 308         if (error) {
 309 #if     DEBUG
 310                 if (apic_debug)
 311                         debug_enter("pcplusmp: APIC Error interrupt received");
 312 #endif /* DEBUG */
 313                 if (apic_panic_on_apic_error)
 314                         cmn_err(CE_PANIC,
 315                             "APIC Error interrupt on CPU %d. Status = %x",
 316                             psm_get_cpu_id(), error);
 317                 else {
 318                         if ((error & ~APIC_CS_ERRORS) == 0) {
 319                                 /* cksum error only */
 320                                 apic_error |= APIC_ERR_APIC_ERROR;
 321                                 apic_apic_error |= error;
 322                                 apic_num_apic_errors++;
 323                                 apic_num_cksum_errors++;
 324                         } else {
 325                                 /*
 326                                  * prom_printf is the best shot we have of
 327                                  * something which is problem free from
 328                                  * high level/NMI type of interrupts
 329                                  */
 330                                 prom_printf("APIC Error interrupt on CPU %d. "
 331                                     "Status 0 = %x, Status 1 = %x\n",
 332                                     psm_get_cpu_id(), error0, error1);
 333                                 apic_error |= APIC_ERR_APIC_ERROR;
 334                                 apic_apic_error |= error;
 335                                 apic_num_apic_errors++;
 336                                 for (i = 0; i < apic_error_display_delay; i++) {
 337                                         tenmicrosec();
 338                                 }
 339                                 /*
 340                                  * provide more delay next time limited to
 341                                  * roughly 1 clock tick time
 342                                  */
 343                                 if (apic_error_display_delay < 500)
 344                                         apic_error_display_delay *= 2;
 345                         }
 346                 }
 347                 lock_clear(&apic_error_lock);
 348                 return (DDI_INTR_CLAIMED);
 349         } else {
 350                 lock_clear(&apic_error_lock);
 351                 return (DDI_INTR_UNCLAIMED);
 352         }
 353 }
 354 
 355 /*
 356  * Turn off the mask bit in the performance counter Local Vector Table entry.
 357  */
 358 void
 359 apic_cpcovf_mask_clear(void)
 360 {
 361         apic_reg_ops->apic_write(APIC_PCINT_VECT,
 362             (apic_reg_ops->apic_read(APIC_PCINT_VECT) & ~APIC_LVT_MASK));
 363 }
 364 
 365 /*ARGSUSED*/
 366 static int
 367 apic_cmci_enable(xc_arg_t arg1, xc_arg_t arg2, xc_arg_t arg3)
 368 {
 369         apic_reg_ops->apic_write(APIC_CMCI_VECT, apic_cmci_vect);
 370         return (0);
 371 }
 372 
 373 /*ARGSUSED*/
 374 static int
 375 apic_cmci_disable(xc_arg_t arg1, xc_arg_t arg2, xc_arg_t arg3)
 376 {
 377         apic_reg_ops->apic_write(APIC_CMCI_VECT, apic_cmci_vect | AV_MASK);
 378         return (0);
 379 }
 380 
 381 void
 382 apic_cmci_setup(processorid_t cpuid, boolean_t enable)
 383 {
 384         cpuset_t        cpu_set;
 385 
 386         CPUSET_ONLY(cpu_set, cpuid);
 387 
 388         if (enable) {
 389                 xc_call(NULL, NULL, NULL, CPUSET2BV(cpu_set),
 390                     (xc_func_t)apic_cmci_enable);
 391         } else {
 392                 xc_call(NULL, NULL, NULL, CPUSET2BV(cpu_set),
 393                     (xc_func_t)apic_cmci_disable);
 394         }
 395 }
 396 
 397 static void
 398 apic_disable_local_apic(void)
 399 {
 400         apic_reg_ops->apic_write_task_reg(APIC_MASK_ALL);
 401         apic_reg_ops->apic_write(APIC_LOCAL_TIMER, AV_MASK);
 402 
 403         /* local intr reg 0 */
 404         apic_reg_ops->apic_write(APIC_INT_VECT0, AV_MASK);
 405 
 406         /* disable NMI */
 407         apic_reg_ops->apic_write(APIC_INT_VECT1, AV_MASK);
 408 
 409         /* and error interrupt */
 410         apic_reg_ops->apic_write(APIC_ERR_VECT, AV_MASK);
 411 
 412         /* and perf counter intr */
 413         apic_reg_ops->apic_write(APIC_PCINT_VECT, AV_MASK);
 414 
 415         apic_reg_ops->apic_write(APIC_SPUR_INT_REG, APIC_SPUR_INTR);
 416 }
 417 
 418 static void
 419 apic_cpu_send_SIPI(processorid_t cpun, boolean_t start)
 420 {
 421         int             loop_count;
 422         uint32_t        vector;
 423         uint_t          apicid;
 424         ulong_t         iflag;
 425 
 426         apicid =  apic_cpus[cpun].aci_local_id;
 427 
 428         /*
 429          * Interrupts on current CPU will be disabled during the
 430          * steps in order to avoid unwanted side effects from
 431          * executing interrupt handlers on a problematic BIOS.
 432          */
 433         iflag = intr_clear();
 434 
 435         if (start) {
 436                 outb(CMOS_ADDR, SSB);
 437                 outb(CMOS_DATA, BIOS_SHUTDOWN);
 438         }
 439 
 440         /*
 441          * According to X2APIC specification in section '2.3.5.1' of
 442          * Interrupt Command Register Semantics, the semantics of
 443          * programming the Interrupt Command Register to dispatch an interrupt
 444          * is simplified. A single MSR write to the 64-bit ICR is required
 445          * for dispatching an interrupt. Specifically, with the 64-bit MSR
 446          * interface to ICR, system software is not required to check the
 447          * status of the delivery status bit prior to writing to the ICR
 448          * to send an IPI. With the removal of the Delivery Status bit,
 449          * system software no longer has a reason to read the ICR. It remains
 450          * readable only to aid in debugging.
 451          */
 452 #ifdef  DEBUG
 453         APIC_AV_PENDING_SET();
 454 #else
 455         if (apic_mode == LOCAL_APIC) {
 456                 APIC_AV_PENDING_SET();
 457         }
 458 #endif /* DEBUG */
 459 
 460         /* for integrated - make sure there is one INIT IPI in buffer */
 461         /* for external - it will wake up the cpu */
 462         apic_reg_ops->apic_write_int_cmd(apicid, AV_ASSERT | AV_RESET);
 463 
 464         /* If only 1 CPU is installed, PENDING bit will not go low */
 465         for (loop_count = apic_sipi_max_loop_count; loop_count; loop_count--) {
 466                 if (apic_mode == LOCAL_APIC &&
 467                     apic_reg_ops->apic_read(APIC_INT_CMD1) & AV_PENDING)
 468                         apic_ret();
 469                 else
 470                         break;
 471         }
 472 
 473         apic_reg_ops->apic_write_int_cmd(apicid, AV_DEASSERT | AV_RESET);
 474         drv_usecwait(20000);            /* 20 milli sec */
 475 
 476         if (apic_cpus[cpun].aci_local_ver >= APIC_INTEGRATED_VERS) {
 477                 /* integrated apic */
 478 
 479                 vector = (rm_platter_pa >> MMU_PAGESHIFT) &
 480                     (APIC_VECTOR_MASK | APIC_IPL_MASK);
 481 
 482                 /* to offset the INIT IPI queue up in the buffer */
 483                 apic_reg_ops->apic_write_int_cmd(apicid, vector | AV_STARTUP);
 484                 drv_usecwait(200);              /* 20 micro sec */
 485 
 486                 /*
 487                  * send the second SIPI (Startup IPI) as recommended by Intel
 488                  * software development manual.
 489                  */
 490                 apic_reg_ops->apic_write_int_cmd(apicid, vector | AV_STARTUP);
 491                 drv_usecwait(200);      /* 20 micro sec */
 492         }
 493 
 494         intr_restore(iflag);
 495 }
 496 
 497 /*ARGSUSED1*/
 498 int
 499 apic_cpu_start(processorid_t cpun, caddr_t arg)
 500 {
 501         ASSERT(MUTEX_HELD(&cpu_lock));
 502 
 503         if (!apic_cpu_in_range(cpun)) {
 504                 return (EINVAL);
 505         }
 506 
 507         /*
 508          * Switch to apic_common_send_ipi for safety during starting other CPUs.
 509          */
 510         if (apic_mode == LOCAL_X2APIC) {
 511                 apic_switch_ipi_callback(B_TRUE);
 512         }
 513 
 514         apic_cmos_ssb_set = 1;
 515         apic_cpu_send_SIPI(cpun, B_TRUE);
 516 
 517         return (0);
 518 }
 519 
 520 /*
 521  * Put CPU into halted state with interrupts disabled.
 522  */
 523 /*ARGSUSED1*/
 524 int
 525 apic_cpu_stop(processorid_t cpun, caddr_t arg)
 526 {
 527         int             rc;
 528         cpu_t           *cp;
 529         extern cpuset_t cpu_ready_set;
 530         extern void cpu_idle_intercept_cpu(cpu_t *cp);
 531 
 532         ASSERT(MUTEX_HELD(&cpu_lock));
 533 
 534         if (!apic_cpu_in_range(cpun)) {
 535                 return (EINVAL);
 536         }
 537         if (apic_cpus[cpun].aci_local_ver < APIC_INTEGRATED_VERS) {
 538                 return (ENOTSUP);
 539         }
 540 
 541         cp = cpu_get(cpun);
 542         ASSERT(cp != NULL);
 543         ASSERT((cp->cpu_flags & CPU_OFFLINE) != 0);
 544         ASSERT((cp->cpu_flags & CPU_QUIESCED) != 0);
 545         ASSERT((cp->cpu_flags & CPU_ENABLE) == 0);
 546 
 547         /* Clear CPU_READY flag to disable cross calls. */
 548         cp->cpu_flags &= ~CPU_READY;
 549         CPUSET_ATOMIC_DEL(cpu_ready_set, cpun);
 550         rc = xc_flush_cpu(cp);
 551         if (rc != 0) {
 552                 CPUSET_ATOMIC_ADD(cpu_ready_set, cpun);
 553                 cp->cpu_flags |= CPU_READY;
 554                 return (rc);
 555         }
 556 
 557         /* Intercept target CPU at a safe point before powering it off. */
 558         cpu_idle_intercept_cpu(cp);
 559 
 560         apic_cpu_send_SIPI(cpun, B_FALSE);
 561         cp->cpu_flags &= ~CPU_RUNNING;
 562 
 563         return (0);
 564 }
 565 
 566 int
 567 apic_cpu_ops(psm_cpu_request_t *reqp)
 568 {
 569         if (reqp == NULL) {
 570                 return (EINVAL);
 571         }
 572 
 573         switch (reqp->pcr_cmd) {
 574         case PSM_CPU_ADD:
 575                 return (apic_cpu_add(reqp));
 576 
 577         case PSM_CPU_REMOVE:
 578                 return (apic_cpu_remove(reqp));
 579 
 580         case PSM_CPU_STOP:
 581                 return (apic_cpu_stop(reqp->req.cpu_stop.cpuid,
 582                     reqp->req.cpu_stop.ctx));
 583 
 584         default:
 585                 return (ENOTSUP);
 586         }
 587 }
 588 
 589 #ifdef  DEBUG
 590 int     apic_break_on_cpu = 9;
 591 int     apic_stretch_interrupts = 0;
 592 int     apic_stretch_ISR = 1 << 3;        /* IPL of 3 matches nothing now */
 593 #endif /* DEBUG */
 594 
 595 /*
 596  * generates an interprocessor interrupt to another CPU. Any changes made to
 597  * this routine must be accompanied by similar changes to
 598  * apic_common_send_ipi().
 599  */
 600 void
 601 apic_send_ipi(int cpun, int ipl)
 602 {
 603         int vector;
 604         ulong_t flag;
 605 
 606         vector = apic_resv_vector[ipl];
 607 
 608         ASSERT((vector >= APIC_BASE_VECT) && (vector <= APIC_SPUR_INTR));
 609 
 610         flag = intr_clear();
 611 
 612         APIC_AV_PENDING_SET();
 613 
 614         apic_reg_ops->apic_write_int_cmd(apic_cpus[cpun].aci_local_id,
 615             vector);
 616 
 617         intr_restore(flag);
 618 }
 619 
 620 void
 621 apic_send_pir_ipi(processorid_t cpun)
 622 {
 623         const int vector = apic_pir_vect;
 624         ulong_t flag;
 625 
 626         ASSERT((vector >= APIC_BASE_VECT) && (vector <= APIC_SPUR_INTR));
 627 
 628         flag = intr_clear();
 629 
 630         /* Self-IPI for inducing PIR makes no sense. */
 631         if ((cpun != psm_get_cpu_id())) {
 632                 APIC_AV_PENDING_SET();
 633                 apic_reg_ops->apic_write_int_cmd(apic_cpus[cpun].aci_local_id,
 634                     vector);
 635         }
 636 
 637         intr_restore(flag);
 638 }
 639 
 640 int
 641 apic_get_pir_ipivect(void)
 642 {
 643         return (apic_pir_vect);
 644 }
 645 
 646 /*ARGSUSED*/
 647 void
 648 apic_set_idlecpu(processorid_t cpun)
 649 {
 650 }
 651 
 652 /*ARGSUSED*/
 653 void
 654 apic_unset_idlecpu(processorid_t cpun)
 655 {
 656 }
 657 
 658 
 659 void
 660 apic_ret()
 661 {
 662 }
 663 
 664 /*
 665  * If apic_coarse_time == 1, then apic_gettime() is used instead of
 666  * apic_gethrtime().  This is used for performance instead of accuracy.
 667  */
 668 
 669 hrtime_t
 670 apic_gettime()
 671 {
 672         int old_hrtime_stamp;
 673         hrtime_t temp;
 674 
 675         /*
 676          * In one-shot mode, we do not keep time, so if anyone
 677          * calls psm_gettime() directly, we vector over to
 678          * gethrtime().
 679          * one-shot mode MUST NOT be enabled if this psm is the source of
 680          * hrtime.
 681          */
 682 
 683         if (apic_oneshot)
 684                 return (gethrtime());
 685 
 686 
 687 gettime_again:
 688         while ((old_hrtime_stamp = apic_hrtime_stamp) & 1)
 689                 apic_ret();
 690 
 691         temp = apic_nsec_since_boot;
 692 
 693         if (apic_hrtime_stamp != old_hrtime_stamp) {    /* got an interrupt */
 694                 goto gettime_again;
 695         }
 696         return (temp);
 697 }
 698 
 699 /*
 700  * Here we return the number of nanoseconds since booting.  Note every
 701  * clock interrupt increments apic_nsec_since_boot by the appropriate
 702  * amount.
 703  */
 704 hrtime_t
 705 apic_gethrtime(void)
 706 {
 707         int curr_timeval, countval, elapsed_ticks;
 708         int old_hrtime_stamp, status;
 709         hrtime_t temp;
 710         uint32_t cpun;
 711         ulong_t oflags;
 712 
 713         /*
 714          * In one-shot mode, we do not keep time, so if anyone
 715          * calls psm_gethrtime() directly, we vector over to
 716          * gethrtime().
 717          * one-shot mode MUST NOT be enabled if this psm is the source of
 718          * hrtime.
 719          */
 720 
 721         if (apic_oneshot)
 722                 return (gethrtime());
 723 
 724         oflags = intr_clear();  /* prevent migration */
 725 
 726         cpun = apic_reg_ops->apic_read(APIC_LID_REG);
 727         if (apic_mode == LOCAL_APIC)
 728                 cpun >>= APIC_ID_BIT_OFFSET;
 729 
 730         lock_set(&apic_gethrtime_lock);
 731 
 732 gethrtime_again:
 733         while ((old_hrtime_stamp = apic_hrtime_stamp) & 1)
 734                 apic_ret();
 735 
 736         /*
 737          * Check to see which CPU we are on.  Note the time is kept on
 738          * the local APIC of CPU 0.  If on CPU 0, simply read the current
 739          * counter.  If on another CPU, issue a remote read command to CPU 0.
 740          */
 741         if (cpun == apic_cpus[0].aci_local_id) {
 742                 countval = apic_reg_ops->apic_read(APIC_CURR_COUNT);
 743         } else {
 744 #ifdef  DEBUG
 745                 APIC_AV_PENDING_SET();
 746 #else
 747                 if (apic_mode == LOCAL_APIC)
 748                         APIC_AV_PENDING_SET();
 749 #endif /* DEBUG */
 750 
 751                 apic_reg_ops->apic_write_int_cmd(
 752                     apic_cpus[0].aci_local_id, APIC_CURR_ADD | AV_REMOTE);
 753 
 754                 while ((status = apic_reg_ops->apic_read(APIC_INT_CMD1))
 755                     & AV_READ_PENDING) {
 756                         apic_ret();
 757                 }
 758 
 759                 if (status & AV_REMOTE_STATUS)      /* 1 = valid */
 760                         countval = apic_reg_ops->apic_read(APIC_REMOTE_READ);
 761                 else {  /* 0 = invalid */
 762                         apic_remote_hrterr++;
 763                         /*
 764                          * return last hrtime right now, will need more
 765                          * testing if change to retry
 766                          */
 767                         temp = apic_last_hrtime;
 768 
 769                         lock_clear(&apic_gethrtime_lock);
 770 
 771                         intr_restore(oflags);
 772 
 773                         return (temp);
 774                 }
 775         }
 776         if (countval > last_count_read)
 777                 countval = 0;
 778         else
 779                 last_count_read = countval;
 780 
 781         elapsed_ticks = apic_hertz_count - countval;
 782 
 783         curr_timeval = APIC_TICKS_TO_NSECS(elapsed_ticks);
 784         temp = apic_nsec_since_boot + curr_timeval;
 785 
 786         if (apic_hrtime_stamp != old_hrtime_stamp) {    /* got an interrupt */
 787                 /* we might have clobbered last_count_read. Restore it */
 788                 last_count_read = apic_hertz_count;
 789                 goto gethrtime_again;
 790         }
 791 
 792         if (temp < apic_last_hrtime) {
 793                 /* return last hrtime if error occurs */
 794                 apic_hrtime_error++;
 795                 temp = apic_last_hrtime;
 796         }
 797         else
 798                 apic_last_hrtime = temp;
 799 
 800         lock_clear(&apic_gethrtime_lock);
 801         intr_restore(oflags);
 802 
 803         return (temp);
 804 }
 805 
 806 /* apic NMI handler */
 807 /*ARGSUSED*/
 808 void
 809 apic_nmi_intr(caddr_t arg, struct regs *rp)
 810 {
 811         nmi_action_t action = nmi_action;
 812 
 813         if (apic_shutdown_processors) {
 814                 apic_disable_local_apic();
 815                 return;
 816         }
 817 
 818         apic_error |= APIC_ERR_NMI;
 819 
 820         if (!lock_try(&apic_nmi_lock))
 821                 return;
 822         apic_num_nmis++;
 823 
 824         /*
 825          * "nmi_action" always over-rides the older way of doing this, unless we
 826          * can't actually drop into kmdb when requested.
 827          */
 828         if (action == NMI_ACTION_KMDB && !psm_debugger())
 829                 action = NMI_ACTION_UNSET;
 830 
 831         if (action == NMI_ACTION_UNSET) {
 832                 if (apic_kmdb_on_nmi && psm_debugger())
 833                         action = NMI_ACTION_KMDB;
 834                 else if (apic_panic_on_nmi)
 835                         action = NMI_ACTION_PANIC;
 836                 else
 837                         action = NMI_ACTION_IGNORE;
 838         }
 839 
 840         switch (action) {
 841         case NMI_ACTION_IGNORE:
 842                 /*
 843                  * prom_printf is the best shot we have of something which is
 844                  * problem free from high level/NMI type of interrupts
 845                  */
 846                 prom_printf("NMI received\n");
 847                 break;
 848 
 849         case NMI_ACTION_PANIC:
 850                 /* Keep panic from entering kmdb. */
 851                 nopanicdebug = 1;
 852                 panic("NMI received\n");
 853                 break;
 854 
 855         case NMI_ACTION_KMDB:
 856         default:
 857                 debug_enter("NMI received: entering kmdb\n");
 858                 break;
 859         }
 860 
 861         lock_clear(&apic_nmi_lock);
 862 }
 863 
 864 processorid_t
 865 apic_get_next_processorid(processorid_t cpu_id)
 866 {
 867 
 868         int i;
 869 
 870         if (cpu_id == -1)
 871                 return ((processorid_t)0);
 872 
 873         for (i = cpu_id + 1; i < NCPU; i++) {
 874                 if (apic_cpu_in_range(i))
 875                         return (i);
 876         }
 877 
 878         return ((processorid_t)-1);
 879 }
 880 
 881 int
 882 apic_cpu_add(psm_cpu_request_t *reqp)
 883 {
 884         int i, rv = 0;
 885         ulong_t iflag;
 886         boolean_t first = B_TRUE;
 887         uchar_t localver = 0;
 888         uint32_t localid, procid;
 889         processorid_t cpuid = (processorid_t)-1;
 890         mach_cpu_add_arg_t *ap;
 891 
 892         ASSERT(reqp != NULL);
 893         reqp->req.cpu_add.cpuid = (processorid_t)-1;
 894 
 895         /* Check whether CPU hotplug is supported. */
 896         if (!plat_dr_support_cpu() || apic_max_nproc == -1) {
 897                 return (ENOTSUP);
 898         }
 899 
 900         ap = (mach_cpu_add_arg_t *)reqp->req.cpu_add.argp;
 901         switch (ap->type) {
 902         case MACH_CPU_ARG_LOCAL_APIC:
 903                 localid = ap->arg.apic.apic_id;
 904                 procid = ap->arg.apic.proc_id;
 905                 if (localid >= 255 || procid > 255) {
 906                         cmn_err(CE_WARN,
 907                             "!apic: apicid(%u) or procid(%u) is invalid.",
 908                             localid, procid);
 909                         return (EINVAL);
 910                 }
 911                 break;
 912 
 913         case MACH_CPU_ARG_LOCAL_X2APIC:
 914                 localid = ap->arg.apic.apic_id;
 915                 procid = ap->arg.apic.proc_id;
 916                 if (localid >= UINT32_MAX) {
 917                         cmn_err(CE_WARN,
 918                             "!apic: x2apicid(%u) is invalid.", localid);
 919                         return (EINVAL);
 920                 } else if (localid >= 255 && apic_mode == LOCAL_APIC) {
 921                         cmn_err(CE_WARN, "!apic: system is in APIC mode, "
 922                             "can't support x2APIC processor.");
 923                         return (ENOTSUP);
 924                 }
 925                 break;
 926 
 927         default:
 928                 cmn_err(CE_WARN,
 929                     "!apic: unknown argument type %d to apic_cpu_add().",
 930                     ap->type);
 931                 return (EINVAL);
 932         }
 933 
 934         /* Use apic_ioapic_lock to sync with apic_get_next_bind_cpu. */
 935         iflag = intr_clear();
 936         lock_set(&apic_ioapic_lock);
 937 
 938         /* Check whether local APIC id already exists. */
 939         for (i = 0; i < apic_nproc; i++) {
 940                 if (!CPU_IN_SET(apic_cpumask, i))
 941                         continue;
 942                 if (apic_cpus[i].aci_local_id == localid) {
 943                         lock_clear(&apic_ioapic_lock);
 944                         intr_restore(iflag);
 945                         cmn_err(CE_WARN,
 946                             "!apic: local apic id %u already exists.",
 947                             localid);
 948                         return (EEXIST);
 949                 } else if (apic_cpus[i].aci_processor_id == procid) {
 950                         lock_clear(&apic_ioapic_lock);
 951                         intr_restore(iflag);
 952                         cmn_err(CE_WARN,
 953                             "!apic: processor id %u already exists.",
 954                             (int)procid);
 955                         return (EEXIST);
 956                 }
 957 
 958                 /*
 959                  * There's no local APIC version number available in MADT table,
 960                  * so assume that all CPUs are homogeneous and use local APIC
 961                  * version number of the first existing CPU.
 962                  */
 963                 if (first) {
 964                         first = B_FALSE;
 965                         localver = apic_cpus[i].aci_local_ver;
 966                 }
 967         }
 968         ASSERT(first == B_FALSE);
 969 
 970         /*
 971          * Try to assign the same cpuid if APIC id exists in the dirty cache.
 972          */
 973         for (i = 0; i < apic_max_nproc; i++) {
 974                 if (CPU_IN_SET(apic_cpumask, i)) {
 975                         ASSERT((apic_cpus[i].aci_status & APIC_CPU_FREE) == 0);
 976                         continue;
 977                 }
 978                 ASSERT(apic_cpus[i].aci_status & APIC_CPU_FREE);
 979                 if ((apic_cpus[i].aci_status & APIC_CPU_DIRTY) &&
 980                     apic_cpus[i].aci_local_id == localid &&
 981                     apic_cpus[i].aci_processor_id == procid) {
 982                         cpuid = i;
 983                         break;
 984                 }
 985         }
 986 
 987         /* Avoid the dirty cache and allocate fresh slot if possible. */
 988         if (cpuid == (processorid_t)-1) {
 989                 for (i = 0; i < apic_max_nproc; i++) {
 990                         if ((apic_cpus[i].aci_status & APIC_CPU_FREE) &&
 991                             (apic_cpus[i].aci_status & APIC_CPU_DIRTY) == 0) {
 992                                 cpuid = i;
 993                                 break;
 994                         }
 995                 }
 996         }
 997 
 998         /* Try to find any free slot as last resort. */
 999         if (cpuid == (processorid_t)-1) {
1000                 for (i = 0; i < apic_max_nproc; i++) {
1001                         if (apic_cpus[i].aci_status & APIC_CPU_FREE) {
1002                                 cpuid = i;
1003                                 break;
1004                         }
1005                 }
1006         }
1007 
1008         if (cpuid == (processorid_t)-1) {
1009                 lock_clear(&apic_ioapic_lock);
1010                 intr_restore(iflag);
1011                 cmn_err(CE_NOTE,
1012                     "!apic: failed to allocate cpu id for processor %u.",
1013                     procid);
1014                 rv = EAGAIN;
1015         } else if (ACPI_FAILURE(acpica_map_cpu(cpuid, procid))) {
1016                 lock_clear(&apic_ioapic_lock);
1017                 intr_restore(iflag);
1018                 cmn_err(CE_NOTE,
1019                     "!apic: failed to build mapping for processor %u.",
1020                     procid);
1021                 rv = EBUSY;
1022         } else {
1023                 ASSERT(cpuid >= 0 && cpuid < NCPU);
1024                 ASSERT(cpuid < apic_max_nproc && cpuid < max_ncpus);
1025                 bzero(&apic_cpus[cpuid], sizeof (apic_cpus[0]));
1026                 apic_cpus[cpuid].aci_processor_id = procid;
1027                 apic_cpus[cpuid].aci_local_id = localid;
1028                 apic_cpus[cpuid].aci_local_ver = localver;
1029                 CPUSET_ATOMIC_ADD(apic_cpumask, cpuid);
1030                 if (cpuid >= apic_nproc) {
1031                         apic_nproc = cpuid + 1;
1032                 }
1033                 lock_clear(&apic_ioapic_lock);
1034                 intr_restore(iflag);
1035                 reqp->req.cpu_add.cpuid = cpuid;
1036         }
1037 
1038         return (rv);
1039 }
1040 
1041 int
1042 apic_cpu_remove(psm_cpu_request_t *reqp)
1043 {
1044         int i;
1045         ulong_t iflag;
1046         processorid_t cpuid;
1047 
1048         /* Check whether CPU hotplug is supported. */
1049         if (!plat_dr_support_cpu() || apic_max_nproc == -1) {
1050                 return (ENOTSUP);
1051         }
1052 
1053         cpuid = reqp->req.cpu_remove.cpuid;
1054 
1055         /* Use apic_ioapic_lock to sync with apic_get_next_bind_cpu. */
1056         iflag = intr_clear();
1057         lock_set(&apic_ioapic_lock);
1058 
1059         if (!apic_cpu_in_range(cpuid)) {
1060                 lock_clear(&apic_ioapic_lock);
1061                 intr_restore(iflag);
1062                 cmn_err(CE_WARN,
1063                     "!apic: cpuid %d doesn't exist in apic_cpus array.",
1064                     cpuid);
1065                 return (ENODEV);
1066         }
1067         ASSERT((apic_cpus[cpuid].aci_status & APIC_CPU_FREE) == 0);
1068 
1069         if (ACPI_FAILURE(acpica_unmap_cpu(cpuid))) {
1070                 lock_clear(&apic_ioapic_lock);
1071                 intr_restore(iflag);
1072                 return (ENOENT);
1073         }
1074 
1075         if (cpuid == apic_nproc - 1) {
1076                 /*
1077                  * We are removing the highest numbered cpuid so we need to
1078                  * find the next highest cpuid as the new value for apic_nproc.
1079                  */
1080                 for (i = apic_nproc; i > 0; i--) {
1081                         if (CPU_IN_SET(apic_cpumask, i - 1)) {
1082                                 apic_nproc = i;
1083                                 break;
1084                         }
1085                 }
1086                 /* at least one CPU left */
1087                 ASSERT(i > 0);
1088         }
1089         CPUSET_ATOMIC_DEL(apic_cpumask, cpuid);
1090         /* mark slot as free and keep it in the dirty cache */
1091         apic_cpus[cpuid].aci_status = APIC_CPU_FREE | APIC_CPU_DIRTY;
1092 
1093         lock_clear(&apic_ioapic_lock);
1094         intr_restore(iflag);
1095 
1096         return (0);
1097 }
1098 
1099 /*
1100  * Return the number of ticks the APIC decrements in SF nanoseconds.
1101  * The fixed-frequency PIT (aka 8254) is used for the measurement.
1102  */
1103 static uint64_t
1104 apic_calibrate_impl()
1105 {
1106         uint8_t         pit_tick_lo;
1107         uint16_t        pit_tick, target_pit_tick, pit_ticks_adj;
1108         uint32_t        pit_ticks;
1109         uint32_t        start_apic_tick, end_apic_tick, apic_ticks;
1110         ulong_t         iflag;
1111 
1112         apic_reg_ops->apic_write(APIC_DIVIDE_REG, apic_divide_reg_init);
1113         apic_reg_ops->apic_write(APIC_INIT_COUNT, APIC_MAXVAL);
1114 
1115         iflag = intr_clear();
1116 
1117         do {
1118                 pit_tick_lo = inb(PITCTR0_PORT);
1119                 pit_tick = (inb(PITCTR0_PORT) << 8) | pit_tick_lo;
1120         } while (pit_tick < APIC_TIME_MIN ||
1121             pit_tick_lo <= APIC_LB_MIN || pit_tick_lo >= APIC_LB_MAX);
1122 
1123         /*
1124          * Wait for the PIT to decrement by 5 ticks to ensure
1125          * we didn't start in the middle of a tick.
1126          * Compare with 0x10 for the wrap around case.
1127          */
1128         target_pit_tick = pit_tick - 5;
1129         do {
1130                 pit_tick_lo = inb(PITCTR0_PORT);
1131                 pit_tick = (inb(PITCTR0_PORT) << 8) | pit_tick_lo;
1132         } while (pit_tick > target_pit_tick || pit_tick_lo < 0x10);
1133 
1134         start_apic_tick = apic_reg_ops->apic_read(APIC_CURR_COUNT);
1135 
1136         /*
1137          * Wait for the PIT to decrement by APIC_TIME_COUNT ticks
1138          */
1139         target_pit_tick = pit_tick - APIC_TIME_COUNT;
1140         do {
1141                 pit_tick_lo = inb(PITCTR0_PORT);
1142                 pit_tick = (inb(PITCTR0_PORT) << 8) | pit_tick_lo;
1143         } while (pit_tick > target_pit_tick || pit_tick_lo < 0x10);
1144 
1145         end_apic_tick = apic_reg_ops->apic_read(APIC_CURR_COUNT);
1146 
1147         intr_restore(iflag);
1148 
1149         apic_ticks = start_apic_tick - end_apic_tick;
1150 
1151         /* The PIT might have decremented by more ticks than planned */
1152         pit_ticks_adj = target_pit_tick - pit_tick;
1153         /* total number of PIT ticks corresponding to apic_ticks */
1154         pit_ticks = APIC_TIME_COUNT + pit_ticks_adj;
1155 
1156         /*
1157          * Determine the number of nanoseconds per APIC clock tick
1158          * and then determine how many APIC ticks to interrupt at the
1159          * desired frequency
1160          * apic_ticks / (pitticks / PIT_HZ) = apic_ticks_per_s
1161          * (apic_ticks * PIT_HZ) / pitticks = apic_ticks_per_s
1162          * apic_ticks_per_ns = (apic_ticks * PIT_HZ) / (pitticks * 10^9)
1163          * apic_ticks_per_SFns =
1164          * (SF * apic_ticks * PIT_HZ) / (pitticks * 10^9)
1165          */
1166         return ((SF * apic_ticks * PIT_HZ) / ((uint64_t)pit_ticks * NANOSEC));
1167 }
1168 
1169 /*
1170  * It was found empirically that 5 measurements seem sufficient to give a good
1171  * accuracy. Most spurious measurements are higher than the target value thus
1172  * we eliminate up to 2/5 spurious measurements.
1173  */
1174 #define APIC_CALIBRATE_MEASUREMENTS             5
1175 
1176 #define APIC_CALIBRATE_PERCENT_OFF_WARNING      10
1177 
1178 /*
1179  * Return the number of ticks the APIC decrements in SF nanoseconds.
1180  * Several measurements are taken to filter out outliers.
1181  */
1182 uint64_t
1183 apic_calibrate()
1184 {
1185         uint64_t        measurements[APIC_CALIBRATE_MEASUREMENTS];
1186         int             median_idx;
1187         uint64_t        median;
1188 
1189         /*
1190          * When running under a virtual machine, the emulated PIT and APIC
1191          * counters do not always return the right values and can roll over.
1192          * Those spurious measurements are relatively rare but could
1193          * significantly affect the calibration.
1194          * Therefore we take several measurements and then keep the median.
1195          * The median is preferred to the average here as we only want to
1196          * discard outliers.
1197          */
1198         for (int i = 0; i < APIC_CALIBRATE_MEASUREMENTS; i++)
1199                 measurements[i] = apic_calibrate_impl();
1200 
1201         /*
1202          * sort results and retrieve median.
1203          */
1204         for (int i = 0; i < APIC_CALIBRATE_MEASUREMENTS; i++) {
1205                 for (int j = i + 1; j < APIC_CALIBRATE_MEASUREMENTS; j++) {
1206                         if (measurements[j] < measurements[i]) {
1207                                 uint64_t tmp = measurements[i];
1208                                 measurements[i] = measurements[j];
1209                                 measurements[j] = tmp;
1210                         }
1211                 }
1212         }
1213         median_idx = APIC_CALIBRATE_MEASUREMENTS / 2;
1214         median = measurements[median_idx];
1215 
1216 #if (APIC_CALIBRATE_MEASUREMENTS >= 3)
1217         /*
1218          * Check that measurements are consistent. Post a warning
1219          * if the three middle values are not close to each other.
1220          */
1221         uint64_t delta_warn = median *
1222             APIC_CALIBRATE_PERCENT_OFF_WARNING / 100;
1223         if ((median - measurements[median_idx - 1]) > delta_warn ||
1224             (measurements[median_idx + 1] - median) > delta_warn) {
1225                 cmn_err(CE_WARN, "apic_calibrate measurements lack "
1226                     "precision: %llu, %llu, %llu.",
1227                     (u_longlong_t)measurements[median_idx - 1],
1228                     (u_longlong_t)median,
1229                     (u_longlong_t)measurements[median_idx + 1]);
1230         }
1231 #endif
1232 
1233         return (median);
1234 }
1235 
1236 /*
1237  * Initialise the APIC timer on the local APIC of CPU 0 to the desired
1238  * frequency.  Note at this stage in the boot sequence, the boot processor
1239  * is the only active processor.
1240  * hertz value of 0 indicates a one-shot mode request.  In this case
1241  * the function returns the resolution (in nanoseconds) for the hardware
1242  * timer interrupt.  If one-shot mode capability is not available,
1243  * the return value will be 0. apic_enable_oneshot is a global switch
1244  * for disabling the functionality.
1245  * A non-zero positive value for hertz indicates a periodic mode request.
1246  * In this case the hardware will be programmed to generate clock interrupts
1247  * at hertz frequency and returns the resolution of interrupts in
1248  * nanosecond.
1249  */
1250 
1251 int
1252 apic_clkinit(int hertz)
1253 {
1254         int             ret;
1255 
1256         apic_int_busy_mark = (apic_int_busy_mark *
1257             apic_sample_factor_redistribution) / 100;
1258         apic_int_free_mark = (apic_int_free_mark *
1259             apic_sample_factor_redistribution) / 100;
1260         apic_diff_for_redistribution = (apic_diff_for_redistribution *
1261             apic_sample_factor_redistribution) / 100;
1262 
1263         ret = apic_timer_init(hertz);
1264         return (ret);
1265 
1266 }
1267 
1268 /*
1269  * apic_preshutdown:
1270  * Called early in shutdown whilst we can still access filesystems to do
1271  * things like loading modules which will be required to complete shutdown
1272  * after filesystems are all unmounted.
1273  */
1274 void
1275 apic_preshutdown(int cmd, int fcn)
1276 {
1277         APIC_VERBOSE_POWEROFF(("apic_preshutdown(%d,%d); m=%d a=%d\n",
1278             cmd, fcn, apic_poweroff_method, apic_enable_acpi));
1279 }
1280 
1281 void
1282 apic_shutdown(int cmd, int fcn)
1283 {
1284         int restarts, attempts;
1285         int i;
1286         uchar_t byte;
1287         ulong_t iflag;
1288 
1289         hpet_acpi_fini();
1290 
1291         /* Send NMI to all CPUs except self to do per processor shutdown */
1292         iflag = intr_clear();
1293 #ifdef  DEBUG
1294         APIC_AV_PENDING_SET();
1295 #else
1296         if (apic_mode == LOCAL_APIC)
1297                 APIC_AV_PENDING_SET();
1298 #endif /* DEBUG */
1299         apic_shutdown_processors = 1;
1300         apic_reg_ops->apic_write(APIC_INT_CMD1,
1301             AV_NMI | AV_LEVEL | AV_SH_ALL_EXCSELF);
1302 
1303         /* restore cmos shutdown byte before reboot */
1304         if (apic_cmos_ssb_set) {
1305                 outb(CMOS_ADDR, SSB);
1306                 outb(CMOS_DATA, 0);
1307         }
1308 
1309         ioapic_disable_redirection();
1310 
1311         /*      disable apic mode if imcr present       */
1312         if (apic_imcrp) {
1313                 outb(APIC_IMCR_P1, (uchar_t)APIC_IMCR_SELECT);
1314                 outb(APIC_IMCR_P2, (uchar_t)APIC_IMCR_PIC);
1315         }
1316 
1317         apic_disable_local_apic();
1318 
1319         intr_restore(iflag);
1320 
1321         /* remainder of function is for shutdown cases only */
1322         if (cmd != A_SHUTDOWN)
1323                 return;
1324 
1325         /*
1326          * Switch system back into Legacy-Mode if using ACPI and
1327          * not powering-off.  Some BIOSes need to remain in ACPI-mode
1328          * for power-off to succeed (Dell Dimension 4600)
1329          * Do not disable ACPI while doing fastreboot
1330          */
1331         if (apic_enable_acpi && fcn != AD_POWEROFF && fcn != AD_FASTREBOOT)
1332                 (void) AcpiDisable();
1333 
1334         if (fcn == AD_FASTREBOOT) {
1335                 apic_reg_ops->apic_write(APIC_INT_CMD1,
1336                     AV_ASSERT | AV_RESET | AV_SH_ALL_EXCSELF);
1337         }
1338 
1339         /* remainder of function is for shutdown+poweroff case only */
1340         if (fcn != AD_POWEROFF)
1341                 return;
1342 
1343         switch (apic_poweroff_method) {
1344                 case APIC_POWEROFF_VIA_RTC:
1345 
1346                         /* select the extended NVRAM bank in the RTC */
1347                         outb(CMOS_ADDR, RTC_REGA);
1348                         byte = inb(CMOS_DATA);
1349                         outb(CMOS_DATA, (byte | EXT_BANK));
1350 
1351                         outb(CMOS_ADDR, PFR_REG);
1352 
1353                         /* for Predator must toggle the PAB bit */
1354                         byte = inb(CMOS_DATA);
1355 
1356                         /*
1357                          * clear power active bar, wakeup alarm and
1358                          * kickstart
1359                          */
1360                         byte &= ~(PAB_CBIT | WF_FLAG | KS_FLAG);
1361                         outb(CMOS_DATA, byte);
1362 
1363                         /* delay before next write */
1364                         drv_usecwait(1000);
1365 
1366                         /* for S40 the following would suffice */
1367                         byte = inb(CMOS_DATA);
1368 
1369                         /* power active bar control bit */
1370                         byte |= PAB_CBIT;
1371                         outb(CMOS_DATA, byte);
1372 
1373                         break;
1374 
1375                 case APIC_POWEROFF_VIA_ASPEN_BMC:
1376                         restarts = 0;
1377 restart_aspen_bmc:
1378                         if (++restarts == 3)
1379                                 break;
1380                         attempts = 0;
1381                         do {
1382                                 byte = inb(MISMIC_FLAG_REGISTER);
1383                                 byte &= MISMIC_BUSY_MASK;
1384                                 if (byte != 0) {
1385                                         drv_usecwait(1000);
1386                                         if (attempts >= 3)
1387                                                 goto restart_aspen_bmc;
1388                                         ++attempts;
1389                                 }
1390                         } while (byte != 0);
1391                         outb(MISMIC_CNTL_REGISTER, CC_SMS_GET_STATUS);
1392                         byte = inb(MISMIC_FLAG_REGISTER);
1393                         byte |= 0x1;
1394                         outb(MISMIC_FLAG_REGISTER, byte);
1395                         i = 0;
1396                         for (; i < (sizeof (aspen_bmc)/sizeof (aspen_bmc[0]));
1397                             i++) {
1398                                 attempts = 0;
1399                                 do {
1400                                         byte = inb(MISMIC_FLAG_REGISTER);
1401                                         byte &= MISMIC_BUSY_MASK;
1402                                         if (byte != 0) {
1403                                                 drv_usecwait(1000);
1404                                                 if (attempts >= 3)
1405                                                         goto restart_aspen_bmc;
1406                                                 ++attempts;
1407                                         }
1408                                 } while (byte != 0);
1409                                 outb(MISMIC_CNTL_REGISTER, aspen_bmc[i].cntl);
1410                                 outb(MISMIC_DATA_REGISTER, aspen_bmc[i].data);
1411                                 byte = inb(MISMIC_FLAG_REGISTER);
1412                                 byte |= 0x1;
1413                                 outb(MISMIC_FLAG_REGISTER, byte);
1414                         }
1415                         break;
1416 
1417                 case APIC_POWEROFF_VIA_SITKA_BMC:
1418                         restarts = 0;
1419 restart_sitka_bmc:
1420                         if (++restarts == 3)
1421                                 break;
1422                         attempts = 0;
1423                         do {
1424                                 byte = inb(SMS_STATUS_REGISTER);
1425                                 byte &= SMS_STATE_MASK;
1426                                 if ((byte == SMS_READ_STATE) ||
1427                                     (byte == SMS_WRITE_STATE)) {
1428                                         drv_usecwait(1000);
1429                                         if (attempts >= 3)
1430                                                 goto restart_sitka_bmc;
1431                                         ++attempts;
1432                                 }
1433                         } while ((byte == SMS_READ_STATE) ||
1434                             (byte == SMS_WRITE_STATE));
1435                         outb(SMS_COMMAND_REGISTER, SMS_GET_STATUS);
1436                         i = 0;
1437                         for (; i < (sizeof (sitka_bmc)/sizeof (sitka_bmc[0]));
1438                             i++) {
1439                                 attempts = 0;
1440                                 do {
1441                                         byte = inb(SMS_STATUS_REGISTER);
1442                                         byte &= SMS_IBF_MASK;
1443                                         if (byte != 0) {
1444                                                 drv_usecwait(1000);
1445                                                 if (attempts >= 3)
1446                                                         goto restart_sitka_bmc;
1447                                                 ++attempts;
1448                                         }
1449                                 } while (byte != 0);
1450                                 outb(sitka_bmc[i].port, sitka_bmc[i].data);
1451                         }
1452                         break;
1453 
1454                 case APIC_POWEROFF_NONE:
1455 
1456                         /* If no APIC direct method, we will try using ACPI */
1457                         if (apic_enable_acpi) {
1458                                 if (acpi_poweroff() == 1)
1459                                         return;
1460                         } else
1461                                 return;
1462 
1463                         break;
1464         }
1465         /*
1466          * Wait a limited time here for power to go off.
1467          * If the power does not go off, then there was a
1468          * problem and we should continue to the halt which
1469          * prints a message for the user to press a key to
1470          * reboot.
1471          */
1472         drv_usecwait(7000000); /* wait seven seconds */
1473 
1474 }
1475 
1476 cyclic_id_t apic_cyclic_id;
1477 
1478 /*
1479  * The following functions are in the platform specific file so that they
1480  * can be different functions depending on whether we are running on
1481  * bare metal or a hypervisor.
1482  */
1483 
1484 /*
1485  * map an apic for memory-mapped access
1486  */
1487 uint32_t *
1488 mapin_apic(uint32_t addr, size_t len, int flags)
1489 {
1490         return ((void *)psm_map_phys(addr, len, flags));
1491 }
1492 
1493 uint32_t *
1494 mapin_ioapic(uint32_t addr, size_t len, int flags)
1495 {
1496         return (mapin_apic(addr, len, flags));
1497 }
1498 
1499 /*
1500  * unmap an apic
1501  */
1502 void
1503 mapout_apic(caddr_t addr, size_t len)
1504 {
1505         psm_unmap_phys(addr, len);
1506 }
1507 
1508 void
1509 mapout_ioapic(caddr_t addr, size_t len)
1510 {
1511         mapout_apic(addr, len);
1512 }
1513 
1514 uint32_t
1515 ioapic_read(int ioapic_ix, uint32_t reg)
1516 {
1517         volatile uint32_t *ioapic;
1518 
1519         ioapic = apicioadr[ioapic_ix];
1520         ioapic[APIC_IO_REG] = reg;
1521         return (ioapic[APIC_IO_DATA]);
1522 }
1523 
1524 void
1525 ioapic_write(int ioapic_ix, uint32_t reg, uint32_t value)
1526 {
1527         volatile uint32_t *ioapic;
1528 
1529         ioapic = apicioadr[ioapic_ix];
1530         ioapic[APIC_IO_REG] = reg;
1531         ioapic[APIC_IO_DATA] = value;
1532 }
1533 
1534 void
1535 ioapic_write_eoi(int ioapic_ix, uint32_t value)
1536 {
1537         volatile uint32_t *ioapic;
1538 
1539         ioapic = apicioadr[ioapic_ix];
1540         ioapic[APIC_IO_EOI] = value;
1541 }
1542 
1543 /*
1544  * Round-robin algorithm to find the next CPU with interrupts enabled.
1545  * It can't share the same static variable apic_next_bind_cpu with
1546  * apic_get_next_bind_cpu(), since that will cause all interrupts to be
1547  * bound to CPU1 at boot time.  During boot, only CPU0 is online with
1548  * interrupts enabled when apic_get_next_bind_cpu() and apic_find_cpu()
1549  * are called.  However, the pcplusmp driver assumes that there will be
1550  * boot_ncpus CPUs configured eventually so it tries to distribute all
1551  * interrupts among CPU0 - CPU[boot_ncpus - 1].  Thus to prevent all
1552  * interrupts being targetted at CPU1, we need to use a dedicated static
1553  * variable for find_next_cpu() instead of sharing apic_next_bind_cpu.
1554  */
1555 
1556 processorid_t
1557 apic_find_cpu(int flag)
1558 {
1559         int i;
1560         static processorid_t acid = 0;
1561 
1562         /* Find the first CPU with the passed-in flag set */
1563         for (i = 0; i < apic_nproc; i++) {
1564                 if (++acid >= apic_nproc) {
1565                         acid = 0;
1566                 }
1567                 if (apic_cpu_in_range(acid) &&
1568                     (apic_cpus[acid].aci_status & flag)) {
1569                         break;
1570                 }
1571         }
1572 
1573         ASSERT((apic_cpus[acid].aci_status & flag) != 0);
1574         return (acid);
1575 }
1576 
1577 void
1578 apic_intrmap_init(int apic_mode)
1579 {
1580         int suppress_brdcst_eoi = 0;
1581 
1582         /*
1583          * Intel Software Developer's Manual 3A, 10.12.7:
1584          *
1585          * Routing of device interrupts to local APIC units operating in
1586          * x2APIC mode requires use of the interrupt-remapping architecture
1587          * specified in the Intel Virtualization Technology for Directed
1588          * I/O, Revision 1.3.  Because of this, BIOS must enumerate support
1589          * for and software must enable this interrupt remapping with
1590          * Extended Interrupt Mode Enabled before it enabling x2APIC mode in
1591          * the local APIC units.
1592          *
1593          *
1594          * In other words, to use the APIC in x2APIC mode, we need interrupt
1595          * remapping.  Since we don't start up the IOMMU by default, we
1596          * won't be able to do any interrupt remapping and therefore have to
1597          * use the APIC in traditional 'local APIC' mode with memory mapped
1598          * I/O.
1599          */
1600 
1601         if (psm_vt_ops != NULL) {
1602                 if (((apic_intrmap_ops_t *)psm_vt_ops)->
1603                     apic_intrmap_init(apic_mode) == DDI_SUCCESS) {
1604 
1605                         apic_vt_ops = psm_vt_ops;
1606 
1607                         /*
1608                          * We leverage the interrupt remapping engine to
1609                          * suppress broadcast EOI; thus we must send the
1610                          * directed EOI with the directed-EOI handler.
1611                          */
1612                         if (apic_directed_EOI_supported() == 0) {
1613                                 suppress_brdcst_eoi = 1;
1614                         }
1615 
1616                         apic_vt_ops->apic_intrmap_enable(suppress_brdcst_eoi);
1617 
1618                         if (apic_detect_x2apic()) {
1619                                 apic_enable_x2apic();
1620                         }
1621 
1622                         if (apic_directed_EOI_supported() == 0) {
1623                                 apic_set_directed_EOI_handler();
1624                         }
1625                 }
1626         }
1627 }
1628 
1629 /*ARGSUSED*/
1630 static void
1631 apic_record_ioapic_rdt(void *intrmap_private, ioapic_rdt_t *irdt)
1632 {
1633         irdt->ir_hi <<= APIC_ID_BIT_OFFSET;
1634 }
1635 
1636 /*ARGSUSED*/
1637 static void
1638 apic_record_msi(void *intrmap_private, msi_regs_t *mregs)
1639 {
1640         mregs->mr_addr = MSI_ADDR_HDR |
1641             (MSI_ADDR_RH_FIXED << MSI_ADDR_RH_SHIFT) |
1642             (MSI_ADDR_DM_PHYSICAL << MSI_ADDR_DM_SHIFT) |
1643             (mregs->mr_addr << MSI_ADDR_DEST_SHIFT);
1644         mregs->mr_data = (MSI_DATA_TM_EDGE << MSI_DATA_TM_SHIFT) |
1645             mregs->mr_data;
1646 }
1647 
1648 /*
1649  * Functions from apic_introp.c
1650  *
1651  * Those functions are used by apic_intr_ops().
1652  */
1653 
1654 /*
1655  * MSI support flag:
1656  * reflects whether MSI is supported at APIC level
1657  * it can also be patched through /etc/system
1658  *
1659  *  0 = default value - don't know and need to call apic_check_msi_support()
1660  *      to find out then set it accordingly
1661  *  1 = supported
1662  * -1 = not supported
1663  */
1664 int     apic_support_msi = 0;
1665 
1666 /* Multiple vector support for MSI-X */
1667 int     apic_msix_enable = 1;
1668 
1669 /* Multiple vector support for MSI */
1670 int     apic_multi_msi_enable = 1;
1671 
1672 /*
1673  * Check whether the system supports MSI.
1674  *
1675  * MSI is required for PCI-E and for PCI versions later than 2.2, so if we find
1676  * a PCI-E bus or we find a PCI bus whose version we know is >= 2.2, then we
1677  * return PSM_SUCCESS to indicate this system supports MSI.
1678  *
1679  * (Currently the only way we check whether a given PCI bus supports >= 2.2 is
1680  * by detecting if we are running inside the KVM hypervisor, which guarantees
1681  * this version number.)
1682  */
1683 int
1684 apic_check_msi_support()
1685 {
1686         dev_info_t *cdip;
1687         char dev_type[16];
1688         int dev_len;
1689         int hwenv = get_hwenv();
1690 
1691         DDI_INTR_IMPLDBG((CE_CONT, "apic_check_msi_support:\n"));
1692 
1693         /*
1694          * check whether the first level children of root_node have
1695          * PCI-E or PCI capability.
1696          */
1697         for (cdip = ddi_get_child(ddi_root_node()); cdip != NULL;
1698             cdip = ddi_get_next_sibling(cdip)) {
1699 
1700                 DDI_INTR_IMPLDBG((CE_CONT, "apic_check_msi_support: cdip: 0x%p,"
1701                     " driver: %s, binding: %s, nodename: %s\n", (void *)cdip,
1702                     ddi_driver_name(cdip), ddi_binding_name(cdip),
1703                     ddi_node_name(cdip)));
1704                 dev_len = sizeof (dev_type);
1705                 if (ddi_getlongprop_buf(DDI_DEV_T_ANY, cdip, DDI_PROP_DONTPASS,
1706                     "device_type", (caddr_t)dev_type, &dev_len)
1707                     != DDI_PROP_SUCCESS)
1708                         continue;
1709                 if (strcmp(dev_type, "pciex") == 0)
1710                         return (PSM_SUCCESS);
1711                 if (strcmp(dev_type, "pci") == 0 &&
1712                     (hwenv == HW_KVM || hwenv == HW_BHYVE))
1713                         return (PSM_SUCCESS);
1714         }
1715 
1716         /* MSI is not supported on this system */
1717         DDI_INTR_IMPLDBG((CE_CONT, "apic_check_msi_support: no 'pciex' "
1718             "device_type found\n"));
1719         return (PSM_FAILURE);
1720 }
1721 
1722 /*
1723  * apic_pci_msi_unconfigure:
1724  *
1725  * This and next two interfaces are copied from pci_intr_lib.c
1726  * Do ensure that these two files stay in sync.
1727  * These needed to be copied over here to avoid a deadlock situation on
1728  * certain mp systems that use MSI interrupts.
1729  *
1730  * IMPORTANT regards next three interfaces:
1731  * i) are called only for MSI/X interrupts.
1732  * ii) called with interrupts disabled, and must not block
1733  */
1734 void
1735 apic_pci_msi_unconfigure(dev_info_t *rdip, int type, int inum)
1736 {
1737         ushort_t                msi_ctrl;
1738         int                     cap_ptr = i_ddi_get_msi_msix_cap_ptr(rdip);
1739         ddi_acc_handle_t        handle = i_ddi_get_pci_config_handle(rdip);
1740 
1741         ASSERT((handle != NULL) && (cap_ptr != 0));
1742 
1743         if (type == DDI_INTR_TYPE_MSI) {
1744                 msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSI_CTRL);
1745                 msi_ctrl &= (~PCI_MSI_MME_MASK);
1746                 pci_config_put16(handle, cap_ptr + PCI_MSI_CTRL, msi_ctrl);
1747                 pci_config_put32(handle, cap_ptr + PCI_MSI_ADDR_OFFSET, 0);
1748 
1749                 if (msi_ctrl &  PCI_MSI_64BIT_MASK) {
1750                         pci_config_put16(handle,
1751                             cap_ptr + PCI_MSI_64BIT_DATA, 0);
1752                         pci_config_put32(handle,
1753                             cap_ptr + PCI_MSI_ADDR_OFFSET + 4, 0);
1754                 } else {
1755                         pci_config_put16(handle,
1756                             cap_ptr + PCI_MSI_32BIT_DATA, 0);
1757                 }
1758 
1759         } else if (type == DDI_INTR_TYPE_MSIX) {
1760                 uintptr_t       off;
1761                 uint32_t        mask;
1762                 ddi_intr_msix_t *msix_p = i_ddi_get_msix(rdip);
1763 
1764                 ASSERT(msix_p != NULL);
1765 
1766                 /* Offset into "inum"th entry in the MSI-X table & mask it */
1767                 off = (uintptr_t)msix_p->msix_tbl_addr + (inum *
1768                     PCI_MSIX_VECTOR_SIZE) + PCI_MSIX_VECTOR_CTRL_OFFSET;
1769 
1770                 mask = ddi_get32(msix_p->msix_tbl_hdl, (uint32_t *)off);
1771 
1772                 ddi_put32(msix_p->msix_tbl_hdl, (uint32_t *)off, (mask | 1));
1773 
1774                 /* Offset into the "inum"th entry in the MSI-X table */
1775                 off = (uintptr_t)msix_p->msix_tbl_addr +
1776                     (inum * PCI_MSIX_VECTOR_SIZE);
1777 
1778                 /* Reset the "data" and "addr" bits */
1779                 ddi_put32(msix_p->msix_tbl_hdl,
1780                     (uint32_t *)(off + PCI_MSIX_DATA_OFFSET), 0);
1781                 ddi_put64(msix_p->msix_tbl_hdl, (uint64_t *)off, 0);
1782         }
1783 }
1784 
1785 /*
1786  * apic_pci_msi_disable_mode:
1787  */
1788 void
1789 apic_pci_msi_disable_mode(dev_info_t *rdip, int type)
1790 {
1791         ushort_t                msi_ctrl;
1792         int                     cap_ptr = i_ddi_get_msi_msix_cap_ptr(rdip);
1793         ddi_acc_handle_t        handle = i_ddi_get_pci_config_handle(rdip);
1794 
1795         ASSERT((handle != NULL) && (cap_ptr != 0));
1796 
1797         if (type == DDI_INTR_TYPE_MSI) {
1798                 msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSI_CTRL);
1799                 if (!(msi_ctrl & PCI_MSI_ENABLE_BIT))
1800                         return;
1801 
1802                 msi_ctrl &= ~PCI_MSI_ENABLE_BIT;    /* MSI disable */
1803                 pci_config_put16(handle, cap_ptr + PCI_MSI_CTRL, msi_ctrl);
1804 
1805         } else if (type == DDI_INTR_TYPE_MSIX) {
1806                 msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSIX_CTRL);
1807                 if (msi_ctrl & PCI_MSIX_ENABLE_BIT) {
1808                         msi_ctrl &= ~PCI_MSIX_ENABLE_BIT;
1809                         pci_config_put16(handle, cap_ptr + PCI_MSIX_CTRL,
1810                             msi_ctrl);
1811                 }
1812         }
1813 }
1814 
1815 uint32_t
1816 apic_get_localapicid(uint32_t cpuid)
1817 {
1818         ASSERT(cpuid < apic_nproc && apic_cpus != NULL);
1819 
1820         return (apic_cpus[cpuid].aci_local_id);
1821 }
1822 
1823 uchar_t
1824 apic_get_ioapicid(uchar_t ioapicindex)
1825 {
1826         ASSERT(ioapicindex < MAX_IO_APIC);
1827 
1828         return (apic_io_id[ioapicindex]);
1829 }