1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 /*
  25  * Copyright (c) 2010, Intel Corporation.
  26  * All rights reserved.
  27  */
  28 
  29 /*
  30  * PSMI 1.1 extensions are supported only in 2.6 and later versions.
  31  * PSMI 1.2 extensions are supported only in 2.7 and later versions.
  32  * PSMI 1.3 and 1.4 extensions are supported in Solaris 10.
  33  * PSMI 1.5 extensions are supported in Solaris Nevada.
  34  * PSMI 1.6 extensions are supported in Solaris Nevada.
  35  * PSMI 1.7 extensions are supported in Solaris Nevada.
  36  */
  37 #define PSMI_1_7
  38 
  39 #include <sys/processor.h>
  40 #include <sys/time.h>
  41 #include <sys/psm.h>
  42 #include <sys/smp_impldefs.h>
  43 #include <sys/inttypes.h>
  44 #include <sys/cram.h>
  45 #include <acpica/include/acpi.h>
  46 #include <sys/acpica.h>
  47 #include <sys/psm_common.h>
  48 #include <sys/apic.h>
  49 #include <sys/apic_common.h>
  50 #include <sys/pit.h>
  51 #include <sys/ddi.h>
  52 #include <sys/sunddi.h>
  53 #include <sys/ddi_impldefs.h>
  54 #include <sys/pci.h>
  55 #include <sys/promif.h>
  56 #include <sys/x86_archext.h>
  57 #include <sys/cpc_impl.h>
  58 #include <sys/uadmin.h>
  59 #include <sys/panic.h>
  60 #include <sys/debug.h>
  61 #include <sys/archsystm.h>
  62 #include <sys/trap.h>
  63 #include <sys/machsystm.h>
  64 #include <sys/cpuvar.h>
  65 #include <sys/rm_platter.h>
  66 #include <sys/privregs.h>
  67 #include <sys/cyclic.h>
  68 #include <sys/note.h>
  69 #include <sys/pci_intr_lib.h>
  70 #include <sys/sunndi.h>
  71 #include <sys/hpet.h>
  72 #include <sys/clock.h>
  73 
  74 /*
  75  * Part of mp_platfrom_common.c that's used only by pcplusmp & xpv_psm
  76  * but not apix.
  77  * These functions may be moved to xpv_psm later when apix and pcplusmp
  78  * are merged together
  79  */
  80 
  81 /*
  82  *      Local Function Prototypes
  83  */
  84 static void apic_mark_vector(uchar_t oldvector, uchar_t newvector);
  85 static void apic_xlate_vector_free_timeout_handler(void *arg);
  86 static int apic_check_stuck_interrupt(apic_irq_t *irq_ptr, int old_bind_cpu,
  87     int new_bind_cpu, int apicindex, int intin_no, int which_irq,
  88     struct ioapic_reprogram_data *drep);
  89 static int apic_setup_irq_table(dev_info_t *dip, int irqno,
  90     struct apic_io_intr *intrp, struct intrspec *ispec, iflag_t *intr_flagp,
  91     int type);
  92 static void apic_try_deferred_reprogram(int ipl, int vect);
  93 static void delete_defer_repro_ent(int which_irq);
  94 static void apic_ioapic_wait_pending_clear(int ioapicindex,
  95     int intin_no);
  96 
  97 extern int apic_acpi_translate_pci_irq(dev_info_t *dip, int busid, int devid,
  98     int ipin, int *pci_irqp, iflag_t *intr_flagp);
  99 extern int apic_handle_pci_pci_bridge(dev_info_t *idip, int child_devno,
 100     int child_ipin, struct apic_io_intr **intrp);
 101 extern uchar_t acpi_find_ioapic(int irq);
 102 extern struct apic_io_intr *apic_find_io_intr_w_busid(int irqno, int busid);
 103 extern int apic_find_bus_id(int bustype);
 104 extern int apic_find_intin(uchar_t ioapic, uchar_t intin);
 105 extern void apic_record_rdt_entry(apic_irq_t *irqptr, int irq);
 106 
 107 extern  int apic_sci_vect;
 108 extern  iflag_t apic_sci_flags;
 109 /* ACPI HPET interrupt configuration; -1 if HPET not used */
 110 extern  int apic_hpet_vect;
 111 extern  iflag_t apic_hpet_flags;
 112 extern  int     apic_intr_policy;
 113 extern  char *psm_name;
 114 
 115 /*
 116  * number of bits per byte, from <sys/param.h>
 117  */
 118 #define UCHAR_MAX       UINT8_MAX
 119 
 120 /* Max wait time (in repetitions) for flags to clear in an RDT entry. */
 121 extern int apic_max_reps_clear_pending;
 122 
 123 /* The irq # is implicit in the array index: */
 124 struct ioapic_reprogram_data apic_reprogram_info[APIC_MAX_VECTOR+1];
 125 /*
 126  * APIC_MAX_VECTOR + 1 is the maximum # of IRQs as well. ioapic_reprogram_info
 127  * is indexed by IRQ number, NOT by vector number.
 128  */
 129 
 130 extern  int     apic_int_busy_mark;
 131 extern  int     apic_int_free_mark;
 132 extern  int     apic_diff_for_redistribution;
 133 extern  int     apic_sample_factor_redistribution;
 134 extern  int     apic_redist_cpu_skip;
 135 extern  int     apic_num_imbalance;
 136 extern  int     apic_num_rebind;
 137 
 138 /* timeout for xlate_vector, mark_vector */
 139 int     apic_revector_timeout = 16 * 10000; /* 160 millisec */
 140 
 141 extern int      apic_defconf;
 142 extern int      apic_irq_translate;
 143 
 144 extern int      apic_use_acpi_madt_only;        /* 1=ONLY use MADT from ACPI */
 145 
 146 extern  uchar_t apic_io_vectbase[MAX_IO_APIC];
 147 
 148 extern  boolean_t ioapic_mask_workaround[MAX_IO_APIC];
 149 
 150 /*
 151  * First available slot to be used as IRQ index into the apic_irq_table
 152  * for those interrupts (like MSI/X) that don't have a physical IRQ.
 153  */
 154 extern int apic_first_avail_irq;
 155 
 156 /*
 157  * apic_defer_reprogram_lock ensures that only one processor is handling
 158  * deferred interrupt programming at *_intr_exit time.
 159  */
 160 static  lock_t  apic_defer_reprogram_lock;
 161 
 162 /*
 163  * The current number of deferred reprogrammings outstanding
 164  */
 165 uint_t  apic_reprogram_outstanding = 0;
 166 
 167 #ifdef DEBUG
 168 /*
 169  * Counters that keep track of deferred reprogramming stats
 170  */
 171 uint_t  apic_intr_deferrals = 0;
 172 uint_t  apic_intr_deliver_timeouts = 0;
 173 uint_t  apic_last_ditch_reprogram_failures = 0;
 174 uint_t  apic_deferred_setup_failures = 0;
 175 uint_t  apic_defer_repro_total_retries = 0;
 176 uint_t  apic_defer_repro_successes = 0;
 177 uint_t  apic_deferred_spurious_enters = 0;
 178 #endif
 179 
 180 extern  int     apic_io_max;
 181 extern  struct apic_io_intr *apic_io_intrp;
 182 
 183 uchar_t apic_vector_to_irq[APIC_MAX_VECTOR+1];
 184 
 185 extern  uint32_t        eisa_level_intr_mask;
 186         /* At least MSB will be set if EISA bus */
 187 
 188 extern  int     apic_pci_bus_total;
 189 extern  uchar_t apic_single_pci_busid;
 190 
 191 /*
 192  * Following declarations are for revectoring; used when ISRs at different
 193  * IPLs share an irq.
 194  */
 195 static  lock_t  apic_revector_lock;
 196 int     apic_revector_pending = 0;
 197 static  uchar_t *apic_oldvec_to_newvec;
 198 static  uchar_t *apic_newvec_to_oldvec;
 199 
 200 /* ACPI Interrupt Source Override Structure ptr */
 201 extern ACPI_MADT_INTERRUPT_OVERRIDE *acpi_isop;
 202 extern int acpi_iso_cnt;
 203 
 204 /*
 205  * Auto-configuration routines
 206  */
 207 
 208 /*
 209  * Initialise vector->ipl and ipl->pri arrays. level_intr and irqtable
 210  * are also set to NULL. vector->irq is set to a value which cannot map
 211  * to a real irq to show that it is free.
 212  */
 213 void
 214 apic_init_common(void)
 215 {
 216         int     i, j, indx;
 217         int     *iptr;
 218 
 219         /*
 220          * Initialize apic_ipls from apic_vectortoipl.  This array is
 221          * used in apic_intr_enter to determine the IPL to use for the
 222          * corresponding vector.  On some systems, due to hardware errata
 223          * and interrupt sharing, the IPL may not correspond to the IPL listed
 224          * in apic_vectortoipl (see apic_addspl and apic_delspl).
 225          */
 226         for (i = 0; i < (APIC_AVAIL_VECTOR / APIC_VECTOR_PER_IPL); i++) {
 227                 indx = i * APIC_VECTOR_PER_IPL;
 228 
 229                 for (j = 0; j < APIC_VECTOR_PER_IPL; j++, indx++)
 230                         apic_ipls[indx] = apic_vectortoipl[i];
 231         }
 232 
 233         /* cpu 0 is always up (for now) */
 234         apic_cpus[0].aci_status = APIC_CPU_ONLINE | APIC_CPU_INTR_ENABLE;
 235 
 236         iptr = (int *)&apic_irq_table[0];
 237         for (i = 0; i <= APIC_MAX_VECTOR; i++) {
 238                 apic_level_intr[i] = 0;
 239                 *iptr++ = NULL;
 240                 apic_vector_to_irq[i] = APIC_RESV_IRQ;
 241 
 242                 /* These *must* be initted to B_TRUE! */
 243                 apic_reprogram_info[i].done = B_TRUE;
 244                 apic_reprogram_info[i].irqp = NULL;
 245                 apic_reprogram_info[i].tries = 0;
 246                 apic_reprogram_info[i].bindcpu = 0;
 247         }
 248 
 249         /*
 250          * Allocate a dummy irq table entry for the reserved entry.
 251          * This takes care of the race between removing an irq and
 252          * clock detecting a CPU in that irq during interrupt load
 253          * sampling.
 254          */
 255         apic_irq_table[APIC_RESV_IRQ] =
 256             kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP);
 257 
 258         mutex_init(&airq_mutex, NULL, MUTEX_DEFAULT, NULL);
 259 }
 260 
 261 void
 262 ioapic_init_intr(int mask_apic)
 263 {
 264         int ioapic_ix;
 265         struct intrspec ispec;
 266         apic_irq_t *irqptr;
 267         int i, j;
 268         ulong_t iflag;
 269 
 270         LOCK_INIT_CLEAR(&apic_revector_lock);
 271         LOCK_INIT_CLEAR(&apic_defer_reprogram_lock);
 272 
 273         /* mask interrupt vectors */
 274         for (j = 0; j < apic_io_max && mask_apic; j++) {
 275                 int intin_max;
 276 
 277                 ioapic_ix = j;
 278                 /* Bits 23-16 define the maximum redirection entries */
 279                 intin_max = (ioapic_read(ioapic_ix, APIC_VERS_CMD) >> 16)
 280                     & 0xff;
 281                 for (i = 0; i <= intin_max; i++)
 282                         ioapic_write(ioapic_ix, APIC_RDT_CMD + 2 * i, AV_MASK);
 283         }
 284 
 285         /*
 286          * Hack alert: deal with ACPI SCI interrupt chicken/egg here
 287          */
 288         if (apic_sci_vect > 0) {
 289                 /*
 290                  * acpica has already done add_avintr(); we just
 291                  * to finish the job by mimicing translate_irq()
 292                  *
 293                  * Fake up an intrspec and setup the tables
 294                  */
 295                 ispec.intrspec_vec = apic_sci_vect;
 296                 ispec.intrspec_pri = SCI_IPL;
 297 
 298                 if (apic_setup_irq_table(NULL, apic_sci_vect, NULL,
 299                     &ispec, &apic_sci_flags, DDI_INTR_TYPE_FIXED) < 0) {
 300                         cmn_err(CE_WARN, "!apic: SCI setup failed");
 301                         return;
 302                 }
 303                 irqptr = apic_irq_table[apic_sci_vect];
 304 
 305                 iflag = intr_clear();
 306                 lock_set(&apic_ioapic_lock);
 307 
 308                 /* Program I/O APIC */
 309                 (void) apic_setup_io_intr(irqptr, apic_sci_vect, B_FALSE);
 310 
 311                 lock_clear(&apic_ioapic_lock);
 312                 intr_restore(iflag);
 313 
 314                 irqptr->airq_share++;
 315         }
 316 
 317         /*
 318          * Hack alert: deal with ACPI HPET interrupt chicken/egg here.
 319          */
 320         if (apic_hpet_vect > 0) {
 321                 /*
 322                  * hpet has already done add_avintr(); we just need
 323                  * to finish the job by mimicing translate_irq()
 324                  *
 325                  * Fake up an intrspec and setup the tables
 326                  */
 327                 ispec.intrspec_vec = apic_hpet_vect;
 328                 ispec.intrspec_pri = CBE_HIGH_PIL;
 329 
 330                 if (apic_setup_irq_table(NULL, apic_hpet_vect, NULL,
 331                     &ispec, &apic_hpet_flags, DDI_INTR_TYPE_FIXED) < 0) {
 332                         cmn_err(CE_WARN, "!apic: HPET setup failed");
 333                         return;
 334                 }
 335                 irqptr = apic_irq_table[apic_hpet_vect];
 336 
 337                 iflag = intr_clear();
 338                 lock_set(&apic_ioapic_lock);
 339 
 340                 /* Program I/O APIC */
 341                 (void) apic_setup_io_intr(irqptr, apic_hpet_vect, B_FALSE);
 342 
 343                 lock_clear(&apic_ioapic_lock);
 344                 intr_restore(iflag);
 345 
 346                 irqptr->airq_share++;
 347         }
 348 }
 349 
 350 /*
 351  * Add mask bits to disable interrupt vector from happening
 352  * at or above IPL. In addition, it should remove mask bits
 353  * to enable interrupt vectors below the given IPL.
 354  *
 355  * Both add and delspl are complicated by the fact that different interrupts
 356  * may share IRQs. This can happen in two ways.
 357  * 1. The same H/W line is shared by more than 1 device
 358  * 1a. with interrupts at different IPLs
 359  * 1b. with interrupts at same IPL
 360  * 2. We ran out of vectors at a given IPL and started sharing vectors.
 361  * 1b and 2 should be handled gracefully, except for the fact some ISRs
 362  * will get called often when no interrupt is pending for the device.
 363  * For 1a, we handle it at the higher IPL.
 364  */
 365 /*ARGSUSED*/
 366 int
 367 apic_addspl_common(int irqno, int ipl, int min_ipl, int max_ipl)
 368 {
 369         uchar_t vector;
 370         ulong_t iflag;
 371         apic_irq_t *irqptr, *irqheadptr;
 372         int irqindex;
 373 
 374         ASSERT(max_ipl <= UCHAR_MAX);
 375         irqindex = IRQINDEX(irqno);
 376 
 377         if ((irqindex == -1) || (!apic_irq_table[irqindex]))
 378                 return (PSM_FAILURE);
 379 
 380         mutex_enter(&airq_mutex);
 381         irqptr = irqheadptr = apic_irq_table[irqindex];
 382 
 383         DDI_INTR_IMPLDBG((CE_CONT, "apic_addspl: dip=0x%p type=%d irqno=0x%x "
 384             "vector=0x%x\n", (void *)irqptr->airq_dip,
 385             irqptr->airq_mps_intr_index, irqno, irqptr->airq_vector));
 386 
 387         while (irqptr) {
 388                 if (VIRTIRQ(irqindex, irqptr->airq_share_id) == irqno)
 389                         break;
 390                 irqptr = irqptr->airq_next;
 391         }
 392         irqptr->airq_share++;
 393 
 394         mutex_exit(&airq_mutex);
 395 
 396         /* return if it is not hardware interrupt */
 397         if (irqptr->airq_mps_intr_index == RESERVE_INDEX)
 398                 return (PSM_SUCCESS);
 399 
 400         /* Or if there are more interupts at a higher IPL */
 401         if (ipl != max_ipl)
 402                 return (PSM_SUCCESS);
 403 
 404         /*
 405          * if apic_picinit() has not been called yet, just return.
 406          * At the end of apic_picinit(), we will call setup_io_intr().
 407          */
 408 
 409         if (!apic_picinit_called)
 410                 return (PSM_SUCCESS);
 411 
 412         /*
 413          * Upgrade vector if max_ipl is not earlier ipl. If we cannot allocate,
 414          * return failure.
 415          */
 416         if (irqptr->airq_ipl != max_ipl &&
 417             !ioapic_mask_workaround[irqptr->airq_ioapicindex]) {
 418 
 419                 vector = apic_allocate_vector(max_ipl, irqindex, 1);
 420                 if (vector == 0) {
 421                         irqptr->airq_share--;
 422                         return (PSM_FAILURE);
 423                 }
 424                 irqptr = irqheadptr;
 425                 apic_mark_vector(irqptr->airq_vector, vector);
 426                 while (irqptr) {
 427                         irqptr->airq_vector = vector;
 428                         irqptr->airq_ipl = (uchar_t)max_ipl;
 429                         /*
 430                          * reprogram irq being added and every one else
 431                          * who is not in the UNINIT state
 432                          */
 433                         if ((VIRTIRQ(irqindex, irqptr->airq_share_id) ==
 434                             irqno) || (irqptr->airq_temp_cpu != IRQ_UNINIT)) {
 435                                 apic_record_rdt_entry(irqptr, irqindex);
 436 
 437                                 iflag = intr_clear();
 438                                 lock_set(&apic_ioapic_lock);
 439 
 440                                 (void) apic_setup_io_intr(irqptr, irqindex,
 441                                     B_FALSE);
 442 
 443                                 lock_clear(&apic_ioapic_lock);
 444                                 intr_restore(iflag);
 445                         }
 446                         irqptr = irqptr->airq_next;
 447                 }
 448                 return (PSM_SUCCESS);
 449 
 450         } else if (irqptr->airq_ipl != max_ipl &&
 451             ioapic_mask_workaround[irqptr->airq_ioapicindex]) {
 452                 /*
 453                  * We cannot upgrade the vector, but we can change
 454                  * the IPL that this vector induces.
 455                  *
 456                  * Note that we subtract APIC_BASE_VECT from the vector
 457                  * here because this array is used in apic_intr_enter
 458                  * (no need to add APIC_BASE_VECT in that hot code
 459                  * path since we can do it in the rarely-executed path
 460                  * here).
 461                  */
 462                 apic_ipls[irqptr->airq_vector - APIC_BASE_VECT] =
 463                     (uchar_t)max_ipl;
 464 
 465                 irqptr = irqheadptr;
 466                 while (irqptr) {
 467                         irqptr->airq_ipl = (uchar_t)max_ipl;
 468                         irqptr = irqptr->airq_next;
 469                 }
 470 
 471                 return (PSM_SUCCESS);
 472         }
 473 
 474         ASSERT(irqptr);
 475 
 476         iflag = intr_clear();
 477         lock_set(&apic_ioapic_lock);
 478 
 479         (void) apic_setup_io_intr(irqptr, irqindex, B_FALSE);
 480 
 481         lock_clear(&apic_ioapic_lock);
 482         intr_restore(iflag);
 483 
 484         return (PSM_SUCCESS);
 485 }
 486 
 487 /*
 488  * Recompute mask bits for the given interrupt vector.
 489  * If there is no interrupt servicing routine for this
 490  * vector, this function should disable interrupt vector
 491  * from happening at all IPLs. If there are still
 492  * handlers using the given vector, this function should
 493  * disable the given vector from happening below the lowest
 494  * IPL of the remaining hadlers.
 495  */
 496 /*ARGSUSED*/
 497 int
 498 apic_delspl_common(int irqno, int ipl, int min_ipl, int max_ipl)
 499 {
 500         uchar_t vector;
 501         uint32_t bind_cpu;
 502         int intin, irqindex;
 503         int ioapic_ix;
 504         apic_irq_t      *irqptr, *preirqptr, *irqheadptr, *irqp;
 505         ulong_t iflag;
 506 
 507         mutex_enter(&airq_mutex);
 508         irqindex = IRQINDEX(irqno);
 509         irqptr = preirqptr = irqheadptr = apic_irq_table[irqindex];
 510 
 511         DDI_INTR_IMPLDBG((CE_CONT, "apic_delspl: dip=0x%p type=%d irqno=0x%x "
 512             "vector=0x%x\n", (void *)irqptr->airq_dip,
 513             irqptr->airq_mps_intr_index, irqno, irqptr->airq_vector));
 514 
 515         while (irqptr) {
 516                 if (VIRTIRQ(irqindex, irqptr->airq_share_id) == irqno)
 517                         break;
 518                 preirqptr = irqptr;
 519                 irqptr = irqptr->airq_next;
 520         }
 521         ASSERT(irqptr);
 522 
 523         irqptr->airq_share--;
 524 
 525         mutex_exit(&airq_mutex);
 526 
 527         /*
 528          * If there are more interrupts at a higher IPL, we don't need
 529          * to disable anything.
 530          */
 531         if (ipl < max_ipl)
 532                 return (PSM_SUCCESS);
 533 
 534         /* return if it is not hardware interrupt */
 535         if (irqptr->airq_mps_intr_index == RESERVE_INDEX)
 536                 return (PSM_SUCCESS);
 537 
 538         if (!apic_picinit_called) {
 539                 /*
 540                  * Clear irq_struct. If two devices shared an intpt
 541                  * line & 1 unloaded before picinit, we are hosed. But, then
 542                  * we hope the machine survive.
 543                  */
 544                 irqptr->airq_mps_intr_index = FREE_INDEX;
 545                 irqptr->airq_temp_cpu = IRQ_UNINIT;
 546                 apic_free_vector(irqptr->airq_vector);
 547                 return (PSM_SUCCESS);
 548         }
 549         /*
 550          * Downgrade vector to new max_ipl if needed. If we cannot allocate,
 551          * use old IPL. Not very elegant, but it should work.
 552          */
 553         if ((irqptr->airq_ipl != max_ipl) && (max_ipl != PSM_INVALID_IPL) &&
 554             !ioapic_mask_workaround[irqptr->airq_ioapicindex]) {
 555                 apic_irq_t      *irqp;
 556                 if (vector = apic_allocate_vector(max_ipl, irqno, 1)) {
 557                         apic_mark_vector(irqheadptr->airq_vector, vector);
 558                         irqp = irqheadptr;
 559                         while (irqp) {
 560                                 irqp->airq_vector = vector;
 561                                 irqp->airq_ipl = (uchar_t)max_ipl;
 562                                 if (irqp->airq_temp_cpu != IRQ_UNINIT) {
 563                                         apic_record_rdt_entry(irqp, irqindex);
 564 
 565                                         iflag = intr_clear();
 566                                         lock_set(&apic_ioapic_lock);
 567 
 568                                         (void) apic_setup_io_intr(irqp,
 569                                             irqindex, B_FALSE);
 570 
 571                                         lock_clear(&apic_ioapic_lock);
 572                                         intr_restore(iflag);
 573                                 }
 574                                 irqp = irqp->airq_next;
 575                         }
 576                 }
 577 
 578         } else if (irqptr->airq_ipl != max_ipl &&
 579             max_ipl != PSM_INVALID_IPL &&
 580             ioapic_mask_workaround[irqptr->airq_ioapicindex]) {
 581 
 582         /*
 583          * We cannot downgrade the IPL of the vector below the vector's
 584          * hardware priority. If we did, it would be possible for a
 585          * higher-priority hardware vector to interrupt a CPU running at an IPL
 586          * lower than the hardware priority of the interrupting vector (but
 587          * higher than the soft IPL of this IRQ). When this happens, we would
 588          * then try to drop the IPL BELOW what it was (effectively dropping
 589          * below base_spl) which would be potentially catastrophic.
 590          *
 591          * (e.g. Suppose the hardware vector associated with this IRQ is 0x40
 592          * (hardware IPL of 4).  Further assume that the old IPL of this IRQ
 593          * was 4, but the new IPL is 1.  If we forced vector 0x40 to result in
 594          * an IPL of 1, it would be possible for the processor to be executing
 595          * at IPL 3 and for an interrupt to come in on vector 0x40, interrupting
 596          * the currently-executing ISR.  When apic_intr_enter consults
 597          * apic_irqs[], it will return 1, bringing the IPL of the CPU down to 1
 598          * so even though the processor was running at IPL 4, an IPL 1
 599          * interrupt will have interrupted it, which must not happen)).
 600          *
 601          * Effectively, this means that the hardware priority corresponding to
 602          * the IRQ's IPL (in apic_ipls[]) cannot be lower than the vector's
 603          * hardware priority.
 604          *
 605          * (In the above example, then, after removal of the IPL 4 device's
 606          * interrupt handler, the new IPL will continue to be 4 because the
 607          * hardware priority that IPL 1 implies is lower than the hardware
 608          * priority of the vector used.)
 609          */
 610                 /* apic_ipls is indexed by vector, starting at APIC_BASE_VECT */
 611                 const int apic_ipls_index = irqptr->airq_vector -
 612                     APIC_BASE_VECT;
 613                 const int vect_inherent_hwpri = irqptr->airq_vector >>
 614                     APIC_IPL_SHIFT;
 615 
 616                 /*
 617                  * If there are still devices using this IRQ, determine the
 618                  * new ipl to use.
 619                  */
 620                 if (irqptr->airq_share) {
 621                         int vect_desired_hwpri, hwpri;
 622 
 623                         ASSERT(max_ipl < MAXIPL);
 624                         vect_desired_hwpri = apic_ipltopri[max_ipl] >>
 625                             APIC_IPL_SHIFT;
 626 
 627                         /*
 628                          * If the desired IPL's hardware priority is lower
 629                          * than that of the vector, use the hardware priority
 630                          * of the vector to determine the new IPL.
 631                          */
 632                         hwpri = (vect_desired_hwpri < vect_inherent_hwpri) ?
 633                             vect_inherent_hwpri : vect_desired_hwpri;
 634 
 635                         /*
 636                          * Now, to get the right index for apic_vectortoipl,
 637                          * we need to subtract APIC_BASE_VECT from the
 638                          * hardware-vector-equivalent (in hwpri).  Since hwpri
 639                          * is already shifted, we shift APIC_BASE_VECT before
 640                          * doing the subtraction.
 641                          */
 642                         hwpri -= (APIC_BASE_VECT >> APIC_IPL_SHIFT);
 643 
 644                         ASSERT(hwpri >= 0);
 645                         ASSERT(hwpri < MAXIPL);
 646                         max_ipl = apic_vectortoipl[hwpri];
 647                         apic_ipls[apic_ipls_index] = max_ipl;
 648 
 649                         irqp = irqheadptr;
 650                         while (irqp) {
 651                                 irqp->airq_ipl = (uchar_t)max_ipl;
 652                                 irqp = irqp->airq_next;
 653                         }
 654                 } else {
 655                         /*
 656                          * No more devices on this IRQ, so reset this vector's
 657                          * element in apic_ipls to the original IPL for this
 658                          * vector
 659                          */
 660                         apic_ipls[apic_ipls_index] =
 661                             apic_vectortoipl[vect_inherent_hwpri];
 662                 }
 663         }
 664 
 665         /*
 666          * If there are still active interrupts, we are done.
 667          */
 668         if (irqptr->airq_share)
 669                 return (PSM_SUCCESS);
 670 
 671         iflag = intr_clear();
 672         lock_set(&apic_ioapic_lock);
 673 
 674         if (irqptr->airq_mps_intr_index == MSI_INDEX) {
 675                 /*
 676                  * Disable the MSI vector
 677                  * Make sure we only disable on the last
 678                  * of the multi-MSI support
 679                  */
 680                 if (i_ddi_intr_get_current_nenables(irqptr->airq_dip) == 1) {
 681                         apic_pci_msi_disable_mode(irqptr->airq_dip,
 682                             DDI_INTR_TYPE_MSI);
 683                 }
 684         } else if (irqptr->airq_mps_intr_index == MSIX_INDEX) {
 685                 /*
 686                  * Disable the MSI-X vector
 687                  * needs to clear its mask and addr/data for each MSI-X
 688                  */
 689                 apic_pci_msi_unconfigure(irqptr->airq_dip, DDI_INTR_TYPE_MSIX,
 690                     irqptr->airq_origirq);
 691                 /*
 692                  * Make sure we only disable on the last MSI-X
 693                  */
 694                 if (i_ddi_intr_get_current_nenables(irqptr->airq_dip) == 1) {
 695                         apic_pci_msi_disable_mode(irqptr->airq_dip,
 696                             DDI_INTR_TYPE_MSIX);
 697                 }
 698         } else {
 699                 /*
 700                  * The assumption here is that this is safe, even for
 701                  * systems with IOAPICs that suffer from the hardware
 702                  * erratum because all devices have been quiesced before
 703                  * they unregister their interrupt handlers.  If that
 704                  * assumption turns out to be false, this mask operation
 705                  * can induce the same erratum result we're trying to
 706                  * avoid.
 707                  */
 708                 ioapic_ix = irqptr->airq_ioapicindex;
 709                 intin = irqptr->airq_intin_no;
 710                 ioapic_write(ioapic_ix, APIC_RDT_CMD + 2 * intin, AV_MASK);
 711         }
 712 
 713         apic_vt_ops->apic_intrmap_free_entry(&irqptr->airq_intrmap_private);
 714 
 715         /*
 716          * This irq entry is the only one in the chain.
 717          */
 718         if (irqheadptr->airq_next == NULL) {
 719                 ASSERT(irqheadptr == irqptr);
 720                 bind_cpu = irqptr->airq_temp_cpu;
 721                 if (((uint32_t)bind_cpu != IRQ_UNBOUND) &&
 722                     ((uint32_t)bind_cpu != IRQ_UNINIT)) {
 723                         ASSERT(apic_cpu_in_range(bind_cpu));
 724                         if (bind_cpu & IRQ_USER_BOUND) {
 725                                 /* If hardbound, temp_cpu == cpu */
 726                                 bind_cpu &= ~IRQ_USER_BOUND;
 727                                 apic_cpus[bind_cpu].aci_bound--;
 728                         } else
 729                                 apic_cpus[bind_cpu].aci_temp_bound--;
 730                 }
 731                 irqptr->airq_temp_cpu = IRQ_UNINIT;
 732                 irqptr->airq_mps_intr_index = FREE_INDEX;
 733                 lock_clear(&apic_ioapic_lock);
 734                 intr_restore(iflag);
 735                 apic_free_vector(irqptr->airq_vector);
 736                 return (PSM_SUCCESS);
 737         }
 738 
 739         /*
 740          * If we get here, we are sharing the vector and there are more than
 741          * one active irq entries in the chain.
 742          */
 743         lock_clear(&apic_ioapic_lock);
 744         intr_restore(iflag);
 745 
 746         mutex_enter(&airq_mutex);
 747         /* Remove the irq entry from the chain */
 748         if (irqptr == irqheadptr) { /* The irq entry is at the head */
 749                 apic_irq_table[irqindex] = irqptr->airq_next;
 750         } else {
 751                 preirqptr->airq_next = irqptr->airq_next;
 752         }
 753         /* Free the irq entry */
 754         kmem_free(irqptr, sizeof (apic_irq_t));
 755         mutex_exit(&airq_mutex);
 756 
 757         return (PSM_SUCCESS);
 758 }
 759 
 760 /*
 761  * apic_introp_xlate() replaces apic_translate_irq() and is
 762  * called only from apic_intr_ops().  With the new ADII framework,
 763  * the priority can no longer be retrieved through i_ddi_get_intrspec().
 764  * It has to be passed in from the caller.
 765  *
 766  * Return value:
 767  *      Success: irqno for the given device
 768  *      Failure: -1
 769  */
 770 int
 771 apic_introp_xlate(dev_info_t *dip, struct intrspec *ispec, int type)
 772 {
 773         char dev_type[16];
 774         int dev_len, pci_irq, newirq, bustype, devid, busid, i;
 775         int irqno = ispec->intrspec_vec;
 776         ddi_acc_handle_t cfg_handle;
 777         uchar_t ipin;
 778         struct apic_io_intr *intrp;
 779         iflag_t intr_flag;
 780         ACPI_SUBTABLE_HEADER    *hp;
 781         ACPI_MADT_INTERRUPT_OVERRIDE *isop;
 782         apic_irq_t *airqp;
 783         int parent_is_pci_or_pciex = 0;
 784         int child_is_pciex = 0;
 785 
 786         DDI_INTR_IMPLDBG((CE_CONT, "apic_introp_xlate: dip=0x%p name=%s "
 787             "type=%d irqno=0x%x\n", (void *)dip, ddi_get_name(dip), type,
 788             irqno));
 789 
 790         dev_len = sizeof (dev_type);
 791         if (ddi_getlongprop_buf(DDI_DEV_T_ANY, ddi_get_parent(dip),
 792             DDI_PROP_DONTPASS, "device_type", (caddr_t)dev_type,
 793             &dev_len) == DDI_PROP_SUCCESS) {
 794                 if ((strcmp(dev_type, "pci") == 0) ||
 795                     (strcmp(dev_type, "pciex") == 0))
 796                         parent_is_pci_or_pciex = 1;
 797         }
 798 
 799         if (ddi_getlongprop_buf(DDI_DEV_T_ANY, dip,
 800             DDI_PROP_DONTPASS, "compatible", (caddr_t)dev_type,
 801             &dev_len) == DDI_PROP_SUCCESS) {
 802                 if (strstr(dev_type, "pciex"))
 803                         child_is_pciex = 1;
 804         }
 805 
 806         if (DDI_INTR_IS_MSI_OR_MSIX(type)) {
 807                 if ((airqp = apic_find_irq(dip, ispec, type)) != NULL) {
 808                         airqp->airq_iflag.bustype =
 809                             child_is_pciex ? BUS_PCIE : BUS_PCI;
 810                         return (apic_vector_to_irq[airqp->airq_vector]);
 811                 }
 812                 return (apic_setup_irq_table(dip, irqno, NULL, ispec,
 813                     NULL, type));
 814         }
 815 
 816         bustype = 0;
 817 
 818         /* check if we have already translated this irq */
 819         mutex_enter(&airq_mutex);
 820         newirq = apic_min_device_irq;
 821         for (; newirq <= apic_max_device_irq; newirq++) {
 822                 airqp = apic_irq_table[newirq];
 823                 while (airqp) {
 824                         if ((airqp->airq_dip == dip) &&
 825                             (airqp->airq_origirq == irqno) &&
 826                             (airqp->airq_mps_intr_index != FREE_INDEX)) {
 827 
 828                                 mutex_exit(&airq_mutex);
 829                                 return (VIRTIRQ(newirq, airqp->airq_share_id));
 830                         }
 831                         airqp = airqp->airq_next;
 832                 }
 833         }
 834         mutex_exit(&airq_mutex);
 835 
 836         if (apic_defconf)
 837                 goto defconf;
 838 
 839         if ((dip == NULL) || (!apic_irq_translate && !apic_enable_acpi))
 840                 goto nonpci;
 841 
 842         if (parent_is_pci_or_pciex) {
 843                 /* pci device */
 844                 if (acpica_get_bdf(dip, &busid, &devid, NULL) != 0)
 845                         goto nonpci;
 846                 if (busid == 0 && apic_pci_bus_total == 1)
 847                         busid = (int)apic_single_pci_busid;
 848 
 849                 if (pci_config_setup(dip, &cfg_handle) != DDI_SUCCESS)
 850                         return (-1);
 851                 ipin = pci_config_get8(cfg_handle, PCI_CONF_IPIN) - PCI_INTA;
 852                 pci_config_teardown(&cfg_handle);
 853                 if (apic_enable_acpi && !apic_use_acpi_madt_only) {
 854                         if (apic_acpi_translate_pci_irq(dip, busid, devid,
 855                             ipin, &pci_irq, &intr_flag) != ACPI_PSM_SUCCESS)
 856                                 return (-1);
 857 
 858                         intr_flag.bustype = child_is_pciex ? BUS_PCIE : BUS_PCI;
 859                         return (apic_setup_irq_table(dip, pci_irq, NULL, ispec,
 860                             &intr_flag, type));
 861                 } else {
 862                         pci_irq = ((devid & 0x1f) << 2) | (ipin & 0x3);
 863                         if ((intrp = apic_find_io_intr_w_busid(pci_irq, busid))
 864                             == NULL) {
 865                                 if ((pci_irq = apic_handle_pci_pci_bridge(dip,
 866                                     devid, ipin, &intrp)) == -1)
 867                                         return (-1);
 868                         }
 869                         return (apic_setup_irq_table(dip, pci_irq, intrp, ispec,
 870                             NULL, type));
 871                 }
 872         } else if (strcmp(dev_type, "isa") == 0)
 873                 bustype = BUS_ISA;
 874         else if (strcmp(dev_type, "eisa") == 0)
 875                 bustype = BUS_EISA;
 876 
 877 nonpci:
 878         if (apic_enable_acpi && !apic_use_acpi_madt_only) {
 879                 /* search iso entries first */
 880                 if (acpi_iso_cnt != 0) {
 881                         hp = (ACPI_SUBTABLE_HEADER *)acpi_isop;
 882                         i = 0;
 883                         while (i < acpi_iso_cnt) {
 884                                 if (hp->Type ==
 885                                     ACPI_MADT_TYPE_INTERRUPT_OVERRIDE) {
 886                                         isop =
 887                                             (ACPI_MADT_INTERRUPT_OVERRIDE *) hp;
 888                                         if (isop->Bus == 0 &&
 889                                             isop->SourceIrq == irqno) {
 890                                                 newirq = isop->GlobalIrq;
 891                                                 intr_flag.intr_po =
 892                                                     isop->IntiFlags &
 893                                                     ACPI_MADT_POLARITY_MASK;
 894                                                 intr_flag.intr_el =
 895                                                     (isop->IntiFlags &
 896                                                     ACPI_MADT_TRIGGER_MASK)
 897                                                     >> 2;
 898                                                 intr_flag.bustype = BUS_ISA;
 899 
 900                                                 return (apic_setup_irq_table(
 901                                                     dip, newirq, NULL, ispec,
 902                                                     &intr_flag, type));
 903 
 904                                         }
 905                                         i++;
 906                                 }
 907                                 hp = (ACPI_SUBTABLE_HEADER *)(((char *)hp) +
 908                                     hp->Length);
 909                         }
 910                 }
 911                 intr_flag.intr_po = INTR_PO_ACTIVE_HIGH;
 912                 intr_flag.intr_el = INTR_EL_EDGE;
 913                 intr_flag.bustype = BUS_ISA;
 914                 return (apic_setup_irq_table(dip, irqno, NULL, ispec,
 915                     &intr_flag, type));
 916         } else {
 917                 if (bustype == 0)       /* not initialized */
 918                         bustype = eisa_level_intr_mask ? BUS_EISA : BUS_ISA;
 919                 for (i = 0; i < 2; i++) {
 920                         if (((busid = apic_find_bus_id(bustype)) != -1) &&
 921                             ((intrp = apic_find_io_intr_w_busid(irqno, busid))
 922                             != NULL)) {
 923                                 if ((newirq = apic_setup_irq_table(dip, irqno,
 924                                     intrp, ispec, NULL, type)) != -1) {
 925                                         return (newirq);
 926                                 }
 927                                 goto defconf;
 928                         }
 929                         bustype = (bustype == BUS_EISA) ? BUS_ISA : BUS_EISA;
 930                 }
 931         }
 932 
 933 /* MPS default configuration */
 934 defconf:
 935         newirq = apic_setup_irq_table(dip, irqno, NULL, ispec, NULL, type);
 936         if (newirq == -1)
 937                 return (-1);
 938         ASSERT(IRQINDEX(newirq) == irqno);
 939         ASSERT(apic_irq_table[irqno]);
 940         return (newirq);
 941 }
 942 
 943 /*
 944  * Attempt to share vector with someone else
 945  */
 946 static int
 947 apic_share_vector(int irqno, iflag_t *intr_flagp, short intr_index, int ipl,
 948         uchar_t ioapicindex, uchar_t ipin, apic_irq_t **irqptrp)
 949 {
 950 #ifdef DEBUG
 951         apic_irq_t *tmpirqp = NULL;
 952 #endif /* DEBUG */
 953         apic_irq_t *irqptr, dummyirq;
 954         int     newirq, chosen_irq = -1, share = 127;
 955         int     lowest, highest, i;
 956         uchar_t share_id;
 957 
 958         DDI_INTR_IMPLDBG((CE_CONT, "apic_share_vector: irqno=0x%x "
 959             "intr_index=0x%x ipl=0x%x\n", irqno, intr_index, ipl));
 960 
 961         highest = apic_ipltopri[ipl] + APIC_VECTOR_MASK;
 962         lowest = apic_ipltopri[ipl-1] + APIC_VECTOR_PER_IPL;
 963 
 964         if (highest < lowest) /* Both ipl and ipl-1 map to same pri */
 965                 lowest -= APIC_VECTOR_PER_IPL;
 966         dummyirq.airq_mps_intr_index = intr_index;
 967         dummyirq.airq_ioapicindex = ioapicindex;
 968         dummyirq.airq_intin_no = ipin;
 969         if (intr_flagp)
 970                 dummyirq.airq_iflag = *intr_flagp;
 971         apic_record_rdt_entry(&dummyirq, irqno);
 972         for (i = lowest; i <= highest; i++) {
 973                 newirq = apic_vector_to_irq[i];
 974                 if (newirq == APIC_RESV_IRQ)
 975                         continue;
 976                 irqptr = apic_irq_table[newirq];
 977 
 978                 if ((dummyirq.airq_rdt_entry & 0xFF00) !=
 979                     (irqptr->airq_rdt_entry & 0xFF00))
 980                         /* not compatible */
 981                         continue;
 982 
 983                 if (irqptr->airq_share < share) {
 984                         share = irqptr->airq_share;
 985                         chosen_irq = newirq;
 986                 }
 987         }
 988         if (chosen_irq != -1) {
 989                 /*
 990                  * Assign a share id which is free or which is larger
 991                  * than the largest one.
 992                  */
 993                 share_id = 1;
 994                 mutex_enter(&airq_mutex);
 995                 irqptr = apic_irq_table[chosen_irq];
 996                 while (irqptr) {
 997                         if (irqptr->airq_mps_intr_index == FREE_INDEX) {
 998                                 share_id = irqptr->airq_share_id;
 999                                 break;
1000                         }
1001                         if (share_id <= irqptr->airq_share_id)
1002                                 share_id = irqptr->airq_share_id + 1;
1003 #ifdef DEBUG
1004                         tmpirqp = irqptr;
1005 #endif /* DEBUG */
1006                         irqptr = irqptr->airq_next;
1007                 }
1008                 if (!irqptr) {
1009                         irqptr = kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP);
1010                         irqptr->airq_temp_cpu = IRQ_UNINIT;
1011                         irqptr->airq_next =
1012                             apic_irq_table[chosen_irq]->airq_next;
1013                         apic_irq_table[chosen_irq]->airq_next = irqptr;
1014 #ifdef  DEBUG
1015                         tmpirqp = apic_irq_table[chosen_irq];
1016 #endif /* DEBUG */
1017                 }
1018                 irqptr->airq_mps_intr_index = intr_index;
1019                 irqptr->airq_ioapicindex = ioapicindex;
1020                 irqptr->airq_intin_no = ipin;
1021                 if (intr_flagp)
1022                         irqptr->airq_iflag = *intr_flagp;
1023                 irqptr->airq_vector = apic_irq_table[chosen_irq]->airq_vector;
1024                 irqptr->airq_share_id = share_id;
1025                 apic_record_rdt_entry(irqptr, irqno);
1026                 *irqptrp = irqptr;
1027 #ifdef  DEBUG
1028                 /* shuffle the pointers to test apic_delspl path */
1029                 if (tmpirqp) {
1030                         tmpirqp->airq_next = irqptr->airq_next;
1031                         irqptr->airq_next = apic_irq_table[chosen_irq];
1032                         apic_irq_table[chosen_irq] = irqptr;
1033                 }
1034 #endif /* DEBUG */
1035                 mutex_exit(&airq_mutex);
1036                 return (VIRTIRQ(chosen_irq, share_id));
1037         }
1038         return (-1);
1039 }
1040 
1041 /*
1042  * Allocate/Initialize the apic_irq_table[] entry for given irqno. If the entry
1043  * is used already, we will try to allocate a new irqno.
1044  *
1045  * Return value:
1046  *      Success: irqno
1047  *      Failure: -1
1048  */
1049 static int
1050 apic_setup_irq_table(dev_info_t *dip, int irqno, struct apic_io_intr *intrp,
1051     struct intrspec *ispec, iflag_t *intr_flagp, int type)
1052 {
1053         int origirq = ispec->intrspec_vec;
1054         uchar_t ipl = ispec->intrspec_pri;
1055         int     newirq, intr_index;
1056         uchar_t ipin, ioapic, ioapicindex, vector;
1057         apic_irq_t *irqptr;
1058         major_t major;
1059         dev_info_t      *sdip;
1060 
1061         DDI_INTR_IMPLDBG((CE_CONT, "apic_setup_irq_table: dip=0x%p type=%d "
1062             "irqno=0x%x origirq=0x%x\n", (void *)dip, type, irqno, origirq));
1063 
1064         ASSERT(ispec != NULL);
1065 
1066         major =  (dip != NULL) ? ddi_driver_major(dip) : 0;
1067 
1068         if (DDI_INTR_IS_MSI_OR_MSIX(type)) {
1069                 /* MSI/X doesn't need to setup ioapic stuffs */
1070                 ioapicindex = 0xff;
1071                 ioapic = 0xff;
1072                 ipin = (uchar_t)0xff;
1073                 intr_index = (type == DDI_INTR_TYPE_MSI) ? MSI_INDEX :
1074                     MSIX_INDEX;
1075                 mutex_enter(&airq_mutex);
1076                 if ((irqno = apic_allocate_irq(apic_first_avail_irq)) == -1) {
1077                         mutex_exit(&airq_mutex);
1078                         /* need an irq for MSI/X to index into autovect[] */
1079                         cmn_err(CE_WARN, "No interrupt irq: %s instance %d",
1080                             ddi_get_name(dip), ddi_get_instance(dip));
1081                         return (-1);
1082                 }
1083                 mutex_exit(&airq_mutex);
1084 
1085         } else if (intrp != NULL) {
1086                 intr_index = (int)(intrp - apic_io_intrp);
1087                 ioapic = intrp->intr_destid;
1088                 ipin = intrp->intr_destintin;
1089                 /* Find ioapicindex. If destid was ALL, we will exit with 0. */
1090                 for (ioapicindex = apic_io_max - 1; ioapicindex; ioapicindex--)
1091                         if (apic_io_id[ioapicindex] == ioapic)
1092                                 break;
1093                 ASSERT((ioapic == apic_io_id[ioapicindex]) ||
1094                     (ioapic == INTR_ALL_APIC));
1095 
1096                 /* check whether this intin# has been used by another irqno */
1097                 if ((newirq = apic_find_intin(ioapicindex, ipin)) != -1) {
1098                         return (newirq);
1099                 }
1100 
1101         } else if (intr_flagp != NULL) {
1102                 /* ACPI case */
1103                 intr_index = ACPI_INDEX;
1104                 ioapicindex = acpi_find_ioapic(irqno);
1105                 ASSERT(ioapicindex != 0xFF);
1106                 ioapic = apic_io_id[ioapicindex];
1107                 ipin = irqno - apic_io_vectbase[ioapicindex];
1108                 if (apic_irq_table[irqno] &&
1109                     apic_irq_table[irqno]->airq_mps_intr_index == ACPI_INDEX) {
1110                         ASSERT(apic_irq_table[irqno]->airq_intin_no == ipin &&
1111                             apic_irq_table[irqno]->airq_ioapicindex ==
1112                             ioapicindex);
1113                         return (irqno);
1114                 }
1115 
1116         } else {
1117                 /* default configuration */
1118                 ioapicindex = 0;
1119                 ioapic = apic_io_id[ioapicindex];
1120                 ipin = (uchar_t)irqno;
1121                 intr_index = DEFAULT_INDEX;
1122         }
1123 
1124         if (ispec == NULL) {
1125                 APIC_VERBOSE_IOAPIC((CE_WARN, "No intrspec for irqno = %x\n",
1126                     irqno));
1127         } else if ((vector = apic_allocate_vector(ipl, irqno, 0)) == 0) {
1128                 if ((newirq = apic_share_vector(irqno, intr_flagp, intr_index,
1129                     ipl, ioapicindex, ipin, &irqptr)) != -1) {
1130                         irqptr->airq_ipl = ipl;
1131                         irqptr->airq_origirq = (uchar_t)origirq;
1132                         irqptr->airq_dip = dip;
1133                         irqptr->airq_major = major;
1134                         sdip = apic_irq_table[IRQINDEX(newirq)]->airq_dip;
1135                         /* This is OK to do really */
1136                         if (sdip == NULL) {
1137                                 cmn_err(CE_WARN, "Sharing vectors: %s"
1138                                     " instance %d and SCI",
1139                                     ddi_get_name(dip), ddi_get_instance(dip));
1140                         } else {
1141                                 cmn_err(CE_WARN, "Sharing vectors: %s"
1142                                     " instance %d and %s instance %d",
1143                                     ddi_get_name(sdip), ddi_get_instance(sdip),
1144                                     ddi_get_name(dip), ddi_get_instance(dip));
1145                         }
1146                         return (newirq);
1147                 }
1148                 /* try high priority allocation now  that share has failed */
1149                 if ((vector = apic_allocate_vector(ipl, irqno, 1)) == 0) {
1150                         cmn_err(CE_WARN, "No interrupt vector: %s instance %d",
1151                             ddi_get_name(dip), ddi_get_instance(dip));
1152                         return (-1);
1153                 }
1154         }
1155 
1156         mutex_enter(&airq_mutex);
1157         if (apic_irq_table[irqno] == NULL) {
1158                 irqptr = kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP);
1159                 irqptr->airq_temp_cpu = IRQ_UNINIT;
1160                 apic_irq_table[irqno] = irqptr;
1161         } else {
1162                 irqptr = apic_irq_table[irqno];
1163                 if (irqptr->airq_mps_intr_index != FREE_INDEX) {
1164                         /*
1165                          * The slot is used by another irqno, so allocate
1166                          * a free irqno for this interrupt
1167                          */
1168                         newirq = apic_allocate_irq(apic_first_avail_irq);
1169                         if (newirq == -1) {
1170                                 mutex_exit(&airq_mutex);
1171                                 return (-1);
1172                         }
1173                         irqno = newirq;
1174                         irqptr = apic_irq_table[irqno];
1175                         if (irqptr == NULL) {
1176                                 irqptr = kmem_zalloc(sizeof (apic_irq_t),
1177                                     KM_SLEEP);
1178                                 irqptr->airq_temp_cpu = IRQ_UNINIT;
1179                                 apic_irq_table[irqno] = irqptr;
1180                         }
1181                         vector = apic_modify_vector(vector, newirq);
1182                 }
1183         }
1184         apic_max_device_irq = max(irqno, apic_max_device_irq);
1185         apic_min_device_irq = min(irqno, apic_min_device_irq);
1186         mutex_exit(&airq_mutex);
1187         irqptr->airq_ioapicindex = ioapicindex;
1188         irqptr->airq_intin_no = ipin;
1189         irqptr->airq_ipl = ipl;
1190         irqptr->airq_vector = vector;
1191         irqptr->airq_origirq = (uchar_t)origirq;
1192         irqptr->airq_share_id = 0;
1193         irqptr->airq_mps_intr_index = (short)intr_index;
1194         irqptr->airq_dip = dip;
1195         irqptr->airq_major = major;
1196         irqptr->airq_cpu = apic_bind_intr(dip, irqno, ioapic, ipin);
1197         if (intr_flagp)
1198                 irqptr->airq_iflag = *intr_flagp;
1199 
1200         if (!DDI_INTR_IS_MSI_OR_MSIX(type)) {
1201                 /* setup I/O APIC entry for non-MSI/X interrupts */
1202                 apic_record_rdt_entry(irqptr, irqno);
1203         }
1204         return (irqno);
1205 }
1206 
1207 /*
1208  * return the cpu to which this intr should be bound.
1209  * Check properties or any other mechanism to see if user wants it
1210  * bound to a specific CPU. If so, return the cpu id with high bit set.
1211  * If not, use the policy to choose a cpu and return the id.
1212  */
1213 uint32_t
1214 apic_bind_intr(dev_info_t *dip, int irq, uchar_t ioapicid, uchar_t intin)
1215 {
1216         int     instance, instno, prop_len, bind_cpu, count;
1217         uint_t  i, rc;
1218         uint32_t cpu;
1219         major_t major;
1220         char    *name, *drv_name, *prop_val, *cptr;
1221         char    prop_name[32];
1222         ulong_t iflag;
1223 
1224 
1225         if (apic_intr_policy == INTR_LOWEST_PRIORITY)
1226                 return (IRQ_UNBOUND);
1227 
1228         if (apic_nproc == 1)
1229                 return (0);
1230 
1231         drv_name = NULL;
1232         rc = DDI_PROP_NOT_FOUND;
1233         major = (major_t)-1;
1234         if (dip != NULL) {
1235                 name = ddi_get_name(dip);
1236                 major = ddi_name_to_major(name);
1237                 drv_name = ddi_major_to_name(major);
1238                 instance = ddi_get_instance(dip);
1239                 if (apic_intr_policy == INTR_ROUND_ROBIN_WITH_AFFINITY) {
1240                         i = apic_min_device_irq;
1241                         for (; i <= apic_max_device_irq; i++) {
1242 
1243                                 if ((i == irq) || (apic_irq_table[i] == NULL) ||
1244                                     (apic_irq_table[i]->airq_mps_intr_index
1245                                     == FREE_INDEX))
1246                                         continue;
1247 
1248                                 if ((apic_irq_table[i]->airq_major == major) &&
1249                                     (!(apic_irq_table[i]->airq_cpu &
1250                                     IRQ_USER_BOUND))) {
1251 
1252                                         cpu = apic_irq_table[i]->airq_cpu;
1253 
1254                                         cmn_err(CE_CONT,
1255                                             "!%s: %s (%s) instance #%d "
1256                                             "irq 0x%x vector 0x%x ioapic 0x%x "
1257                                             "intin 0x%x is bound to cpu %d\n",
1258                                             psm_name,
1259                                             name, drv_name, instance, irq,
1260                                             apic_irq_table[irq]->airq_vector,
1261                                             ioapicid, intin, cpu);
1262                                         return (cpu);
1263                                 }
1264                         }
1265                 }
1266                 /*
1267                  * search for "drvname"_intpt_bind_cpus property first, the
1268                  * syntax of the property should be "a[,b,c,...]" where
1269                  * instance 0 binds to cpu a, instance 1 binds to cpu b,
1270                  * instance 3 binds to cpu c...
1271                  * ddi_getlongprop() will search /option first, then /
1272                  * if "drvname"_intpt_bind_cpus doesn't exist, then find
1273                  * intpt_bind_cpus property.  The syntax is the same, and
1274                  * it applies to all the devices if its "drvname" specific
1275                  * property doesn't exist
1276                  */
1277                 (void) strcpy(prop_name, drv_name);
1278                 (void) strcat(prop_name, "_intpt_bind_cpus");
1279                 rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, 0, prop_name,
1280                     (caddr_t)&prop_val, &prop_len);
1281                 if (rc != DDI_PROP_SUCCESS) {
1282                         rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, 0,
1283                             "intpt_bind_cpus", (caddr_t)&prop_val, &prop_len);
1284                 }
1285         }
1286         if (rc == DDI_PROP_SUCCESS) {
1287                 for (i = count = 0; i < (prop_len - 1); i++)
1288                         if (prop_val[i] == ',')
1289                                 count++;
1290                 if (prop_val[i-1] != ',')
1291                         count++;
1292                 /*
1293                  * if somehow the binding instances defined in the
1294                  * property are not enough for this instno., then
1295                  * reuse the pattern for the next instance until
1296                  * it reaches the requested instno
1297                  */
1298                 instno = instance % count;
1299                 i = 0;
1300                 cptr = prop_val;
1301                 while (i < instno)
1302                         if (*cptr++ == ',')
1303                                 i++;
1304                 bind_cpu = stoi(&cptr);
1305                 kmem_free(prop_val, prop_len);
1306                 /* if specific CPU is bogus, then default to next cpu */
1307                 if (!apic_cpu_in_range(bind_cpu)) {
1308                         cmn_err(CE_WARN, "%s: %s=%s: CPU %d not present",
1309                             psm_name, prop_name, prop_val, bind_cpu);
1310                         rc = DDI_PROP_NOT_FOUND;
1311                 } else {
1312                         /* indicate that we are bound at user request */
1313                         bind_cpu |= IRQ_USER_BOUND;
1314                 }
1315                 /*
1316                  * no need to check apic_cpus[].aci_status, if specific CPU is
1317                  * not up, then post_cpu_start will handle it.
1318                  */
1319         }
1320         if (rc != DDI_PROP_SUCCESS) {
1321                 iflag = intr_clear();
1322                 lock_set(&apic_ioapic_lock);
1323                 bind_cpu = apic_get_next_bind_cpu();
1324                 lock_clear(&apic_ioapic_lock);
1325                 intr_restore(iflag);
1326         }
1327 
1328         if (drv_name != NULL)
1329                 cmn_err(CE_CONT, "!%s: %s (%s) instance %d irq 0x%x "
1330                     "vector 0x%x ioapic 0x%x intin 0x%x is bound to cpu %d\n",
1331                     psm_name, name, drv_name, instance, irq,
1332                     apic_irq_table[irq]->airq_vector, ioapicid, intin,
1333                     bind_cpu & ~IRQ_USER_BOUND);
1334         else
1335                 cmn_err(CE_CONT, "!%s: irq 0x%x "
1336                     "vector 0x%x ioapic 0x%x intin 0x%x is bound to cpu %d\n",
1337                     psm_name, irq, apic_irq_table[irq]->airq_vector, ioapicid,
1338                     intin, bind_cpu & ~IRQ_USER_BOUND);
1339 
1340         return ((uint32_t)bind_cpu);
1341 }
1342 
1343 /*
1344  * Mark vector as being in the process of being deleted. Interrupts
1345  * may still come in on some CPU. The moment an interrupt comes with
1346  * the new vector, we know we can free the old one. Called only from
1347  * addspl and delspl with interrupts disabled. Because an interrupt
1348  * can be shared, but no interrupt from either device may come in,
1349  * we also use a timeout mechanism, which we arbitrarily set to
1350  * apic_revector_timeout microseconds.
1351  */
1352 static void
1353 apic_mark_vector(uchar_t oldvector, uchar_t newvector)
1354 {
1355         ulong_t iflag;
1356 
1357         iflag = intr_clear();
1358         lock_set(&apic_revector_lock);
1359         if (!apic_oldvec_to_newvec) {
1360                 apic_oldvec_to_newvec =
1361                     kmem_zalloc(sizeof (newvector) * APIC_MAX_VECTOR * 2,
1362                     KM_NOSLEEP);
1363 
1364                 if (!apic_oldvec_to_newvec) {
1365                         /*
1366                          * This failure is not catastrophic.
1367                          * But, the oldvec will never be freed.
1368                          */
1369                         apic_error |= APIC_ERR_MARK_VECTOR_FAIL;
1370                         lock_clear(&apic_revector_lock);
1371                         intr_restore(iflag);
1372                         return;
1373                 }
1374                 apic_newvec_to_oldvec = &apic_oldvec_to_newvec[APIC_MAX_VECTOR];
1375         }
1376 
1377         /* See if we already did this for drivers which do double addintrs */
1378         if (apic_oldvec_to_newvec[oldvector] != newvector) {
1379                 apic_oldvec_to_newvec[oldvector] = newvector;
1380                 apic_newvec_to_oldvec[newvector] = oldvector;
1381                 apic_revector_pending++;
1382         }
1383         lock_clear(&apic_revector_lock);
1384         intr_restore(iflag);
1385         (void) timeout(apic_xlate_vector_free_timeout_handler,
1386             (void *)(uintptr_t)oldvector, drv_usectohz(apic_revector_timeout));
1387 }
1388 
1389 /*
1390  * xlate_vector is called from intr_enter if revector_pending is set.
1391  * It will xlate it if needed and mark the old vector as free.
1392  */
1393 uchar_t
1394 apic_xlate_vector(uchar_t vector)
1395 {
1396         uchar_t newvector, oldvector = 0;
1397 
1398         lock_set(&apic_revector_lock);
1399         /* Do we really need to do this ? */
1400         if (!apic_revector_pending) {
1401                 lock_clear(&apic_revector_lock);
1402                 return (vector);
1403         }
1404         if ((newvector = apic_oldvec_to_newvec[vector]) != 0)
1405                 oldvector = vector;
1406         else {
1407                 /*
1408                  * The incoming vector is new . See if a stale entry is
1409                  * remaining
1410                  */
1411                 if ((oldvector = apic_newvec_to_oldvec[vector]) != 0)
1412                         newvector = vector;
1413         }
1414 
1415         if (oldvector) {
1416                 apic_revector_pending--;
1417                 apic_oldvec_to_newvec[oldvector] = 0;
1418                 apic_newvec_to_oldvec[newvector] = 0;
1419                 apic_free_vector(oldvector);
1420                 lock_clear(&apic_revector_lock);
1421                 /* There could have been more than one reprogramming! */
1422                 return (apic_xlate_vector(newvector));
1423         }
1424         lock_clear(&apic_revector_lock);
1425         return (vector);
1426 }
1427 
1428 void
1429 apic_xlate_vector_free_timeout_handler(void *arg)
1430 {
1431         ulong_t iflag;
1432         uchar_t oldvector, newvector;
1433 
1434         oldvector = (uchar_t)(uintptr_t)arg;
1435         iflag = intr_clear();
1436         lock_set(&apic_revector_lock);
1437         if ((newvector = apic_oldvec_to_newvec[oldvector]) != 0) {
1438                 apic_free_vector(oldvector);
1439                 apic_oldvec_to_newvec[oldvector] = 0;
1440                 apic_newvec_to_oldvec[newvector] = 0;
1441                 apic_revector_pending--;
1442         }
1443 
1444         lock_clear(&apic_revector_lock);
1445         intr_restore(iflag);
1446 }
1447 
1448 /*
1449  * Bind interrupt corresponding to irq_ptr to bind_cpu.
1450  * Must be called with interrupts disabled and apic_ioapic_lock held
1451  */
1452 int
1453 apic_rebind(apic_irq_t *irq_ptr, int bind_cpu,
1454     struct ioapic_reprogram_data *drep)
1455 {
1456         int                     ioapicindex, intin_no;
1457         uint32_t                airq_temp_cpu;
1458         apic_cpus_info_t        *cpu_infop;
1459         uint32_t                rdt_entry;
1460         int                     which_irq;
1461         ioapic_rdt_t            irdt;
1462 
1463         which_irq = apic_vector_to_irq[irq_ptr->airq_vector];
1464 
1465         intin_no = irq_ptr->airq_intin_no;
1466         ioapicindex = irq_ptr->airq_ioapicindex;
1467         airq_temp_cpu = irq_ptr->airq_temp_cpu;
1468         if (airq_temp_cpu != IRQ_UNINIT && airq_temp_cpu != IRQ_UNBOUND) {
1469                 if (airq_temp_cpu & IRQ_USER_BOUND)
1470                         /* Mask off high bit so it can be used as array index */
1471                         airq_temp_cpu &= ~IRQ_USER_BOUND;
1472 
1473                 ASSERT(apic_cpu_in_range(airq_temp_cpu));
1474         }
1475 
1476         /*
1477          * Can't bind to a CPU that's not accepting interrupts:
1478          */
1479         cpu_infop = &apic_cpus[bind_cpu & ~IRQ_USER_BOUND];
1480         if (!(cpu_infop->aci_status & APIC_CPU_INTR_ENABLE))
1481                 return (1);
1482 
1483         /*
1484          * If we are about to change the interrupt vector for this interrupt,
1485          * and this interrupt is level-triggered, attached to an IOAPIC,
1486          * has been delivered to a CPU and that CPU has not handled it
1487          * yet, we cannot reprogram the IOAPIC now.
1488          */
1489         if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) {
1490 
1491                 rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex,
1492                     intin_no);
1493 
1494                 if ((irq_ptr->airq_vector != RDT_VECTOR(rdt_entry)) &&
1495                     apic_check_stuck_interrupt(irq_ptr, airq_temp_cpu,
1496                     bind_cpu, ioapicindex, intin_no, which_irq, drep) != 0) {
1497 
1498                         return (0);
1499                 }
1500 
1501                 /*
1502                  * NOTE: We do not unmask the RDT here, as an interrupt MAY
1503                  * still come in before we have a chance to reprogram it below.
1504                  * The reprogramming below will simultaneously change and
1505                  * unmask the RDT entry.
1506                  */
1507 
1508                 if ((uint32_t)bind_cpu == IRQ_UNBOUND) {
1509                         irdt.ir_lo =  AV_LDEST | AV_LOPRI |
1510                             irq_ptr->airq_rdt_entry;
1511 
1512                         irdt.ir_hi = AV_TOALL >> APIC_ID_BIT_OFFSET;
1513 
1514                         apic_vt_ops->apic_intrmap_alloc_entry(
1515                             &irq_ptr->airq_intrmap_private, NULL,
1516                             DDI_INTR_TYPE_FIXED, 1, ioapicindex);
1517                         apic_vt_ops->apic_intrmap_map_entry(
1518                             irq_ptr->airq_intrmap_private, (void *)&irdt,
1519                             DDI_INTR_TYPE_FIXED, 1);
1520                         apic_vt_ops->apic_intrmap_record_rdt(
1521                             irq_ptr->airq_intrmap_private, &irdt);
1522 
1523                         /* Write the RDT entry -- no specific CPU binding */
1524                         WRITE_IOAPIC_RDT_ENTRY_HIGH_DWORD(ioapicindex, intin_no,
1525                             irdt.ir_hi | AV_TOALL);
1526 
1527                         if (airq_temp_cpu != IRQ_UNINIT && airq_temp_cpu !=
1528                             IRQ_UNBOUND)
1529                                 apic_cpus[airq_temp_cpu].aci_temp_bound--;
1530 
1531                         /*
1532                          * Write the vector, trigger, and polarity portion of
1533                          * the RDT
1534                          */
1535                         WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex, intin_no,
1536                             irdt.ir_lo);
1537 
1538                         irq_ptr->airq_temp_cpu = IRQ_UNBOUND;
1539                         return (0);
1540                 }
1541         }
1542 
1543         if (bind_cpu & IRQ_USER_BOUND) {
1544                 cpu_infop->aci_bound++;
1545         } else {
1546                 cpu_infop->aci_temp_bound++;
1547         }
1548         ASSERT(apic_cpu_in_range(bind_cpu));
1549 
1550         if ((airq_temp_cpu != IRQ_UNBOUND) && (airq_temp_cpu != IRQ_UNINIT)) {
1551                 apic_cpus[airq_temp_cpu].aci_temp_bound--;
1552         }
1553         if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) {
1554 
1555                 irdt.ir_lo = AV_PDEST | AV_FIXED | irq_ptr->airq_rdt_entry;
1556                 irdt.ir_hi = cpu_infop->aci_local_id;
1557 
1558                 apic_vt_ops->apic_intrmap_alloc_entry(
1559                     &irq_ptr->airq_intrmap_private, NULL, DDI_INTR_TYPE_FIXED,
1560                     1, ioapicindex);
1561                 apic_vt_ops->apic_intrmap_map_entry(
1562                     irq_ptr->airq_intrmap_private,
1563                     (void *)&irdt, DDI_INTR_TYPE_FIXED, 1);
1564                 apic_vt_ops->apic_intrmap_record_rdt(
1565                     irq_ptr->airq_intrmap_private, &irdt);
1566 
1567                 /* Write the RDT entry -- bind to a specific CPU: */
1568                 WRITE_IOAPIC_RDT_ENTRY_HIGH_DWORD(ioapicindex, intin_no,
1569                     irdt.ir_hi);
1570 
1571                 /* Write the vector, trigger, and polarity portion of the RDT */
1572                 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex, intin_no,
1573                     irdt.ir_lo);
1574 
1575         } else {
1576                 int type = (irq_ptr->airq_mps_intr_index == MSI_INDEX) ?
1577                     DDI_INTR_TYPE_MSI : DDI_INTR_TYPE_MSIX;
1578                 if (type == DDI_INTR_TYPE_MSI) {
1579                         if (irq_ptr->airq_ioapicindex ==
1580                             irq_ptr->airq_origirq) {
1581                                 /* first one */
1582                                 DDI_INTR_IMPLDBG((CE_CONT, "apic_rebind: call "
1583                                     "apic_pci_msi_enable_vector\n"));
1584                                 apic_pci_msi_enable_vector(irq_ptr,
1585                                     type, which_irq, irq_ptr->airq_vector,
1586                                     irq_ptr->airq_intin_no,
1587                                     cpu_infop->aci_local_id);
1588                         }
1589                         if ((irq_ptr->airq_ioapicindex +
1590                             irq_ptr->airq_intin_no - 1) ==
1591                             irq_ptr->airq_origirq) { /* last one */
1592                                 DDI_INTR_IMPLDBG((CE_CONT, "apic_rebind: call "
1593                                     "apic_pci_msi_enable_mode\n"));
1594                                 apic_pci_msi_enable_mode(irq_ptr->airq_dip,
1595                                     type, which_irq);
1596                         }
1597                 } else { /* MSI-X */
1598                         apic_pci_msi_enable_vector(irq_ptr, type,
1599                             irq_ptr->airq_origirq, irq_ptr->airq_vector, 1,
1600                             cpu_infop->aci_local_id);
1601                         apic_pci_msi_enable_mode(irq_ptr->airq_dip, type,
1602                             irq_ptr->airq_origirq);
1603                 }
1604         }
1605         irq_ptr->airq_temp_cpu = (uint32_t)bind_cpu;
1606         apic_redist_cpu_skip &= ~(1 << (bind_cpu & ~IRQ_USER_BOUND));
1607         return (0);
1608 }
1609 
1610 static void
1611 apic_last_ditch_clear_remote_irr(int ioapic_ix, int intin_no)
1612 {
1613         if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no)
1614             & AV_REMOTE_IRR) != 0) {
1615                 /*
1616                  * Trying to clear the bit through normal
1617                  * channels has failed.  So as a last-ditch
1618                  * effort, try to set the trigger mode to
1619                  * edge, then to level.  This has been
1620                  * observed to work on many systems.
1621                  */
1622                 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1623                     intin_no,
1624                     READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1625                     intin_no) & ~AV_LEVEL);
1626 
1627                 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1628                     intin_no,
1629                     READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1630                     intin_no) | AV_LEVEL);
1631 
1632                 /*
1633                  * If the bit's STILL set, this interrupt may
1634                  * be hosed.
1635                  */
1636                 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1637                     intin_no) & AV_REMOTE_IRR) != 0) {
1638 
1639                         prom_printf("%s: Remote IRR still "
1640                             "not clear for IOAPIC %d intin %d.\n"
1641                             "\tInterrupts to this pin may cease "
1642                             "functioning.\n", psm_name, ioapic_ix,
1643                             intin_no);
1644 #ifdef DEBUG
1645                         apic_last_ditch_reprogram_failures++;
1646 #endif
1647                 }
1648         }
1649 }
1650 
1651 /*
1652  * This function is protected by apic_ioapic_lock coupled with the
1653  * fact that interrupts are disabled.
1654  */
1655 static void
1656 delete_defer_repro_ent(int which_irq)
1657 {
1658         ASSERT(which_irq >= 0);
1659         ASSERT(which_irq <= 255);
1660         ASSERT(LOCK_HELD(&apic_ioapic_lock));
1661 
1662         if (apic_reprogram_info[which_irq].done)
1663                 return;
1664 
1665         apic_reprogram_info[which_irq].done = B_TRUE;
1666 
1667 #ifdef DEBUG
1668         apic_defer_repro_total_retries +=
1669             apic_reprogram_info[which_irq].tries;
1670 
1671         apic_defer_repro_successes++;
1672 #endif
1673 
1674         if (--apic_reprogram_outstanding == 0) {
1675 
1676                 setlvlx = psm_intr_exit_fn();
1677         }
1678 }
1679 
1680 
1681 /*
1682  * Interrupts must be disabled during this function to prevent
1683  * self-deadlock.  Interrupts are disabled because this function
1684  * is called from apic_check_stuck_interrupt(), which is called
1685  * from apic_rebind(), which requires its caller to disable interrupts.
1686  */
1687 static void
1688 add_defer_repro_ent(apic_irq_t *irq_ptr, int which_irq, int new_bind_cpu)
1689 {
1690         ASSERT(which_irq >= 0);
1691         ASSERT(which_irq <= 255);
1692         ASSERT(!interrupts_enabled());
1693 
1694         /*
1695          * On the off-chance that there's already a deferred
1696          * reprogramming on this irq, check, and if so, just update the
1697          * CPU and irq pointer to which the interrupt is targeted, then return.
1698          */
1699         if (!apic_reprogram_info[which_irq].done) {
1700                 apic_reprogram_info[which_irq].bindcpu = new_bind_cpu;
1701                 apic_reprogram_info[which_irq].irqp = irq_ptr;
1702                 return;
1703         }
1704 
1705         apic_reprogram_info[which_irq].irqp = irq_ptr;
1706         apic_reprogram_info[which_irq].bindcpu = new_bind_cpu;
1707         apic_reprogram_info[which_irq].tries = 0;
1708         /*
1709          * This must be the last thing set, since we're not
1710          * grabbing any locks, apic_try_deferred_reprogram() will
1711          * make its decision about using this entry iff done
1712          * is false.
1713          */
1714         apic_reprogram_info[which_irq].done = B_FALSE;
1715 
1716         /*
1717          * If there were previously no deferred reprogrammings, change
1718          * setlvlx to call apic_try_deferred_reprogram()
1719          */
1720         if (++apic_reprogram_outstanding == 1) {
1721 
1722                 setlvlx = apic_try_deferred_reprogram;
1723         }
1724 }
1725 
1726 static void
1727 apic_try_deferred_reprogram(int prev_ipl, int irq)
1728 {
1729         int reproirq;
1730         ulong_t iflag;
1731         struct ioapic_reprogram_data *drep;
1732 
1733         (*psm_intr_exit_fn())(prev_ipl, irq);
1734 
1735         if (!lock_try(&apic_defer_reprogram_lock)) {
1736                 return;
1737         }
1738 
1739         /*
1740          * Acquire the apic_ioapic_lock so that any other operations that
1741          * may affect the apic_reprogram_info state are serialized.
1742          * It's still possible for the last deferred reprogramming to clear
1743          * between the time we entered this function and the time we get to
1744          * the for loop below.  In that case, *setlvlx will have been set
1745          * back to *_intr_exit and drep will be NULL. (There's no way to
1746          * stop that from happening -- we would need to grab a lock before
1747          * calling *setlvlx, which is neither realistic nor prudent).
1748          */
1749         iflag = intr_clear();
1750         lock_set(&apic_ioapic_lock);
1751 
1752         /*
1753          * For each deferred RDT entry, try to reprogram it now.  Note that
1754          * there is no lock acquisition to read apic_reprogram_info because
1755          * '.done' is set only after the other fields in the structure are set.
1756          */
1757 
1758         drep = NULL;
1759         for (reproirq = 0; reproirq <= APIC_MAX_VECTOR; reproirq++) {
1760                 if (apic_reprogram_info[reproirq].done == B_FALSE) {
1761                         drep = &apic_reprogram_info[reproirq];
1762                         break;
1763                 }
1764         }
1765 
1766         /*
1767          * Either we found a deferred action to perform, or
1768          * we entered this function spuriously, after *setlvlx
1769          * was restored to point to *_intr_exit.  Any other
1770          * permutation is invalid.
1771          */
1772         ASSERT(drep != NULL || *setlvlx == psm_intr_exit_fn());
1773 
1774         /*
1775          * Though we can't really do anything about errors
1776          * at this point, keep track of them for reporting.
1777          * Note that it is very possible for apic_setup_io_intr
1778          * to re-register this very timeout if the Remote IRR bit
1779          * has not yet cleared.
1780          */
1781 
1782 #ifdef DEBUG
1783         if (drep != NULL) {
1784                 if (apic_setup_io_intr(drep, reproirq, B_TRUE) != 0) {
1785                         apic_deferred_setup_failures++;
1786                 }
1787         } else {
1788                 apic_deferred_spurious_enters++;
1789         }
1790 #else
1791         if (drep != NULL)
1792                 (void) apic_setup_io_intr(drep, reproirq, B_TRUE);
1793 #endif
1794 
1795         lock_clear(&apic_ioapic_lock);
1796         intr_restore(iflag);
1797 
1798         lock_clear(&apic_defer_reprogram_lock);
1799 }
1800 
1801 static void
1802 apic_ioapic_wait_pending_clear(int ioapic_ix, int intin_no)
1803 {
1804         int waited;
1805 
1806         /*
1807          * Wait for the delivery pending bit to clear.
1808          */
1809         if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no) &
1810             (AV_LEVEL|AV_PENDING)) == (AV_LEVEL|AV_PENDING)) {
1811 
1812                 /*
1813                  * If we're still waiting on the delivery of this interrupt,
1814                  * continue to wait here until it is delivered (this should be
1815                  * a very small amount of time, but include a timeout just in
1816                  * case).
1817                  */
1818                 for (waited = 0; waited < apic_max_reps_clear_pending;
1819                     waited++) {
1820                         if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1821                             intin_no) & AV_PENDING) == 0) {
1822                                 break;
1823                         }
1824                 }
1825         }
1826 }
1827 
1828 
1829 /*
1830  * Checks to see if the IOAPIC interrupt entry specified has its Remote IRR
1831  * bit set.  Calls functions that modify the function that setlvlx points to,
1832  * so that the reprogramming can be retried very shortly.
1833  *
1834  * This function will mask the RDT entry if the interrupt is level-triggered.
1835  * (The caller is responsible for unmasking the RDT entry.)
1836  *
1837  * Returns non-zero if the caller should defer IOAPIC reprogramming.
1838  */
1839 static int
1840 apic_check_stuck_interrupt(apic_irq_t *irq_ptr, int old_bind_cpu,
1841     int new_bind_cpu, int ioapic_ix, int intin_no, int which_irq,
1842     struct ioapic_reprogram_data *drep)
1843 {
1844         int32_t                 rdt_entry;
1845         int                     waited;
1846         int                     reps = 0;
1847 
1848         /*
1849          * Wait for the delivery pending bit to clear.
1850          */
1851         do {
1852                 ++reps;
1853 
1854                 apic_ioapic_wait_pending_clear(ioapic_ix, intin_no);
1855 
1856                 /*
1857                  * Mask the RDT entry, but only if it's a level-triggered
1858                  * interrupt
1859                  */
1860                 rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1861                     intin_no);
1862                 if ((rdt_entry & (AV_LEVEL|AV_MASK)) == AV_LEVEL) {
1863 
1864                         /* Mask it */
1865                         WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no,
1866                             AV_MASK | rdt_entry);
1867                 }
1868 
1869                 if ((rdt_entry & AV_LEVEL) == AV_LEVEL) {
1870                         /*
1871                          * If there was a race and an interrupt was injected
1872                          * just before we masked, check for that case here.
1873                          * Then, unmask the RDT entry and try again.  If we're
1874                          * on our last try, don't unmask (because we want the
1875                          * RDT entry to remain masked for the rest of the
1876                          * function).
1877                          */
1878                         rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1879                             intin_no);
1880                         if ((rdt_entry & AV_PENDING) &&
1881                             (reps < apic_max_reps_clear_pending)) {
1882                                 /* Unmask it */
1883                                 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1884                                     intin_no, rdt_entry & ~AV_MASK);
1885                         }
1886                 }
1887 
1888         } while ((rdt_entry & AV_PENDING) &&
1889             (reps < apic_max_reps_clear_pending));
1890 
1891 #ifdef DEBUG
1892                 if (rdt_entry & AV_PENDING)
1893                         apic_intr_deliver_timeouts++;
1894 #endif
1895 
1896         /*
1897          * If the remote IRR bit is set, then the interrupt has been sent
1898          * to a CPU for processing.  We have no choice but to wait for
1899          * that CPU to process the interrupt, at which point the remote IRR
1900          * bit will be cleared.
1901          */
1902         if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no) &
1903             (AV_LEVEL|AV_REMOTE_IRR)) == (AV_LEVEL|AV_REMOTE_IRR)) {
1904 
1905                 /*
1906                  * If the CPU that this RDT is bound to is NOT the current
1907                  * CPU, wait until that CPU handles the interrupt and ACKs
1908                  * it.  If this interrupt is not bound to any CPU (that is,
1909                  * if it's bound to the logical destination of "anyone"), it
1910                  * may have been delivered to the current CPU so handle that
1911                  * case by deferring the reprogramming (below).
1912                  */
1913                 if ((old_bind_cpu != IRQ_UNBOUND) &&
1914                     (old_bind_cpu != IRQ_UNINIT) &&
1915                     (old_bind_cpu != psm_get_cpu_id())) {
1916                         for (waited = 0; waited < apic_max_reps_clear_pending;
1917                             waited++) {
1918                                 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1919                                     intin_no) & AV_REMOTE_IRR) == 0) {
1920 
1921                                         delete_defer_repro_ent(which_irq);
1922 
1923                                         /* Remote IRR has cleared! */
1924                                         return (0);
1925                                 }
1926                         }
1927                 }
1928 
1929                 /*
1930                  * If we waited and the Remote IRR bit is still not cleared,
1931                  * AND if we've invoked the timeout APIC_REPROGRAM_MAX_TIMEOUTS
1932                  * times for this interrupt, try the last-ditch workaround:
1933                  */
1934                 if (drep && drep->tries >= APIC_REPROGRAM_MAX_TRIES) {
1935 
1936                         apic_last_ditch_clear_remote_irr(ioapic_ix, intin_no);
1937 
1938                         /* Mark this one as reprogrammed: */
1939                         delete_defer_repro_ent(which_irq);
1940 
1941                         return (0);
1942                 } else {
1943 #ifdef DEBUG
1944                         apic_intr_deferrals++;
1945 #endif
1946 
1947                         /*
1948                          * If waiting for the Remote IRR bit (above) didn't
1949                          * allow it to clear, defer the reprogramming.
1950                          * Add a new deferred-programming entry if the
1951                          * caller passed a NULL one (and update the existing one
1952                          * in case anything changed).
1953                          */
1954                         add_defer_repro_ent(irq_ptr, which_irq, new_bind_cpu);
1955                         if (drep)
1956                                 drep->tries++;
1957 
1958                         /* Inform caller to defer IOAPIC programming: */
1959                         return (1);
1960                 }
1961 
1962         }
1963 
1964         /* Remote IRR is clear */
1965         delete_defer_repro_ent(which_irq);
1966 
1967         return (0);
1968 }
1969 
1970 /*
1971  * Called to migrate all interrupts at an irq to another cpu.
1972  * Must be called with interrupts disabled and apic_ioapic_lock held
1973  */
1974 int
1975 apic_rebind_all(apic_irq_t *irq_ptr, int bind_cpu)
1976 {
1977         apic_irq_t      *irqptr = irq_ptr;
1978         int             retval = 0;
1979 
1980         while (irqptr) {
1981                 if (irqptr->airq_temp_cpu != IRQ_UNINIT)
1982                         retval |= apic_rebind(irqptr, bind_cpu, NULL);
1983                 irqptr = irqptr->airq_next;
1984         }
1985 
1986         return (retval);
1987 }
1988 
1989 /*
1990  * apic_intr_redistribute does all the messy computations for identifying
1991  * which interrupt to move to which CPU. Currently we do just one interrupt
1992  * at a time. This reduces the time we spent doing all this within clock
1993  * interrupt. When it is done in idle, we could do more than 1.
1994  * First we find the most busy and the most free CPU (time in ISR only)
1995  * skipping those CPUs that has been identified as being ineligible (cpu_skip)
1996  * Then we look for IRQs which are closest to the difference between the
1997  * most busy CPU and the average ISR load. We try to find one whose load
1998  * is less than difference.If none exists, then we chose one larger than the
1999  * difference, provided it does not make the most idle CPU worse than the
2000  * most busy one. In the end, we clear all the busy fields for CPUs. For
2001  * IRQs, they are cleared as they are scanned.
2002  */
2003 void
2004 apic_intr_redistribute(void)
2005 {
2006         int busiest_cpu, most_free_cpu;
2007         int cpu_free, cpu_busy, max_busy, min_busy;
2008         int min_free, diff;
2009         int average_busy, cpus_online;
2010         int i, busy;
2011         ulong_t iflag;
2012         apic_cpus_info_t *cpu_infop;
2013         apic_irq_t *min_busy_irq = NULL;
2014         apic_irq_t *max_busy_irq = NULL;
2015 
2016         busiest_cpu = most_free_cpu = -1;
2017         cpu_free = cpu_busy = max_busy = average_busy = 0;
2018         min_free = apic_sample_factor_redistribution;
2019         cpus_online = 0;
2020         /*
2021          * Below we will check for CPU_INTR_ENABLE, bound, temp_bound, temp_cpu
2022          * without ioapic_lock. That is OK as we are just doing statistical
2023          * sampling anyway and any inaccuracy now will get corrected next time
2024          * The call to rebind which actually changes things will make sure
2025          * we are consistent.
2026          */
2027         for (i = 0; i < apic_nproc; i++) {
2028                 if (apic_cpu_in_range(i) &&
2029                     !(apic_redist_cpu_skip & (1 << i)) &&
2030                     (apic_cpus[i].aci_status & APIC_CPU_INTR_ENABLE)) {
2031 
2032                         cpu_infop = &apic_cpus[i];
2033                         /*
2034                          * If no unbound interrupts or only 1 total on this
2035                          * CPU, skip
2036                          */
2037                         if (!cpu_infop->aci_temp_bound ||
2038                             (cpu_infop->aci_bound + cpu_infop->aci_temp_bound)
2039                             == 1) {
2040                                 apic_redist_cpu_skip |= 1 << i;
2041                                 continue;
2042                         }
2043 
2044                         busy = cpu_infop->aci_busy;
2045                         average_busy += busy;
2046                         cpus_online++;
2047                         if (max_busy < busy) {
2048                                 max_busy = busy;
2049                                 busiest_cpu = i;
2050                         }
2051                         if (min_free > busy) {
2052                                 min_free = busy;
2053                                 most_free_cpu = i;
2054                         }
2055                         if (busy > apic_int_busy_mark) {
2056                                 cpu_busy |= 1 << i;
2057                         } else {
2058                                 if (busy < apic_int_free_mark)
2059                                         cpu_free |= 1 << i;
2060                         }
2061                 }
2062         }
2063         if ((cpu_busy && cpu_free) ||
2064             (max_busy >= (min_free + apic_diff_for_redistribution))) {
2065 
2066                 apic_num_imbalance++;
2067 #ifdef  DEBUG
2068                 if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) {
2069                         prom_printf(
2070                             "redistribute busy=%x free=%x max=%x min=%x",
2071                             cpu_busy, cpu_free, max_busy, min_free);
2072                 }
2073 #endif /* DEBUG */
2074 
2075 
2076                 average_busy /= cpus_online;
2077 
2078                 diff = max_busy - average_busy;
2079                 min_busy = max_busy; /* start with the max possible value */
2080                 max_busy = 0;
2081                 min_busy_irq = max_busy_irq = NULL;
2082                 i = apic_min_device_irq;
2083                 for (; i <= apic_max_device_irq; i++) {
2084                         apic_irq_t *irq_ptr;
2085                         /* Change to linked list per CPU ? */
2086                         if ((irq_ptr = apic_irq_table[i]) == NULL)
2087                                 continue;
2088                         /* Check for irq_busy & decide which one to move */
2089                         /* Also zero them for next round */
2090                         if ((irq_ptr->airq_temp_cpu == busiest_cpu) &&
2091                             irq_ptr->airq_busy) {
2092                                 if (irq_ptr->airq_busy < diff) {
2093                                         /*
2094                                          * Check for least busy CPU,
2095                                          * best fit or what ?
2096                                          */
2097                                         if (max_busy < irq_ptr->airq_busy) {
2098                                                 /*
2099                                                  * Most busy within the
2100                                                  * required differential
2101                                                  */
2102                                                 max_busy = irq_ptr->airq_busy;
2103                                                 max_busy_irq = irq_ptr;
2104                                         }
2105                                 } else {
2106                                         if (min_busy > irq_ptr->airq_busy) {
2107                                                 /*
2108                                                  * least busy, but more than
2109                                                  * the reqd diff
2110                                                  */
2111                                                 if (min_busy <
2112                                                     (diff + average_busy -
2113                                                     min_free)) {
2114                                                         /*
2115                                                          * Making sure new cpu
2116                                                          * will not end up
2117                                                          * worse
2118                                                          */
2119                                                         min_busy =
2120                                                             irq_ptr->airq_busy;
2121 
2122                                                         min_busy_irq = irq_ptr;
2123                                                 }
2124                                         }
2125                                 }
2126                         }
2127                         irq_ptr->airq_busy = 0;
2128                 }
2129 
2130                 if (max_busy_irq != NULL) {
2131 #ifdef  DEBUG
2132                         if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) {
2133                                 prom_printf("rebinding %x to %x",
2134                                     max_busy_irq->airq_vector, most_free_cpu);
2135                         }
2136 #endif /* DEBUG */
2137                         iflag = intr_clear();
2138                         if (lock_try(&apic_ioapic_lock)) {
2139                                 if (apic_rebind_all(max_busy_irq,
2140                                     most_free_cpu) == 0) {
2141                                         /* Make change permenant */
2142                                         max_busy_irq->airq_cpu =
2143                                             (uint32_t)most_free_cpu;
2144                                 }
2145                                 lock_clear(&apic_ioapic_lock);
2146                         }
2147                         intr_restore(iflag);
2148 
2149                 } else if (min_busy_irq != NULL) {
2150 #ifdef  DEBUG
2151                         if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) {
2152                                 prom_printf("rebinding %x to %x",
2153                                     min_busy_irq->airq_vector, most_free_cpu);
2154                         }
2155 #endif /* DEBUG */
2156 
2157                         iflag = intr_clear();
2158                         if (lock_try(&apic_ioapic_lock)) {
2159                                 if (apic_rebind_all(min_busy_irq,
2160                                     most_free_cpu) == 0) {
2161                                         /* Make change permenant */
2162                                         min_busy_irq->airq_cpu =
2163                                             (uint32_t)most_free_cpu;
2164                                 }
2165                                 lock_clear(&apic_ioapic_lock);
2166                         }
2167                         intr_restore(iflag);
2168 
2169                 } else {
2170                         if (cpu_busy != (1 << busiest_cpu)) {
2171                                 apic_redist_cpu_skip |= 1 << busiest_cpu;
2172                                 /*
2173                                  * We leave cpu_skip set so that next time we
2174                                  * can choose another cpu
2175                                  */
2176                         }
2177                 }
2178                 apic_num_rebind++;
2179         } else {
2180                 /*
2181                  * found nothing. Could be that we skipped over valid CPUs
2182                  * or we have balanced everything. If we had a variable
2183                  * ticks_for_redistribution, it could be increased here.
2184                  * apic_int_busy, int_free etc would also need to be
2185                  * changed.
2186                  */
2187                 if (apic_redist_cpu_skip)
2188                         apic_redist_cpu_skip = 0;
2189         }
2190         for (i = 0; i < apic_nproc; i++) {
2191                 if (apic_cpu_in_range(i)) {
2192                         apic_cpus[i].aci_busy = 0;
2193                 }
2194         }
2195 }
2196 
2197 void
2198 apic_cleanup_busy(void)
2199 {
2200         int i;
2201         apic_irq_t *irq_ptr;
2202 
2203         for (i = 0; i < apic_nproc; i++) {
2204                 if (apic_cpu_in_range(i)) {
2205                         apic_cpus[i].aci_busy = 0;
2206                 }
2207         }
2208 
2209         for (i = apic_min_device_irq; i <= apic_max_device_irq; i++) {
2210                 if ((irq_ptr = apic_irq_table[i]) != NULL)
2211                         irq_ptr->airq_busy = 0;
2212         }
2213 }