1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 /*
  25  * Copyright (c) 2010, Intel Corporation.
  26  * All rights reserved.
  27  * Copyright 2016 PALO, Richard.
  28  */
  29 
  30 /*
  31  * PSMI 1.1 extensions are supported only in 2.6 and later versions.
  32  * PSMI 1.2 extensions are supported only in 2.7 and later versions.
  33  * PSMI 1.3 and 1.4 extensions are supported in Solaris 10.
  34  * PSMI 1.5 extensions are supported in Solaris Nevada.
  35  * PSMI 1.6 extensions are supported in Solaris Nevada.
  36  * PSMI 1.7 extensions are supported in Solaris Nevada.
  37  */
  38 #define PSMI_1_7
  39 
  40 #include <sys/processor.h>
  41 #include <sys/time.h>
  42 #include <sys/psm.h>
  43 #include <sys/smp_impldefs.h>
  44 #include <sys/cram.h>
  45 #include <sys/acpi/acpi.h>
  46 #include <sys/acpica.h>
  47 #include <sys/psm_common.h>
  48 #include <sys/apic.h>
  49 #include <sys/apic_common.h>
  50 #include <sys/pit.h>
  51 #include <sys/ddi.h>
  52 #include <sys/sunddi.h>
  53 #include <sys/ddi_impldefs.h>
  54 #include <sys/pci.h>
  55 #include <sys/promif.h>
  56 #include <sys/x86_archext.h>
  57 #include <sys/cpc_impl.h>
  58 #include <sys/uadmin.h>
  59 #include <sys/panic.h>
  60 #include <sys/debug.h>
  61 #include <sys/archsystm.h>
  62 #include <sys/trap.h>
  63 #include <sys/machsystm.h>
  64 #include <sys/cpuvar.h>
  65 #include <sys/rm_platter.h>
  66 #include <sys/privregs.h>
  67 #include <sys/cyclic.h>
  68 #include <sys/note.h>
  69 #include <sys/pci_intr_lib.h>
  70 #include <sys/sunndi.h>
  71 
  72 
  73 /*
  74  *      Local Function Prototypes
  75  */
  76 static void apic_mark_vector(uchar_t oldvector, uchar_t newvector);
  77 static void apic_xlate_vector_free_timeout_handler(void *arg);
  78 static int apic_check_stuck_interrupt(apic_irq_t *irq_ptr, int old_bind_cpu,
  79     int new_bind_cpu, int apicindex, int intin_no, int which_irq,
  80     struct ioapic_reprogram_data *drep);
  81 static int apic_setup_irq_table(dev_info_t *dip, int irqno,
  82     struct apic_io_intr *intrp, struct intrspec *ispec, iflag_t *intr_flagp,
  83     int type);
  84 static void apic_try_deferred_reprogram(int ipl, int vect);
  85 static void delete_defer_repro_ent(int which_irq);
  86 static void apic_ioapic_wait_pending_clear(int ioapicindex,
  87     int intin_no);
  88 
  89 extern int apic_acpi_translate_pci_irq(dev_info_t *dip, int busid, int devid,
  90     int ipin, int *pci_irqp, iflag_t *intr_flagp);
  91 extern int apic_handle_pci_pci_bridge(dev_info_t *idip, int child_devno,
  92     int child_ipin, struct apic_io_intr **intrp);
  93 extern uchar_t acpi_find_ioapic(int irq);
  94 extern struct apic_io_intr *apic_find_io_intr_w_busid(int irqno, int busid);
  95 extern int apic_find_bus_id(int bustype);
  96 extern int apic_find_intin(uchar_t ioapic, uchar_t intin);
  97 extern void apic_record_rdt_entry(apic_irq_t *irqptr, int irq);
  98 
  99 extern  int apic_sci_vect;
 100 extern  iflag_t apic_sci_flags;
 101 extern  int     apic_intr_policy;
 102 extern  char *psm_name;
 103 
 104 /*
 105  * number of bits per byte, from <sys/param.h>
 106  */
 107 #define UCHAR_MAX       ((1 << NBBY) - 1)
 108 
 109 /* Max wait time (in repetitions) for flags to clear in an RDT entry. */
 110 extern int apic_max_reps_clear_pending;
 111 
 112 /* The irq # is implicit in the array index: */
 113 struct ioapic_reprogram_data apic_reprogram_info[APIC_MAX_VECTOR+1];
 114 /*
 115  * APIC_MAX_VECTOR + 1 is the maximum # of IRQs as well. ioapic_reprogram_info
 116  * is indexed by IRQ number, NOT by vector number.
 117  */
 118 
 119 extern  int     apic_int_busy_mark;
 120 extern  int     apic_int_free_mark;
 121 extern  int     apic_diff_for_redistribution;
 122 extern  int     apic_sample_factor_redistribution;
 123 extern  int     apic_redist_cpu_skip;
 124 extern  int     apic_num_imbalance;
 125 extern  int     apic_num_rebind;
 126 
 127 /* timeout for xlate_vector, mark_vector */
 128 int     apic_revector_timeout = 16 * 10000; /* 160 millisec */
 129 
 130 extern int      apic_defconf;
 131 extern int      apic_irq_translate;
 132 
 133 extern int      apic_use_acpi_madt_only;        /* 1=ONLY use MADT from ACPI */
 134 
 135 extern  uchar_t apic_io_vectbase[MAX_IO_APIC];
 136 
 137 extern  boolean_t ioapic_mask_workaround[MAX_IO_APIC];
 138 
 139 /*
 140  * First available slot to be used as IRQ index into the apic_irq_table
 141  * for those interrupts (like MSI/X) that don't have a physical IRQ.
 142  */
 143 extern int apic_first_avail_irq;
 144 
 145 /*
 146  * apic_defer_reprogram_lock ensures that only one processor is handling
 147  * deferred interrupt programming at *_intr_exit time.
 148  */
 149 static  lock_t  apic_defer_reprogram_lock;
 150 
 151 /*
 152  * The current number of deferred reprogrammings outstanding
 153  */
 154 uint_t  apic_reprogram_outstanding = 0;
 155 
 156 #ifdef DEBUG
 157 /*
 158  * Counters that keep track of deferred reprogramming stats
 159  */
 160 uint_t  apic_intr_deferrals = 0;
 161 uint_t  apic_intr_deliver_timeouts = 0;
 162 uint_t  apic_last_ditch_reprogram_failures = 0;
 163 uint_t  apic_deferred_setup_failures = 0;
 164 uint_t  apic_defer_repro_total_retries = 0;
 165 uint_t  apic_defer_repro_successes = 0;
 166 uint_t  apic_deferred_spurious_enters = 0;
 167 #endif
 168 
 169 extern  int     apic_io_max;
 170 extern  struct apic_io_intr *apic_io_intrp;
 171 
 172 uchar_t apic_vector_to_irq[APIC_MAX_VECTOR+1];
 173 
 174 extern  uint32_t        eisa_level_intr_mask;
 175         /* At least MSB will be set if EISA bus */
 176 
 177 extern  int     apic_pci_bus_total;
 178 extern  uchar_t apic_single_pci_busid;
 179 
 180 /*
 181  * Following declarations are for revectoring; used when ISRs at different
 182  * IPLs share an irq.
 183  */
 184 static  lock_t  apic_revector_lock;
 185 int     apic_revector_pending = 0;
 186 static  uchar_t *apic_oldvec_to_newvec;
 187 static  uchar_t *apic_newvec_to_oldvec;
 188 
 189 /* ACPI Interrupt Source Override Structure ptr */
 190 ACPI_MADT_INTERRUPT_OVERRIDE *acpi_isop;
 191 extern  int acpi_iso_cnt;
 192 
 193 /*
 194  * Auto-configuration routines
 195  */
 196 
 197 /*
 198  * Initialise vector->ipl and ipl->pri arrays. level_intr and irqtable
 199  * are also set to NULL. vector->irq is set to a value which cannot map
 200  * to a real irq to show that it is free.
 201  */
 202 void
 203 apic_init_common(void)
 204 {
 205         int     i, j, indx;
 206         int     *iptr;
 207 
 208         /*
 209          * Initialize apic_ipls from apic_vectortoipl.  This array is
 210          * used in apic_intr_enter to determine the IPL to use for the
 211          * corresponding vector.  On some systems, due to hardware errata
 212          * and interrupt sharing, the IPL may not correspond to the IPL listed
 213          * in apic_vectortoipl (see apic_addspl and apic_delspl).
 214          */
 215         for (i = 0; i < (APIC_AVAIL_VECTOR / APIC_VECTOR_PER_IPL); i++) {
 216                 indx = i * APIC_VECTOR_PER_IPL;
 217 
 218                 for (j = 0; j < APIC_VECTOR_PER_IPL; j++, indx++)
 219                         apic_ipls[indx] = apic_vectortoipl[i];
 220         }
 221 
 222         /* cpu 0 is always up (for now) */
 223         apic_cpus[0].aci_status = APIC_CPU_ONLINE | APIC_CPU_INTR_ENABLE;
 224 
 225         iptr = (int *)&apic_irq_table[0];
 226         for (i = 0; i <= APIC_MAX_VECTOR; i++) {
 227                 apic_level_intr[i] = 0;
 228                 *iptr++ = 0;
 229                 apic_vector_to_irq[i] = APIC_RESV_IRQ;
 230 
 231                 /* These *must* be initted to B_TRUE! */
 232                 apic_reprogram_info[i].done = B_TRUE;
 233                 apic_reprogram_info[i].irqp = NULL;
 234                 apic_reprogram_info[i].tries = 0;
 235                 apic_reprogram_info[i].bindcpu = 0;
 236         }
 237 
 238         /*
 239          * Allocate a dummy irq table entry for the reserved entry.
 240          * This takes care of the race between removing an irq and
 241          * clock detecting a CPU in that irq during interrupt load
 242          * sampling.
 243          */
 244         apic_irq_table[APIC_RESV_IRQ] =
 245             kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP);
 246 
 247         mutex_init(&airq_mutex, NULL, MUTEX_DEFAULT, NULL);
 248 }
 249 
 250 void
 251 ioapic_init_intr(int mask_apic)
 252 {
 253         int ioapic_ix;
 254         struct intrspec ispec;
 255         apic_irq_t *irqptr;
 256         int i, j;
 257         ulong_t iflag;
 258 
 259         LOCK_INIT_CLEAR(&apic_revector_lock);
 260         LOCK_INIT_CLEAR(&apic_defer_reprogram_lock);
 261 
 262         /* mask interrupt vectors */
 263         for (j = 0; j < apic_io_max && mask_apic; j++) {
 264                 int intin_max;
 265 
 266                 ioapic_ix = j;
 267                 /* Bits 23-16 define the maximum redirection entries */
 268                 intin_max = (ioapic_read(ioapic_ix, APIC_VERS_CMD) >> 16)
 269                     & 0xff;
 270                 for (i = 0; i <= intin_max; i++)
 271                         ioapic_write(ioapic_ix, APIC_RDT_CMD + 2 * i, AV_MASK);
 272         }
 273 
 274         /*
 275          * Hack alert: deal with ACPI SCI interrupt chicken/egg here
 276          */
 277         if (apic_sci_vect > 0) {
 278                 /*
 279                  * acpica has already done add_avintr(); we just
 280                  * to finish the job by mimicing translate_irq()
 281                  *
 282                  * Fake up an intrspec and setup the tables
 283                  */
 284                 ispec.intrspec_vec = apic_sci_vect;
 285                 ispec.intrspec_pri = SCI_IPL;
 286 
 287                 if (apic_setup_irq_table(NULL, apic_sci_vect, NULL,
 288                     &ispec, &apic_sci_flags, DDI_INTR_TYPE_FIXED) < 0) {
 289                         cmn_err(CE_WARN, "!apic: SCI setup failed");
 290                         return;
 291                 }
 292                 irqptr = apic_irq_table[apic_sci_vect];
 293 
 294                 iflag = intr_clear();
 295                 lock_set(&apic_ioapic_lock);
 296 
 297                 /* Program I/O APIC */
 298                 (void) apic_setup_io_intr(irqptr, apic_sci_vect, B_FALSE);
 299 
 300                 lock_clear(&apic_ioapic_lock);
 301                 intr_restore(iflag);
 302 
 303                 irqptr->airq_share++;
 304         }
 305 }
 306 
 307 /*
 308  * Add mask bits to disable interrupt vector from happening
 309  * at or above IPL. In addition, it should remove mask bits
 310  * to enable interrupt vectors below the given IPL.
 311  *
 312  * Both add and delspl are complicated by the fact that different interrupts
 313  * may share IRQs. This can happen in two ways.
 314  * 1. The same H/W line is shared by more than 1 device
 315  * 1a. with interrupts at different IPLs
 316  * 1b. with interrupts at same IPL
 317  * 2. We ran out of vectors at a given IPL and started sharing vectors.
 318  * 1b and 2 should be handled gracefully, except for the fact some ISRs
 319  * will get called often when no interrupt is pending for the device.
 320  * For 1a, we handle it at the higher IPL.
 321  */
 322 /*ARGSUSED*/
 323 int
 324 apic_addspl_common(int irqno, int ipl, int min_ipl, int max_ipl)
 325 {
 326         uchar_t vector;
 327         ulong_t iflag;
 328         apic_irq_t *irqptr, *irqheadptr;
 329         int irqindex;
 330 
 331         ASSERT(max_ipl <= UCHAR_MAX);
 332         irqindex = IRQINDEX(irqno);
 333 
 334         if ((irqindex == -1) || (!apic_irq_table[irqindex]))
 335                 return (PSM_FAILURE);
 336 
 337         mutex_enter(&airq_mutex);
 338         irqptr = irqheadptr = apic_irq_table[irqindex];
 339 
 340         DDI_INTR_IMPLDBG((CE_CONT, "apic_addspl: dip=0x%p type=%d irqno=0x%x "
 341             "vector=0x%x\n", (void *)irqptr->airq_dip,
 342             irqptr->airq_mps_intr_index, irqno, irqptr->airq_vector));
 343 
 344         while (irqptr) {
 345                 if (VIRTIRQ(irqindex, irqptr->airq_share_id) == irqno)
 346                         break;
 347                 irqptr = irqptr->airq_next;
 348         }
 349         irqptr->airq_share++;
 350 
 351         mutex_exit(&airq_mutex);
 352 
 353         /* return if it is not hardware interrupt */
 354         if (irqptr->airq_mps_intr_index == RESERVE_INDEX)
 355                 return (PSM_SUCCESS);
 356 
 357         /* Or if there are more interupts at a higher IPL */
 358         if (ipl != max_ipl)
 359                 return (PSM_SUCCESS);
 360 
 361         /*
 362          * if apic_picinit() has not been called yet, just return.
 363          * At the end of apic_picinit(), we will call setup_io_intr().
 364          */
 365 
 366         if (!apic_picinit_called)
 367                 return (PSM_SUCCESS);
 368 
 369         /*
 370          * Upgrade vector if max_ipl is not earlier ipl. If we cannot allocate,
 371          * return failure.
 372          */
 373         if (irqptr->airq_ipl != max_ipl &&
 374             !ioapic_mask_workaround[irqptr->airq_ioapicindex]) {
 375 
 376                 vector = apic_allocate_vector(max_ipl, irqindex, 1);
 377                 if (vector == 0) {
 378                         irqptr->airq_share--;
 379                         return (PSM_FAILURE);
 380                 }
 381                 irqptr = irqheadptr;
 382                 apic_mark_vector(irqptr->airq_vector, vector);
 383                 while (irqptr) {
 384                         irqptr->airq_vector = vector;
 385                         irqptr->airq_ipl = (uchar_t)max_ipl;
 386                         /*
 387                          * reprogram irq being added and every one else
 388                          * who is not in the UNINIT state
 389                          */
 390                         if ((VIRTIRQ(irqindex, irqptr->airq_share_id) ==
 391                             irqno) || (irqptr->airq_temp_cpu != IRQ_UNINIT)) {
 392                                 apic_record_rdt_entry(irqptr, irqindex);
 393 
 394                                 iflag = intr_clear();
 395                                 lock_set(&apic_ioapic_lock);
 396 
 397                                 (void) apic_setup_io_intr(irqptr, irqindex,
 398                                     B_FALSE);
 399 
 400                                 lock_clear(&apic_ioapic_lock);
 401                                 intr_restore(iflag);
 402                         }
 403                         irqptr = irqptr->airq_next;
 404                 }
 405                 return (PSM_SUCCESS);
 406 
 407         } else if (irqptr->airq_ipl != max_ipl &&
 408             ioapic_mask_workaround[irqptr->airq_ioapicindex]) {
 409                 /*
 410                  * We cannot upgrade the vector, but we can change
 411                  * the IPL that this vector induces.
 412                  *
 413                  * Note that we subtract APIC_BASE_VECT from the vector
 414                  * here because this array is used in apic_intr_enter
 415                  * (no need to add APIC_BASE_VECT in that hot code
 416                  * path since we can do it in the rarely-executed path
 417                  * here).
 418                  */
 419                 apic_ipls[irqptr->airq_vector - APIC_BASE_VECT] =
 420                     (uchar_t)max_ipl;
 421 
 422                 irqptr = irqheadptr;
 423                 while (irqptr) {
 424                         irqptr->airq_ipl = (uchar_t)max_ipl;
 425                         irqptr = irqptr->airq_next;
 426                 }
 427 
 428                 return (PSM_SUCCESS);
 429         }
 430 
 431         ASSERT(irqptr);
 432 
 433         iflag = intr_clear();
 434         lock_set(&apic_ioapic_lock);
 435 
 436         (void) apic_setup_io_intr(irqptr, irqindex, B_FALSE);
 437 
 438         lock_clear(&apic_ioapic_lock);
 439         intr_restore(iflag);
 440 
 441         return (PSM_SUCCESS);
 442 }
 443 
 444 /*
 445  * Recompute mask bits for the given interrupt vector.
 446  * If there is no interrupt servicing routine for this
 447  * vector, this function should disable interrupt vector
 448  * from happening at all IPLs. If there are still
 449  * handlers using the given vector, this function should
 450  * disable the given vector from happening below the lowest
 451  * IPL of the remaining hadlers.
 452  */
 453 /*ARGSUSED*/
 454 int
 455 apic_delspl_common(int irqno, int ipl, int min_ipl, int max_ipl)
 456 {
 457         uchar_t vector;
 458         uint32_t bind_cpu;
 459         int intin, irqindex;
 460         int ioapic_ix;
 461         apic_irq_t      *irqptr, *preirqptr, *irqheadptr, *irqp;
 462         ulong_t iflag;
 463 
 464         mutex_enter(&airq_mutex);
 465         irqindex = IRQINDEX(irqno);
 466         irqptr = preirqptr = irqheadptr = apic_irq_table[irqindex];
 467 
 468         DDI_INTR_IMPLDBG((CE_CONT, "apic_delspl: dip=0x%p type=%d irqno=0x%x "
 469             "vector=0x%x\n", (void *)irqptr->airq_dip,
 470             irqptr->airq_mps_intr_index, irqno, irqptr->airq_vector));
 471 
 472         while (irqptr) {
 473                 if (VIRTIRQ(irqindex, irqptr->airq_share_id) == irqno)
 474                         break;
 475                 preirqptr = irqptr;
 476                 irqptr = irqptr->airq_next;
 477         }
 478         ASSERT(irqptr);
 479 
 480         irqptr->airq_share--;
 481 
 482         mutex_exit(&airq_mutex);
 483 
 484         /*
 485          * If there are more interrupts at a higher IPL, we don't need
 486          * to disable anything.
 487          */
 488         if (ipl < max_ipl)
 489                 return (PSM_SUCCESS);
 490 
 491         /* return if it is not hardware interrupt */
 492         if (irqptr->airq_mps_intr_index == RESERVE_INDEX)
 493                 return (PSM_SUCCESS);
 494 
 495         if (!apic_picinit_called) {
 496                 /*
 497                  * Clear irq_struct. If two devices shared an intpt
 498                  * line & 1 unloaded before picinit, we are hosed. But, then
 499                  * we hope the machine survive.
 500                  */
 501                 irqptr->airq_mps_intr_index = FREE_INDEX;
 502                 irqptr->airq_temp_cpu = IRQ_UNINIT;
 503                 apic_free_vector(irqptr->airq_vector);
 504                 return (PSM_SUCCESS);
 505         }
 506         /*
 507          * Downgrade vector to new max_ipl if needed. If we cannot allocate,
 508          * use old IPL. Not very elegant, but it should work.
 509          */
 510         if ((irqptr->airq_ipl != max_ipl) && (max_ipl != PSM_INVALID_IPL) &&
 511             !ioapic_mask_workaround[irqptr->airq_ioapicindex]) {
 512                 apic_irq_t      *irqp;
 513                 if (vector = apic_allocate_vector(max_ipl, irqno, 1)) {
 514                         apic_mark_vector(irqheadptr->airq_vector, vector);
 515                         irqp = irqheadptr;
 516                         while (irqp) {
 517                                 irqp->airq_vector = vector;
 518                                 irqp->airq_ipl = (uchar_t)max_ipl;
 519                                 if (irqp->airq_temp_cpu != IRQ_UNINIT) {
 520                                         apic_record_rdt_entry(irqp, irqindex);
 521 
 522                                         iflag = intr_clear();
 523                                         lock_set(&apic_ioapic_lock);
 524 
 525                                         (void) apic_setup_io_intr(irqp,
 526                                             irqindex, B_FALSE);
 527 
 528                                         lock_clear(&apic_ioapic_lock);
 529                                         intr_restore(iflag);
 530                                 }
 531                                 irqp = irqp->airq_next;
 532                         }
 533                 }
 534 
 535         } else if (irqptr->airq_ipl != max_ipl &&
 536             max_ipl != PSM_INVALID_IPL &&
 537             ioapic_mask_workaround[irqptr->airq_ioapicindex]) {
 538 
 539         /*
 540          * We cannot downgrade the IPL of the vector below the vector's
 541          * hardware priority. If we did, it would be possible for a
 542          * higher-priority hardware vector to interrupt a CPU running at an IPL
 543          * lower than the hardware priority of the interrupting vector (but
 544          * higher than the soft IPL of this IRQ). When this happens, we would
 545          * then try to drop the IPL BELOW what it was (effectively dropping
 546          * below base_spl) which would be potentially catastrophic.
 547          *
 548          * (e.g. Suppose the hardware vector associated with this IRQ is 0x40
 549          * (hardware IPL of 4).  Further assume that the old IPL of this IRQ
 550          * was 4, but the new IPL is 1.  If we forced vector 0x40 to result in
 551          * an IPL of 1, it would be possible for the processor to be executing
 552          * at IPL 3 and for an interrupt to come in on vector 0x40, interrupting
 553          * the currently-executing ISR.  When apic_intr_enter consults
 554          * apic_irqs[], it will return 1, bringing the IPL of the CPU down to 1
 555          * so even though the processor was running at IPL 4, an IPL 1
 556          * interrupt will have interrupted it, which must not happen)).
 557          *
 558          * Effectively, this means that the hardware priority corresponding to
 559          * the IRQ's IPL (in apic_ipls[]) cannot be lower than the vector's
 560          * hardware priority.
 561          *
 562          * (In the above example, then, after removal of the IPL 4 device's
 563          * interrupt handler, the new IPL will continue to be 4 because the
 564          * hardware priority that IPL 1 implies is lower than the hardware
 565          * priority of the vector used.)
 566          */
 567                 /* apic_ipls is indexed by vector, starting at APIC_BASE_VECT */
 568                 const int apic_ipls_index = irqptr->airq_vector -
 569                     APIC_BASE_VECT;
 570                 const int vect_inherent_hwpri = irqptr->airq_vector >>
 571                     APIC_IPL_SHIFT;
 572 
 573                 /*
 574                  * If there are still devices using this IRQ, determine the
 575                  * new ipl to use.
 576                  */
 577                 if (irqptr->airq_share) {
 578                         int vect_desired_hwpri, hwpri;
 579 
 580                         ASSERT(max_ipl < MAXIPL);
 581                         vect_desired_hwpri = apic_ipltopri[max_ipl] >>
 582                             APIC_IPL_SHIFT;
 583 
 584                         /*
 585                          * If the desired IPL's hardware priority is lower
 586                          * than that of the vector, use the hardware priority
 587                          * of the vector to determine the new IPL.
 588                          */
 589                         hwpri = (vect_desired_hwpri < vect_inherent_hwpri) ?
 590                             vect_inherent_hwpri : vect_desired_hwpri;
 591 
 592                         /*
 593                          * Now, to get the right index for apic_vectortoipl,
 594                          * we need to subtract APIC_BASE_VECT from the
 595                          * hardware-vector-equivalent (in hwpri).  Since hwpri
 596                          * is already shifted, we shift APIC_BASE_VECT before
 597                          * doing the subtraction.
 598                          */
 599                         hwpri -= (APIC_BASE_VECT >> APIC_IPL_SHIFT);
 600 
 601                         ASSERT(hwpri >= 0);
 602                         ASSERT(hwpri < MAXIPL);
 603                         max_ipl = apic_vectortoipl[hwpri];
 604                         apic_ipls[apic_ipls_index] = max_ipl;
 605 
 606                         irqp = irqheadptr;
 607                         while (irqp) {
 608                                 irqp->airq_ipl = (uchar_t)max_ipl;
 609                                 irqp = irqp->airq_next;
 610                         }
 611                 } else {
 612                         /*
 613                          * No more devices on this IRQ, so reset this vector's
 614                          * element in apic_ipls to the original IPL for this
 615                          * vector
 616                          */
 617                         apic_ipls[apic_ipls_index] =
 618                             apic_vectortoipl[vect_inherent_hwpri];
 619                 }
 620         }
 621 
 622         /*
 623          * If there are still active interrupts, we are done.
 624          */
 625         if (irqptr->airq_share)
 626                 return (PSM_SUCCESS);
 627 
 628         iflag = intr_clear();
 629         lock_set(&apic_ioapic_lock);
 630 
 631         if (irqptr->airq_mps_intr_index == MSI_INDEX) {
 632                 /*
 633                  * Disable the MSI vector
 634                  * Make sure we only disable on the last
 635                  * of the multi-MSI support
 636                  */
 637                 if (i_ddi_intr_get_current_nenables(irqptr->airq_dip) == 1) {
 638                         apic_pci_msi_disable_mode(irqptr->airq_dip,
 639                             DDI_INTR_TYPE_MSI);
 640                 }
 641         } else if (irqptr->airq_mps_intr_index == MSIX_INDEX) {
 642                 /*
 643                  * Disable the MSI-X vector
 644                  * needs to clear its mask and addr/data for each MSI-X
 645                  */
 646                 apic_pci_msi_unconfigure(irqptr->airq_dip, DDI_INTR_TYPE_MSIX,
 647                     irqptr->airq_origirq);
 648                 /*
 649                  * Make sure we only disable on the last MSI-X
 650                  */
 651                 if (i_ddi_intr_get_current_nenables(irqptr->airq_dip) == 1) {
 652                         apic_pci_msi_disable_mode(irqptr->airq_dip,
 653                             DDI_INTR_TYPE_MSIX);
 654                 }
 655         } else {
 656                 /*
 657                  * The assumption here is that this is safe, even for
 658                  * systems with IOAPICs that suffer from the hardware
 659                  * erratum because all devices have been quiesced before
 660                  * they unregister their interrupt handlers.  If that
 661                  * assumption turns out to be false, this mask operation
 662                  * can induce the same erratum result we're trying to
 663                  * avoid.
 664                  */
 665                 ioapic_ix = irqptr->airq_ioapicindex;
 666                 intin = irqptr->airq_intin_no;
 667                 ioapic_write(ioapic_ix, APIC_RDT_CMD + 2 * intin, AV_MASK);
 668         }
 669 
 670         /*
 671          * This irq entry is the only one in the chain.
 672          */
 673         if (irqheadptr->airq_next == NULL) {
 674                 ASSERT(irqheadptr == irqptr);
 675                 bind_cpu = irqptr->airq_temp_cpu;
 676                 if (((uint32_t)bind_cpu != IRQ_UNBOUND) &&
 677                     ((uint32_t)bind_cpu != IRQ_UNINIT)) {
 678                         ASSERT(apic_cpu_in_range(bind_cpu));
 679                         if (bind_cpu & IRQ_USER_BOUND) {
 680                                 /* If hardbound, temp_cpu == cpu */
 681                                 bind_cpu &= ~IRQ_USER_BOUND;
 682                                 apic_cpus[bind_cpu].aci_bound--;
 683                         } else
 684                                 apic_cpus[bind_cpu].aci_temp_bound--;
 685                 }
 686                 irqptr->airq_temp_cpu = IRQ_UNINIT;
 687                 irqptr->airq_mps_intr_index = FREE_INDEX;
 688                 lock_clear(&apic_ioapic_lock);
 689                 intr_restore(iflag);
 690                 apic_free_vector(irqptr->airq_vector);
 691                 return (PSM_SUCCESS);
 692         }
 693 
 694         /*
 695          * If we get here, we are sharing the vector and there are more than
 696          * one active irq entries in the chain.
 697          */
 698         lock_clear(&apic_ioapic_lock);
 699         intr_restore(iflag);
 700 
 701         mutex_enter(&airq_mutex);
 702         /* Remove the irq entry from the chain */
 703         if (irqptr == irqheadptr) { /* The irq entry is at the head */
 704                 apic_irq_table[irqindex] = irqptr->airq_next;
 705         } else {
 706                 preirqptr->airq_next = irqptr->airq_next;
 707         }
 708         /* Free the irq entry */
 709         kmem_free(irqptr, sizeof (apic_irq_t));
 710         mutex_exit(&airq_mutex);
 711 
 712         return (PSM_SUCCESS);
 713 }
 714 
 715 /*
 716  * apic_introp_xlate() replaces apic_translate_irq() and is
 717  * called only from apic_intr_ops().  With the new ADII framework,
 718  * the priority can no longer be retrieved through i_ddi_get_intrspec().
 719  * It has to be passed in from the caller.
 720  *
 721  * Return value:
 722  *      Success: irqno for the given device
 723  *      Failure: -1
 724  */
 725 int
 726 apic_introp_xlate(dev_info_t *dip, struct intrspec *ispec, int type)
 727 {
 728         char dev_type[16];
 729         int dev_len, pci_irq, newirq, bustype, devid, busid, i;
 730         int irqno = ispec->intrspec_vec;
 731         ddi_acc_handle_t cfg_handle;
 732         uchar_t ipin;
 733         struct apic_io_intr *intrp;
 734         iflag_t intr_flag;
 735         ACPI_SUBTABLE_HEADER    *hp;
 736         ACPI_MADT_INTERRUPT_OVERRIDE *isop;
 737         apic_irq_t *airqp;
 738         int parent_is_pci_or_pciex = 0;
 739         int child_is_pciex = 0;
 740 
 741         DDI_INTR_IMPLDBG((CE_CONT, "apic_introp_xlate: dip=0x%p name=%s "
 742             "type=%d irqno=0x%x\n", (void *)dip, ddi_get_name(dip), type,
 743             irqno));
 744 
 745         dev_len = sizeof (dev_type);
 746         if (ddi_getlongprop_buf(DDI_DEV_T_ANY, ddi_get_parent(dip),
 747             DDI_PROP_DONTPASS, "device_type", (caddr_t)dev_type,
 748             &dev_len) == DDI_PROP_SUCCESS) {
 749                 if ((strcmp(dev_type, "pci") == 0) ||
 750                     (strcmp(dev_type, "pciex") == 0))
 751                         parent_is_pci_or_pciex = 1;
 752         }
 753 
 754         if (ddi_getlongprop_buf(DDI_DEV_T_ANY, dip,
 755             DDI_PROP_DONTPASS, "compatible", (caddr_t)dev_type,
 756             &dev_len) == DDI_PROP_SUCCESS) {
 757                 if (strstr(dev_type, "pciex"))
 758                         child_is_pciex = 1;
 759         }
 760 
 761         if (DDI_INTR_IS_MSI_OR_MSIX(type)) {
 762                 if ((airqp = apic_find_irq(dip, ispec, type)) != NULL) {
 763                         airqp->airq_iflag.bustype =
 764                             child_is_pciex ? BUS_PCIE : BUS_PCI;
 765                         return (apic_vector_to_irq[airqp->airq_vector]);
 766                 }
 767                 return (apic_setup_irq_table(dip, irqno, NULL, ispec,
 768                     NULL, type));
 769         }
 770 
 771         bustype = 0;
 772 
 773         /* check if we have already translated this irq */
 774         mutex_enter(&airq_mutex);
 775         newirq = apic_min_device_irq;
 776         for (; newirq <= apic_max_device_irq; newirq++) {
 777                 airqp = apic_irq_table[newirq];
 778                 while (airqp) {
 779                         if ((airqp->airq_dip == dip) &&
 780                             (airqp->airq_origirq == irqno) &&
 781                             (airqp->airq_mps_intr_index != FREE_INDEX)) {
 782 
 783                                 mutex_exit(&airq_mutex);
 784                                 return (VIRTIRQ(newirq, airqp->airq_share_id));
 785                         }
 786                         airqp = airqp->airq_next;
 787                 }
 788         }
 789         mutex_exit(&airq_mutex);
 790 
 791         if (apic_defconf)
 792                 goto defconf;
 793 
 794         if ((dip == NULL) || (!apic_irq_translate && !apic_enable_acpi))
 795                 goto nonpci;
 796 
 797         if (parent_is_pci_or_pciex) {
 798                 /* pci device */
 799                 if (acpica_get_bdf(dip, &busid, &devid, NULL) != 0)
 800                         goto nonpci;
 801                 if (busid == 0 && apic_pci_bus_total == 1)
 802                         busid = (int)apic_single_pci_busid;
 803 
 804                 if (pci_config_setup(dip, &cfg_handle) != DDI_SUCCESS)
 805                         return (-1);
 806                 ipin = pci_config_get8(cfg_handle, PCI_CONF_IPIN) - PCI_INTA;
 807                 pci_config_teardown(&cfg_handle);
 808                 if (apic_enable_acpi && !apic_use_acpi_madt_only) {
 809                         if (apic_acpi_translate_pci_irq(dip, busid, devid,
 810                             ipin, &pci_irq, &intr_flag) != ACPI_PSM_SUCCESS)
 811                                 return (-1);
 812 
 813                         intr_flag.bustype = child_is_pciex ? BUS_PCIE : BUS_PCI;
 814                         return (apic_setup_irq_table(dip, pci_irq, NULL, ispec,
 815                             &intr_flag, type));
 816                 } else {
 817                         pci_irq = ((devid & 0x1f) << 2) | (ipin & 0x3);
 818                         if ((intrp = apic_find_io_intr_w_busid(pci_irq, busid))
 819                             == NULL) {
 820                                 if ((pci_irq = apic_handle_pci_pci_bridge(dip,
 821                                     devid, ipin, &intrp)) == -1)
 822                                         return (-1);
 823                         }
 824                         return (apic_setup_irq_table(dip, pci_irq, intrp, ispec,
 825                             NULL, type));
 826                 }
 827         } else if (strcmp(dev_type, "isa") == 0)
 828                 bustype = BUS_ISA;
 829         else if (strcmp(dev_type, "eisa") == 0)
 830                 bustype = BUS_EISA;
 831 
 832 nonpci:
 833         if (apic_enable_acpi && !apic_use_acpi_madt_only) {
 834                 /* search iso entries first */
 835                 if (acpi_iso_cnt != 0) {
 836                         hp = (ACPI_SUBTABLE_HEADER *)acpi_isop;
 837                         i = 0;
 838                         while (i < acpi_iso_cnt) {
 839                                 if (hp->Type ==
 840                                     ACPI_MADT_TYPE_INTERRUPT_OVERRIDE) {
 841                                         isop =
 842                                             (ACPI_MADT_INTERRUPT_OVERRIDE *) hp;
 843                                         if (isop->Bus == 0 &&
 844                                             isop->SourceIrq == irqno) {
 845                                                 newirq = isop->GlobalIrq;
 846                                                 intr_flag.intr_po =
 847                                                     isop->IntiFlags &
 848                                                     ACPI_MADT_POLARITY_MASK;
 849                                                 intr_flag.intr_el =
 850                                                     (isop->IntiFlags &
 851                                                     ACPI_MADT_TRIGGER_MASK)
 852                                                     >> 2;
 853                                                 intr_flag.bustype = BUS_ISA;
 854 
 855                                                 return (apic_setup_irq_table(
 856                                                     dip, newirq, NULL, ispec,
 857                                                     &intr_flag, type));
 858 
 859                                         }
 860                                         i++;
 861                                 }
 862                                 hp = (ACPI_SUBTABLE_HEADER *)(((char *)hp) +
 863                                     hp->Length);
 864                         }
 865                 }
 866                 intr_flag.intr_po = INTR_PO_ACTIVE_HIGH;
 867                 intr_flag.intr_el = INTR_EL_EDGE;
 868                 intr_flag.bustype = BUS_ISA;
 869                 return (apic_setup_irq_table(dip, irqno, NULL, ispec,
 870                     &intr_flag, type));
 871         } else {
 872                 if (bustype == 0)       /* not initialized */
 873                         bustype = eisa_level_intr_mask ? BUS_EISA : BUS_ISA;
 874                 for (i = 0; i < 2; i++) {
 875                         if (((busid = apic_find_bus_id(bustype)) != -1) &&
 876                             ((intrp = apic_find_io_intr_w_busid(irqno, busid))
 877                             != NULL)) {
 878                                 if ((newirq = apic_setup_irq_table(dip, irqno,
 879                                     intrp, ispec, NULL, type)) != -1) {
 880                                         return (newirq);
 881                                 }
 882                                 goto defconf;
 883                         }
 884                         bustype = (bustype == BUS_EISA) ? BUS_ISA : BUS_EISA;
 885                 }
 886         }
 887 
 888 /* MPS default configuration */
 889 defconf:
 890         newirq = apic_setup_irq_table(dip, irqno, NULL, ispec, NULL, type);
 891         if (newirq == -1)
 892                 return (-1);
 893         ASSERT(IRQINDEX(newirq) == irqno);
 894         ASSERT(apic_irq_table[irqno]);
 895         return (newirq);
 896 }
 897 
 898 /*
 899  * Attempt to share vector with someone else
 900  */
 901 static int
 902 apic_share_vector(int irqno, iflag_t *intr_flagp, short intr_index, int ipl,
 903     uchar_t ioapicindex, uchar_t ipin, apic_irq_t **irqptrp)
 904 {
 905 #ifdef DEBUG
 906         apic_irq_t *tmpirqp = NULL;
 907 #endif /* DEBUG */
 908         apic_irq_t *irqptr, dummyirq;
 909         int     newirq, chosen_irq = -1, share = 127;
 910         int     lowest, highest, i;
 911         uchar_t share_id;
 912 
 913         DDI_INTR_IMPLDBG((CE_CONT, "apic_share_vector: irqno=0x%x "
 914             "intr_index=0x%x ipl=0x%x\n", irqno, intr_index, ipl));
 915 
 916         highest = apic_ipltopri[ipl] + APIC_VECTOR_MASK;
 917         lowest = apic_ipltopri[ipl-1] + APIC_VECTOR_PER_IPL;
 918 
 919         if (highest < lowest) /* Both ipl and ipl-1 map to same pri */
 920                 lowest -= APIC_VECTOR_PER_IPL;
 921         dummyirq.airq_mps_intr_index = intr_index;
 922         dummyirq.airq_ioapicindex = ioapicindex;
 923         dummyirq.airq_intin_no = ipin;
 924         if (intr_flagp)
 925                 dummyirq.airq_iflag = *intr_flagp;
 926         apic_record_rdt_entry(&dummyirq, irqno);
 927         for (i = lowest; i <= highest; i++) {
 928                 newirq = apic_vector_to_irq[i];
 929                 if (newirq == APIC_RESV_IRQ)
 930                         continue;
 931                 irqptr = apic_irq_table[newirq];
 932 
 933                 if ((dummyirq.airq_rdt_entry & 0xFF00) !=
 934                     (irqptr->airq_rdt_entry & 0xFF00))
 935                         /* not compatible */
 936                         continue;
 937 
 938                 if (irqptr->airq_share < share) {
 939                         share = irqptr->airq_share;
 940                         chosen_irq = newirq;
 941                 }
 942         }
 943         if (chosen_irq != -1) {
 944                 /*
 945                  * Assign a share id which is free or which is larger
 946                  * than the largest one.
 947                  */
 948                 share_id = 1;
 949                 mutex_enter(&airq_mutex);
 950                 irqptr = apic_irq_table[chosen_irq];
 951                 while (irqptr) {
 952                         if (irqptr->airq_mps_intr_index == FREE_INDEX) {
 953                                 share_id = irqptr->airq_share_id;
 954                                 break;
 955                         }
 956                         if (share_id <= irqptr->airq_share_id)
 957                                 share_id = irqptr->airq_share_id + 1;
 958 #ifdef DEBUG
 959                         tmpirqp = irqptr;
 960 #endif /* DEBUG */
 961                         irqptr = irqptr->airq_next;
 962                 }
 963                 if (!irqptr) {
 964                         irqptr = kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP);
 965                         irqptr->airq_temp_cpu = IRQ_UNINIT;
 966                         irqptr->airq_next =
 967                             apic_irq_table[chosen_irq]->airq_next;
 968                         apic_irq_table[chosen_irq]->airq_next = irqptr;
 969 #ifdef  DEBUG
 970                         tmpirqp = apic_irq_table[chosen_irq];
 971 #endif /* DEBUG */
 972                 }
 973                 irqptr->airq_mps_intr_index = intr_index;
 974                 irqptr->airq_ioapicindex = ioapicindex;
 975                 irqptr->airq_intin_no = ipin;
 976                 if (intr_flagp)
 977                         irqptr->airq_iflag = *intr_flagp;
 978                 irqptr->airq_vector = apic_irq_table[chosen_irq]->airq_vector;
 979                 irqptr->airq_share_id = share_id;
 980                 apic_record_rdt_entry(irqptr, irqno);
 981                 *irqptrp = irqptr;
 982 #ifdef  DEBUG
 983                 /* shuffle the pointers to test apic_delspl path */
 984                 if (tmpirqp) {
 985                         tmpirqp->airq_next = irqptr->airq_next;
 986                         irqptr->airq_next = apic_irq_table[chosen_irq];
 987                         apic_irq_table[chosen_irq] = irqptr;
 988                 }
 989 #endif /* DEBUG */
 990                 mutex_exit(&airq_mutex);
 991                 return (VIRTIRQ(chosen_irq, share_id));
 992         }
 993         return (-1);
 994 }
 995 
 996 /*
 997  * Allocate/Initialize the apic_irq_table[] entry for given irqno. If the entry
 998  * is used already, we will try to allocate a new irqno.
 999  *
1000  * Return value:
1001  *      Success: irqno
1002  *      Failure: -1
1003  */
1004 static int
1005 apic_setup_irq_table(dev_info_t *dip, int irqno, struct apic_io_intr *intrp,
1006     struct intrspec *ispec, iflag_t *intr_flagp, int type)
1007 {
1008         int origirq = ispec->intrspec_vec;
1009         uchar_t ipl = ispec->intrspec_pri;
1010         int     newirq, intr_index;
1011         uchar_t ipin, ioapic, ioapicindex, vector;
1012         apic_irq_t *irqptr;
1013         major_t major;
1014         dev_info_t      *sdip;
1015 
1016         DDI_INTR_IMPLDBG((CE_CONT, "apic_setup_irq_table: dip=0x%p type=%d "
1017             "irqno=0x%x origirq=0x%x\n", (void *)dip, type, irqno, origirq));
1018 
1019         ASSERT(ispec != NULL);
1020 
1021         major =  (dip != NULL) ? ddi_driver_major(dip) : 0;
1022 
1023         if (DDI_INTR_IS_MSI_OR_MSIX(type)) {
1024                 /* MSI/X doesn't need to setup ioapic stuffs */
1025                 ioapicindex = 0xff;
1026                 ioapic = 0xff;
1027                 ipin = (uchar_t)0xff;
1028                 intr_index = (type == DDI_INTR_TYPE_MSI) ? MSI_INDEX :
1029                     MSIX_INDEX;
1030                 mutex_enter(&airq_mutex);
1031                 if ((irqno = apic_allocate_irq(apic_first_avail_irq)) == -1) {
1032                         mutex_exit(&airq_mutex);
1033                         /* need an irq for MSI/X to index into autovect[] */
1034                         cmn_err(CE_WARN, "No interrupt irq: %s instance %d",
1035                             ddi_get_name(dip), ddi_get_instance(dip));
1036                         return (-1);
1037                 }
1038                 mutex_exit(&airq_mutex);
1039 
1040         } else if (intrp != NULL) {
1041                 intr_index = (int)(intrp - apic_io_intrp);
1042                 ioapic = intrp->intr_destid;
1043                 ipin = intrp->intr_destintin;
1044                 /* Find ioapicindex. If destid was ALL, we will exit with 0. */
1045                 for (ioapicindex = apic_io_max - 1; ioapicindex; ioapicindex--)
1046                         if (apic_io_id[ioapicindex] == ioapic)
1047                                 break;
1048                 ASSERT((ioapic == apic_io_id[ioapicindex]) ||
1049                     (ioapic == INTR_ALL_APIC));
1050 
1051                 /* check whether this intin# has been used by another irqno */
1052                 if ((newirq = apic_find_intin(ioapicindex, ipin)) != -1) {
1053                         return (newirq);
1054                 }
1055 
1056         } else if (intr_flagp != NULL) {
1057                 /* ACPI case */
1058                 intr_index = ACPI_INDEX;
1059                 ioapicindex = acpi_find_ioapic(irqno);
1060                 ASSERT(ioapicindex != 0xFF);
1061                 ioapic = apic_io_id[ioapicindex];
1062                 ipin = irqno - apic_io_vectbase[ioapicindex];
1063                 if (apic_irq_table[irqno] &&
1064                     apic_irq_table[irqno]->airq_mps_intr_index == ACPI_INDEX) {
1065                         ASSERT(apic_irq_table[irqno]->airq_intin_no == ipin &&
1066                             apic_irq_table[irqno]->airq_ioapicindex ==
1067                             ioapicindex);
1068                         return (irqno);
1069                 }
1070 
1071         } else {
1072                 /* default configuration */
1073                 ioapicindex = 0;
1074                 ioapic = apic_io_id[ioapicindex];
1075                 ipin = (uchar_t)irqno;
1076                 intr_index = DEFAULT_INDEX;
1077         }
1078 
1079         if (ispec == NULL) {
1080                 APIC_VERBOSE_IOAPIC((CE_WARN, "No intrspec for irqno = %x\n",
1081                     irqno));
1082         } else if ((vector = apic_allocate_vector(ipl, irqno, 0)) == 0) {
1083                 if ((newirq = apic_share_vector(irqno, intr_flagp, intr_index,
1084                     ipl, ioapicindex, ipin, &irqptr)) != -1) {
1085                         irqptr->airq_ipl = ipl;
1086                         irqptr->airq_origirq = (uchar_t)origirq;
1087                         irqptr->airq_dip = dip;
1088                         irqptr->airq_major = major;
1089                         sdip = apic_irq_table[IRQINDEX(newirq)]->airq_dip;
1090                         /* This is OK to do really */
1091                         if (sdip == NULL) {
1092                                 cmn_err(CE_WARN, "Sharing vectors: %s"
1093                                     " instance %d and SCI",
1094                                     ddi_get_name(dip), ddi_get_instance(dip));
1095                         } else {
1096                                 cmn_err(CE_WARN, "Sharing vectors: %s"
1097                                     " instance %d and %s instance %d",
1098                                     ddi_get_name(sdip), ddi_get_instance(sdip),
1099                                     ddi_get_name(dip), ddi_get_instance(dip));
1100                         }
1101                         return (newirq);
1102                 }
1103                 /* try high priority allocation now  that share has failed */
1104                 if ((vector = apic_allocate_vector(ipl, irqno, 1)) == 0) {
1105                         cmn_err(CE_WARN, "No interrupt vector: %s instance %d",
1106                             ddi_get_name(dip), ddi_get_instance(dip));
1107                         return (-1);
1108                 }
1109         }
1110 
1111         mutex_enter(&airq_mutex);
1112         if (apic_irq_table[irqno] == NULL) {
1113                 irqptr = kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP);
1114                 irqptr->airq_temp_cpu = IRQ_UNINIT;
1115                 apic_irq_table[irqno] = irqptr;
1116         } else {
1117                 irqptr = apic_irq_table[irqno];
1118                 if (irqptr->airq_mps_intr_index != FREE_INDEX) {
1119                         /*
1120                          * The slot is used by another irqno, so allocate
1121                          * a free irqno for this interrupt
1122                          */
1123                         newirq = apic_allocate_irq(apic_first_avail_irq);
1124                         if (newirq == -1) {
1125                                 mutex_exit(&airq_mutex);
1126                                 return (-1);
1127                         }
1128                         irqno = newirq;
1129                         irqptr = apic_irq_table[irqno];
1130                         if (irqptr == NULL) {
1131                                 irqptr = kmem_zalloc(sizeof (apic_irq_t),
1132                                     KM_SLEEP);
1133                                 irqptr->airq_temp_cpu = IRQ_UNINIT;
1134                                 apic_irq_table[irqno] = irqptr;
1135                         }
1136                         vector = apic_modify_vector(vector, newirq);
1137                 }
1138         }
1139         apic_max_device_irq = max(irqno, apic_max_device_irq);
1140         apic_min_device_irq = min(irqno, apic_min_device_irq);
1141         mutex_exit(&airq_mutex);
1142         irqptr->airq_ioapicindex = ioapicindex;
1143         irqptr->airq_intin_no = ipin;
1144         irqptr->airq_ipl = ipl;
1145         irqptr->airq_vector = vector;
1146         irqptr->airq_origirq = (uchar_t)origirq;
1147         irqptr->airq_share_id = 0;
1148         irqptr->airq_mps_intr_index = (short)intr_index;
1149         irqptr->airq_dip = dip;
1150         irqptr->airq_major = major;
1151         irqptr->airq_cpu = apic_bind_intr(dip, irqno, ioapic, ipin);
1152         if (intr_flagp)
1153                 irqptr->airq_iflag = *intr_flagp;
1154 
1155         if (!DDI_INTR_IS_MSI_OR_MSIX(type)) {
1156                 /* setup I/O APIC entry for non-MSI/X interrupts */
1157                 apic_record_rdt_entry(irqptr, irqno);
1158         }
1159         return (irqno);
1160 }
1161 
1162 /*
1163  * return the cpu to which this intr should be bound.
1164  * Check properties or any other mechanism to see if user wants it
1165  * bound to a specific CPU. If so, return the cpu id with high bit set.
1166  * If not, use the policy to choose a cpu and return the id.
1167  */
1168 uint32_t
1169 apic_bind_intr(dev_info_t *dip, int irq, uchar_t ioapicid, uchar_t intin)
1170 {
1171         int     instance, instno, prop_len, bind_cpu, count;
1172         uint_t  i, rc;
1173         uint32_t cpu;
1174         major_t major;
1175         char    *name, *drv_name, *prop_val, *cptr;
1176         char    prop_name[32];
1177         ulong_t iflag;
1178 
1179 
1180         if (apic_intr_policy == INTR_LOWEST_PRIORITY)
1181                 return (IRQ_UNBOUND);
1182 
1183         if (apic_nproc == 1)
1184                 return (0);
1185 
1186         drv_name = NULL;
1187         rc = DDI_PROP_NOT_FOUND;
1188         major = (major_t)-1;
1189         if (dip != NULL) {
1190                 name = ddi_get_name(dip);
1191                 major = ddi_name_to_major(name);
1192                 drv_name = ddi_major_to_name(major);
1193                 instance = ddi_get_instance(dip);
1194                 if (apic_intr_policy == INTR_ROUND_ROBIN_WITH_AFFINITY) {
1195                         i = apic_min_device_irq;
1196                         for (; i <= apic_max_device_irq; i++) {
1197 
1198                                 if ((i == irq) || (apic_irq_table[i] == NULL) ||
1199                                     (apic_irq_table[i]->airq_mps_intr_index
1200                                     == FREE_INDEX))
1201                                         continue;
1202 
1203                                 if ((apic_irq_table[i]->airq_major == major) &&
1204                                     (!(apic_irq_table[i]->airq_cpu &
1205                                     IRQ_USER_BOUND))) {
1206 
1207                                         cpu = apic_irq_table[i]->airq_cpu;
1208 
1209                                         cmn_err(CE_CONT,
1210                                             "!%s: %s (%s) instance #%d "
1211                                             "irq 0x%x vector 0x%x ioapic 0x%x "
1212                                             "intin 0x%x is bound to cpu %d\n",
1213                                             psm_name,
1214                                             name, drv_name, instance, irq,
1215                                             apic_irq_table[irq]->airq_vector,
1216                                             ioapicid, intin, cpu);
1217                                         return (cpu);
1218                                 }
1219                         }
1220                 }
1221                 /*
1222                  * search for "drvname"_intpt_bind_cpus property first, the
1223                  * syntax of the property should be "a[,b,c,...]" where
1224                  * instance 0 binds to cpu a, instance 1 binds to cpu b,
1225                  * instance 3 binds to cpu c...
1226                  * ddi_getlongprop() will search /option first, then /
1227                  * if "drvname"_intpt_bind_cpus doesn't exist, then find
1228                  * intpt_bind_cpus property.  The syntax is the same, and
1229                  * it applies to all the devices if its "drvname" specific
1230                  * property doesn't exist
1231                  */
1232                 (void) strcpy(prop_name, drv_name);
1233                 (void) strcat(prop_name, "_intpt_bind_cpus");
1234                 rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, 0, prop_name,
1235                     (caddr_t)&prop_val, &prop_len);
1236                 if (rc != DDI_PROP_SUCCESS) {
1237                         rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, 0,
1238                             "intpt_bind_cpus", (caddr_t)&prop_val, &prop_len);
1239                 }
1240         }
1241         if (rc == DDI_PROP_SUCCESS) {
1242                 for (i = count = 0; i < (prop_len - 1); i++)
1243                         if (prop_val[i] == ',')
1244                                 count++;
1245                 if (prop_val[i-1] != ',')
1246                         count++;
1247                 /*
1248                  * if somehow the binding instances defined in the
1249                  * property are not enough for this instno., then
1250                  * reuse the pattern for the next instance until
1251                  * it reaches the requested instno
1252                  */
1253                 instno = instance % count;
1254                 i = 0;
1255                 cptr = prop_val;
1256                 while (i < instno)
1257                         if (*cptr++ == ',')
1258                                 i++;
1259                 bind_cpu = stoi(&cptr);
1260                 kmem_free(prop_val, prop_len);
1261                 /* if specific CPU is bogus, then default to next cpu */
1262                 if (!apic_cpu_in_range(bind_cpu)) {
1263                         cmn_err(CE_WARN, "%s: %s=%s: CPU %d not present",
1264                             psm_name, prop_name, prop_val, bind_cpu);
1265                         rc = DDI_PROP_NOT_FOUND;
1266                 } else {
1267                         /* indicate that we are bound at user request */
1268                         bind_cpu |= IRQ_USER_BOUND;
1269                 }
1270                 /*
1271                  * no need to check apic_cpus[].aci_status, if specific CPU is
1272                  * not up, then post_cpu_start will handle it.
1273                  */
1274         }
1275         if (rc != DDI_PROP_SUCCESS) {
1276                 iflag = intr_clear();
1277                 lock_set(&apic_ioapic_lock);
1278                 bind_cpu = apic_get_next_bind_cpu();
1279                 lock_clear(&apic_ioapic_lock);
1280                 intr_restore(iflag);
1281         }
1282 
1283         if (drv_name != NULL)
1284                 cmn_err(CE_CONT, "!%s: %s (%s) instance %d irq 0x%x "
1285                     "vector 0x%x ioapic 0x%x intin 0x%x is bound to cpu %d\n",
1286                     psm_name, name, drv_name, instance, irq,
1287                     apic_irq_table[irq]->airq_vector, ioapicid, intin,
1288                     bind_cpu & ~IRQ_USER_BOUND);
1289         else
1290                 cmn_err(CE_CONT, "!%s: irq 0x%x "
1291                     "vector 0x%x ioapic 0x%x intin 0x%x is bound to cpu %d\n",
1292                     psm_name, irq, apic_irq_table[irq]->airq_vector, ioapicid,
1293                     intin, bind_cpu & ~IRQ_USER_BOUND);
1294 
1295         return ((uint32_t)bind_cpu);
1296 }
1297 
1298 /*
1299  * Mark vector as being in the process of being deleted. Interrupts
1300  * may still come in on some CPU. The moment an interrupt comes with
1301  * the new vector, we know we can free the old one. Called only from
1302  * addspl and delspl with interrupts disabled. Because an interrupt
1303  * can be shared, but no interrupt from either device may come in,
1304  * we also use a timeout mechanism, which we arbitrarily set to
1305  * apic_revector_timeout microseconds.
1306  */
1307 static void
1308 apic_mark_vector(uchar_t oldvector, uchar_t newvector)
1309 {
1310         ulong_t iflag;
1311 
1312         iflag = intr_clear();
1313         lock_set(&apic_revector_lock);
1314         if (!apic_oldvec_to_newvec) {
1315                 apic_oldvec_to_newvec =
1316                     kmem_zalloc(sizeof (newvector) * APIC_MAX_VECTOR * 2,
1317                     KM_NOSLEEP);
1318 
1319                 if (!apic_oldvec_to_newvec) {
1320                         /*
1321                          * This failure is not catastrophic.
1322                          * But, the oldvec will never be freed.
1323                          */
1324                         apic_error |= APIC_ERR_MARK_VECTOR_FAIL;
1325                         lock_clear(&apic_revector_lock);
1326                         intr_restore(iflag);
1327                         return;
1328                 }
1329                 apic_newvec_to_oldvec = &apic_oldvec_to_newvec[APIC_MAX_VECTOR];
1330         }
1331 
1332         /* See if we already did this for drivers which do double addintrs */
1333         if (apic_oldvec_to_newvec[oldvector] != newvector) {
1334                 apic_oldvec_to_newvec[oldvector] = newvector;
1335                 apic_newvec_to_oldvec[newvector] = oldvector;
1336                 apic_revector_pending++;
1337         }
1338         lock_clear(&apic_revector_lock);
1339         intr_restore(iflag);
1340         (void) timeout(apic_xlate_vector_free_timeout_handler,
1341             (void *)(uintptr_t)oldvector, drv_usectohz(apic_revector_timeout));
1342 }
1343 
1344 /*
1345  * xlate_vector is called from intr_enter if revector_pending is set.
1346  * It will xlate it if needed and mark the old vector as free.
1347  */
1348 uchar_t
1349 apic_xlate_vector(uchar_t vector)
1350 {
1351         uchar_t newvector, oldvector = 0;
1352 
1353         lock_set(&apic_revector_lock);
1354         /* Do we really need to do this ? */
1355         if (!apic_revector_pending) {
1356                 lock_clear(&apic_revector_lock);
1357                 return (vector);
1358         }
1359         if ((newvector = apic_oldvec_to_newvec[vector]) != 0)
1360                 oldvector = vector;
1361         else {
1362                 /*
1363                  * The incoming vector is new . See if a stale entry is
1364                  * remaining
1365                  */
1366                 if ((oldvector = apic_newvec_to_oldvec[vector]) != 0)
1367                         newvector = vector;
1368         }
1369 
1370         if (oldvector) {
1371                 apic_revector_pending--;
1372                 apic_oldvec_to_newvec[oldvector] = 0;
1373                 apic_newvec_to_oldvec[newvector] = 0;
1374                 apic_free_vector(oldvector);
1375                 lock_clear(&apic_revector_lock);
1376                 /* There could have been more than one reprogramming! */
1377                 return (apic_xlate_vector(newvector));
1378         }
1379         lock_clear(&apic_revector_lock);
1380         return (vector);
1381 }
1382 
1383 void
1384 apic_xlate_vector_free_timeout_handler(void *arg)
1385 {
1386         ulong_t iflag;
1387         uchar_t oldvector, newvector;
1388 
1389         oldvector = (uchar_t)(uintptr_t)arg;
1390         iflag = intr_clear();
1391         lock_set(&apic_revector_lock);
1392         if ((newvector = apic_oldvec_to_newvec[oldvector]) != 0) {
1393                 apic_free_vector(oldvector);
1394                 apic_oldvec_to_newvec[oldvector] = 0;
1395                 apic_newvec_to_oldvec[newvector] = 0;
1396                 apic_revector_pending--;
1397         }
1398 
1399         lock_clear(&apic_revector_lock);
1400         intr_restore(iflag);
1401 }
1402 
1403 /*
1404  * Bind interrupt corresponding to irq_ptr to bind_cpu.
1405  * Must be called with interrupts disabled and apic_ioapic_lock held
1406  */
1407 int
1408 apic_rebind(apic_irq_t *irq_ptr, int bind_cpu,
1409     struct ioapic_reprogram_data *drep)
1410 {
1411         int                     ioapicindex, intin_no;
1412         uint32_t                airq_temp_cpu;
1413         apic_cpus_info_t        *cpu_infop;
1414         uint32_t                rdt_entry;
1415         int                     which_irq;
1416         ioapic_rdt_t            irdt;
1417 
1418         which_irq = apic_vector_to_irq[irq_ptr->airq_vector];
1419 
1420         intin_no = irq_ptr->airq_intin_no;
1421         ioapicindex = irq_ptr->airq_ioapicindex;
1422         airq_temp_cpu = irq_ptr->airq_temp_cpu;
1423         if (airq_temp_cpu != IRQ_UNINIT && airq_temp_cpu != IRQ_UNBOUND) {
1424                 if (airq_temp_cpu & IRQ_USER_BOUND)
1425                         /* Mask off high bit so it can be used as array index */
1426                         airq_temp_cpu &= ~IRQ_USER_BOUND;
1427 
1428                 ASSERT(apic_cpu_in_range(airq_temp_cpu));
1429         }
1430 
1431         /*
1432          * Can't bind to a CPU that's not accepting interrupts:
1433          */
1434         cpu_infop = &apic_cpus[bind_cpu & ~IRQ_USER_BOUND];
1435         if (!(cpu_infop->aci_status & APIC_CPU_INTR_ENABLE))
1436                 return (1);
1437 
1438         /*
1439          * If we are about to change the interrupt vector for this interrupt,
1440          * and this interrupt is level-triggered, attached to an IOAPIC,
1441          * has been delivered to a CPU and that CPU has not handled it
1442          * yet, we cannot reprogram the IOAPIC now.
1443          */
1444         if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) {
1445 
1446                 rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex,
1447                     intin_no);
1448 
1449                 if ((irq_ptr->airq_vector != RDT_VECTOR(rdt_entry)) &&
1450                     apic_check_stuck_interrupt(irq_ptr, airq_temp_cpu,
1451                     bind_cpu, ioapicindex, intin_no, which_irq, drep) != 0) {
1452 
1453                         return (0);
1454                 }
1455 
1456                 /*
1457                  * NOTE: We do not unmask the RDT here, as an interrupt MAY
1458                  * still come in before we have a chance to reprogram it below.
1459                  * The reprogramming below will simultaneously change and
1460                  * unmask the RDT entry.
1461                  */
1462 
1463                 if ((uint32_t)bind_cpu == IRQ_UNBOUND) {
1464                         irdt.ir_lo =  AV_LDEST | AV_LOPRI |
1465                             irq_ptr->airq_rdt_entry;
1466 
1467                         WRITE_IOAPIC_RDT_ENTRY_HIGH_DWORD(ioapicindex, intin_no,
1468                             AV_TOALL);
1469 
1470                         if (airq_temp_cpu != IRQ_UNINIT && airq_temp_cpu !=
1471                             IRQ_UNBOUND)
1472                                 apic_cpus[airq_temp_cpu].aci_temp_bound--;
1473 
1474                         /*
1475                          * Write the vector, trigger, and polarity portion of
1476                          * the RDT
1477                          */
1478                         WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex, intin_no,
1479                             irdt.ir_lo);
1480 
1481                         irq_ptr->airq_temp_cpu = IRQ_UNBOUND;
1482                         return (0);
1483                 }
1484         }
1485 
1486         if (bind_cpu & IRQ_USER_BOUND) {
1487                 cpu_infop->aci_bound++;
1488         } else {
1489                 cpu_infop->aci_temp_bound++;
1490         }
1491         ASSERT(apic_cpu_in_range(bind_cpu));
1492 
1493         if ((airq_temp_cpu != IRQ_UNBOUND) && (airq_temp_cpu != IRQ_UNINIT)) {
1494                 apic_cpus[airq_temp_cpu].aci_temp_bound--;
1495         }
1496         if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) {
1497 
1498                 irdt.ir_lo = AV_PDEST | AV_FIXED | irq_ptr->airq_rdt_entry;
1499                 irdt.ir_hi = cpu_infop->aci_local_id;
1500 
1501                 /* Write the RDT entry -- bind to a specific CPU: */
1502                 WRITE_IOAPIC_RDT_ENTRY_HIGH_DWORD(ioapicindex, intin_no,
1503                     irdt.ir_hi << APIC_ID_BIT_OFFSET);
1504 
1505                 /* Write the vector, trigger, and polarity portion of the RDT */
1506                 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex, intin_no,
1507                     irdt.ir_lo);
1508 
1509         } else {
1510                 int type = (irq_ptr->airq_mps_intr_index == MSI_INDEX) ?
1511                     DDI_INTR_TYPE_MSI : DDI_INTR_TYPE_MSIX;
1512                 if (type == DDI_INTR_TYPE_MSI) {
1513                         if (irq_ptr->airq_ioapicindex ==
1514                             irq_ptr->airq_origirq) {
1515                                 /* first one */
1516                                 DDI_INTR_IMPLDBG((CE_CONT, "apic_rebind: call "
1517                                     "apic_pci_msi_enable_vector\n"));
1518                                 apic_pci_msi_enable_vector(irq_ptr,
1519                                     type, which_irq, irq_ptr->airq_vector,
1520                                     irq_ptr->airq_intin_no,
1521                                     cpu_infop->aci_local_id);
1522                         }
1523                         if ((irq_ptr->airq_ioapicindex +
1524                             irq_ptr->airq_intin_no - 1) ==
1525                             irq_ptr->airq_origirq) { /* last one */
1526                                 DDI_INTR_IMPLDBG((CE_CONT, "apic_rebind: call "
1527                                     "apic_pci_msi_enable_mode\n"));
1528                                 apic_pci_msi_enable_mode(irq_ptr->airq_dip,
1529                                     type, which_irq);
1530                         }
1531                 } else { /* MSI-X */
1532                         apic_pci_msi_enable_vector(irq_ptr, type,
1533                             irq_ptr->airq_origirq, irq_ptr->airq_vector, 1,
1534                             cpu_infop->aci_local_id);
1535                         apic_pci_msi_enable_mode(irq_ptr->airq_dip, type,
1536                             irq_ptr->airq_origirq);
1537                 }
1538         }
1539         irq_ptr->airq_temp_cpu = (uint32_t)bind_cpu;
1540         apic_redist_cpu_skip &= ~(1 << (bind_cpu & ~IRQ_USER_BOUND));
1541         return (0);
1542 }
1543 
1544 static void
1545 apic_last_ditch_clear_remote_irr(int ioapic_ix, int intin_no)
1546 {
1547         if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no)
1548             & AV_REMOTE_IRR) != 0) {
1549                 /*
1550                  * Trying to clear the bit through normal
1551                  * channels has failed.  So as a last-ditch
1552                  * effort, try to set the trigger mode to
1553                  * edge, then to level.  This has been
1554                  * observed to work on many systems.
1555                  */
1556                 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1557                     intin_no,
1558                     READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1559                     intin_no) & ~AV_LEVEL);
1560 
1561                 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1562                     intin_no,
1563                     READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1564                     intin_no) | AV_LEVEL);
1565 
1566                 /*
1567                  * If the bit's STILL set, this interrupt may
1568                  * be hosed.
1569                  */
1570                 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1571                     intin_no) & AV_REMOTE_IRR) != 0) {
1572 
1573                         prom_printf("%s: Remote IRR still "
1574                             "not clear for IOAPIC %d intin %d.\n"
1575                             "\tInterrupts to this pin may cease "
1576                             "functioning.\n", psm_name, ioapic_ix,
1577                             intin_no);
1578 #ifdef DEBUG
1579                         apic_last_ditch_reprogram_failures++;
1580 #endif
1581                 }
1582         }
1583 }
1584 
1585 /*
1586  * This function is protected by apic_ioapic_lock coupled with the
1587  * fact that interrupts are disabled.
1588  */
1589 static void
1590 delete_defer_repro_ent(int which_irq)
1591 {
1592         ASSERT(which_irq >= 0);
1593         ASSERT(which_irq <= 255);
1594         ASSERT(LOCK_HELD(&apic_ioapic_lock));
1595 
1596         if (apic_reprogram_info[which_irq].done)
1597                 return;
1598 
1599         apic_reprogram_info[which_irq].done = B_TRUE;
1600 
1601 #ifdef DEBUG
1602         apic_defer_repro_total_retries +=
1603             apic_reprogram_info[which_irq].tries;
1604 
1605         apic_defer_repro_successes++;
1606 #endif
1607 
1608         if (--apic_reprogram_outstanding == 0) {
1609 
1610                 setlvlx = psm_intr_exit_fn();
1611         }
1612 }
1613 
1614 
1615 /*
1616  * Interrupts must be disabled during this function to prevent
1617  * self-deadlock.  Interrupts are disabled because this function
1618  * is called from apic_check_stuck_interrupt(), which is called
1619  * from apic_rebind(), which requires its caller to disable interrupts.
1620  */
1621 static void
1622 add_defer_repro_ent(apic_irq_t *irq_ptr, int which_irq, int new_bind_cpu)
1623 {
1624         ASSERT(which_irq >= 0);
1625         ASSERT(which_irq <= 255);
1626         ASSERT(!interrupts_enabled());
1627 
1628         /*
1629          * On the off-chance that there's already a deferred
1630          * reprogramming on this irq, check, and if so, just update the
1631          * CPU and irq pointer to which the interrupt is targeted, then return.
1632          */
1633         if (!apic_reprogram_info[which_irq].done) {
1634                 apic_reprogram_info[which_irq].bindcpu = new_bind_cpu;
1635                 apic_reprogram_info[which_irq].irqp = irq_ptr;
1636                 return;
1637         }
1638 
1639         apic_reprogram_info[which_irq].irqp = irq_ptr;
1640         apic_reprogram_info[which_irq].bindcpu = new_bind_cpu;
1641         apic_reprogram_info[which_irq].tries = 0;
1642         /*
1643          * This must be the last thing set, since we're not
1644          * grabbing any locks, apic_try_deferred_reprogram() will
1645          * make its decision about using this entry iff done
1646          * is false.
1647          */
1648         apic_reprogram_info[which_irq].done = B_FALSE;
1649 
1650         /*
1651          * If there were previously no deferred reprogrammings, change
1652          * setlvlx to call apic_try_deferred_reprogram()
1653          */
1654         if (++apic_reprogram_outstanding == 1) {
1655 
1656                 setlvlx = apic_try_deferred_reprogram;
1657         }
1658 }
1659 
1660 static void
1661 apic_try_deferred_reprogram(int prev_ipl, int irq)
1662 {
1663         int reproirq;
1664         ulong_t iflag;
1665         struct ioapic_reprogram_data *drep;
1666 
1667         (*psm_intr_exit_fn())(prev_ipl, irq);
1668 
1669         if (!lock_try(&apic_defer_reprogram_lock)) {
1670                 return;
1671         }
1672 
1673         /*
1674          * Acquire the apic_ioapic_lock so that any other operations that
1675          * may affect the apic_reprogram_info state are serialized.
1676          * It's still possible for the last deferred reprogramming to clear
1677          * between the time we entered this function and the time we get to
1678          * the for loop below.  In that case, *setlvlx will have been set
1679          * back to *_intr_exit and drep will be NULL. (There's no way to
1680          * stop that from happening -- we would need to grab a lock before
1681          * calling *setlvlx, which is neither realistic nor prudent).
1682          */
1683         iflag = intr_clear();
1684         lock_set(&apic_ioapic_lock);
1685 
1686         /*
1687          * For each deferred RDT entry, try to reprogram it now.  Note that
1688          * there is no lock acquisition to read apic_reprogram_info because
1689          * '.done' is set only after the other fields in the structure are set.
1690          */
1691 
1692         drep = NULL;
1693         for (reproirq = 0; reproirq <= APIC_MAX_VECTOR; reproirq++) {
1694                 if (apic_reprogram_info[reproirq].done == B_FALSE) {
1695                         drep = &apic_reprogram_info[reproirq];
1696                         break;
1697                 }
1698         }
1699 
1700         /*
1701          * Either we found a deferred action to perform, or
1702          * we entered this function spuriously, after *setlvlx
1703          * was restored to point to *_intr_exit.  Any other
1704          * permutation is invalid.
1705          */
1706         ASSERT(drep != NULL || *setlvlx == psm_intr_exit_fn());
1707 
1708         /*
1709          * Though we can't really do anything about errors
1710          * at this point, keep track of them for reporting.
1711          * Note that it is very possible for apic_setup_io_intr
1712          * to re-register this very timeout if the Remote IRR bit
1713          * has not yet cleared.
1714          */
1715 
1716 #ifdef DEBUG
1717         if (drep != NULL) {
1718                 if (apic_setup_io_intr(drep, reproirq, B_TRUE) != 0) {
1719                         apic_deferred_setup_failures++;
1720                 }
1721         } else {
1722                 apic_deferred_spurious_enters++;
1723         }
1724 #else
1725         if (drep != NULL)
1726                 (void) apic_setup_io_intr(drep, reproirq, B_TRUE);
1727 #endif
1728 
1729         lock_clear(&apic_ioapic_lock);
1730         intr_restore(iflag);
1731 
1732         lock_clear(&apic_defer_reprogram_lock);
1733 }
1734 
1735 static void
1736 apic_ioapic_wait_pending_clear(int ioapic_ix, int intin_no)
1737 {
1738         int waited;
1739 
1740         /*
1741          * Wait for the delivery pending bit to clear.
1742          */
1743         if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no) &
1744             (AV_LEVEL|AV_PENDING)) == (AV_LEVEL|AV_PENDING)) {
1745 
1746                 /*
1747                  * If we're still waiting on the delivery of this interrupt,
1748                  * continue to wait here until it is delivered (this should be
1749                  * a very small amount of time, but include a timeout just in
1750                  * case).
1751                  */
1752                 for (waited = 0; waited < apic_max_reps_clear_pending;
1753                     waited++) {
1754                         if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1755                             intin_no) & AV_PENDING) == 0) {
1756                                 break;
1757                         }
1758                 }
1759         }
1760 }
1761 
1762 
1763 /*
1764  * Checks to see if the IOAPIC interrupt entry specified has its Remote IRR
1765  * bit set.  Calls functions that modify the function that setlvlx points to,
1766  * so that the reprogramming can be retried very shortly.
1767  *
1768  * This function will mask the RDT entry if the interrupt is level-triggered.
1769  * (The caller is responsible for unmasking the RDT entry.)
1770  *
1771  * Returns non-zero if the caller should defer IOAPIC reprogramming.
1772  */
1773 static int
1774 apic_check_stuck_interrupt(apic_irq_t *irq_ptr, int old_bind_cpu,
1775     int new_bind_cpu, int ioapic_ix, int intin_no, int which_irq,
1776     struct ioapic_reprogram_data *drep)
1777 {
1778         int32_t                 rdt_entry;
1779         int                     waited;
1780         int                     reps = 0;
1781 
1782         /*
1783          * Wait for the delivery pending bit to clear.
1784          */
1785         do {
1786                 ++reps;
1787 
1788                 apic_ioapic_wait_pending_clear(ioapic_ix, intin_no);
1789 
1790                 /*
1791                  * Mask the RDT entry, but only if it's a level-triggered
1792                  * interrupt
1793                  */
1794                 rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1795                     intin_no);
1796                 if ((rdt_entry & (AV_LEVEL|AV_MASK)) == AV_LEVEL) {
1797 
1798                         /* Mask it */
1799                         WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no,
1800                             AV_MASK | rdt_entry);
1801                 }
1802 
1803                 if ((rdt_entry & AV_LEVEL) == AV_LEVEL) {
1804                         /*
1805                          * If there was a race and an interrupt was injected
1806                          * just before we masked, check for that case here.
1807                          * Then, unmask the RDT entry and try again.  If we're
1808                          * on our last try, don't unmask (because we want the
1809                          * RDT entry to remain masked for the rest of the
1810                          * function).
1811                          */
1812                         rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1813                             intin_no);
1814                         if ((rdt_entry & AV_PENDING) &&
1815                             (reps < apic_max_reps_clear_pending)) {
1816                                 /* Unmask it */
1817                                 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1818                                     intin_no, rdt_entry & ~AV_MASK);
1819                         }
1820                 }
1821 
1822         } while ((rdt_entry & AV_PENDING) &&
1823             (reps < apic_max_reps_clear_pending));
1824 
1825 #ifdef DEBUG
1826                 if (rdt_entry & AV_PENDING)
1827                         apic_intr_deliver_timeouts++;
1828 #endif
1829 
1830         /*
1831          * If the remote IRR bit is set, then the interrupt has been sent
1832          * to a CPU for processing.  We have no choice but to wait for
1833          * that CPU to process the interrupt, at which point the remote IRR
1834          * bit will be cleared.
1835          */
1836         if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no) &
1837             (AV_LEVEL|AV_REMOTE_IRR)) == (AV_LEVEL|AV_REMOTE_IRR)) {
1838 
1839                 /*
1840                  * If the CPU that this RDT is bound to is NOT the current
1841                  * CPU, wait until that CPU handles the interrupt and ACKs
1842                  * it.  If this interrupt is not bound to any CPU (that is,
1843                  * if it's bound to the logical destination of "anyone"), it
1844                  * may have been delivered to the current CPU so handle that
1845                  * case by deferring the reprogramming (below).
1846                  */
1847                 if ((old_bind_cpu != IRQ_UNBOUND) &&
1848                     (old_bind_cpu != IRQ_UNINIT) &&
1849                     (old_bind_cpu != psm_get_cpu_id())) {
1850                         for (waited = 0; waited < apic_max_reps_clear_pending;
1851                             waited++) {
1852                                 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1853                                     intin_no) & AV_REMOTE_IRR) == 0) {
1854 
1855                                         delete_defer_repro_ent(which_irq);
1856 
1857                                         /* Remote IRR has cleared! */
1858                                         return (0);
1859                                 }
1860                         }
1861                 }
1862 
1863                 /*
1864                  * If we waited and the Remote IRR bit is still not cleared,
1865                  * AND if we've invoked the timeout APIC_REPROGRAM_MAX_TIMEOUTS
1866                  * times for this interrupt, try the last-ditch workaround:
1867                  */
1868                 if (drep && drep->tries >= APIC_REPROGRAM_MAX_TRIES) {
1869 
1870                         apic_last_ditch_clear_remote_irr(ioapic_ix, intin_no);
1871 
1872                         /* Mark this one as reprogrammed: */
1873                         delete_defer_repro_ent(which_irq);
1874 
1875                         return (0);
1876                 } else {
1877 #ifdef DEBUG
1878                         apic_intr_deferrals++;
1879 #endif
1880 
1881                         /*
1882                          * If waiting for the Remote IRR bit (above) didn't
1883                          * allow it to clear, defer the reprogramming.
1884                          * Add a new deferred-programming entry if the
1885                          * caller passed a NULL one (and update the existing one
1886                          * in case anything changed).
1887                          */
1888                         add_defer_repro_ent(irq_ptr, which_irq, new_bind_cpu);
1889                         if (drep)
1890                                 drep->tries++;
1891 
1892                         /* Inform caller to defer IOAPIC programming: */
1893                         return (1);
1894                 }
1895 
1896         }
1897 
1898         /* Remote IRR is clear */
1899         delete_defer_repro_ent(which_irq);
1900 
1901         return (0);
1902 }
1903 
1904 /*
1905  * Called to migrate all interrupts at an irq to another cpu.
1906  * Must be called with interrupts disabled and apic_ioapic_lock held
1907  */
1908 int
1909 apic_rebind_all(apic_irq_t *irq_ptr, int bind_cpu)
1910 {
1911         apic_irq_t      *irqptr = irq_ptr;
1912         int             retval = 0;
1913 
1914         while (irqptr) {
1915                 if (irqptr->airq_temp_cpu != IRQ_UNINIT)
1916                         retval |= apic_rebind(irqptr, bind_cpu, NULL);
1917                 irqptr = irqptr->airq_next;
1918         }
1919 
1920         return (retval);
1921 }
1922 
1923 /*
1924  * apic_intr_redistribute does all the messy computations for identifying
1925  * which interrupt to move to which CPU. Currently we do just one interrupt
1926  * at a time. This reduces the time we spent doing all this within clock
1927  * interrupt. When it is done in idle, we could do more than 1.
1928  * First we find the most busy and the most free CPU (time in ISR only)
1929  * skipping those CPUs that has been identified as being ineligible (cpu_skip)
1930  * Then we look for IRQs which are closest to the difference between the
1931  * most busy CPU and the average ISR load. We try to find one whose load
1932  * is less than difference.If none exists, then we chose one larger than the
1933  * difference, provided it does not make the most idle CPU worse than the
1934  * most busy one. In the end, we clear all the busy fields for CPUs. For
1935  * IRQs, they are cleared as they are scanned.
1936  */
1937 void
1938 apic_intr_redistribute(void)
1939 {
1940         int busiest_cpu, most_free_cpu;
1941         int cpu_free, cpu_busy, max_busy, min_busy;
1942         int min_free, diff;
1943         int average_busy, cpus_online;
1944         int i, busy;
1945         ulong_t iflag;
1946         apic_cpus_info_t *cpu_infop;
1947         apic_irq_t *min_busy_irq = NULL;
1948         apic_irq_t *max_busy_irq = NULL;
1949 
1950         busiest_cpu = most_free_cpu = -1;
1951         cpu_free = cpu_busy = max_busy = average_busy = 0;
1952         min_free = apic_sample_factor_redistribution;
1953         cpus_online = 0;
1954         /*
1955          * Below we will check for CPU_INTR_ENABLE, bound, temp_bound, temp_cpu
1956          * without ioapic_lock. That is OK as we are just doing statistical
1957          * sampling anyway and any inaccuracy now will get corrected next time
1958          * The call to rebind which actually changes things will make sure
1959          * we are consistent.
1960          */
1961         for (i = 0; i < apic_nproc; i++) {
1962                 if (apic_cpu_in_range(i) &&
1963                     !(apic_redist_cpu_skip & (1 << i)) &&
1964                     (apic_cpus[i].aci_status & APIC_CPU_INTR_ENABLE)) {
1965 
1966                         cpu_infop = &apic_cpus[i];
1967                         /*
1968                          * If no unbound interrupts or only 1 total on this
1969                          * CPU, skip
1970                          */
1971                         if (!cpu_infop->aci_temp_bound ||
1972                             (cpu_infop->aci_bound + cpu_infop->aci_temp_bound)
1973                             == 1) {
1974                                 apic_redist_cpu_skip |= 1 << i;
1975                                 continue;
1976                         }
1977 
1978                         busy = cpu_infop->aci_busy;
1979                         average_busy += busy;
1980                         cpus_online++;
1981                         if (max_busy < busy) {
1982                                 max_busy = busy;
1983                                 busiest_cpu = i;
1984                         }
1985                         if (min_free > busy) {
1986                                 min_free = busy;
1987                                 most_free_cpu = i;
1988                         }
1989                         if (busy > apic_int_busy_mark) {
1990                                 cpu_busy |= 1 << i;
1991                         } else {
1992                                 if (busy < apic_int_free_mark)
1993                                         cpu_free |= 1 << i;
1994                         }
1995                 }
1996         }
1997         if ((cpu_busy && cpu_free) ||
1998             (max_busy >= (min_free + apic_diff_for_redistribution))) {
1999 
2000                 apic_num_imbalance++;
2001 #ifdef  DEBUG
2002                 if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) {
2003                         prom_printf(
2004                             "redistribute busy=%x free=%x max=%x min=%x",
2005                             cpu_busy, cpu_free, max_busy, min_free);
2006                 }
2007 #endif /* DEBUG */
2008 
2009 
2010                 average_busy /= cpus_online;
2011 
2012                 diff = max_busy - average_busy;
2013                 min_busy = max_busy; /* start with the max possible value */
2014                 max_busy = 0;
2015                 min_busy_irq = max_busy_irq = NULL;
2016                 i = apic_min_device_irq;
2017                 for (; i <= apic_max_device_irq; i++) {
2018                         apic_irq_t *irq_ptr;
2019                         /* Change to linked list per CPU ? */
2020                         if ((irq_ptr = apic_irq_table[i]) == NULL)
2021                                 continue;
2022                         /* Check for irq_busy & decide which one to move */
2023                         /* Also zero them for next round */
2024                         if ((irq_ptr->airq_temp_cpu == busiest_cpu) &&
2025                             irq_ptr->airq_busy) {
2026                                 if (irq_ptr->airq_busy < diff) {
2027                                         /*
2028                                          * Check for least busy CPU,
2029                                          * best fit or what ?
2030                                          */
2031                                         if (max_busy < irq_ptr->airq_busy) {
2032                                                 /*
2033                                                  * Most busy within the
2034                                                  * required differential
2035                                                  */
2036                                                 max_busy = irq_ptr->airq_busy;
2037                                                 max_busy_irq = irq_ptr;
2038                                         }
2039                                 } else {
2040                                         if (min_busy > irq_ptr->airq_busy) {
2041                                                 /*
2042                                                  * least busy, but more than
2043                                                  * the reqd diff
2044                                                  */
2045                                                 if (min_busy <
2046                                                     (diff + average_busy -
2047                                                     min_free)) {
2048                                                         /*
2049                                                          * Making sure new cpu
2050                                                          * will not end up
2051                                                          * worse
2052                                                          */
2053                                                         min_busy =
2054                                                             irq_ptr->airq_busy;
2055 
2056                                                         min_busy_irq = irq_ptr;
2057                                                 }
2058                                         }
2059                                 }
2060                         }
2061                         irq_ptr->airq_busy = 0;
2062                 }
2063 
2064                 if (max_busy_irq != NULL) {
2065 #ifdef  DEBUG
2066                         if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) {
2067                                 prom_printf("rebinding %x to %x",
2068                                     max_busy_irq->airq_vector, most_free_cpu);
2069                         }
2070 #endif /* DEBUG */
2071                         iflag = intr_clear();
2072                         if (lock_try(&apic_ioapic_lock)) {
2073                                 if (apic_rebind_all(max_busy_irq,
2074                                     most_free_cpu) == 0) {
2075                                         /* Make change permenant */
2076                                         max_busy_irq->airq_cpu =
2077                                             (uint32_t)most_free_cpu;
2078                                 }
2079                                 lock_clear(&apic_ioapic_lock);
2080                         }
2081                         intr_restore(iflag);
2082 
2083                 } else if (min_busy_irq != NULL) {
2084 #ifdef  DEBUG
2085                         if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) {
2086                                 prom_printf("rebinding %x to %x",
2087                                     min_busy_irq->airq_vector, most_free_cpu);
2088                         }
2089 #endif /* DEBUG */
2090 
2091                         iflag = intr_clear();
2092                         if (lock_try(&apic_ioapic_lock)) {
2093                                 if (apic_rebind_all(min_busy_irq,
2094                                     most_free_cpu) == 0) {
2095                                         /* Make change permenant */
2096                                         min_busy_irq->airq_cpu =
2097                                             (uint32_t)most_free_cpu;
2098                                 }
2099                                 lock_clear(&apic_ioapic_lock);
2100                         }
2101                         intr_restore(iflag);
2102 
2103                 } else {
2104                         if (cpu_busy != (1 << busiest_cpu)) {
2105                                 apic_redist_cpu_skip |= 1 << busiest_cpu;
2106                                 /*
2107                                  * We leave cpu_skip set so that next time we
2108                                  * can choose another cpu
2109                                  */
2110                         }
2111                 }
2112                 apic_num_rebind++;
2113         } else {
2114                 /*
2115                  * found nothing. Could be that we skipped over valid CPUs
2116                  * or we have balanced everything. If we had a variable
2117                  * ticks_for_redistribution, it could be increased here.
2118                  * apic_int_busy, int_free etc would also need to be
2119                  * changed.
2120                  */
2121                 if (apic_redist_cpu_skip)
2122                         apic_redist_cpu_skip = 0;
2123         }
2124         for (i = 0; i < apic_nproc; i++) {
2125                 if (apic_cpu_in_range(i)) {
2126                         apic_cpus[i].aci_busy = 0;
2127                 }
2128         }
2129 }
2130 
2131 void
2132 apic_cleanup_busy(void)
2133 {
2134         int i;
2135         apic_irq_t *irq_ptr;
2136 
2137         for (i = 0; i < apic_nproc; i++) {
2138                 if (apic_cpu_in_range(i)) {
2139                         apic_cpus[i].aci_busy = 0;
2140                 }
2141         }
2142 
2143         for (i = apic_min_device_irq; i <= apic_max_device_irq; i++) {
2144                 if ((irq_ptr = apic_irq_table[i]) != NULL)
2145                         irq_ptr->airq_busy = 0;
2146         }
2147 }
2148 
2149 int
2150 apic_ioapic_method_probe()
2151 {
2152         return (PSM_SUCCESS);
2153 }