1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24 /*
25 * Copyright (c) 2010, Intel Corporation.
26 * All rights reserved.
27 * Copyright 2016 PALO, Richard.
28 */
29
30 /*
31 * PSMI 1.1 extensions are supported only in 2.6 and later versions.
32 * PSMI 1.2 extensions are supported only in 2.7 and later versions.
33 * PSMI 1.3 and 1.4 extensions are supported in Solaris 10.
34 * PSMI 1.5 extensions are supported in Solaris Nevada.
35 * PSMI 1.6 extensions are supported in Solaris Nevada.
36 * PSMI 1.7 extensions are supported in Solaris Nevada.
37 */
38 #define PSMI_1_7
39
40 #include <sys/processor.h>
41 #include <sys/time.h>
42 #include <sys/psm.h>
43 #include <sys/smp_impldefs.h>
44 #include <sys/cram.h>
45 #include <sys/acpi/acpi.h>
46 #include <sys/acpica.h>
47 #include <sys/psm_common.h>
48 #include <sys/apic.h>
49 #include <sys/apic_common.h>
50 #include <sys/pit.h>
51 #include <sys/ddi.h>
52 #include <sys/sunddi.h>
53 #include <sys/ddi_impldefs.h>
54 #include <sys/pci.h>
55 #include <sys/promif.h>
56 #include <sys/x86_archext.h>
57 #include <sys/cpc_impl.h>
58 #include <sys/uadmin.h>
59 #include <sys/panic.h>
60 #include <sys/debug.h>
61 #include <sys/archsystm.h>
62 #include <sys/trap.h>
63 #include <sys/machsystm.h>
64 #include <sys/cpuvar.h>
65 #include <sys/rm_platter.h>
66 #include <sys/privregs.h>
67 #include <sys/cyclic.h>
68 #include <sys/note.h>
69 #include <sys/pci_intr_lib.h>
70 #include <sys/sunndi.h>
71
72
73 /*
74 * Local Function Prototypes
75 */
76 static void apic_mark_vector(uchar_t oldvector, uchar_t newvector);
77 static void apic_xlate_vector_free_timeout_handler(void *arg);
78 static int apic_check_stuck_interrupt(apic_irq_t *irq_ptr, int old_bind_cpu,
79 int new_bind_cpu, int apicindex, int intin_no, int which_irq,
80 struct ioapic_reprogram_data *drep);
81 static int apic_setup_irq_table(dev_info_t *dip, int irqno,
82 struct apic_io_intr *intrp, struct intrspec *ispec, iflag_t *intr_flagp,
83 int type);
84 static void apic_try_deferred_reprogram(int ipl, int vect);
85 static void delete_defer_repro_ent(int which_irq);
86 static void apic_ioapic_wait_pending_clear(int ioapicindex,
87 int intin_no);
88
89 extern int apic_acpi_translate_pci_irq(dev_info_t *dip, int busid, int devid,
90 int ipin, int *pci_irqp, iflag_t *intr_flagp);
91 extern int apic_handle_pci_pci_bridge(dev_info_t *idip, int child_devno,
92 int child_ipin, struct apic_io_intr **intrp);
93 extern uchar_t acpi_find_ioapic(int irq);
94 extern struct apic_io_intr *apic_find_io_intr_w_busid(int irqno, int busid);
95 extern int apic_find_bus_id(int bustype);
96 extern int apic_find_intin(uchar_t ioapic, uchar_t intin);
97 extern void apic_record_rdt_entry(apic_irq_t *irqptr, int irq);
98
99 extern int apic_sci_vect;
100 extern iflag_t apic_sci_flags;
101 extern int apic_intr_policy;
102 extern char *psm_name;
103
104 /*
105 * number of bits per byte, from <sys/param.h>
106 */
107 #define UCHAR_MAX ((1 << NBBY) - 1)
108
109 /* Max wait time (in repetitions) for flags to clear in an RDT entry. */
110 extern int apic_max_reps_clear_pending;
111
112 /* The irq # is implicit in the array index: */
113 struct ioapic_reprogram_data apic_reprogram_info[APIC_MAX_VECTOR+1];
114 /*
115 * APIC_MAX_VECTOR + 1 is the maximum # of IRQs as well. ioapic_reprogram_info
116 * is indexed by IRQ number, NOT by vector number.
117 */
118
119 extern int apic_int_busy_mark;
120 extern int apic_int_free_mark;
121 extern int apic_diff_for_redistribution;
122 extern int apic_sample_factor_redistribution;
123 extern int apic_redist_cpu_skip;
124 extern int apic_num_imbalance;
125 extern int apic_num_rebind;
126
127 /* timeout for xlate_vector, mark_vector */
128 int apic_revector_timeout = 16 * 10000; /* 160 millisec */
129
130 extern int apic_defconf;
131 extern int apic_irq_translate;
132
133 extern int apic_use_acpi_madt_only; /* 1=ONLY use MADT from ACPI */
134
135 extern uchar_t apic_io_vectbase[MAX_IO_APIC];
136
137 extern boolean_t ioapic_mask_workaround[MAX_IO_APIC];
138
139 /*
140 * First available slot to be used as IRQ index into the apic_irq_table
141 * for those interrupts (like MSI/X) that don't have a physical IRQ.
142 */
143 extern int apic_first_avail_irq;
144
145 /*
146 * apic_defer_reprogram_lock ensures that only one processor is handling
147 * deferred interrupt programming at *_intr_exit time.
148 */
149 static lock_t apic_defer_reprogram_lock;
150
151 /*
152 * The current number of deferred reprogrammings outstanding
153 */
154 uint_t apic_reprogram_outstanding = 0;
155
156 #ifdef DEBUG
157 /*
158 * Counters that keep track of deferred reprogramming stats
159 */
160 uint_t apic_intr_deferrals = 0;
161 uint_t apic_intr_deliver_timeouts = 0;
162 uint_t apic_last_ditch_reprogram_failures = 0;
163 uint_t apic_deferred_setup_failures = 0;
164 uint_t apic_defer_repro_total_retries = 0;
165 uint_t apic_defer_repro_successes = 0;
166 uint_t apic_deferred_spurious_enters = 0;
167 #endif
168
169 extern int apic_io_max;
170 extern struct apic_io_intr *apic_io_intrp;
171
172 uchar_t apic_vector_to_irq[APIC_MAX_VECTOR+1];
173
174 extern uint32_t eisa_level_intr_mask;
175 /* At least MSB will be set if EISA bus */
176
177 extern int apic_pci_bus_total;
178 extern uchar_t apic_single_pci_busid;
179
180 /*
181 * Following declarations are for revectoring; used when ISRs at different
182 * IPLs share an irq.
183 */
184 static lock_t apic_revector_lock;
185 int apic_revector_pending = 0;
186 static uchar_t *apic_oldvec_to_newvec;
187 static uchar_t *apic_newvec_to_oldvec;
188
189 /* ACPI Interrupt Source Override Structure ptr */
190 ACPI_MADT_INTERRUPT_OVERRIDE *acpi_isop;
191 extern int acpi_iso_cnt;
192
193 /*
194 * Auto-configuration routines
195 */
196
197 /*
198 * Initialise vector->ipl and ipl->pri arrays. level_intr and irqtable
199 * are also set to NULL. vector->irq is set to a value which cannot map
200 * to a real irq to show that it is free.
201 */
202 void
203 apic_init_common(void)
204 {
205 int i, j, indx;
206 int *iptr;
207
208 /*
209 * Initialize apic_ipls from apic_vectortoipl. This array is
210 * used in apic_intr_enter to determine the IPL to use for the
211 * corresponding vector. On some systems, due to hardware errata
212 * and interrupt sharing, the IPL may not correspond to the IPL listed
213 * in apic_vectortoipl (see apic_addspl and apic_delspl).
214 */
215 for (i = 0; i < (APIC_AVAIL_VECTOR / APIC_VECTOR_PER_IPL); i++) {
216 indx = i * APIC_VECTOR_PER_IPL;
217
218 for (j = 0; j < APIC_VECTOR_PER_IPL; j++, indx++)
219 apic_ipls[indx] = apic_vectortoipl[i];
220 }
221
222 /* cpu 0 is always up (for now) */
223 apic_cpus[0].aci_status = APIC_CPU_ONLINE | APIC_CPU_INTR_ENABLE;
224
225 iptr = (int *)&apic_irq_table[0];
226 for (i = 0; i <= APIC_MAX_VECTOR; i++) {
227 apic_level_intr[i] = 0;
228 *iptr++ = 0;
229 apic_vector_to_irq[i] = APIC_RESV_IRQ;
230
231 /* These *must* be initted to B_TRUE! */
232 apic_reprogram_info[i].done = B_TRUE;
233 apic_reprogram_info[i].irqp = NULL;
234 apic_reprogram_info[i].tries = 0;
235 apic_reprogram_info[i].bindcpu = 0;
236 }
237
238 /*
239 * Allocate a dummy irq table entry for the reserved entry.
240 * This takes care of the race between removing an irq and
241 * clock detecting a CPU in that irq during interrupt load
242 * sampling.
243 */
244 apic_irq_table[APIC_RESV_IRQ] =
245 kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP);
246
247 mutex_init(&airq_mutex, NULL, MUTEX_DEFAULT, NULL);
248 }
249
250 void
251 ioapic_init_intr(int mask_apic)
252 {
253 int ioapic_ix;
254 struct intrspec ispec;
255 apic_irq_t *irqptr;
256 int i, j;
257 ulong_t iflag;
258
259 LOCK_INIT_CLEAR(&apic_revector_lock);
260 LOCK_INIT_CLEAR(&apic_defer_reprogram_lock);
261
262 /* mask interrupt vectors */
263 for (j = 0; j < apic_io_max && mask_apic; j++) {
264 int intin_max;
265
266 ioapic_ix = j;
267 /* Bits 23-16 define the maximum redirection entries */
268 intin_max = (ioapic_read(ioapic_ix, APIC_VERS_CMD) >> 16)
269 & 0xff;
270 for (i = 0; i <= intin_max; i++)
271 ioapic_write(ioapic_ix, APIC_RDT_CMD + 2 * i, AV_MASK);
272 }
273
274 /*
275 * Hack alert: deal with ACPI SCI interrupt chicken/egg here
276 */
277 if (apic_sci_vect > 0) {
278 /*
279 * acpica has already done add_avintr(); we just
280 * to finish the job by mimicing translate_irq()
281 *
282 * Fake up an intrspec and setup the tables
283 */
284 ispec.intrspec_vec = apic_sci_vect;
285 ispec.intrspec_pri = SCI_IPL;
286
287 if (apic_setup_irq_table(NULL, apic_sci_vect, NULL,
288 &ispec, &apic_sci_flags, DDI_INTR_TYPE_FIXED) < 0) {
289 cmn_err(CE_WARN, "!apic: SCI setup failed");
290 return;
291 }
292 irqptr = apic_irq_table[apic_sci_vect];
293
294 iflag = intr_clear();
295 lock_set(&apic_ioapic_lock);
296
297 /* Program I/O APIC */
298 (void) apic_setup_io_intr(irqptr, apic_sci_vect, B_FALSE);
299
300 lock_clear(&apic_ioapic_lock);
301 intr_restore(iflag);
302
303 irqptr->airq_share++;
304 }
305 }
306
307 /*
308 * Add mask bits to disable interrupt vector from happening
309 * at or above IPL. In addition, it should remove mask bits
310 * to enable interrupt vectors below the given IPL.
311 *
312 * Both add and delspl are complicated by the fact that different interrupts
313 * may share IRQs. This can happen in two ways.
314 * 1. The same H/W line is shared by more than 1 device
315 * 1a. with interrupts at different IPLs
316 * 1b. with interrupts at same IPL
317 * 2. We ran out of vectors at a given IPL and started sharing vectors.
318 * 1b and 2 should be handled gracefully, except for the fact some ISRs
319 * will get called often when no interrupt is pending for the device.
320 * For 1a, we handle it at the higher IPL.
321 */
322 /*ARGSUSED*/
323 int
324 apic_addspl_common(int irqno, int ipl, int min_ipl, int max_ipl)
325 {
326 uchar_t vector;
327 ulong_t iflag;
328 apic_irq_t *irqptr, *irqheadptr;
329 int irqindex;
330
331 ASSERT(max_ipl <= UCHAR_MAX);
332 irqindex = IRQINDEX(irqno);
333
334 if ((irqindex == -1) || (!apic_irq_table[irqindex]))
335 return (PSM_FAILURE);
336
337 mutex_enter(&airq_mutex);
338 irqptr = irqheadptr = apic_irq_table[irqindex];
339
340 DDI_INTR_IMPLDBG((CE_CONT, "apic_addspl: dip=0x%p type=%d irqno=0x%x "
341 "vector=0x%x\n", (void *)irqptr->airq_dip,
342 irqptr->airq_mps_intr_index, irqno, irqptr->airq_vector));
343
344 while (irqptr) {
345 if (VIRTIRQ(irqindex, irqptr->airq_share_id) == irqno)
346 break;
347 irqptr = irqptr->airq_next;
348 }
349 irqptr->airq_share++;
350
351 mutex_exit(&airq_mutex);
352
353 /* return if it is not hardware interrupt */
354 if (irqptr->airq_mps_intr_index == RESERVE_INDEX)
355 return (PSM_SUCCESS);
356
357 /* Or if there are more interupts at a higher IPL */
358 if (ipl != max_ipl)
359 return (PSM_SUCCESS);
360
361 /*
362 * if apic_picinit() has not been called yet, just return.
363 * At the end of apic_picinit(), we will call setup_io_intr().
364 */
365
366 if (!apic_picinit_called)
367 return (PSM_SUCCESS);
368
369 /*
370 * Upgrade vector if max_ipl is not earlier ipl. If we cannot allocate,
371 * return failure.
372 */
373 if (irqptr->airq_ipl != max_ipl &&
374 !ioapic_mask_workaround[irqptr->airq_ioapicindex]) {
375
376 vector = apic_allocate_vector(max_ipl, irqindex, 1);
377 if (vector == 0) {
378 irqptr->airq_share--;
379 return (PSM_FAILURE);
380 }
381 irqptr = irqheadptr;
382 apic_mark_vector(irqptr->airq_vector, vector);
383 while (irqptr) {
384 irqptr->airq_vector = vector;
385 irqptr->airq_ipl = (uchar_t)max_ipl;
386 /*
387 * reprogram irq being added and every one else
388 * who is not in the UNINIT state
389 */
390 if ((VIRTIRQ(irqindex, irqptr->airq_share_id) ==
391 irqno) || (irqptr->airq_temp_cpu != IRQ_UNINIT)) {
392 apic_record_rdt_entry(irqptr, irqindex);
393
394 iflag = intr_clear();
395 lock_set(&apic_ioapic_lock);
396
397 (void) apic_setup_io_intr(irqptr, irqindex,
398 B_FALSE);
399
400 lock_clear(&apic_ioapic_lock);
401 intr_restore(iflag);
402 }
403 irqptr = irqptr->airq_next;
404 }
405 return (PSM_SUCCESS);
406
407 } else if (irqptr->airq_ipl != max_ipl &&
408 ioapic_mask_workaround[irqptr->airq_ioapicindex]) {
409 /*
410 * We cannot upgrade the vector, but we can change
411 * the IPL that this vector induces.
412 *
413 * Note that we subtract APIC_BASE_VECT from the vector
414 * here because this array is used in apic_intr_enter
415 * (no need to add APIC_BASE_VECT in that hot code
416 * path since we can do it in the rarely-executed path
417 * here).
418 */
419 apic_ipls[irqptr->airq_vector - APIC_BASE_VECT] =
420 (uchar_t)max_ipl;
421
422 irqptr = irqheadptr;
423 while (irqptr) {
424 irqptr->airq_ipl = (uchar_t)max_ipl;
425 irqptr = irqptr->airq_next;
426 }
427
428 return (PSM_SUCCESS);
429 }
430
431 ASSERT(irqptr);
432
433 iflag = intr_clear();
434 lock_set(&apic_ioapic_lock);
435
436 (void) apic_setup_io_intr(irqptr, irqindex, B_FALSE);
437
438 lock_clear(&apic_ioapic_lock);
439 intr_restore(iflag);
440
441 return (PSM_SUCCESS);
442 }
443
444 /*
445 * Recompute mask bits for the given interrupt vector.
446 * If there is no interrupt servicing routine for this
447 * vector, this function should disable interrupt vector
448 * from happening at all IPLs. If there are still
449 * handlers using the given vector, this function should
450 * disable the given vector from happening below the lowest
451 * IPL of the remaining hadlers.
452 */
453 /*ARGSUSED*/
454 int
455 apic_delspl_common(int irqno, int ipl, int min_ipl, int max_ipl)
456 {
457 uchar_t vector;
458 uint32_t bind_cpu;
459 int intin, irqindex;
460 int ioapic_ix;
461 apic_irq_t *irqptr, *preirqptr, *irqheadptr, *irqp;
462 ulong_t iflag;
463
464 mutex_enter(&airq_mutex);
465 irqindex = IRQINDEX(irqno);
466 irqptr = preirqptr = irqheadptr = apic_irq_table[irqindex];
467
468 DDI_INTR_IMPLDBG((CE_CONT, "apic_delspl: dip=0x%p type=%d irqno=0x%x "
469 "vector=0x%x\n", (void *)irqptr->airq_dip,
470 irqptr->airq_mps_intr_index, irqno, irqptr->airq_vector));
471
472 while (irqptr) {
473 if (VIRTIRQ(irqindex, irqptr->airq_share_id) == irqno)
474 break;
475 preirqptr = irqptr;
476 irqptr = irqptr->airq_next;
477 }
478 ASSERT(irqptr);
479
480 irqptr->airq_share--;
481
482 mutex_exit(&airq_mutex);
483
484 /*
485 * If there are more interrupts at a higher IPL, we don't need
486 * to disable anything.
487 */
488 if (ipl < max_ipl)
489 return (PSM_SUCCESS);
490
491 /* return if it is not hardware interrupt */
492 if (irqptr->airq_mps_intr_index == RESERVE_INDEX)
493 return (PSM_SUCCESS);
494
495 if (!apic_picinit_called) {
496 /*
497 * Clear irq_struct. If two devices shared an intpt
498 * line & 1 unloaded before picinit, we are hosed. But, then
499 * we hope the machine survive.
500 */
501 irqptr->airq_mps_intr_index = FREE_INDEX;
502 irqptr->airq_temp_cpu = IRQ_UNINIT;
503 apic_free_vector(irqptr->airq_vector);
504 return (PSM_SUCCESS);
505 }
506 /*
507 * Downgrade vector to new max_ipl if needed. If we cannot allocate,
508 * use old IPL. Not very elegant, but it should work.
509 */
510 if ((irqptr->airq_ipl != max_ipl) && (max_ipl != PSM_INVALID_IPL) &&
511 !ioapic_mask_workaround[irqptr->airq_ioapicindex]) {
512 apic_irq_t *irqp;
513 if (vector = apic_allocate_vector(max_ipl, irqno, 1)) {
514 apic_mark_vector(irqheadptr->airq_vector, vector);
515 irqp = irqheadptr;
516 while (irqp) {
517 irqp->airq_vector = vector;
518 irqp->airq_ipl = (uchar_t)max_ipl;
519 if (irqp->airq_temp_cpu != IRQ_UNINIT) {
520 apic_record_rdt_entry(irqp, irqindex);
521
522 iflag = intr_clear();
523 lock_set(&apic_ioapic_lock);
524
525 (void) apic_setup_io_intr(irqp,
526 irqindex, B_FALSE);
527
528 lock_clear(&apic_ioapic_lock);
529 intr_restore(iflag);
530 }
531 irqp = irqp->airq_next;
532 }
533 }
534
535 } else if (irqptr->airq_ipl != max_ipl &&
536 max_ipl != PSM_INVALID_IPL &&
537 ioapic_mask_workaround[irqptr->airq_ioapicindex]) {
538
539 /*
540 * We cannot downgrade the IPL of the vector below the vector's
541 * hardware priority. If we did, it would be possible for a
542 * higher-priority hardware vector to interrupt a CPU running at an IPL
543 * lower than the hardware priority of the interrupting vector (but
544 * higher than the soft IPL of this IRQ). When this happens, we would
545 * then try to drop the IPL BELOW what it was (effectively dropping
546 * below base_spl) which would be potentially catastrophic.
547 *
548 * (e.g. Suppose the hardware vector associated with this IRQ is 0x40
549 * (hardware IPL of 4). Further assume that the old IPL of this IRQ
550 * was 4, but the new IPL is 1. If we forced vector 0x40 to result in
551 * an IPL of 1, it would be possible for the processor to be executing
552 * at IPL 3 and for an interrupt to come in on vector 0x40, interrupting
553 * the currently-executing ISR. When apic_intr_enter consults
554 * apic_irqs[], it will return 1, bringing the IPL of the CPU down to 1
555 * so even though the processor was running at IPL 4, an IPL 1
556 * interrupt will have interrupted it, which must not happen)).
557 *
558 * Effectively, this means that the hardware priority corresponding to
559 * the IRQ's IPL (in apic_ipls[]) cannot be lower than the vector's
560 * hardware priority.
561 *
562 * (In the above example, then, after removal of the IPL 4 device's
563 * interrupt handler, the new IPL will continue to be 4 because the
564 * hardware priority that IPL 1 implies is lower than the hardware
565 * priority of the vector used.)
566 */
567 /* apic_ipls is indexed by vector, starting at APIC_BASE_VECT */
568 const int apic_ipls_index = irqptr->airq_vector -
569 APIC_BASE_VECT;
570 const int vect_inherent_hwpri = irqptr->airq_vector >>
571 APIC_IPL_SHIFT;
572
573 /*
574 * If there are still devices using this IRQ, determine the
575 * new ipl to use.
576 */
577 if (irqptr->airq_share) {
578 int vect_desired_hwpri, hwpri;
579
580 ASSERT(max_ipl < MAXIPL);
581 vect_desired_hwpri = apic_ipltopri[max_ipl] >>
582 APIC_IPL_SHIFT;
583
584 /*
585 * If the desired IPL's hardware priority is lower
586 * than that of the vector, use the hardware priority
587 * of the vector to determine the new IPL.
588 */
589 hwpri = (vect_desired_hwpri < vect_inherent_hwpri) ?
590 vect_inherent_hwpri : vect_desired_hwpri;
591
592 /*
593 * Now, to get the right index for apic_vectortoipl,
594 * we need to subtract APIC_BASE_VECT from the
595 * hardware-vector-equivalent (in hwpri). Since hwpri
596 * is already shifted, we shift APIC_BASE_VECT before
597 * doing the subtraction.
598 */
599 hwpri -= (APIC_BASE_VECT >> APIC_IPL_SHIFT);
600
601 ASSERT(hwpri >= 0);
602 ASSERT(hwpri < MAXIPL);
603 max_ipl = apic_vectortoipl[hwpri];
604 apic_ipls[apic_ipls_index] = max_ipl;
605
606 irqp = irqheadptr;
607 while (irqp) {
608 irqp->airq_ipl = (uchar_t)max_ipl;
609 irqp = irqp->airq_next;
610 }
611 } else {
612 /*
613 * No more devices on this IRQ, so reset this vector's
614 * element in apic_ipls to the original IPL for this
615 * vector
616 */
617 apic_ipls[apic_ipls_index] =
618 apic_vectortoipl[vect_inherent_hwpri];
619 }
620 }
621
622 /*
623 * If there are still active interrupts, we are done.
624 */
625 if (irqptr->airq_share)
626 return (PSM_SUCCESS);
627
628 iflag = intr_clear();
629 lock_set(&apic_ioapic_lock);
630
631 if (irqptr->airq_mps_intr_index == MSI_INDEX) {
632 /*
633 * Disable the MSI vector
634 * Make sure we only disable on the last
635 * of the multi-MSI support
636 */
637 if (i_ddi_intr_get_current_nenables(irqptr->airq_dip) == 1) {
638 apic_pci_msi_disable_mode(irqptr->airq_dip,
639 DDI_INTR_TYPE_MSI);
640 }
641 } else if (irqptr->airq_mps_intr_index == MSIX_INDEX) {
642 /*
643 * Disable the MSI-X vector
644 * needs to clear its mask and addr/data for each MSI-X
645 */
646 apic_pci_msi_unconfigure(irqptr->airq_dip, DDI_INTR_TYPE_MSIX,
647 irqptr->airq_origirq);
648 /*
649 * Make sure we only disable on the last MSI-X
650 */
651 if (i_ddi_intr_get_current_nenables(irqptr->airq_dip) == 1) {
652 apic_pci_msi_disable_mode(irqptr->airq_dip,
653 DDI_INTR_TYPE_MSIX);
654 }
655 } else {
656 /*
657 * The assumption here is that this is safe, even for
658 * systems with IOAPICs that suffer from the hardware
659 * erratum because all devices have been quiesced before
660 * they unregister their interrupt handlers. If that
661 * assumption turns out to be false, this mask operation
662 * can induce the same erratum result we're trying to
663 * avoid.
664 */
665 ioapic_ix = irqptr->airq_ioapicindex;
666 intin = irqptr->airq_intin_no;
667 ioapic_write(ioapic_ix, APIC_RDT_CMD + 2 * intin, AV_MASK);
668 }
669
670 /*
671 * This irq entry is the only one in the chain.
672 */
673 if (irqheadptr->airq_next == NULL) {
674 ASSERT(irqheadptr == irqptr);
675 bind_cpu = irqptr->airq_temp_cpu;
676 if (((uint32_t)bind_cpu != IRQ_UNBOUND) &&
677 ((uint32_t)bind_cpu != IRQ_UNINIT)) {
678 ASSERT(apic_cpu_in_range(bind_cpu));
679 if (bind_cpu & IRQ_USER_BOUND) {
680 /* If hardbound, temp_cpu == cpu */
681 bind_cpu &= ~IRQ_USER_BOUND;
682 apic_cpus[bind_cpu].aci_bound--;
683 } else
684 apic_cpus[bind_cpu].aci_temp_bound--;
685 }
686 irqptr->airq_temp_cpu = IRQ_UNINIT;
687 irqptr->airq_mps_intr_index = FREE_INDEX;
688 lock_clear(&apic_ioapic_lock);
689 intr_restore(iflag);
690 apic_free_vector(irqptr->airq_vector);
691 return (PSM_SUCCESS);
692 }
693
694 /*
695 * If we get here, we are sharing the vector and there are more than
696 * one active irq entries in the chain.
697 */
698 lock_clear(&apic_ioapic_lock);
699 intr_restore(iflag);
700
701 mutex_enter(&airq_mutex);
702 /* Remove the irq entry from the chain */
703 if (irqptr == irqheadptr) { /* The irq entry is at the head */
704 apic_irq_table[irqindex] = irqptr->airq_next;
705 } else {
706 preirqptr->airq_next = irqptr->airq_next;
707 }
708 /* Free the irq entry */
709 kmem_free(irqptr, sizeof (apic_irq_t));
710 mutex_exit(&airq_mutex);
711
712 return (PSM_SUCCESS);
713 }
714
715 /*
716 * apic_introp_xlate() replaces apic_translate_irq() and is
717 * called only from apic_intr_ops(). With the new ADII framework,
718 * the priority can no longer be retrieved through i_ddi_get_intrspec().
719 * It has to be passed in from the caller.
720 *
721 * Return value:
722 * Success: irqno for the given device
723 * Failure: -1
724 */
725 int
726 apic_introp_xlate(dev_info_t *dip, struct intrspec *ispec, int type)
727 {
728 char dev_type[16];
729 int dev_len, pci_irq, newirq, bustype, devid, busid, i;
730 int irqno = ispec->intrspec_vec;
731 ddi_acc_handle_t cfg_handle;
732 uchar_t ipin;
733 struct apic_io_intr *intrp;
734 iflag_t intr_flag;
735 ACPI_SUBTABLE_HEADER *hp;
736 ACPI_MADT_INTERRUPT_OVERRIDE *isop;
737 apic_irq_t *airqp;
738 int parent_is_pci_or_pciex = 0;
739 int child_is_pciex = 0;
740
741 DDI_INTR_IMPLDBG((CE_CONT, "apic_introp_xlate: dip=0x%p name=%s "
742 "type=%d irqno=0x%x\n", (void *)dip, ddi_get_name(dip), type,
743 irqno));
744
745 dev_len = sizeof (dev_type);
746 if (ddi_getlongprop_buf(DDI_DEV_T_ANY, ddi_get_parent(dip),
747 DDI_PROP_DONTPASS, "device_type", (caddr_t)dev_type,
748 &dev_len) == DDI_PROP_SUCCESS) {
749 if ((strcmp(dev_type, "pci") == 0) ||
750 (strcmp(dev_type, "pciex") == 0))
751 parent_is_pci_or_pciex = 1;
752 }
753
754 if (ddi_getlongprop_buf(DDI_DEV_T_ANY, dip,
755 DDI_PROP_DONTPASS, "compatible", (caddr_t)dev_type,
756 &dev_len) == DDI_PROP_SUCCESS) {
757 if (strstr(dev_type, "pciex"))
758 child_is_pciex = 1;
759 }
760
761 if (DDI_INTR_IS_MSI_OR_MSIX(type)) {
762 if ((airqp = apic_find_irq(dip, ispec, type)) != NULL) {
763 airqp->airq_iflag.bustype =
764 child_is_pciex ? BUS_PCIE : BUS_PCI;
765 return (apic_vector_to_irq[airqp->airq_vector]);
766 }
767 return (apic_setup_irq_table(dip, irqno, NULL, ispec,
768 NULL, type));
769 }
770
771 bustype = 0;
772
773 /* check if we have already translated this irq */
774 mutex_enter(&airq_mutex);
775 newirq = apic_min_device_irq;
776 for (; newirq <= apic_max_device_irq; newirq++) {
777 airqp = apic_irq_table[newirq];
778 while (airqp) {
779 if ((airqp->airq_dip == dip) &&
780 (airqp->airq_origirq == irqno) &&
781 (airqp->airq_mps_intr_index != FREE_INDEX)) {
782
783 mutex_exit(&airq_mutex);
784 return (VIRTIRQ(newirq, airqp->airq_share_id));
785 }
786 airqp = airqp->airq_next;
787 }
788 }
789 mutex_exit(&airq_mutex);
790
791 if (apic_defconf)
792 goto defconf;
793
794 if ((dip == NULL) || (!apic_irq_translate && !apic_enable_acpi))
795 goto nonpci;
796
797 if (parent_is_pci_or_pciex) {
798 /* pci device */
799 if (acpica_get_bdf(dip, &busid, &devid, NULL) != 0)
800 goto nonpci;
801 if (busid == 0 && apic_pci_bus_total == 1)
802 busid = (int)apic_single_pci_busid;
803
804 if (pci_config_setup(dip, &cfg_handle) != DDI_SUCCESS)
805 return (-1);
806 ipin = pci_config_get8(cfg_handle, PCI_CONF_IPIN) - PCI_INTA;
807 pci_config_teardown(&cfg_handle);
808 if (apic_enable_acpi && !apic_use_acpi_madt_only) {
809 if (apic_acpi_translate_pci_irq(dip, busid, devid,
810 ipin, &pci_irq, &intr_flag) != ACPI_PSM_SUCCESS)
811 return (-1);
812
813 intr_flag.bustype = child_is_pciex ? BUS_PCIE : BUS_PCI;
814 return (apic_setup_irq_table(dip, pci_irq, NULL, ispec,
815 &intr_flag, type));
816 } else {
817 pci_irq = ((devid & 0x1f) << 2) | (ipin & 0x3);
818 if ((intrp = apic_find_io_intr_w_busid(pci_irq, busid))
819 == NULL) {
820 if ((pci_irq = apic_handle_pci_pci_bridge(dip,
821 devid, ipin, &intrp)) == -1)
822 return (-1);
823 }
824 return (apic_setup_irq_table(dip, pci_irq, intrp, ispec,
825 NULL, type));
826 }
827 } else if (strcmp(dev_type, "isa") == 0)
828 bustype = BUS_ISA;
829 else if (strcmp(dev_type, "eisa") == 0)
830 bustype = BUS_EISA;
831
832 nonpci:
833 if (apic_enable_acpi && !apic_use_acpi_madt_only) {
834 /* search iso entries first */
835 if (acpi_iso_cnt != 0) {
836 hp = (ACPI_SUBTABLE_HEADER *)acpi_isop;
837 i = 0;
838 while (i < acpi_iso_cnt) {
839 if (hp->Type ==
840 ACPI_MADT_TYPE_INTERRUPT_OVERRIDE) {
841 isop =
842 (ACPI_MADT_INTERRUPT_OVERRIDE *) hp;
843 if (isop->Bus == 0 &&
844 isop->SourceIrq == irqno) {
845 newirq = isop->GlobalIrq;
846 intr_flag.intr_po =
847 isop->IntiFlags &
848 ACPI_MADT_POLARITY_MASK;
849 intr_flag.intr_el =
850 (isop->IntiFlags &
851 ACPI_MADT_TRIGGER_MASK)
852 >> 2;
853 intr_flag.bustype = BUS_ISA;
854
855 return (apic_setup_irq_table(
856 dip, newirq, NULL, ispec,
857 &intr_flag, type));
858
859 }
860 i++;
861 }
862 hp = (ACPI_SUBTABLE_HEADER *)(((char *)hp) +
863 hp->Length);
864 }
865 }
866 intr_flag.intr_po = INTR_PO_ACTIVE_HIGH;
867 intr_flag.intr_el = INTR_EL_EDGE;
868 intr_flag.bustype = BUS_ISA;
869 return (apic_setup_irq_table(dip, irqno, NULL, ispec,
870 &intr_flag, type));
871 } else {
872 if (bustype == 0) /* not initialized */
873 bustype = eisa_level_intr_mask ? BUS_EISA : BUS_ISA;
874 for (i = 0; i < 2; i++) {
875 if (((busid = apic_find_bus_id(bustype)) != -1) &&
876 ((intrp = apic_find_io_intr_w_busid(irqno, busid))
877 != NULL)) {
878 if ((newirq = apic_setup_irq_table(dip, irqno,
879 intrp, ispec, NULL, type)) != -1) {
880 return (newirq);
881 }
882 goto defconf;
883 }
884 bustype = (bustype == BUS_EISA) ? BUS_ISA : BUS_EISA;
885 }
886 }
887
888 /* MPS default configuration */
889 defconf:
890 newirq = apic_setup_irq_table(dip, irqno, NULL, ispec, NULL, type);
891 if (newirq == -1)
892 return (-1);
893 ASSERT(IRQINDEX(newirq) == irqno);
894 ASSERT(apic_irq_table[irqno]);
895 return (newirq);
896 }
897
898 /*
899 * Attempt to share vector with someone else
900 */
901 static int
902 apic_share_vector(int irqno, iflag_t *intr_flagp, short intr_index, int ipl,
903 uchar_t ioapicindex, uchar_t ipin, apic_irq_t **irqptrp)
904 {
905 #ifdef DEBUG
906 apic_irq_t *tmpirqp = NULL;
907 #endif /* DEBUG */
908 apic_irq_t *irqptr, dummyirq;
909 int newirq, chosen_irq = -1, share = 127;
910 int lowest, highest, i;
911 uchar_t share_id;
912
913 DDI_INTR_IMPLDBG((CE_CONT, "apic_share_vector: irqno=0x%x "
914 "intr_index=0x%x ipl=0x%x\n", irqno, intr_index, ipl));
915
916 highest = apic_ipltopri[ipl] + APIC_VECTOR_MASK;
917 lowest = apic_ipltopri[ipl-1] + APIC_VECTOR_PER_IPL;
918
919 if (highest < lowest) /* Both ipl and ipl-1 map to same pri */
920 lowest -= APIC_VECTOR_PER_IPL;
921 dummyirq.airq_mps_intr_index = intr_index;
922 dummyirq.airq_ioapicindex = ioapicindex;
923 dummyirq.airq_intin_no = ipin;
924 if (intr_flagp)
925 dummyirq.airq_iflag = *intr_flagp;
926 apic_record_rdt_entry(&dummyirq, irqno);
927 for (i = lowest; i <= highest; i++) {
928 newirq = apic_vector_to_irq[i];
929 if (newirq == APIC_RESV_IRQ)
930 continue;
931 irqptr = apic_irq_table[newirq];
932
933 if ((dummyirq.airq_rdt_entry & 0xFF00) !=
934 (irqptr->airq_rdt_entry & 0xFF00))
935 /* not compatible */
936 continue;
937
938 if (irqptr->airq_share < share) {
939 share = irqptr->airq_share;
940 chosen_irq = newirq;
941 }
942 }
943 if (chosen_irq != -1) {
944 /*
945 * Assign a share id which is free or which is larger
946 * than the largest one.
947 */
948 share_id = 1;
949 mutex_enter(&airq_mutex);
950 irqptr = apic_irq_table[chosen_irq];
951 while (irqptr) {
952 if (irqptr->airq_mps_intr_index == FREE_INDEX) {
953 share_id = irqptr->airq_share_id;
954 break;
955 }
956 if (share_id <= irqptr->airq_share_id)
957 share_id = irqptr->airq_share_id + 1;
958 #ifdef DEBUG
959 tmpirqp = irqptr;
960 #endif /* DEBUG */
961 irqptr = irqptr->airq_next;
962 }
963 if (!irqptr) {
964 irqptr = kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP);
965 irqptr->airq_temp_cpu = IRQ_UNINIT;
966 irqptr->airq_next =
967 apic_irq_table[chosen_irq]->airq_next;
968 apic_irq_table[chosen_irq]->airq_next = irqptr;
969 #ifdef DEBUG
970 tmpirqp = apic_irq_table[chosen_irq];
971 #endif /* DEBUG */
972 }
973 irqptr->airq_mps_intr_index = intr_index;
974 irqptr->airq_ioapicindex = ioapicindex;
975 irqptr->airq_intin_no = ipin;
976 if (intr_flagp)
977 irqptr->airq_iflag = *intr_flagp;
978 irqptr->airq_vector = apic_irq_table[chosen_irq]->airq_vector;
979 irqptr->airq_share_id = share_id;
980 apic_record_rdt_entry(irqptr, irqno);
981 *irqptrp = irqptr;
982 #ifdef DEBUG
983 /* shuffle the pointers to test apic_delspl path */
984 if (tmpirqp) {
985 tmpirqp->airq_next = irqptr->airq_next;
986 irqptr->airq_next = apic_irq_table[chosen_irq];
987 apic_irq_table[chosen_irq] = irqptr;
988 }
989 #endif /* DEBUG */
990 mutex_exit(&airq_mutex);
991 return (VIRTIRQ(chosen_irq, share_id));
992 }
993 return (-1);
994 }
995
996 /*
997 * Allocate/Initialize the apic_irq_table[] entry for given irqno. If the entry
998 * is used already, we will try to allocate a new irqno.
999 *
1000 * Return value:
1001 * Success: irqno
1002 * Failure: -1
1003 */
1004 static int
1005 apic_setup_irq_table(dev_info_t *dip, int irqno, struct apic_io_intr *intrp,
1006 struct intrspec *ispec, iflag_t *intr_flagp, int type)
1007 {
1008 int origirq = ispec->intrspec_vec;
1009 uchar_t ipl = ispec->intrspec_pri;
1010 int newirq, intr_index;
1011 uchar_t ipin, ioapic, ioapicindex, vector;
1012 apic_irq_t *irqptr;
1013 major_t major;
1014 dev_info_t *sdip;
1015
1016 DDI_INTR_IMPLDBG((CE_CONT, "apic_setup_irq_table: dip=0x%p type=%d "
1017 "irqno=0x%x origirq=0x%x\n", (void *)dip, type, irqno, origirq));
1018
1019 ASSERT(ispec != NULL);
1020
1021 major = (dip != NULL) ? ddi_driver_major(dip) : 0;
1022
1023 if (DDI_INTR_IS_MSI_OR_MSIX(type)) {
1024 /* MSI/X doesn't need to setup ioapic stuffs */
1025 ioapicindex = 0xff;
1026 ioapic = 0xff;
1027 ipin = (uchar_t)0xff;
1028 intr_index = (type == DDI_INTR_TYPE_MSI) ? MSI_INDEX :
1029 MSIX_INDEX;
1030 mutex_enter(&airq_mutex);
1031 if ((irqno = apic_allocate_irq(apic_first_avail_irq)) == -1) {
1032 mutex_exit(&airq_mutex);
1033 /* need an irq for MSI/X to index into autovect[] */
1034 cmn_err(CE_WARN, "No interrupt irq: %s instance %d",
1035 ddi_get_name(dip), ddi_get_instance(dip));
1036 return (-1);
1037 }
1038 mutex_exit(&airq_mutex);
1039
1040 } else if (intrp != NULL) {
1041 intr_index = (int)(intrp - apic_io_intrp);
1042 ioapic = intrp->intr_destid;
1043 ipin = intrp->intr_destintin;
1044 /* Find ioapicindex. If destid was ALL, we will exit with 0. */
1045 for (ioapicindex = apic_io_max - 1; ioapicindex; ioapicindex--)
1046 if (apic_io_id[ioapicindex] == ioapic)
1047 break;
1048 ASSERT((ioapic == apic_io_id[ioapicindex]) ||
1049 (ioapic == INTR_ALL_APIC));
1050
1051 /* check whether this intin# has been used by another irqno */
1052 if ((newirq = apic_find_intin(ioapicindex, ipin)) != -1) {
1053 return (newirq);
1054 }
1055
1056 } else if (intr_flagp != NULL) {
1057 /* ACPI case */
1058 intr_index = ACPI_INDEX;
1059 ioapicindex = acpi_find_ioapic(irqno);
1060 ASSERT(ioapicindex != 0xFF);
1061 ioapic = apic_io_id[ioapicindex];
1062 ipin = irqno - apic_io_vectbase[ioapicindex];
1063 if (apic_irq_table[irqno] &&
1064 apic_irq_table[irqno]->airq_mps_intr_index == ACPI_INDEX) {
1065 ASSERT(apic_irq_table[irqno]->airq_intin_no == ipin &&
1066 apic_irq_table[irqno]->airq_ioapicindex ==
1067 ioapicindex);
1068 return (irqno);
1069 }
1070
1071 } else {
1072 /* default configuration */
1073 ioapicindex = 0;
1074 ioapic = apic_io_id[ioapicindex];
1075 ipin = (uchar_t)irqno;
1076 intr_index = DEFAULT_INDEX;
1077 }
1078
1079 if (ispec == NULL) {
1080 APIC_VERBOSE_IOAPIC((CE_WARN, "No intrspec for irqno = %x\n",
1081 irqno));
1082 } else if ((vector = apic_allocate_vector(ipl, irqno, 0)) == 0) {
1083 if ((newirq = apic_share_vector(irqno, intr_flagp, intr_index,
1084 ipl, ioapicindex, ipin, &irqptr)) != -1) {
1085 irqptr->airq_ipl = ipl;
1086 irqptr->airq_origirq = (uchar_t)origirq;
1087 irqptr->airq_dip = dip;
1088 irqptr->airq_major = major;
1089 sdip = apic_irq_table[IRQINDEX(newirq)]->airq_dip;
1090 /* This is OK to do really */
1091 if (sdip == NULL) {
1092 cmn_err(CE_WARN, "Sharing vectors: %s"
1093 " instance %d and SCI",
1094 ddi_get_name(dip), ddi_get_instance(dip));
1095 } else {
1096 cmn_err(CE_WARN, "Sharing vectors: %s"
1097 " instance %d and %s instance %d",
1098 ddi_get_name(sdip), ddi_get_instance(sdip),
1099 ddi_get_name(dip), ddi_get_instance(dip));
1100 }
1101 return (newirq);
1102 }
1103 /* try high priority allocation now that share has failed */
1104 if ((vector = apic_allocate_vector(ipl, irqno, 1)) == 0) {
1105 cmn_err(CE_WARN, "No interrupt vector: %s instance %d",
1106 ddi_get_name(dip), ddi_get_instance(dip));
1107 return (-1);
1108 }
1109 }
1110
1111 mutex_enter(&airq_mutex);
1112 if (apic_irq_table[irqno] == NULL) {
1113 irqptr = kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP);
1114 irqptr->airq_temp_cpu = IRQ_UNINIT;
1115 apic_irq_table[irqno] = irqptr;
1116 } else {
1117 irqptr = apic_irq_table[irqno];
1118 if (irqptr->airq_mps_intr_index != FREE_INDEX) {
1119 /*
1120 * The slot is used by another irqno, so allocate
1121 * a free irqno for this interrupt
1122 */
1123 newirq = apic_allocate_irq(apic_first_avail_irq);
1124 if (newirq == -1) {
1125 mutex_exit(&airq_mutex);
1126 return (-1);
1127 }
1128 irqno = newirq;
1129 irqptr = apic_irq_table[irqno];
1130 if (irqptr == NULL) {
1131 irqptr = kmem_zalloc(sizeof (apic_irq_t),
1132 KM_SLEEP);
1133 irqptr->airq_temp_cpu = IRQ_UNINIT;
1134 apic_irq_table[irqno] = irqptr;
1135 }
1136 vector = apic_modify_vector(vector, newirq);
1137 }
1138 }
1139 apic_max_device_irq = max(irqno, apic_max_device_irq);
1140 apic_min_device_irq = min(irqno, apic_min_device_irq);
1141 mutex_exit(&airq_mutex);
1142 irqptr->airq_ioapicindex = ioapicindex;
1143 irqptr->airq_intin_no = ipin;
1144 irqptr->airq_ipl = ipl;
1145 irqptr->airq_vector = vector;
1146 irqptr->airq_origirq = (uchar_t)origirq;
1147 irqptr->airq_share_id = 0;
1148 irqptr->airq_mps_intr_index = (short)intr_index;
1149 irqptr->airq_dip = dip;
1150 irqptr->airq_major = major;
1151 irqptr->airq_cpu = apic_bind_intr(dip, irqno, ioapic, ipin);
1152 if (intr_flagp)
1153 irqptr->airq_iflag = *intr_flagp;
1154
1155 if (!DDI_INTR_IS_MSI_OR_MSIX(type)) {
1156 /* setup I/O APIC entry for non-MSI/X interrupts */
1157 apic_record_rdt_entry(irqptr, irqno);
1158 }
1159 return (irqno);
1160 }
1161
1162 /*
1163 * return the cpu to which this intr should be bound.
1164 * Check properties or any other mechanism to see if user wants it
1165 * bound to a specific CPU. If so, return the cpu id with high bit set.
1166 * If not, use the policy to choose a cpu and return the id.
1167 */
1168 uint32_t
1169 apic_bind_intr(dev_info_t *dip, int irq, uchar_t ioapicid, uchar_t intin)
1170 {
1171 int instance, instno, prop_len, bind_cpu, count;
1172 uint_t i, rc;
1173 uint32_t cpu;
1174 major_t major;
1175 char *name, *drv_name, *prop_val, *cptr;
1176 char prop_name[32];
1177 ulong_t iflag;
1178
1179
1180 if (apic_intr_policy == INTR_LOWEST_PRIORITY)
1181 return (IRQ_UNBOUND);
1182
1183 if (apic_nproc == 1)
1184 return (0);
1185
1186 drv_name = NULL;
1187 rc = DDI_PROP_NOT_FOUND;
1188 major = (major_t)-1;
1189 if (dip != NULL) {
1190 name = ddi_get_name(dip);
1191 major = ddi_name_to_major(name);
1192 drv_name = ddi_major_to_name(major);
1193 instance = ddi_get_instance(dip);
1194 if (apic_intr_policy == INTR_ROUND_ROBIN_WITH_AFFINITY) {
1195 i = apic_min_device_irq;
1196 for (; i <= apic_max_device_irq; i++) {
1197
1198 if ((i == irq) || (apic_irq_table[i] == NULL) ||
1199 (apic_irq_table[i]->airq_mps_intr_index
1200 == FREE_INDEX))
1201 continue;
1202
1203 if ((apic_irq_table[i]->airq_major == major) &&
1204 (!(apic_irq_table[i]->airq_cpu &
1205 IRQ_USER_BOUND))) {
1206
1207 cpu = apic_irq_table[i]->airq_cpu;
1208
1209 cmn_err(CE_CONT,
1210 "!%s: %s (%s) instance #%d "
1211 "irq 0x%x vector 0x%x ioapic 0x%x "
1212 "intin 0x%x is bound to cpu %d\n",
1213 psm_name,
1214 name, drv_name, instance, irq,
1215 apic_irq_table[irq]->airq_vector,
1216 ioapicid, intin, cpu);
1217 return (cpu);
1218 }
1219 }
1220 }
1221 /*
1222 * search for "drvname"_intpt_bind_cpus property first, the
1223 * syntax of the property should be "a[,b,c,...]" where
1224 * instance 0 binds to cpu a, instance 1 binds to cpu b,
1225 * instance 3 binds to cpu c...
1226 * ddi_getlongprop() will search /option first, then /
1227 * if "drvname"_intpt_bind_cpus doesn't exist, then find
1228 * intpt_bind_cpus property. The syntax is the same, and
1229 * it applies to all the devices if its "drvname" specific
1230 * property doesn't exist
1231 */
1232 (void) strcpy(prop_name, drv_name);
1233 (void) strcat(prop_name, "_intpt_bind_cpus");
1234 rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, 0, prop_name,
1235 (caddr_t)&prop_val, &prop_len);
1236 if (rc != DDI_PROP_SUCCESS) {
1237 rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, 0,
1238 "intpt_bind_cpus", (caddr_t)&prop_val, &prop_len);
1239 }
1240 }
1241 if (rc == DDI_PROP_SUCCESS) {
1242 for (i = count = 0; i < (prop_len - 1); i++)
1243 if (prop_val[i] == ',')
1244 count++;
1245 if (prop_val[i-1] != ',')
1246 count++;
1247 /*
1248 * if somehow the binding instances defined in the
1249 * property are not enough for this instno., then
1250 * reuse the pattern for the next instance until
1251 * it reaches the requested instno
1252 */
1253 instno = instance % count;
1254 i = 0;
1255 cptr = prop_val;
1256 while (i < instno)
1257 if (*cptr++ == ',')
1258 i++;
1259 bind_cpu = stoi(&cptr);
1260 kmem_free(prop_val, prop_len);
1261 /* if specific CPU is bogus, then default to next cpu */
1262 if (!apic_cpu_in_range(bind_cpu)) {
1263 cmn_err(CE_WARN, "%s: %s=%s: CPU %d not present",
1264 psm_name, prop_name, prop_val, bind_cpu);
1265 rc = DDI_PROP_NOT_FOUND;
1266 } else {
1267 /* indicate that we are bound at user request */
1268 bind_cpu |= IRQ_USER_BOUND;
1269 }
1270 /*
1271 * no need to check apic_cpus[].aci_status, if specific CPU is
1272 * not up, then post_cpu_start will handle it.
1273 */
1274 }
1275 if (rc != DDI_PROP_SUCCESS) {
1276 iflag = intr_clear();
1277 lock_set(&apic_ioapic_lock);
1278 bind_cpu = apic_get_next_bind_cpu();
1279 lock_clear(&apic_ioapic_lock);
1280 intr_restore(iflag);
1281 }
1282
1283 if (drv_name != NULL)
1284 cmn_err(CE_CONT, "!%s: %s (%s) instance %d irq 0x%x "
1285 "vector 0x%x ioapic 0x%x intin 0x%x is bound to cpu %d\n",
1286 psm_name, name, drv_name, instance, irq,
1287 apic_irq_table[irq]->airq_vector, ioapicid, intin,
1288 bind_cpu & ~IRQ_USER_BOUND);
1289 else
1290 cmn_err(CE_CONT, "!%s: irq 0x%x "
1291 "vector 0x%x ioapic 0x%x intin 0x%x is bound to cpu %d\n",
1292 psm_name, irq, apic_irq_table[irq]->airq_vector, ioapicid,
1293 intin, bind_cpu & ~IRQ_USER_BOUND);
1294
1295 return ((uint32_t)bind_cpu);
1296 }
1297
1298 /*
1299 * Mark vector as being in the process of being deleted. Interrupts
1300 * may still come in on some CPU. The moment an interrupt comes with
1301 * the new vector, we know we can free the old one. Called only from
1302 * addspl and delspl with interrupts disabled. Because an interrupt
1303 * can be shared, but no interrupt from either device may come in,
1304 * we also use a timeout mechanism, which we arbitrarily set to
1305 * apic_revector_timeout microseconds.
1306 */
1307 static void
1308 apic_mark_vector(uchar_t oldvector, uchar_t newvector)
1309 {
1310 ulong_t iflag;
1311
1312 iflag = intr_clear();
1313 lock_set(&apic_revector_lock);
1314 if (!apic_oldvec_to_newvec) {
1315 apic_oldvec_to_newvec =
1316 kmem_zalloc(sizeof (newvector) * APIC_MAX_VECTOR * 2,
1317 KM_NOSLEEP);
1318
1319 if (!apic_oldvec_to_newvec) {
1320 /*
1321 * This failure is not catastrophic.
1322 * But, the oldvec will never be freed.
1323 */
1324 apic_error |= APIC_ERR_MARK_VECTOR_FAIL;
1325 lock_clear(&apic_revector_lock);
1326 intr_restore(iflag);
1327 return;
1328 }
1329 apic_newvec_to_oldvec = &apic_oldvec_to_newvec[APIC_MAX_VECTOR];
1330 }
1331
1332 /* See if we already did this for drivers which do double addintrs */
1333 if (apic_oldvec_to_newvec[oldvector] != newvector) {
1334 apic_oldvec_to_newvec[oldvector] = newvector;
1335 apic_newvec_to_oldvec[newvector] = oldvector;
1336 apic_revector_pending++;
1337 }
1338 lock_clear(&apic_revector_lock);
1339 intr_restore(iflag);
1340 (void) timeout(apic_xlate_vector_free_timeout_handler,
1341 (void *)(uintptr_t)oldvector, drv_usectohz(apic_revector_timeout));
1342 }
1343
1344 /*
1345 * xlate_vector is called from intr_enter if revector_pending is set.
1346 * It will xlate it if needed and mark the old vector as free.
1347 */
1348 uchar_t
1349 apic_xlate_vector(uchar_t vector)
1350 {
1351 uchar_t newvector, oldvector = 0;
1352
1353 lock_set(&apic_revector_lock);
1354 /* Do we really need to do this ? */
1355 if (!apic_revector_pending) {
1356 lock_clear(&apic_revector_lock);
1357 return (vector);
1358 }
1359 if ((newvector = apic_oldvec_to_newvec[vector]) != 0)
1360 oldvector = vector;
1361 else {
1362 /*
1363 * The incoming vector is new . See if a stale entry is
1364 * remaining
1365 */
1366 if ((oldvector = apic_newvec_to_oldvec[vector]) != 0)
1367 newvector = vector;
1368 }
1369
1370 if (oldvector) {
1371 apic_revector_pending--;
1372 apic_oldvec_to_newvec[oldvector] = 0;
1373 apic_newvec_to_oldvec[newvector] = 0;
1374 apic_free_vector(oldvector);
1375 lock_clear(&apic_revector_lock);
1376 /* There could have been more than one reprogramming! */
1377 return (apic_xlate_vector(newvector));
1378 }
1379 lock_clear(&apic_revector_lock);
1380 return (vector);
1381 }
1382
1383 void
1384 apic_xlate_vector_free_timeout_handler(void *arg)
1385 {
1386 ulong_t iflag;
1387 uchar_t oldvector, newvector;
1388
1389 oldvector = (uchar_t)(uintptr_t)arg;
1390 iflag = intr_clear();
1391 lock_set(&apic_revector_lock);
1392 if ((newvector = apic_oldvec_to_newvec[oldvector]) != 0) {
1393 apic_free_vector(oldvector);
1394 apic_oldvec_to_newvec[oldvector] = 0;
1395 apic_newvec_to_oldvec[newvector] = 0;
1396 apic_revector_pending--;
1397 }
1398
1399 lock_clear(&apic_revector_lock);
1400 intr_restore(iflag);
1401 }
1402
1403 /*
1404 * Bind interrupt corresponding to irq_ptr to bind_cpu.
1405 * Must be called with interrupts disabled and apic_ioapic_lock held
1406 */
1407 int
1408 apic_rebind(apic_irq_t *irq_ptr, int bind_cpu,
1409 struct ioapic_reprogram_data *drep)
1410 {
1411 int ioapicindex, intin_no;
1412 uint32_t airq_temp_cpu;
1413 apic_cpus_info_t *cpu_infop;
1414 uint32_t rdt_entry;
1415 int which_irq;
1416 ioapic_rdt_t irdt;
1417
1418 which_irq = apic_vector_to_irq[irq_ptr->airq_vector];
1419
1420 intin_no = irq_ptr->airq_intin_no;
1421 ioapicindex = irq_ptr->airq_ioapicindex;
1422 airq_temp_cpu = irq_ptr->airq_temp_cpu;
1423 if (airq_temp_cpu != IRQ_UNINIT && airq_temp_cpu != IRQ_UNBOUND) {
1424 if (airq_temp_cpu & IRQ_USER_BOUND)
1425 /* Mask off high bit so it can be used as array index */
1426 airq_temp_cpu &= ~IRQ_USER_BOUND;
1427
1428 ASSERT(apic_cpu_in_range(airq_temp_cpu));
1429 }
1430
1431 /*
1432 * Can't bind to a CPU that's not accepting interrupts:
1433 */
1434 cpu_infop = &apic_cpus[bind_cpu & ~IRQ_USER_BOUND];
1435 if (!(cpu_infop->aci_status & APIC_CPU_INTR_ENABLE))
1436 return (1);
1437
1438 /*
1439 * If we are about to change the interrupt vector for this interrupt,
1440 * and this interrupt is level-triggered, attached to an IOAPIC,
1441 * has been delivered to a CPU and that CPU has not handled it
1442 * yet, we cannot reprogram the IOAPIC now.
1443 */
1444 if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) {
1445
1446 rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex,
1447 intin_no);
1448
1449 if ((irq_ptr->airq_vector != RDT_VECTOR(rdt_entry)) &&
1450 apic_check_stuck_interrupt(irq_ptr, airq_temp_cpu,
1451 bind_cpu, ioapicindex, intin_no, which_irq, drep) != 0) {
1452
1453 return (0);
1454 }
1455
1456 /*
1457 * NOTE: We do not unmask the RDT here, as an interrupt MAY
1458 * still come in before we have a chance to reprogram it below.
1459 * The reprogramming below will simultaneously change and
1460 * unmask the RDT entry.
1461 */
1462
1463 if ((uint32_t)bind_cpu == IRQ_UNBOUND) {
1464 irdt.ir_lo = AV_LDEST | AV_LOPRI |
1465 irq_ptr->airq_rdt_entry;
1466
1467 WRITE_IOAPIC_RDT_ENTRY_HIGH_DWORD(ioapicindex, intin_no,
1468 AV_TOALL);
1469
1470 if (airq_temp_cpu != IRQ_UNINIT && airq_temp_cpu !=
1471 IRQ_UNBOUND)
1472 apic_cpus[airq_temp_cpu].aci_temp_bound--;
1473
1474 /*
1475 * Write the vector, trigger, and polarity portion of
1476 * the RDT
1477 */
1478 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex, intin_no,
1479 irdt.ir_lo);
1480
1481 irq_ptr->airq_temp_cpu = IRQ_UNBOUND;
1482 return (0);
1483 }
1484 }
1485
1486 if (bind_cpu & IRQ_USER_BOUND) {
1487 cpu_infop->aci_bound++;
1488 } else {
1489 cpu_infop->aci_temp_bound++;
1490 }
1491 ASSERT(apic_cpu_in_range(bind_cpu));
1492
1493 if ((airq_temp_cpu != IRQ_UNBOUND) && (airq_temp_cpu != IRQ_UNINIT)) {
1494 apic_cpus[airq_temp_cpu].aci_temp_bound--;
1495 }
1496 if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) {
1497
1498 irdt.ir_lo = AV_PDEST | AV_FIXED | irq_ptr->airq_rdt_entry;
1499 irdt.ir_hi = cpu_infop->aci_local_id;
1500
1501 /* Write the RDT entry -- bind to a specific CPU: */
1502 WRITE_IOAPIC_RDT_ENTRY_HIGH_DWORD(ioapicindex, intin_no,
1503 irdt.ir_hi << APIC_ID_BIT_OFFSET);
1504
1505 /* Write the vector, trigger, and polarity portion of the RDT */
1506 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex, intin_no,
1507 irdt.ir_lo);
1508
1509 } else {
1510 int type = (irq_ptr->airq_mps_intr_index == MSI_INDEX) ?
1511 DDI_INTR_TYPE_MSI : DDI_INTR_TYPE_MSIX;
1512 if (type == DDI_INTR_TYPE_MSI) {
1513 if (irq_ptr->airq_ioapicindex ==
1514 irq_ptr->airq_origirq) {
1515 /* first one */
1516 DDI_INTR_IMPLDBG((CE_CONT, "apic_rebind: call "
1517 "apic_pci_msi_enable_vector\n"));
1518 apic_pci_msi_enable_vector(irq_ptr,
1519 type, which_irq, irq_ptr->airq_vector,
1520 irq_ptr->airq_intin_no,
1521 cpu_infop->aci_local_id);
1522 }
1523 if ((irq_ptr->airq_ioapicindex +
1524 irq_ptr->airq_intin_no - 1) ==
1525 irq_ptr->airq_origirq) { /* last one */
1526 DDI_INTR_IMPLDBG((CE_CONT, "apic_rebind: call "
1527 "apic_pci_msi_enable_mode\n"));
1528 apic_pci_msi_enable_mode(irq_ptr->airq_dip,
1529 type, which_irq);
1530 }
1531 } else { /* MSI-X */
1532 apic_pci_msi_enable_vector(irq_ptr, type,
1533 irq_ptr->airq_origirq, irq_ptr->airq_vector, 1,
1534 cpu_infop->aci_local_id);
1535 apic_pci_msi_enable_mode(irq_ptr->airq_dip, type,
1536 irq_ptr->airq_origirq);
1537 }
1538 }
1539 irq_ptr->airq_temp_cpu = (uint32_t)bind_cpu;
1540 apic_redist_cpu_skip &= ~(1 << (bind_cpu & ~IRQ_USER_BOUND));
1541 return (0);
1542 }
1543
1544 static void
1545 apic_last_ditch_clear_remote_irr(int ioapic_ix, int intin_no)
1546 {
1547 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no)
1548 & AV_REMOTE_IRR) != 0) {
1549 /*
1550 * Trying to clear the bit through normal
1551 * channels has failed. So as a last-ditch
1552 * effort, try to set the trigger mode to
1553 * edge, then to level. This has been
1554 * observed to work on many systems.
1555 */
1556 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1557 intin_no,
1558 READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1559 intin_no) & ~AV_LEVEL);
1560
1561 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1562 intin_no,
1563 READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1564 intin_no) | AV_LEVEL);
1565
1566 /*
1567 * If the bit's STILL set, this interrupt may
1568 * be hosed.
1569 */
1570 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1571 intin_no) & AV_REMOTE_IRR) != 0) {
1572
1573 prom_printf("%s: Remote IRR still "
1574 "not clear for IOAPIC %d intin %d.\n"
1575 "\tInterrupts to this pin may cease "
1576 "functioning.\n", psm_name, ioapic_ix,
1577 intin_no);
1578 #ifdef DEBUG
1579 apic_last_ditch_reprogram_failures++;
1580 #endif
1581 }
1582 }
1583 }
1584
1585 /*
1586 * This function is protected by apic_ioapic_lock coupled with the
1587 * fact that interrupts are disabled.
1588 */
1589 static void
1590 delete_defer_repro_ent(int which_irq)
1591 {
1592 ASSERT(which_irq >= 0);
1593 ASSERT(which_irq <= 255);
1594 ASSERT(LOCK_HELD(&apic_ioapic_lock));
1595
1596 if (apic_reprogram_info[which_irq].done)
1597 return;
1598
1599 apic_reprogram_info[which_irq].done = B_TRUE;
1600
1601 #ifdef DEBUG
1602 apic_defer_repro_total_retries +=
1603 apic_reprogram_info[which_irq].tries;
1604
1605 apic_defer_repro_successes++;
1606 #endif
1607
1608 if (--apic_reprogram_outstanding == 0) {
1609
1610 setlvlx = psm_intr_exit_fn();
1611 }
1612 }
1613
1614
1615 /*
1616 * Interrupts must be disabled during this function to prevent
1617 * self-deadlock. Interrupts are disabled because this function
1618 * is called from apic_check_stuck_interrupt(), which is called
1619 * from apic_rebind(), which requires its caller to disable interrupts.
1620 */
1621 static void
1622 add_defer_repro_ent(apic_irq_t *irq_ptr, int which_irq, int new_bind_cpu)
1623 {
1624 ASSERT(which_irq >= 0);
1625 ASSERT(which_irq <= 255);
1626 ASSERT(!interrupts_enabled());
1627
1628 /*
1629 * On the off-chance that there's already a deferred
1630 * reprogramming on this irq, check, and if so, just update the
1631 * CPU and irq pointer to which the interrupt is targeted, then return.
1632 */
1633 if (!apic_reprogram_info[which_irq].done) {
1634 apic_reprogram_info[which_irq].bindcpu = new_bind_cpu;
1635 apic_reprogram_info[which_irq].irqp = irq_ptr;
1636 return;
1637 }
1638
1639 apic_reprogram_info[which_irq].irqp = irq_ptr;
1640 apic_reprogram_info[which_irq].bindcpu = new_bind_cpu;
1641 apic_reprogram_info[which_irq].tries = 0;
1642 /*
1643 * This must be the last thing set, since we're not
1644 * grabbing any locks, apic_try_deferred_reprogram() will
1645 * make its decision about using this entry iff done
1646 * is false.
1647 */
1648 apic_reprogram_info[which_irq].done = B_FALSE;
1649
1650 /*
1651 * If there were previously no deferred reprogrammings, change
1652 * setlvlx to call apic_try_deferred_reprogram()
1653 */
1654 if (++apic_reprogram_outstanding == 1) {
1655
1656 setlvlx = apic_try_deferred_reprogram;
1657 }
1658 }
1659
1660 static void
1661 apic_try_deferred_reprogram(int prev_ipl, int irq)
1662 {
1663 int reproirq;
1664 ulong_t iflag;
1665 struct ioapic_reprogram_data *drep;
1666
1667 (*psm_intr_exit_fn())(prev_ipl, irq);
1668
1669 if (!lock_try(&apic_defer_reprogram_lock)) {
1670 return;
1671 }
1672
1673 /*
1674 * Acquire the apic_ioapic_lock so that any other operations that
1675 * may affect the apic_reprogram_info state are serialized.
1676 * It's still possible for the last deferred reprogramming to clear
1677 * between the time we entered this function and the time we get to
1678 * the for loop below. In that case, *setlvlx will have been set
1679 * back to *_intr_exit and drep will be NULL. (There's no way to
1680 * stop that from happening -- we would need to grab a lock before
1681 * calling *setlvlx, which is neither realistic nor prudent).
1682 */
1683 iflag = intr_clear();
1684 lock_set(&apic_ioapic_lock);
1685
1686 /*
1687 * For each deferred RDT entry, try to reprogram it now. Note that
1688 * there is no lock acquisition to read apic_reprogram_info because
1689 * '.done' is set only after the other fields in the structure are set.
1690 */
1691
1692 drep = NULL;
1693 for (reproirq = 0; reproirq <= APIC_MAX_VECTOR; reproirq++) {
1694 if (apic_reprogram_info[reproirq].done == B_FALSE) {
1695 drep = &apic_reprogram_info[reproirq];
1696 break;
1697 }
1698 }
1699
1700 /*
1701 * Either we found a deferred action to perform, or
1702 * we entered this function spuriously, after *setlvlx
1703 * was restored to point to *_intr_exit. Any other
1704 * permutation is invalid.
1705 */
1706 ASSERT(drep != NULL || *setlvlx == psm_intr_exit_fn());
1707
1708 /*
1709 * Though we can't really do anything about errors
1710 * at this point, keep track of them for reporting.
1711 * Note that it is very possible for apic_setup_io_intr
1712 * to re-register this very timeout if the Remote IRR bit
1713 * has not yet cleared.
1714 */
1715
1716 #ifdef DEBUG
1717 if (drep != NULL) {
1718 if (apic_setup_io_intr(drep, reproirq, B_TRUE) != 0) {
1719 apic_deferred_setup_failures++;
1720 }
1721 } else {
1722 apic_deferred_spurious_enters++;
1723 }
1724 #else
1725 if (drep != NULL)
1726 (void) apic_setup_io_intr(drep, reproirq, B_TRUE);
1727 #endif
1728
1729 lock_clear(&apic_ioapic_lock);
1730 intr_restore(iflag);
1731
1732 lock_clear(&apic_defer_reprogram_lock);
1733 }
1734
1735 static void
1736 apic_ioapic_wait_pending_clear(int ioapic_ix, int intin_no)
1737 {
1738 int waited;
1739
1740 /*
1741 * Wait for the delivery pending bit to clear.
1742 */
1743 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no) &
1744 (AV_LEVEL|AV_PENDING)) == (AV_LEVEL|AV_PENDING)) {
1745
1746 /*
1747 * If we're still waiting on the delivery of this interrupt,
1748 * continue to wait here until it is delivered (this should be
1749 * a very small amount of time, but include a timeout just in
1750 * case).
1751 */
1752 for (waited = 0; waited < apic_max_reps_clear_pending;
1753 waited++) {
1754 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1755 intin_no) & AV_PENDING) == 0) {
1756 break;
1757 }
1758 }
1759 }
1760 }
1761
1762
1763 /*
1764 * Checks to see if the IOAPIC interrupt entry specified has its Remote IRR
1765 * bit set. Calls functions that modify the function that setlvlx points to,
1766 * so that the reprogramming can be retried very shortly.
1767 *
1768 * This function will mask the RDT entry if the interrupt is level-triggered.
1769 * (The caller is responsible for unmasking the RDT entry.)
1770 *
1771 * Returns non-zero if the caller should defer IOAPIC reprogramming.
1772 */
1773 static int
1774 apic_check_stuck_interrupt(apic_irq_t *irq_ptr, int old_bind_cpu,
1775 int new_bind_cpu, int ioapic_ix, int intin_no, int which_irq,
1776 struct ioapic_reprogram_data *drep)
1777 {
1778 int32_t rdt_entry;
1779 int waited;
1780 int reps = 0;
1781
1782 /*
1783 * Wait for the delivery pending bit to clear.
1784 */
1785 do {
1786 ++reps;
1787
1788 apic_ioapic_wait_pending_clear(ioapic_ix, intin_no);
1789
1790 /*
1791 * Mask the RDT entry, but only if it's a level-triggered
1792 * interrupt
1793 */
1794 rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1795 intin_no);
1796 if ((rdt_entry & (AV_LEVEL|AV_MASK)) == AV_LEVEL) {
1797
1798 /* Mask it */
1799 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no,
1800 AV_MASK | rdt_entry);
1801 }
1802
1803 if ((rdt_entry & AV_LEVEL) == AV_LEVEL) {
1804 /*
1805 * If there was a race and an interrupt was injected
1806 * just before we masked, check for that case here.
1807 * Then, unmask the RDT entry and try again. If we're
1808 * on our last try, don't unmask (because we want the
1809 * RDT entry to remain masked for the rest of the
1810 * function).
1811 */
1812 rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1813 intin_no);
1814 if ((rdt_entry & AV_PENDING) &&
1815 (reps < apic_max_reps_clear_pending)) {
1816 /* Unmask it */
1817 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1818 intin_no, rdt_entry & ~AV_MASK);
1819 }
1820 }
1821
1822 } while ((rdt_entry & AV_PENDING) &&
1823 (reps < apic_max_reps_clear_pending));
1824
1825 #ifdef DEBUG
1826 if (rdt_entry & AV_PENDING)
1827 apic_intr_deliver_timeouts++;
1828 #endif
1829
1830 /*
1831 * If the remote IRR bit is set, then the interrupt has been sent
1832 * to a CPU for processing. We have no choice but to wait for
1833 * that CPU to process the interrupt, at which point the remote IRR
1834 * bit will be cleared.
1835 */
1836 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no) &
1837 (AV_LEVEL|AV_REMOTE_IRR)) == (AV_LEVEL|AV_REMOTE_IRR)) {
1838
1839 /*
1840 * If the CPU that this RDT is bound to is NOT the current
1841 * CPU, wait until that CPU handles the interrupt and ACKs
1842 * it. If this interrupt is not bound to any CPU (that is,
1843 * if it's bound to the logical destination of "anyone"), it
1844 * may have been delivered to the current CPU so handle that
1845 * case by deferring the reprogramming (below).
1846 */
1847 if ((old_bind_cpu != IRQ_UNBOUND) &&
1848 (old_bind_cpu != IRQ_UNINIT) &&
1849 (old_bind_cpu != psm_get_cpu_id())) {
1850 for (waited = 0; waited < apic_max_reps_clear_pending;
1851 waited++) {
1852 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1853 intin_no) & AV_REMOTE_IRR) == 0) {
1854
1855 delete_defer_repro_ent(which_irq);
1856
1857 /* Remote IRR has cleared! */
1858 return (0);
1859 }
1860 }
1861 }
1862
1863 /*
1864 * If we waited and the Remote IRR bit is still not cleared,
1865 * AND if we've invoked the timeout APIC_REPROGRAM_MAX_TIMEOUTS
1866 * times for this interrupt, try the last-ditch workaround:
1867 */
1868 if (drep && drep->tries >= APIC_REPROGRAM_MAX_TRIES) {
1869
1870 apic_last_ditch_clear_remote_irr(ioapic_ix, intin_no);
1871
1872 /* Mark this one as reprogrammed: */
1873 delete_defer_repro_ent(which_irq);
1874
1875 return (0);
1876 } else {
1877 #ifdef DEBUG
1878 apic_intr_deferrals++;
1879 #endif
1880
1881 /*
1882 * If waiting for the Remote IRR bit (above) didn't
1883 * allow it to clear, defer the reprogramming.
1884 * Add a new deferred-programming entry if the
1885 * caller passed a NULL one (and update the existing one
1886 * in case anything changed).
1887 */
1888 add_defer_repro_ent(irq_ptr, which_irq, new_bind_cpu);
1889 if (drep)
1890 drep->tries++;
1891
1892 /* Inform caller to defer IOAPIC programming: */
1893 return (1);
1894 }
1895
1896 }
1897
1898 /* Remote IRR is clear */
1899 delete_defer_repro_ent(which_irq);
1900
1901 return (0);
1902 }
1903
1904 /*
1905 * Called to migrate all interrupts at an irq to another cpu.
1906 * Must be called with interrupts disabled and apic_ioapic_lock held
1907 */
1908 int
1909 apic_rebind_all(apic_irq_t *irq_ptr, int bind_cpu)
1910 {
1911 apic_irq_t *irqptr = irq_ptr;
1912 int retval = 0;
1913
1914 while (irqptr) {
1915 if (irqptr->airq_temp_cpu != IRQ_UNINIT)
1916 retval |= apic_rebind(irqptr, bind_cpu, NULL);
1917 irqptr = irqptr->airq_next;
1918 }
1919
1920 return (retval);
1921 }
1922
1923 /*
1924 * apic_intr_redistribute does all the messy computations for identifying
1925 * which interrupt to move to which CPU. Currently we do just one interrupt
1926 * at a time. This reduces the time we spent doing all this within clock
1927 * interrupt. When it is done in idle, we could do more than 1.
1928 * First we find the most busy and the most free CPU (time in ISR only)
1929 * skipping those CPUs that has been identified as being ineligible (cpu_skip)
1930 * Then we look for IRQs which are closest to the difference between the
1931 * most busy CPU and the average ISR load. We try to find one whose load
1932 * is less than difference.If none exists, then we chose one larger than the
1933 * difference, provided it does not make the most idle CPU worse than the
1934 * most busy one. In the end, we clear all the busy fields for CPUs. For
1935 * IRQs, they are cleared as they are scanned.
1936 */
1937 void
1938 apic_intr_redistribute(void)
1939 {
1940 int busiest_cpu, most_free_cpu;
1941 int cpu_free, cpu_busy, max_busy, min_busy;
1942 int min_free, diff;
1943 int average_busy, cpus_online;
1944 int i, busy;
1945 ulong_t iflag;
1946 apic_cpus_info_t *cpu_infop;
1947 apic_irq_t *min_busy_irq = NULL;
1948 apic_irq_t *max_busy_irq = NULL;
1949
1950 busiest_cpu = most_free_cpu = -1;
1951 cpu_free = cpu_busy = max_busy = average_busy = 0;
1952 min_free = apic_sample_factor_redistribution;
1953 cpus_online = 0;
1954 /*
1955 * Below we will check for CPU_INTR_ENABLE, bound, temp_bound, temp_cpu
1956 * without ioapic_lock. That is OK as we are just doing statistical
1957 * sampling anyway and any inaccuracy now will get corrected next time
1958 * The call to rebind which actually changes things will make sure
1959 * we are consistent.
1960 */
1961 for (i = 0; i < apic_nproc; i++) {
1962 if (apic_cpu_in_range(i) &&
1963 !(apic_redist_cpu_skip & (1 << i)) &&
1964 (apic_cpus[i].aci_status & APIC_CPU_INTR_ENABLE)) {
1965
1966 cpu_infop = &apic_cpus[i];
1967 /*
1968 * If no unbound interrupts or only 1 total on this
1969 * CPU, skip
1970 */
1971 if (!cpu_infop->aci_temp_bound ||
1972 (cpu_infop->aci_bound + cpu_infop->aci_temp_bound)
1973 == 1) {
1974 apic_redist_cpu_skip |= 1 << i;
1975 continue;
1976 }
1977
1978 busy = cpu_infop->aci_busy;
1979 average_busy += busy;
1980 cpus_online++;
1981 if (max_busy < busy) {
1982 max_busy = busy;
1983 busiest_cpu = i;
1984 }
1985 if (min_free > busy) {
1986 min_free = busy;
1987 most_free_cpu = i;
1988 }
1989 if (busy > apic_int_busy_mark) {
1990 cpu_busy |= 1 << i;
1991 } else {
1992 if (busy < apic_int_free_mark)
1993 cpu_free |= 1 << i;
1994 }
1995 }
1996 }
1997 if ((cpu_busy && cpu_free) ||
1998 (max_busy >= (min_free + apic_diff_for_redistribution))) {
1999
2000 apic_num_imbalance++;
2001 #ifdef DEBUG
2002 if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) {
2003 prom_printf(
2004 "redistribute busy=%x free=%x max=%x min=%x",
2005 cpu_busy, cpu_free, max_busy, min_free);
2006 }
2007 #endif /* DEBUG */
2008
2009
2010 average_busy /= cpus_online;
2011
2012 diff = max_busy - average_busy;
2013 min_busy = max_busy; /* start with the max possible value */
2014 max_busy = 0;
2015 min_busy_irq = max_busy_irq = NULL;
2016 i = apic_min_device_irq;
2017 for (; i <= apic_max_device_irq; i++) {
2018 apic_irq_t *irq_ptr;
2019 /* Change to linked list per CPU ? */
2020 if ((irq_ptr = apic_irq_table[i]) == NULL)
2021 continue;
2022 /* Check for irq_busy & decide which one to move */
2023 /* Also zero them for next round */
2024 if ((irq_ptr->airq_temp_cpu == busiest_cpu) &&
2025 irq_ptr->airq_busy) {
2026 if (irq_ptr->airq_busy < diff) {
2027 /*
2028 * Check for least busy CPU,
2029 * best fit or what ?
2030 */
2031 if (max_busy < irq_ptr->airq_busy) {
2032 /*
2033 * Most busy within the
2034 * required differential
2035 */
2036 max_busy = irq_ptr->airq_busy;
2037 max_busy_irq = irq_ptr;
2038 }
2039 } else {
2040 if (min_busy > irq_ptr->airq_busy) {
2041 /*
2042 * least busy, but more than
2043 * the reqd diff
2044 */
2045 if (min_busy <
2046 (diff + average_busy -
2047 min_free)) {
2048 /*
2049 * Making sure new cpu
2050 * will not end up
2051 * worse
2052 */
2053 min_busy =
2054 irq_ptr->airq_busy;
2055
2056 min_busy_irq = irq_ptr;
2057 }
2058 }
2059 }
2060 }
2061 irq_ptr->airq_busy = 0;
2062 }
2063
2064 if (max_busy_irq != NULL) {
2065 #ifdef DEBUG
2066 if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) {
2067 prom_printf("rebinding %x to %x",
2068 max_busy_irq->airq_vector, most_free_cpu);
2069 }
2070 #endif /* DEBUG */
2071 iflag = intr_clear();
2072 if (lock_try(&apic_ioapic_lock)) {
2073 if (apic_rebind_all(max_busy_irq,
2074 most_free_cpu) == 0) {
2075 /* Make change permenant */
2076 max_busy_irq->airq_cpu =
2077 (uint32_t)most_free_cpu;
2078 }
2079 lock_clear(&apic_ioapic_lock);
2080 }
2081 intr_restore(iflag);
2082
2083 } else if (min_busy_irq != NULL) {
2084 #ifdef DEBUG
2085 if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) {
2086 prom_printf("rebinding %x to %x",
2087 min_busy_irq->airq_vector, most_free_cpu);
2088 }
2089 #endif /* DEBUG */
2090
2091 iflag = intr_clear();
2092 if (lock_try(&apic_ioapic_lock)) {
2093 if (apic_rebind_all(min_busy_irq,
2094 most_free_cpu) == 0) {
2095 /* Make change permenant */
2096 min_busy_irq->airq_cpu =
2097 (uint32_t)most_free_cpu;
2098 }
2099 lock_clear(&apic_ioapic_lock);
2100 }
2101 intr_restore(iflag);
2102
2103 } else {
2104 if (cpu_busy != (1 << busiest_cpu)) {
2105 apic_redist_cpu_skip |= 1 << busiest_cpu;
2106 /*
2107 * We leave cpu_skip set so that next time we
2108 * can choose another cpu
2109 */
2110 }
2111 }
2112 apic_num_rebind++;
2113 } else {
2114 /*
2115 * found nothing. Could be that we skipped over valid CPUs
2116 * or we have balanced everything. If we had a variable
2117 * ticks_for_redistribution, it could be increased here.
2118 * apic_int_busy, int_free etc would also need to be
2119 * changed.
2120 */
2121 if (apic_redist_cpu_skip)
2122 apic_redist_cpu_skip = 0;
2123 }
2124 for (i = 0; i < apic_nproc; i++) {
2125 if (apic_cpu_in_range(i)) {
2126 apic_cpus[i].aci_busy = 0;
2127 }
2128 }
2129 }
2130
2131 void
2132 apic_cleanup_busy(void)
2133 {
2134 int i;
2135 apic_irq_t *irq_ptr;
2136
2137 for (i = 0; i < apic_nproc; i++) {
2138 if (apic_cpu_in_range(i)) {
2139 apic_cpus[i].aci_busy = 0;
2140 }
2141 }
2142
2143 for (i = apic_min_device_irq; i <= apic_max_device_irq; i++) {
2144 if ((irq_ptr = apic_irq_table[i]) != NULL)
2145 irq_ptr->airq_busy = 0;
2146 }
2147 }
2148
2149 int
2150 apic_ioapic_method_probe()
2151 {
2152 return (PSM_SUCCESS);
2153 }