1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 #define PSMI_1_7
  28 
  29 #include <sys/mutex.h>
  30 #include <sys/types.h>
  31 #include <sys/time.h>
  32 #include <sys/clock.h>
  33 #include <sys/machlock.h>
  34 #include <sys/smp_impldefs.h>
  35 #include <sys/uadmin.h>
  36 #include <sys/promif.h>
  37 #include <sys/psm.h>
  38 #include <sys/psm_common.h>
  39 #include <sys/atomic.h>
  40 #include <sys/apic.h>
  41 #include <sys/archsystm.h>
  42 #include <sys/mach_intr.h>
  43 #include <sys/hypervisor.h>
  44 #include <sys/evtchn_impl.h>
  45 #include <sys/modctl.h>
  46 #include <sys/trap.h>
  47 #include <sys/panic.h>
  48 #include <sys/sysmacros.h>
  49 #include <sys/pci_intr_lib.h>
  50 #include <vm/hat_i86.h>
  51 
  52 #include <xen/public/vcpu.h>
  53 #include <xen/public/physdev.h>
  54 
  55 
  56 /*
  57  * Global Data
  58  */
  59 
  60 int xen_psm_verbose = 0;
  61 
  62 /* As of now we don't support x2apic in xVM */
  63 volatile uint32_t *apicadr = NULL;      /* dummy, so common code will link */
  64 int apic_error = 0;
  65 int apic_verbose = 0;
  66 cpuset_t apic_cpumask;
  67 int apic_forceload = 0;
  68 uchar_t apic_vectortoipl[APIC_AVAIL_VECTOR / APIC_VECTOR_PER_IPL] = {
  69         3, 4, 5, 5, 6, 6, 9, 10, 11, 12, 13, 14, 15, 15
  70 };
  71 uchar_t apic_ipltopri[MAXIPL + 1];
  72 uchar_t apic_ipls[APIC_AVAIL_VECTOR];
  73 uint_t apic_picinit_called;
  74 apic_cpus_info_t *apic_cpus;
  75 int xen_psm_intr_policy = INTR_ROUND_ROBIN_WITH_AFFINITY;
  76 /* use to make sure only one cpu handles the nmi */
  77 static lock_t xen_psm_nmi_lock;
  78 int xen_psm_kmdb_on_nmi = 0;            /* 0 - no, 1 - yes enter kmdb */
  79 int xen_psm_panic_on_nmi = 0;
  80 int xen_psm_num_nmis = 0;
  81 
  82 cpuset_t xen_psm_cpus_online;   /* online cpus */
  83 int xen_psm_ncpus = 1;          /* cpu count */
  84 int xen_psm_next_bind_cpu;      /* next cpu to bind an interrupt to */
  85 
  86 int xen_support_msi = 0;
  87 
  88 static int xen_clock_irq = INVALID_IRQ;
  89 
  90 /* flag definitions for xen_psm_verbose */
  91 #define XEN_PSM_VERBOSE_IRQ_FLAG                0x00000001
  92 #define XEN_PSM_VERBOSE_POWEROFF_FLAG           0x00000002
  93 #define XEN_PSM_VERBOSE_POWEROFF_PAUSE_FLAG     0x00000004
  94 
  95 #define XEN_PSM_VERBOSE_IRQ(fmt) \
  96         if (xen_psm_verbose & XEN_PSM_VERBOSE_IRQ_FLAG) \
  97                 cmn_err fmt;
  98 
  99 #define XEN_PSM_VERBOSE_POWEROFF(fmt) \
 100         if (xen_psm_verbose & XEN_PSM_VERBOSE_POWEROFF_FLAG) \
 101                 prom_printf fmt;
 102 
 103 /*
 104  * Dummy apic array to point common routines at that want to do some apic
 105  * manipulation.  Xen doesn't allow guest apic access so we point at these
 106  * memory locations to fake out those who want to do apic fiddling.
 107  */
 108 uint32_t xen_psm_dummy_apic[APIC_IRR_REG + 1];
 109 
 110 static struct psm_info xen_psm_info;
 111 static void xen_psm_setspl(int);
 112 
 113 int
 114 apic_alloc_msi_vectors(dev_info_t *dip, int inum, int count, int pri,
 115     int behavior);
 116 int
 117 apic_alloc_msix_vectors(dev_info_t *dip, int inum, int count, int pri,
 118     int behavior);
 119 
 120 /*
 121  * Local support routines
 122  */
 123 
 124 /*
 125  * Select vcpu to bind xen virtual device interrupt to.
 126  */
 127 /*ARGSUSED*/
 128 int
 129 xen_psm_bind_intr(int irq)
 130 {
 131         int bind_cpu;
 132         apic_irq_t *irqptr;
 133 
 134         bind_cpu = IRQ_UNBOUND;
 135         if (xen_psm_intr_policy == INTR_LOWEST_PRIORITY)
 136                 return (bind_cpu);
 137         if (irq <= APIC_MAX_VECTOR)
 138                 irqptr = apic_irq_table[irq];
 139         else
 140                 irqptr = NULL;
 141         if (irqptr && (irqptr->airq_cpu != IRQ_UNBOUND))
 142                 bind_cpu = irqptr->airq_cpu & ~IRQ_USER_BOUND;
 143         if (bind_cpu != IRQ_UNBOUND) {
 144                 if (!CPU_IN_SET(xen_psm_cpus_online, bind_cpu))
 145                         bind_cpu = 0;
 146                 goto done;
 147         }
 148         if (xen_psm_intr_policy == INTR_ROUND_ROBIN_WITH_AFFINITY) {
 149                 do {
 150                         bind_cpu = xen_psm_next_bind_cpu++;
 151                         if (xen_psm_next_bind_cpu >= xen_psm_ncpus)
 152                                 xen_psm_next_bind_cpu = 0;
 153                 } while (!CPU_IN_SET(xen_psm_cpus_online, bind_cpu));
 154         } else {
 155                 bind_cpu = 0;
 156         }
 157 done:
 158         return (bind_cpu);
 159 }
 160 
 161 /*
 162  * Autoconfiguration Routines
 163  */
 164 
 165 static int
 166 xen_psm_probe(void)
 167 {
 168         int ret = PSM_SUCCESS;
 169 
 170         if (DOMAIN_IS_INITDOMAIN(xen_info))
 171                 ret = apic_probe_common(xen_psm_info.p_mach_idstring);
 172         return (ret);
 173 }
 174 
 175 static void
 176 xen_psm_softinit(void)
 177 {
 178         /* LINTED logical expression always true: op "||" */
 179         ASSERT((1 << EVTCHN_SHIFT) == NBBY * sizeof (ulong_t));
 180         CPUSET_ATOMIC_ADD(xen_psm_cpus_online, 0);
 181         if (DOMAIN_IS_INITDOMAIN(xen_info)) {
 182                 apic_init_common();
 183         }
 184 }
 185 
 186 #define XEN_NSEC_PER_TICK       10 /* XXX - assume we have a 100 Mhz clock */
 187 
 188 /*ARGSUSED*/
 189 static int
 190 xen_psm_clkinit(int hertz)
 191 {
 192         extern enum tod_fault_type tod_fault(enum tod_fault_type, int);
 193         extern int dosynctodr;
 194 
 195         /*
 196          * domU cannot set the TOD hardware, fault the TOD clock now to
 197          * indicate that and turn off attempts to sync TOD hardware
 198          * with the hires timer.
 199          */
 200         if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
 201                 mutex_enter(&tod_lock);
 202                 (void) tod_fault(TOD_RDONLY, 0);
 203                 dosynctodr = 0;
 204                 mutex_exit(&tod_lock);
 205         }
 206         /*
 207          * The hypervisor provides a timer based on the local APIC timer.
 208          * The interface supports requests of nanosecond resolution.
 209          * A common frequency of the apic clock is 100 Mhz which
 210          * gives a resolution of 10 nsec per tick.  What we would really like
 211          * is a way to get the ns per tick value from xen.
 212          * XXPV - This is an assumption that needs checking and may change
 213          */
 214         return (XEN_NSEC_PER_TICK);
 215 }
 216 
 217 static void
 218 xen_psm_hrtimeinit(void)
 219 {
 220         extern int gethrtime_hires;
 221         gethrtime_hires = 1;
 222 }
 223 
 224 /* xen_psm NMI handler */
 225 /*ARGSUSED*/
 226 static void
 227 xen_psm_nmi_intr(caddr_t arg, struct regs *rp)
 228 {
 229         xen_psm_num_nmis++;
 230 
 231         if (!lock_try(&xen_psm_nmi_lock))
 232                 return;
 233 
 234         if (xen_psm_kmdb_on_nmi && psm_debugger()) {
 235                 debug_enter("NMI received: entering kmdb\n");
 236         } else if (xen_psm_panic_on_nmi) {
 237                 /* Keep panic from entering kmdb. */
 238                 nopanicdebug = 1;
 239                 panic("NMI received\n");
 240         } else {
 241                 /*
 242                  * prom_printf is the best shot we have of something which is
 243                  * problem free from high level/NMI type of interrupts
 244                  */
 245                 prom_printf("NMI received\n");
 246         }
 247 
 248         lock_clear(&xen_psm_nmi_lock);
 249 }
 250 
 251 static void
 252 xen_psm_picinit()
 253 {
 254         int cpu, irqno;
 255         cpuset_t cpus;
 256 
 257         if (DOMAIN_IS_INITDOMAIN(xen_info)) {
 258                 /* set a flag so we know we have run xen_psm_picinit() */
 259                 apic_picinit_called = 1;
 260                 LOCK_INIT_CLEAR(&apic_ioapic_lock);
 261 
 262                 /* XXPV - do we need to do this? */
 263                 picsetup();      /* initialise the 8259 */
 264 
 265                 /* enable apic mode if imcr present */
 266                 /* XXPV - do we need to do this either? */
 267                 if (apic_imcrp) {
 268                         outb(APIC_IMCR_P1, (uchar_t)APIC_IMCR_SELECT);
 269                         outb(APIC_IMCR_P2, (uchar_t)APIC_IMCR_APIC);
 270                 }
 271 
 272                 ioapic_init_intr(IOAPIC_NOMASK);
 273                 /*
 274                  * We never called xen_psm_addspl() when the SCI
 275                  * interrupt was added because that happened before the
 276                  * PSM module was loaded.  Fix that up here by doing
 277                  * any missed operations (e.g. bind to CPU)
 278                  */
 279                 if ((irqno = apic_sci_vect) > 0) {
 280                         if ((cpu = xen_psm_bind_intr(irqno)) == IRQ_UNBOUND) {
 281                                 CPUSET_ZERO(cpus);
 282                                 CPUSET_OR(cpus, xen_psm_cpus_online);
 283                         } else {
 284                                 CPUSET_ONLY(cpus, cpu & ~IRQ_USER_BOUND);
 285                         }
 286                         ec_set_irq_affinity(irqno, cpus);
 287                         apic_irq_table[irqno]->airq_temp_cpu =
 288                             (uchar_t)(cpu & ~IRQ_USER_BOUND);
 289                         ec_enable_irq(irqno);
 290                 }
 291         }
 292 
 293         /* add nmi handler - least priority nmi handler */
 294         LOCK_INIT_CLEAR(&xen_psm_nmi_lock);
 295 
 296         if (!psm_add_nmintr(0, (avfunc) xen_psm_nmi_intr,
 297             "xVM_psm NMI handler", (caddr_t)NULL))
 298                 cmn_err(CE_WARN, "xVM_psm: Unable to add nmi handler");
 299 }
 300 
 301 
 302 /*
 303  * generates an interprocessor interrupt to another CPU
 304  */
 305 static void
 306 xen_psm_send_ipi(int cpun, int ipl)
 307 {
 308         ulong_t flag = intr_clear();
 309 
 310         ec_send_ipi(ipl, cpun);
 311         intr_restore(flag);
 312 }
 313 
 314 /*ARGSUSED*/
 315 static int
 316 xen_psm_addspl(int irqno, int ipl, int min_ipl, int max_ipl)
 317 {
 318         int cpu, ret;
 319         cpuset_t cpus;
 320 
 321         /*
 322          * We are called at splhi() so we can't call anything that might end
 323          * up trying to context switch.
 324          */
 325         if (irqno >= PIRQ_BASE && irqno < NR_PIRQS &&
 326             DOMAIN_IS_INITDOMAIN(xen_info)) {
 327                 /*
 328                  * Priority/affinity/enable for PIRQ's is set in ec_setup_pirq()
 329                  */
 330                 ret = apic_addspl_common(irqno, ipl, min_ipl, max_ipl);
 331         } else {
 332                 /*
 333                  * Set priority/affinity/enable for non PIRQs
 334                  */
 335                 ret = ec_set_irq_priority(irqno, ipl);
 336                 ASSERT(ret == 0);
 337                 if ((cpu = xen_psm_bind_intr(irqno)) == IRQ_UNBOUND) {
 338                         CPUSET_ZERO(cpus);
 339                         CPUSET_OR(cpus, xen_psm_cpus_online);
 340                 } else {
 341                         CPUSET_ONLY(cpus, cpu & ~IRQ_USER_BOUND);
 342                 }
 343                 ec_set_irq_affinity(irqno, cpus);
 344                 ec_enable_irq(irqno);
 345         }
 346         return (ret);
 347 }
 348 
 349 /*
 350  * Acquire ownership of this irq on this cpu
 351  */
 352 void
 353 xen_psm_acquire_irq(int irq)
 354 {
 355         ulong_t flags;
 356         int cpuid;
 357 
 358         /*
 359          * If the irq is currently being serviced by another cpu
 360          * we busy-wait for the other cpu to finish.  Take any
 361          * pending interrupts before retrying.
 362          */
 363         do {
 364                 flags = intr_clear();
 365                 cpuid = ec_block_irq(irq);
 366                 intr_restore(flags);
 367         } while (cpuid != CPU->cpu_id);
 368 }
 369 
 370 /*ARGSUSED*/
 371 static int
 372 xen_psm_delspl(int irqno, int ipl, int min_ipl, int max_ipl)
 373 {
 374         apic_irq_t *irqptr;
 375         int err = PSM_SUCCESS;
 376 
 377         if (irqno >= PIRQ_BASE && irqno < NR_PIRQS &&
 378             DOMAIN_IS_INITDOMAIN(xen_info)) {
 379                 irqptr = apic_irq_table[irqno];
 380                 /*
 381                  * unbind if no more sharers of this irq/evtchn
 382                  */
 383                 if (irqptr->airq_share == 1) {
 384                         xen_psm_acquire_irq(irqno);
 385                         ec_unbind_irq(irqno);
 386                 }
 387                 err = apic_delspl_common(irqno, ipl, min_ipl, max_ipl);
 388                 /*
 389                  * If still in use reset priority
 390                  */
 391                 if (!err && irqptr->airq_share != 0) {
 392                         err = ec_set_irq_priority(irqno, max_ipl);
 393                         return (err);
 394                 }
 395         } else {
 396                 xen_psm_acquire_irq(irqno);
 397                 ec_unbind_irq(irqno);
 398         }
 399         return (err);
 400 }
 401 
 402 static processorid_t
 403 xen_psm_get_next_processorid(processorid_t id)
 404 {
 405         if (id == -1)
 406                 return (0);
 407 
 408         for (id++; id < NCPU; id++) {
 409                 switch (-HYPERVISOR_vcpu_op(VCPUOP_is_up, id, NULL)) {
 410                 case 0:         /* yeah, that one's there */
 411                         return (id);
 412                 default:
 413                 case X_EINVAL:  /* out of range */
 414                         return (-1);
 415                 case X_ENOENT:  /* not present in the domain */
 416                         /*
 417                          * It's not clear that we -need- to keep looking
 418                          * at this point, if, e.g., we can guarantee
 419                          * the hypervisor always keeps a contiguous range
 420                          * of vcpus around this is equivalent to "out of range".
 421                          *
 422                          * But it would be sad to miss a vcpu we're
 423                          * supposed to be using ..
 424                          */
 425                         break;
 426                 }
 427         }
 428 
 429         return (-1);
 430 }
 431 
 432 /*
 433  * XXPV - undo the start cpu op change; return to ignoring this value
 434  *      - also tweak error handling in main startup loop
 435  */
 436 /*ARGSUSED*/
 437 static int
 438 xen_psm_cpu_start(processorid_t id, caddr_t arg)
 439 {
 440         int ret;
 441 
 442         ASSERT(id > 0);
 443         CPUSET_ATOMIC_ADD(xen_psm_cpus_online, id);
 444         ec_bind_cpu_ipis(id);
 445         (void) ec_bind_virq_to_irq(VIRQ_TIMER, id);
 446         if ((ret = xen_vcpu_up(id)) == 0)
 447                 xen_psm_ncpus++;
 448         else
 449                 ret = EINVAL;
 450         return (ret);
 451 }
 452 
 453 /*
 454  * Allocate an irq for inter cpu signaling
 455  */
 456 /*ARGSUSED*/
 457 static int
 458 xen_psm_get_ipivect(int ipl, int type)
 459 {
 460         return (ec_bind_ipi_to_irq(ipl, 0));
 461 }
 462 
 463 /*ARGSUSED*/
 464 static int
 465 xen_psm_get_clockirq(int ipl)
 466 {
 467         if (xen_clock_irq != INVALID_IRQ)
 468                 return (xen_clock_irq);
 469 
 470         xen_clock_irq = ec_bind_virq_to_irq(VIRQ_TIMER, 0);
 471         return (xen_clock_irq);
 472 }
 473 
 474 /*ARGSUSED*/
 475 static void
 476 xen_psm_shutdown(int cmd, int fcn)
 477 {
 478         XEN_PSM_VERBOSE_POWEROFF(("xen_psm_shutdown(%d,%d);\n", cmd, fcn));
 479 
 480         switch (cmd) {
 481         case A_SHUTDOWN:
 482                 switch (fcn) {
 483                 case AD_BOOT:
 484                 case AD_IBOOT:
 485                         (void) HYPERVISOR_shutdown(SHUTDOWN_reboot);
 486                         break;
 487                 case AD_POWEROFF:
 488                         /* fall through if domU or if poweroff fails */
 489                         if (DOMAIN_IS_INITDOMAIN(xen_info))
 490                                 if (apic_enable_acpi)
 491                                         (void) acpi_poweroff();
 492                         /* FALLTHRU */
 493                 case AD_HALT:
 494                 default:
 495                         (void) HYPERVISOR_shutdown(SHUTDOWN_poweroff);
 496                         break;
 497                 }
 498                 break;
 499         case A_REBOOT:
 500                 (void) HYPERVISOR_shutdown(SHUTDOWN_reboot);
 501                 break;
 502         default:
 503                 return;
 504         }
 505 }
 506 
 507 
 508 static int
 509 xen_psm_translate_irq(dev_info_t *dip, int irqno)
 510 {
 511         if (dip == NULL) {
 512                 XEN_PSM_VERBOSE_IRQ((CE_CONT, "!xen_psm: irqno = %d"
 513                     " dip = NULL\n", irqno));
 514                 return (irqno);
 515         }
 516         return (irqno);
 517 }
 518 
 519 /*
 520  * xen_psm_intr_enter() acks the event that triggered the interrupt and
 521  * returns the new priority level,
 522  */
 523 /*ARGSUSED*/
 524 static int
 525 xen_psm_intr_enter(int ipl, int *vector)
 526 {
 527         int newipl;
 528         uint_t intno;
 529         cpu_t *cpu = CPU;
 530 
 531         intno = (*vector);
 532 
 533         ASSERT(intno < NR_IRQS);
 534         ASSERT(cpu->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask != 0);
 535 
 536         if (!ec_is_edge_pirq(intno))
 537                 ec_clear_irq(intno);
 538 
 539         newipl = autovect[intno].avh_hi_pri;
 540         if (newipl == 0) {
 541                 /*
 542                  * (newipl == 0) means we have no service routines for this
 543                  * vector.  We will treat this as a spurious interrupt.
 544                  * We have cleared the pending bit already, clear the event
 545                  * mask and return a spurious interrupt.  This case can happen
 546                  * when an interrupt delivery is racing with the removal of
 547                  * of the service routine for that interrupt.
 548                  */
 549                 ec_unmask_irq(intno);
 550                 newipl = -1;    /* flag spurious interrupt */
 551         } else if (newipl <= cpu->cpu_pri) {
 552                 /*
 553                  * (newipl <= cpu->cpu_pri) means that we must be trying to
 554                  * service a vector that was shared with a higher priority
 555                  * isr.  The higher priority handler has been removed and
 556                  * we need to service this int.  We can't return a lower
 557                  * priority than current cpu priority.  Just synthesize a
 558                  * priority to return that should be acceptable.
 559                  * It should never happen that we synthesize a priority that
 560                  * moves us from low-priority to high-priority that would make
 561                  * a us incorrectly run on the high priority stack.
 562                  */
 563                 newipl = cpu->cpu_pri + 1;   /* synthetic priority */
 564                 ASSERT(newipl != LOCK_LEVEL + 1);
 565         }
 566         return (newipl);
 567 }
 568 
 569 
 570 /*
 571  * xen_psm_intr_exit() restores the old interrupt
 572  * priority level after processing an interrupt.
 573  * It is called with interrupts disabled, and does not enable interrupts.
 574  */
 575 /* ARGSUSED */
 576 static void
 577 xen_psm_intr_exit(int ipl, int vector)
 578 {
 579         ec_try_unmask_irq(vector);
 580         xen_psm_setspl(ipl);
 581 }
 582 
 583 intr_exit_fn_t
 584 psm_intr_exit_fn(void)
 585 {
 586         return (xen_psm_intr_exit);
 587 }
 588 
 589 /*
 590  * Check if new ipl level allows delivery of previously unserviced events
 591  */
 592 static void
 593 xen_psm_setspl(int ipl)
 594 {
 595         struct cpu *cpu = CPU;
 596         volatile vcpu_info_t *vci = cpu->cpu_m.mcpu_vcpu_info;
 597         uint16_t pending;
 598 
 599         ASSERT(vci->evtchn_upcall_mask != 0);
 600 
 601         /*
 602          * If new ipl level will enable any pending interrupts, setup so the
 603          * upcoming sti will cause us to get an upcall.
 604          */
 605         pending = cpu->cpu_m.mcpu_intr_pending & ~((1 << (ipl + 1)) - 1);
 606         if (pending) {
 607                 int i;
 608                 ulong_t pending_sels = 0;
 609                 volatile ulong_t *selp;
 610                 struct xen_evt_data *cpe = cpu->cpu_m.mcpu_evt_pend;
 611 
 612                 for (i = bsrw_insn(pending); i > ipl; i--)
 613                         pending_sels |= cpe->pending_sel[i];
 614                 ASSERT(pending_sels);
 615                 selp = (volatile ulong_t *)&vci->evtchn_pending_sel;
 616                 atomic_or_ulong(selp, pending_sels);
 617                 vci->evtchn_upcall_pending = 1;
 618         }
 619 }
 620 
 621 /*
 622  * This function provides external interface to the nexus for all
 623  * functionality related to the new DDI interrupt framework.
 624  *
 625  * Input:
 626  * dip     - pointer to the dev_info structure of the requested device
 627  * hdlp    - pointer to the internal interrupt handle structure for the
 628  *           requested interrupt
 629  * intr_op - opcode for this call
 630  * result  - pointer to the integer that will hold the result to be
 631  *           passed back if return value is PSM_SUCCESS
 632  *
 633  * Output:
 634  * return value is either PSM_SUCCESS or PSM_FAILURE
 635  */
 636 int
 637 xen_intr_ops(dev_info_t *dip, ddi_intr_handle_impl_t *hdlp,
 638     psm_intr_op_t intr_op, int *result)
 639 {
 640         int             cap;
 641         int             err;
 642         int             new_priority;
 643         apic_irq_t      *irqp;
 644         struct intrspec *ispec;
 645 
 646         DDI_INTR_IMPLDBG((CE_CONT, "xen_intr_ops: dip: %p hdlp: %p "
 647             "intr_op: %x\n", (void *)dip, (void *)hdlp, intr_op));
 648 
 649         switch (intr_op) {
 650         case PSM_INTR_OP_CHECK_MSI:
 651                 /*
 652                  * Till PCI passthru is supported, only dom0 has MSI/MSIX
 653                  */
 654                 if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
 655                         *result = hdlp->ih_type & ~(DDI_INTR_TYPE_MSI |
 656                             DDI_INTR_TYPE_MSIX);
 657                         break;
 658                 }
 659                 /*
 660                  * Check MSI/X is supported or not at APIC level and
 661                  * masked off the MSI/X bits in hdlp->ih_type if not
 662                  * supported before return.  If MSI/X is supported,
 663                  * leave the ih_type unchanged and return.
 664                  *
 665                  * hdlp->ih_type passed in from the nexus has all the
 666                  * interrupt types supported by the device.
 667                  */
 668                 if (xen_support_msi == 0) {
 669                         /*
 670                          * if xen_support_msi is not set, call
 671                          * apic_check_msi_support() to check whether msi
 672                          * is supported first
 673                          */
 674                         if (apic_check_msi_support() == PSM_SUCCESS)
 675                                 xen_support_msi = 1;
 676                         else
 677                                 xen_support_msi = -1;
 678                 }
 679                 if (xen_support_msi == 1)
 680                         *result = hdlp->ih_type;
 681                 else
 682                         *result = hdlp->ih_type & ~(DDI_INTR_TYPE_MSI |
 683                             DDI_INTR_TYPE_MSIX);
 684                 break;
 685         case PSM_INTR_OP_ALLOC_VECTORS:
 686                 if (hdlp->ih_type == DDI_INTR_TYPE_MSI)
 687                         *result = apic_alloc_msi_vectors(dip, hdlp->ih_inum,
 688                             hdlp->ih_scratch1, hdlp->ih_pri,
 689                             (int)(uintptr_t)hdlp->ih_scratch2);
 690                 else
 691                         *result = apic_alloc_msix_vectors(dip, hdlp->ih_inum,
 692                             hdlp->ih_scratch1, hdlp->ih_pri,
 693                             (int)(uintptr_t)hdlp->ih_scratch2);
 694                 break;
 695         case PSM_INTR_OP_FREE_VECTORS:
 696                 apic_free_vectors(dip, hdlp->ih_inum, hdlp->ih_scratch1,
 697                     hdlp->ih_pri, hdlp->ih_type);
 698                 break;
 699         case PSM_INTR_OP_NAVAIL_VECTORS:
 700                 /*
 701                  * XXPV - maybe we should make this be:
 702                  * min(APIC_VECTOR_PER_IPL, count of all avail vectors);
 703                  */
 704                 if (DOMAIN_IS_INITDOMAIN(xen_info))
 705                         *result = APIC_VECTOR_PER_IPL;
 706                 else
 707                         *result = 1;
 708                 break;
 709         case PSM_INTR_OP_XLATE_VECTOR:
 710                 ispec = ((ihdl_plat_t *)hdlp->ih_private)->ip_ispecp;
 711                 if (ispec->intrspec_vec >= PIRQ_BASE &&
 712                     ispec->intrspec_vec < NR_PIRQS &&
 713                     DOMAIN_IS_INITDOMAIN(xen_info)) {
 714                         *result = apic_introp_xlate(dip, ispec, hdlp->ih_type);
 715                 } else {
 716                         *result = ispec->intrspec_vec;
 717                 }
 718                 break;
 719         case PSM_INTR_OP_GET_PENDING:
 720                 /* XXPV - is this enough for dom0 or do we need to ref ioapic */
 721                 *result = ec_pending_irq(hdlp->ih_vector);
 722                 break;
 723         case PSM_INTR_OP_CLEAR_MASK:
 724                 /* XXPV - is this enough for dom0 or do we need to set ioapic */
 725                 if (hdlp->ih_type != DDI_INTR_TYPE_FIXED)
 726                         return (PSM_FAILURE);
 727                 ec_enable_irq(hdlp->ih_vector);
 728                 break;
 729         case PSM_INTR_OP_SET_MASK:
 730                 /* XXPV - is this enough for dom0 or do we need to set ioapic */
 731                 if (hdlp->ih_type != DDI_INTR_TYPE_FIXED)
 732                         return (PSM_FAILURE);
 733                 ec_disable_irq(hdlp->ih_vector);
 734                 break;
 735         case PSM_INTR_OP_GET_CAP:
 736                 cap = DDI_INTR_FLAG_PENDING | DDI_INTR_FLAG_EDGE;
 737                 if (hdlp->ih_type == DDI_INTR_TYPE_FIXED)
 738                         cap |= DDI_INTR_FLAG_MASKABLE;
 739                 *result = cap;
 740                 break;
 741         case PSM_INTR_OP_GET_SHARED:
 742                 if (DOMAIN_IS_INITDOMAIN(xen_info)) {
 743                         if (hdlp->ih_type != DDI_INTR_TYPE_FIXED)
 744                                 return (PSM_FAILURE);
 745                         ispec = ((ihdl_plat_t *)hdlp->ih_private)->ip_ispecp;
 746                         if ((irqp = apic_find_irq(dip, ispec, hdlp->ih_type))
 747                             == NULL)
 748                                 return (PSM_FAILURE);
 749                         *result = (irqp->airq_share > 1) ? 1: 0;
 750                 } else {
 751                         return (PSM_FAILURE);
 752                 }
 753                 break;
 754         case PSM_INTR_OP_SET_PRI:
 755                 new_priority = *(int *)result;
 756                 err = ec_set_irq_priority(hdlp->ih_vector, new_priority);
 757                 if (err != 0)
 758                         return (PSM_FAILURE);
 759                 break;
 760         case PSM_INTR_OP_GET_INTR:
 761                 if (!DOMAIN_IS_INITDOMAIN(xen_info))
 762                         return (PSM_FAILURE);
 763                 /*
 764                  * The interrupt handle given here has been allocated
 765                  * specifically for this command, and ih_private carries
 766                  * a pointer to a apic_get_intr_t.
 767                  */
 768                 if (apic_get_vector_intr_info(
 769                     hdlp->ih_vector, hdlp->ih_private) != PSM_SUCCESS)
 770                         return (PSM_FAILURE);
 771                 break;
 772         case PSM_INTR_OP_SET_CAP:
 773                 /* FALLTHRU */
 774         default:
 775                 return (PSM_FAILURE);
 776         }
 777         return (PSM_SUCCESS);
 778 }
 779 
 780 static void
 781 xen_psm_rebind_irq(int irq)
 782 {
 783         cpuset_t ncpu;
 784         processorid_t newcpu;
 785         apic_irq_t *irqptr;
 786 
 787         newcpu = xen_psm_bind_intr(irq);
 788         if (newcpu == IRQ_UNBOUND) {
 789                 CPUSET_ZERO(ncpu);
 790                 CPUSET_OR(ncpu, xen_psm_cpus_online);
 791         } else {
 792                 CPUSET_ONLY(ncpu, newcpu & ~IRQ_USER_BOUND);
 793         }
 794         ec_set_irq_affinity(irq, ncpu);
 795         if (irq <= APIC_MAX_VECTOR) {
 796                 irqptr = apic_irq_table[irq];
 797                 ASSERT(irqptr != NULL);
 798                 irqptr->airq_temp_cpu = (uchar_t)newcpu;
 799         }
 800 }
 801 
 802 /*
 803  * Disable all device interrupts for the given cpu.
 804  * High priority interrupts are not disabled and will still be serviced.
 805  */
 806 static int
 807 xen_psm_disable_intr(processorid_t cpun)
 808 {
 809         int irq;
 810 
 811         /*
 812          * Can't offline VCPU 0 on this hypervisor.  There's no reason
 813          * anyone would want to given that the CPUs are virtual. Also note
 814          * that the hypervisor requires suspend/resume to be on VCPU 0.
 815          */
 816         if (cpun == 0)
 817                 return (PSM_FAILURE);
 818 
 819         CPUSET_ATOMIC_DEL(xen_psm_cpus_online, cpun);
 820         for (irq = 0; irq < NR_IRQS; irq++) {
 821                 if (!ec_irq_needs_rebind(irq, cpun))
 822                         continue;
 823                 xen_psm_rebind_irq(irq);
 824         }
 825         return (PSM_SUCCESS);
 826 }
 827 
 828 static void
 829 xen_psm_enable_intr(processorid_t cpun)
 830 {
 831         int irq;
 832 
 833         if (cpun == 0)
 834                 return;
 835 
 836         CPUSET_ATOMIC_ADD(xen_psm_cpus_online, cpun);
 837 
 838         /*
 839          * Rebalance device interrupts among online processors
 840          */
 841         for (irq = 0; irq < NR_IRQS; irq++) {
 842                 if (!ec_irq_rebindable(irq))
 843                         continue;
 844                 xen_psm_rebind_irq(irq);
 845         }
 846 
 847         if (DOMAIN_IS_INITDOMAIN(xen_info)) {
 848                 apic_cpus[cpun].aci_status |= APIC_CPU_INTR_ENABLE;
 849         }
 850 }
 851 
 852 static int
 853 xen_psm_post_cpu_start()
 854 {
 855         processorid_t cpun;
 856 
 857         cpun = psm_get_cpu_id();
 858         if (DOMAIN_IS_INITDOMAIN(xen_info)) {
 859                 /*
 860                  * Non-virtualized environments can call psm_post_cpu_start
 861                  * from Suspend/Resume with the APIC_CPU_INTR_ENABLE bit set.
 862                  * xen_psm_post_cpu_start() is only called from boot.
 863                  */
 864                 apic_cpus[cpun].aci_status |= APIC_CPU_ONLINE;
 865         }
 866         return (PSM_SUCCESS);
 867 }
 868 
 869 /*
 870  * This function will reprogram the timer.
 871  *
 872  * When in oneshot mode the argument is the absolute time in future at which to
 873  * generate the interrupt.
 874  *
 875  * When in periodic mode, the argument is the interval at which the
 876  * interrupts should be generated. There is no need to support the periodic
 877  * mode timer change at this time.
 878  *
 879  * Note that we must be careful to convert from hrtime to Xen system time (see
 880  * xpv_timestamp.c).
 881  */
 882 static void
 883 xen_psm_timer_reprogram(hrtime_t timer_req)
 884 {
 885         hrtime_t now, timer_new, time_delta, xen_time;
 886         ulong_t flags;
 887 
 888         flags = intr_clear();
 889         /*
 890          * We should be called from high PIL context (CBE_HIGH_PIL),
 891          * so kpreempt is disabled.
 892          */
 893 
 894         now = xpv_gethrtime();
 895         xen_time = xpv_getsystime();
 896         if (timer_req <= now) {
 897                 /*
 898                  * requested to generate an interrupt in the past
 899                  * generate an interrupt as soon as possible
 900                  */
 901                 time_delta = XEN_NSEC_PER_TICK;
 902         } else
 903                 time_delta = timer_req - now;
 904 
 905         timer_new = xen_time + time_delta;
 906         if (HYPERVISOR_set_timer_op(timer_new) != 0)
 907                 panic("can't set hypervisor timer?");
 908         intr_restore(flags);
 909 }
 910 
 911 /*
 912  * This function will enable timer interrupts.
 913  */
 914 static void
 915 xen_psm_timer_enable(void)
 916 {
 917         ec_unmask_irq(xen_clock_irq);
 918 }
 919 
 920 /*
 921  * This function will disable timer interrupts on the current cpu.
 922  */
 923 static void
 924 xen_psm_timer_disable(void)
 925 {
 926         (void) ec_block_irq(xen_clock_irq);
 927         /*
 928          * If the clock irq is pending on this cpu then we need to
 929          * clear the pending interrupt.
 930          */
 931         ec_unpend_irq(xen_clock_irq);
 932 }
 933 
 934 /*
 935  *
 936  * The following functions are in the platform specific file so that they
 937  * can be different functions depending on whether we are running on
 938  * bare metal or a hypervisor.
 939  */
 940 
 941 /*
 942  * Allocate a free vector for irq at ipl.
 943  */
 944 /* ARGSUSED */
 945 uchar_t
 946 apic_allocate_vector(int ipl, int irq, int pri)
 947 {
 948         physdev_irq_t irq_op;
 949         uchar_t vector;
 950         int rc;
 951 
 952         irq_op.irq = irq;
 953 
 954         if ((rc = HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
 955             != 0)
 956                 panic("Hypervisor alloc vector failed err: %d", -rc);
 957         vector = irq_op.vector;
 958         /*
 959          * No need to worry about vector colliding with our reserved vectors
 960          * e.g. T_FASTTRAP, xen can differentiate between hardware and software
 961          * generated traps and handle them properly.
 962          */
 963         apic_vector_to_irq[vector] = (uchar_t)irq;
 964         return (vector);
 965 }
 966 
 967 /* Mark vector as not being used by any irq */
 968 void
 969 apic_free_vector(uchar_t vector)
 970 {
 971         apic_vector_to_irq[vector] = APIC_RESV_IRQ;
 972 }
 973 
 974 /*
 975  * This function returns the no. of vectors available for the pri.
 976  * dip is not used at this moment.  If we really don't need that,
 977  * it will be removed.  Since priority is not limited by hardware
 978  * when running on the hypervisor we simply return the maximum no.
 979  * of available contiguous vectors.
 980  */
 981 /*ARGSUSED*/
 982 int
 983 apic_navail_vector(dev_info_t *dip, int pri)
 984 {
 985         int     lowest, highest, i, navail, count;
 986 
 987         DDI_INTR_IMPLDBG((CE_CONT, "apic_navail_vector: dip: %p, pri: %x\n",
 988             (void *)dip, pri));
 989 
 990         highest = APIC_MAX_VECTOR;
 991         lowest = APIC_BASE_VECT;
 992         navail = count = 0;
 993 
 994         /* It has to be contiguous */
 995         for (i = lowest; i < highest; i++) {
 996                 count = 0;
 997                 while ((apic_vector_to_irq[i] == APIC_RESV_IRQ) &&
 998                     (i < highest)) {
 999                         count++;
1000                         i++;
1001                 }
1002                 if (count > navail)
1003                         navail = count;
1004         }
1005         return (navail);
1006 }
1007 
1008 static physdev_manage_pci_t *managed_devlist;
1009 static int mdev_cnt;
1010 static int mdev_size = 128;
1011 static uchar_t  msi_vector_to_pirq[APIC_MAX_VECTOR+1];
1012 
1013 /*
1014  * Add devfn on given bus to devices managed by hypervisor
1015  */
1016 static int
1017 xen_manage_device(uint8_t bus, uint8_t devfn)
1018 {
1019         physdev_manage_pci_t manage_pci, *newlist;
1020         int rc, i, oldsize;
1021 
1022         /*
1023          * Check if bus/devfn already managed.  If so just return success.
1024          */
1025         if (managed_devlist == NULL) {
1026                 managed_devlist = kmem_alloc(sizeof (physdev_manage_pci_t) *
1027                     mdev_size, KM_NOSLEEP);
1028                 if (managed_devlist == NULL) {
1029                         cmn_err(CE_WARN,
1030                             "Can't alloc space for managed device list");
1031                         return (0);
1032                 }
1033         };
1034         for (i = 0; i < mdev_cnt; i++) {
1035                 if (managed_devlist[i].bus == bus &&
1036                     managed_devlist[i].devfn == devfn)
1037                         return (1); /* device already managed */
1038         }
1039         manage_pci.bus = bus;
1040         manage_pci.devfn = devfn;
1041         rc = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add, &manage_pci);
1042         if (rc < 0) {
1043                 cmn_err(CE_WARN,
1044                     "hypervisor add pci device call failed bus:0x%x"
1045                     " devfn:0x%x", bus, devfn);
1046                 return (0);
1047         }
1048         /*
1049          * Add device to the managed device list
1050          */
1051         if (i == mdev_size) {
1052                 /*
1053                  * grow the managed device list
1054                  */
1055                 oldsize = mdev_size * sizeof (physdev_manage_pci_t);
1056                 mdev_size *= 2;
1057                 newlist = kmem_alloc(sizeof (physdev_manage_pci_t) * mdev_size,
1058                     KM_NOSLEEP);
1059                 if (newlist == NULL) {
1060                         cmn_err(CE_WARN, "Can't grow managed device list");
1061                         return (0);
1062                 }
1063                 bcopy(managed_devlist, newlist, oldsize);
1064                 kmem_free(managed_devlist, oldsize);
1065                 managed_devlist = newlist;
1066         }
1067         managed_devlist[i].bus = bus;
1068         managed_devlist[i].devfn = devfn;
1069         mdev_cnt++;
1070         return (1);
1071 }
1072 
1073 /*
1074  * allocate an apic irq struct for an MSI interrupt
1075  */
1076 static int
1077 msi_allocate_irq(int irq)
1078 {
1079         apic_irq_t *irqptr = apic_irq_table[irq];
1080 
1081         if (irqptr == NULL) {
1082                 irqptr = kmem_zalloc(sizeof (apic_irq_t), KM_NOSLEEP);
1083                 if (irqptr == NULL) {
1084                         cmn_err(CE_WARN, "xpv_psm: NO memory to allocate IRQ");
1085                         return (-1);
1086                 }
1087                 apic_irq_table[irq] = irqptr;
1088         } else {
1089                 if (irq == APIC_RESV_IRQ && irqptr->airq_mps_intr_index == 0)
1090                         irqptr->airq_mps_intr_index = FREE_INDEX;
1091                 if (irqptr->airq_mps_intr_index != FREE_INDEX) {
1092                         cmn_err(CE_WARN, "xpv_psm: MSI IRQ already in use");
1093                         return (-1);
1094                 }
1095         }
1096         irqptr->airq_mps_intr_index = FREE_INDEX;
1097         return (irq);
1098 }
1099 
1100 /*
1101  * read MSI/MSIX vector out of config space
1102  */
1103 static uchar_t
1104 xpv_psm_get_msi_vector(dev_info_t *dip, int type, int entry)
1105 {
1106         uint64_t                msi_data = 0;
1107         int                     cap_ptr = i_ddi_get_msi_msix_cap_ptr(dip);
1108         ddi_acc_handle_t        handle = i_ddi_get_pci_config_handle(dip);
1109         ushort_t                msi_ctrl;
1110         uchar_t                 vector;
1111 
1112         ASSERT((handle != NULL) && (cap_ptr != 0));
1113         if (type == DDI_INTR_TYPE_MSI) {
1114                 msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSI_CTRL);
1115                 /*
1116                  * Get vector
1117                  */
1118                 if (msi_ctrl &  PCI_MSI_64BIT_MASK) {
1119                         msi_data = pci_config_get16(handle,
1120                             cap_ptr + PCI_MSI_64BIT_DATA);
1121                 } else {
1122                         msi_data = pci_config_get16(handle,
1123                             cap_ptr + PCI_MSI_32BIT_DATA);
1124                 }
1125                 vector = (msi_data & 0xff) + entry;
1126         } else if (type == DDI_INTR_TYPE_MSIX) {
1127                 uintptr_t       off;
1128                 ddi_intr_msix_t *msix_p = i_ddi_get_msix(dip);
1129 
1130                 /* Offset into the given entry in the MSI-X table */
1131                 off = (uintptr_t)msix_p->msix_tbl_addr +
1132                     (entry  * PCI_MSIX_VECTOR_SIZE);
1133 
1134                 msi_data = ddi_get32(msix_p->msix_tbl_hdl,
1135                     (uint32_t *)(off + PCI_MSIX_DATA_OFFSET));
1136                 vector = msi_data & 0xff;
1137         }
1138         return (vector);
1139 }
1140 
1141 
1142 static void
1143 get_busdevfn(dev_info_t *dip, int *busp, int *devfnp)
1144 {
1145         pci_regspec_t *regspec;
1146         int reglen;
1147 
1148         /*
1149          * Get device reg spec, first word has PCI bus and
1150          * device/function info we need.
1151          */
1152         if (ddi_getlongprop(DDI_DEV_T_NONE, dip, DDI_PROP_DONTPASS, "reg",
1153             (caddr_t)&regspec, &reglen) != DDI_SUCCESS) {
1154                 cmn_err(CE_WARN,
1155                     "get_busdevfn() failed to get regspec.");
1156                 return;
1157         }
1158         /*
1159          * get PCI bus # from reg spec for device
1160          */
1161         *busp = PCI_REG_BUS_G(regspec[0].pci_phys_hi);
1162         /*
1163          * get combined device/function from reg spec for device.
1164          */
1165         *devfnp = (regspec[0].pci_phys_hi & (PCI_REG_FUNC_M | PCI_REG_DEV_M)) >>
1166             PCI_REG_FUNC_SHIFT;
1167 
1168         kmem_free(regspec, reglen);
1169 }
1170 
1171 /*
1172  * This function allocates "count" MSI vector(s) for the given "dip/pri/type"
1173  */
1174 int
1175 apic_alloc_msi_vectors(dev_info_t *dip, int inum, int count, int pri,
1176     int behavior)
1177 {
1178         int     rcount, i, rc, irqno;
1179         uchar_t vector, cpu;
1180         major_t major;
1181         apic_irq_t      *irqptr;
1182         physdev_map_pirq_t map_irq;
1183         int busnum, devfn;
1184 
1185         DDI_INTR_IMPLDBG((CE_CONT, "apic_alloc_msi_vectors: dip=0x%p "
1186             "inum=0x%x  pri=0x%x count=0x%x behavior=%d\n",
1187             (void *)dip, inum, pri, count, behavior));
1188 
1189         if (count > 1) {
1190                 if (behavior == DDI_INTR_ALLOC_STRICT &&
1191                     apic_multi_msi_enable == 0)
1192                         return (0);
1193                 if (apic_multi_msi_enable == 0)
1194                         count = 1;
1195         }
1196 
1197         if ((rcount = apic_navail_vector(dip, pri)) > count)
1198                 rcount = count;
1199         else if (rcount == 0 || (rcount < count &&
1200             behavior == DDI_INTR_ALLOC_STRICT))
1201                 return (0);
1202 
1203         /* if not ISP2, then round it down */
1204         if (!ISP2(rcount))
1205                 rcount = 1 << (highbit(rcount) - 1);
1206 
1207         /*
1208          * get PCI bus #  and devfn from reg spec for device
1209          */
1210         get_busdevfn(dip, &busnum, &devfn);
1211 
1212         /*
1213          * Tell xen about this pci device
1214          */
1215         if (!xen_manage_device(busnum, devfn))
1216                 return (0);
1217 
1218         mutex_enter(&airq_mutex);
1219 
1220         major = (dip != NULL) ? ddi_name_to_major(ddi_get_name(dip)) : 0;
1221         for (i = 0; i < rcount; i++) {
1222                 /*
1223                  * use PHYSDEVOP_map_pirq to have xen map MSI to a pirq
1224                  */
1225                 map_irq.domid = DOMID_SELF;
1226                 map_irq.type = MAP_PIRQ_TYPE_MSI;
1227                 map_irq.index = -rcount; /* hypervisor auto allocates vectors */
1228                 map_irq.pirq = -1;
1229                 map_irq.bus = busnum;
1230                 map_irq.devfn = devfn;
1231                 map_irq.entry_nr = i;
1232                 map_irq.table_base = 0;
1233                 rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
1234                 irqno = map_irq.pirq;
1235                 if (rc < 0) {
1236                         mutex_exit(&airq_mutex);
1237                         cmn_err(CE_WARN, "map MSI irq failed err: %d", -rc);
1238                         return (i);
1239                 }
1240                 if (irqno < 0) {
1241                         mutex_exit(&airq_mutex);
1242                         cmn_err(CE_NOTE,
1243                             "!hypervisor not configured for MSI support");
1244                         xen_support_msi = -1;
1245                         return (0);
1246                 }
1247 
1248                 /*
1249                  * Find out what vector the hypervisor assigned
1250                  */
1251                 vector = xpv_psm_get_msi_vector(dip, DDI_INTR_TYPE_MSI, i);
1252 
1253                 if (msi_allocate_irq(irqno) < 0) {
1254                         mutex_exit(&airq_mutex);
1255                         return (i);
1256                 }
1257                 apic_max_device_irq = max(irqno, apic_max_device_irq);
1258                 apic_min_device_irq = min(irqno, apic_min_device_irq);
1259                 irqptr = apic_irq_table[irqno];
1260                 ASSERT(irqptr != NULL);
1261 #ifdef  DEBUG
1262                 if (apic_vector_to_irq[vector] != APIC_RESV_IRQ)
1263                         DDI_INTR_IMPLDBG((CE_CONT, "apic_alloc_msi_vectors: "
1264                             "apic_vector_to_irq is not APIC_RESV_IRQ\n"));
1265 #endif
1266                 apic_vector_to_irq[vector] = (uchar_t)irqno;
1267                 msi_vector_to_pirq[vector] = (uchar_t)irqno;
1268 
1269                 irqptr->airq_vector = vector;
1270                 irqptr->airq_ioapicindex = (uchar_t)inum;    /* start */
1271                 irqptr->airq_intin_no = (uchar_t)rcount;
1272                 irqptr->airq_ipl = pri;
1273                 irqptr->airq_origirq = (uchar_t)(inum + i);
1274                 irqptr->airq_share_id = 0;
1275                 irqptr->airq_mps_intr_index = MSI_INDEX;
1276                 irqptr->airq_dip = dip;
1277                 irqptr->airq_major = major;
1278                 if (i == 0) /* they all bind to the same cpu */
1279                         cpu = irqptr->airq_cpu = xen_psm_bind_intr(irqno);
1280                 else
1281                         irqptr->airq_cpu = cpu;
1282                 DDI_INTR_IMPLDBG((CE_CONT, "apic_alloc_msi_vectors: irq=0x%x "
1283                     "dip=0x%p vector=0x%x origirq=0x%x pri=0x%x\n", irqno,
1284                     (void *)irqptr->airq_dip, irqptr->airq_vector,
1285                     irqptr->airq_origirq, pri));
1286         }
1287         mutex_exit(&airq_mutex);
1288         return (rcount);
1289 }
1290 
1291 /*
1292  * This function allocates "count" MSI-X vector(s) for the given "dip/pri/type"
1293  */
1294 int
1295 apic_alloc_msix_vectors(dev_info_t *dip, int inum, int count, int pri,
1296     int behavior)
1297 {
1298         int     rcount, i, rc;
1299         major_t major;
1300         physdev_map_pirq_t map_irq;
1301         int busnum, devfn;
1302         ddi_intr_msix_t *msix_p = i_ddi_get_msix(dip);
1303         uint64_t table_base;
1304         pfn_t pfnum;
1305 
1306         if (msix_p == NULL) {
1307                 msix_p = pci_msix_init(dip);
1308                 if (msix_p != NULL) {
1309                         i_ddi_set_msix(dip, msix_p);
1310                 } else {
1311                         cmn_err(CE_WARN, "apic_alloc_msix_vectors()"
1312                             " msix_init failed");
1313                         return (0);
1314                 }
1315         }
1316         /*
1317          * Hypervisor wants PCI config space address of msix table base
1318          */
1319         pfnum = hat_getpfnum(kas.a_hat, (caddr_t)msix_p->msix_tbl_addr) &
1320             ~PFN_IS_FOREIGN_MFN;
1321         table_base = (uint64_t)((pfnum << PAGESHIFT) - msix_p->msix_tbl_offset |
1322             ((uintptr_t)msix_p->msix_tbl_addr & PAGEOFFSET));
1323         /*
1324          * get PCI bus #  and devfn from reg spec for device
1325          */
1326         get_busdevfn(dip, &busnum, &devfn);
1327 
1328         /*
1329          * Tell xen about this pci device
1330          */
1331         if (!xen_manage_device(busnum, devfn))
1332                 return (0);
1333         mutex_enter(&airq_mutex);
1334 
1335         if ((rcount = apic_navail_vector(dip, pri)) > count)
1336                 rcount = count;
1337         else if (rcount == 0 || (rcount < count &&
1338             behavior == DDI_INTR_ALLOC_STRICT)) {
1339                 rcount = 0;
1340                 goto out;
1341         }
1342 
1343         major = (dip != NULL) ? ddi_name_to_major(ddi_get_name(dip)) : 0;
1344         for (i = 0; i < rcount; i++) {
1345                 int irqno;
1346                 uchar_t vector;
1347                 apic_irq_t      *irqptr;
1348 
1349                 /*
1350                  * use PHYSDEVOP_map_pirq to have xen map MSI-X to a pirq
1351                  */
1352                 map_irq.domid = DOMID_SELF;
1353                 map_irq.type = MAP_PIRQ_TYPE_MSI;
1354                 map_irq.index = -1; /* hypervisor auto allocates vector */
1355                 map_irq.pirq = -1;
1356                 map_irq.bus = busnum;
1357                 map_irq.devfn = devfn;
1358                 map_irq.entry_nr = i;
1359                 map_irq.table_base = table_base;
1360                 rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
1361                 irqno = map_irq.pirq;
1362                 if (rc < 0) {
1363                         mutex_exit(&airq_mutex);
1364                         cmn_err(CE_WARN, "map MSI irq failed err: %d", -rc);
1365                         return (i);
1366                 }
1367                 if (irqno < 0) {
1368                         mutex_exit(&airq_mutex);
1369                         cmn_err(CE_NOTE,
1370                             "!hypervisor not configured for MSI support");
1371                         xen_support_msi = -1;
1372                         return (0);
1373                 }
1374                 /*
1375                  * Find out what vector the hypervisor assigned
1376                  */
1377                 vector = xpv_psm_get_msi_vector(dip, DDI_INTR_TYPE_MSIX, i);
1378 
1379                 if (msi_allocate_irq(irqno) < 0) {
1380                         mutex_exit(&airq_mutex);
1381                         return (i);
1382                 }
1383                 apic_vector_to_irq[vector] = (uchar_t)irqno;
1384                 msi_vector_to_pirq[vector] = (uchar_t)irqno;
1385                 apic_max_device_irq = max(irqno, apic_max_device_irq);
1386                 apic_min_device_irq = min(irqno, apic_min_device_irq);
1387                 irqptr = apic_irq_table[irqno];
1388                 ASSERT(irqptr != NULL);
1389                 irqptr->airq_vector = (uchar_t)vector;
1390                 irqptr->airq_ipl = pri;
1391                 irqptr->airq_origirq = (uchar_t)(inum + i);
1392                 irqptr->airq_share_id = 0;
1393                 irqptr->airq_mps_intr_index = MSIX_INDEX;
1394                 irqptr->airq_dip = dip;
1395                 irqptr->airq_major = major;
1396                 irqptr->airq_cpu = IRQ_UNBOUND; /* will be bound when addspl */
1397         }
1398 out:
1399         mutex_exit(&airq_mutex);
1400         return (rcount);
1401 }
1402 
1403 
1404 /*
1405  * This finds the apic_irq_t associated with the dip, ispec and type.
1406  * The entry should have already been freed, but it can not have been
1407  * reused yet since the hypervisor can not have reassigned the pirq since
1408  * we have not freed that yet.
1409  */
1410 static apic_irq_t *
1411 msi_find_irq(dev_info_t *dip, struct intrspec *ispec)
1412 {
1413         apic_irq_t      *irqp;
1414         int i;
1415 
1416         for (i = apic_min_device_irq; i <= apic_max_device_irq; i++) {
1417                 if ((irqp = apic_irq_table[i]) == NULL)
1418                         continue;
1419                 if ((irqp->airq_dip == dip) &&
1420                     (irqp->airq_origirq == ispec->intrspec_vec) &&
1421                     (irqp->airq_ipl == ispec->intrspec_pri)) {
1422                         return (irqp);
1423                 }
1424         }
1425         return (NULL);
1426 }
1427 
1428 void
1429 apic_free_vectors(dev_info_t *dip, int inum, int count, int pri, int type)
1430 {
1431         int i, rc;
1432         physdev_unmap_pirq_t unmap_pirq;
1433         apic_irq_t *irqptr;
1434         struct intrspec ispec;
1435 
1436         DDI_INTR_IMPLDBG((CE_CONT, "apic_free_vectors: dip: %p inum: %x "
1437             "count: %x pri: %x type: %x\n",
1438             (void *)dip, inum, count, pri, type));
1439 
1440         /* for MSI/X only */
1441         if (!DDI_INTR_IS_MSI_OR_MSIX(type))
1442                 return;
1443 
1444         for (i = 0; i < count; i++) {
1445                 DDI_INTR_IMPLDBG((CE_CONT, "apic_free_vectors: inum=0x%x "
1446                     "pri=0x%x count=0x%x\n", inum, pri, count));
1447                 ispec.intrspec_vec = inum + i;
1448                 ispec.intrspec_pri = pri;
1449                 if ((irqptr = msi_find_irq(dip, &ispec)) == NULL) {
1450                         cmn_err(CE_WARN,
1451                             "couldn't find irq %s,%s dip: 0x%p vec: %x pri: %x",
1452                             ddi_get_name(dip), ddi_get_name_addr(dip),
1453                             (void *)dip, inum + i, pri);
1454                         continue;
1455                 }
1456                 /*
1457                  * use PHYSDEVOP_unmap_pirq to have xen unmap MSI from a pirq
1458                  */
1459                 unmap_pirq.domid = DOMID_SELF;
1460                 unmap_pirq.pirq = msi_vector_to_pirq[irqptr->airq_vector];
1461                 rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_pirq);
1462                 if (rc < 0) {
1463                         cmn_err(CE_WARN, "unmap pirq failed");
1464                         return;
1465                 }
1466                 irqptr->airq_mps_intr_index = FREE_INDEX;
1467                 apic_vector_to_irq[irqptr->airq_vector] = APIC_RESV_IRQ;
1468         }
1469 }
1470 
1471 /*
1472  * The hypervisor doesn't permit access to local apics directly
1473  */
1474 /* ARGSUSED */
1475 uint32_t *
1476 mapin_apic(uint32_t addr, size_t len, int flags)
1477 {
1478         /*
1479          * Return a pointer to a memory area to fake out the
1480          * probe code that wants to read apic registers.
1481          * The dummy values will end up being ignored by xen
1482          * later on when they are used anyway.
1483          */
1484         xen_psm_dummy_apic[APIC_VERS_REG] = APIC_INTEGRATED_VERS;
1485         return (xen_psm_dummy_apic);
1486 }
1487 
1488 /* ARGSUSED */
1489 uint32_t *
1490 mapin_ioapic(uint32_t addr, size_t len, int flags)
1491 {
1492         /*
1493          * Return non-null here to fake out configure code that calls this.
1494          * The i86xpv platform will not reference through the returned value..
1495          */
1496         return ((uint32_t *)0x1);
1497 }
1498 
1499 /* ARGSUSED */
1500 void
1501 mapout_apic(caddr_t addr, size_t len)
1502 {
1503 }
1504 
1505 /* ARGSUSED */
1506 void
1507 mapout_ioapic(caddr_t addr, size_t len)
1508 {
1509 }
1510 
1511 uint32_t
1512 ioapic_read(int apic_ix, uint32_t reg)
1513 {
1514         physdev_apic_t apic;
1515 
1516         apic.apic_physbase = (unsigned long)apic_physaddr[apic_ix];
1517         apic.reg = reg;
1518         if (HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic))
1519                 panic("read ioapic %d reg %d failed", apic_ix, reg);
1520         return (apic.value);
1521 }
1522 
1523 void
1524 ioapic_write(int apic_ix, uint32_t reg, uint32_t value)
1525 {
1526         physdev_apic_t apic;
1527 
1528         apic.apic_physbase = (unsigned long)apic_physaddr[apic_ix];
1529         apic.reg = reg;
1530         apic.value = value;
1531         if (HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic))
1532                 panic("write ioapic %d reg %d failed", apic_ix, reg);
1533 }
1534 
1535 /*
1536  * This function was added as part of x2APIC support in pcplusmp.
1537  */
1538 void
1539 ioapic_write_eoi(int apic_ix, uint32_t value)
1540 {
1541         physdev_apic_t apic;
1542 
1543         apic.apic_physbase = (unsigned long)apic_physaddr[apic_ix];
1544         apic.reg = APIC_IO_EOI;
1545         apic.value = value;
1546         if (HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic))
1547                 panic("write ioapic reg : APIC_IO_EOI %d failed", apic_ix);
1548 }
1549 
1550 /*
1551  * This function was added as part of x2APIC support in pcplusmp to resolve
1552  * undefined symbol in xpv_psm.
1553  */
1554 void
1555 x2apic_update_psm()
1556 {
1557 }
1558 
1559 /*
1560  * This function was added as part of x2APIC support in pcplusmp to resolve
1561  * undefined symbol in xpv_psm.
1562  */
1563 void
1564 apic_ret()
1565 {
1566 }
1567 
1568 /*
1569  * Call rebind to do the actual programming.
1570  */
1571 int
1572 apic_setup_io_intr(void *p, int irq, boolean_t deferred)
1573 {
1574         apic_irq_t *irqptr;
1575         struct ioapic_reprogram_data *drep = NULL;
1576         int rv, cpu;
1577         cpuset_t cpus;
1578 
1579         if (deferred) {
1580                 drep = (struct ioapic_reprogram_data *)p;
1581                 ASSERT(drep != NULL);
1582                 irqptr = drep->irqp;
1583         } else {
1584                 irqptr = (apic_irq_t *)p;
1585         }
1586         ASSERT(irqptr != NULL);
1587         /*
1588          * Set cpu based on xen idea of online cpu's not apic tables.
1589          * Note that xen ignores/sets to it's own preferred value the
1590          * target cpu field when programming ioapic anyway.
1591          */
1592         if (irqptr->airq_mps_intr_index == MSI_INDEX)
1593                 cpu = irqptr->airq_cpu; /* MSI cpus are already set */
1594         else {
1595                 cpu = xen_psm_bind_intr(irq);
1596                 irqptr->airq_cpu = cpu;
1597         }
1598         if (cpu == IRQ_UNBOUND) {
1599                 CPUSET_ZERO(cpus);
1600                 CPUSET_OR(cpus, xen_psm_cpus_online);
1601         } else {
1602                 CPUSET_ONLY(cpus, cpu & ~IRQ_USER_BOUND);
1603         }
1604         rv = apic_rebind(irqptr, cpu, drep);
1605         if (rv) {
1606                 /* CPU is not up or interrupt is disabled. Fall back to 0 */
1607                 cpu = 0;
1608                 irqptr->airq_cpu = cpu;
1609                 rv = apic_rebind(irqptr, cpu, drep);
1610         }
1611         /*
1612          * If rebind successful bind the irq to an event channel
1613          */
1614         if (rv == 0) {
1615                 ec_setup_pirq(irq, irqptr->airq_ipl, &cpus);
1616                 CPUSET_FIND(cpus, cpu);
1617                 apic_irq_table[irq]->airq_temp_cpu = cpu & ~IRQ_USER_BOUND;
1618         }
1619         return (rv);
1620 }
1621 
1622 /*
1623  * Allocate a new vector for the given irq
1624  */
1625 /* ARGSUSED */
1626 uchar_t
1627 apic_modify_vector(uchar_t vector, int irq)
1628 {
1629         return (apic_allocate_vector(0, irq, 0));
1630 }
1631 
1632 /*
1633  * The rest of the file is just generic psm module boilerplate
1634  */
1635 
1636 static struct psm_ops xen_psm_ops = {
1637         xen_psm_probe,                          /* psm_probe            */
1638 
1639         xen_psm_softinit,                       /* psm_init             */
1640         xen_psm_picinit,                        /* psm_picinit          */
1641         xen_psm_intr_enter,                     /* psm_intr_enter       */
1642         xen_psm_intr_exit,                      /* psm_intr_exit        */
1643         xen_psm_setspl,                         /* psm_setspl           */
1644         xen_psm_addspl,                         /* psm_addspl           */
1645         xen_psm_delspl,                         /* psm_delspl           */
1646         xen_psm_disable_intr,                   /* psm_disable_intr     */
1647         xen_psm_enable_intr,                    /* psm_enable_intr      */
1648         (int (*)(int))NULL,                     /* psm_softlvl_to_irq   */
1649         (void (*)(int))NULL,                    /* psm_set_softintr     */
1650         (void (*)(processorid_t))NULL,          /* psm_set_idlecpu      */
1651         (void (*)(processorid_t))NULL,          /* psm_unset_idlecpu    */
1652 
1653         xen_psm_clkinit,                        /* psm_clkinit          */
1654         xen_psm_get_clockirq,                   /* psm_get_clockirq     */
1655         xen_psm_hrtimeinit,                     /* psm_hrtimeinit       */
1656         xpv_gethrtime,                          /* psm_gethrtime        */
1657 
1658         xen_psm_get_next_processorid,           /* psm_get_next_processorid */
1659         xen_psm_cpu_start,                      /* psm_cpu_start        */
1660         xen_psm_post_cpu_start,                 /* psm_post_cpu_start   */
1661         xen_psm_shutdown,                       /* psm_shutdown         */
1662         xen_psm_get_ipivect,                    /* psm_get_ipivect      */
1663         xen_psm_send_ipi,                       /* psm_send_ipi         */
1664 
1665         xen_psm_translate_irq,                  /* psm_translate_irq    */
1666 
1667         (void (*)(int, char *))NULL,            /* psm_notify_error     */
1668         (void (*)(int msg))NULL,                /* psm_notify_func      */
1669         xen_psm_timer_reprogram,                /* psm_timer_reprogram  */
1670         xen_psm_timer_enable,                   /* psm_timer_enable     */
1671         xen_psm_timer_disable,                  /* psm_timer_disable    */
1672         (void (*)(void *arg))NULL,              /* psm_post_cyclic_setup */
1673         (void (*)(int, int))NULL,               /* psm_preshutdown      */
1674         xen_intr_ops,                   /* Advanced DDI Interrupt framework */
1675         (int (*)(psm_state_request_t *))NULL,   /* psm_state            */
1676         (int (*)(psm_cpu_request_t *))NULL      /* psm_cpu_ops          */
1677 };
1678 
1679 static struct psm_info xen_psm_info = {
1680         PSM_INFO_VER01_5,       /* version                              */
1681         PSM_OWN_EXCLUSIVE,      /* ownership                            */
1682         &xen_psm_ops,               /* operation                            */
1683         "xVM_psm",              /* machine name                         */
1684         "platform module"       /* machine descriptions                 */
1685 };
1686 
1687 static void *xen_psm_hdlp;
1688 
1689 int
1690 _init(void)
1691 {
1692         return (psm_mod_init(&xen_psm_hdlp, &xen_psm_info));
1693 }
1694 
1695 int
1696 _fini(void)
1697 {
1698         return (psm_mod_fini(&xen_psm_hdlp, &xen_psm_info));
1699 }
1700 
1701 int
1702 _info(struct modinfo *modinfop)
1703 {
1704         return (psm_mod_info(&xen_psm_hdlp, &xen_psm_info, modinfop));
1705 }