1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 #ifdef DEBUG
  28 #define XNB_DEBUG 1
  29 #endif /* DEBUG */
  30 
  31 #include "xnb.h"
  32 
  33 #include <sys/sunddi.h>
  34 #include <sys/sunndi.h>
  35 #include <sys/modctl.h>
  36 #include <sys/conf.h>
  37 #include <sys/mac.h>
  38 #include <sys/mac_impl.h> /* For mac_fix_cksum(). */
  39 #include <sys/dlpi.h>
  40 #include <sys/strsubr.h>
  41 #include <sys/strsun.h>
  42 #include <sys/types.h>
  43 #include <sys/pattr.h>
  44 #include <vm/seg_kmem.h>
  45 #include <vm/hat_i86.h>
  46 #include <xen/sys/xenbus_impl.h>
  47 #include <xen/sys/xendev.h>
  48 #include <sys/balloon_impl.h>
  49 #include <sys/evtchn_impl.h>
  50 #include <sys/gnttab.h>
  51 #include <vm/vm_dep.h>
  52 #include <sys/note.h>
  53 #include <sys/gld.h>
  54 #include <inet/ip.h>
  55 #include <inet/ip_impl.h>
  56 
  57 /*
  58  * The terms "transmit" and "receive" are used in alignment with domU,
  59  * which means that packets originating from the peer domU are "transmitted"
  60  * to other parts of the system and packets are "received" from them.
  61  */
  62 
  63 /*
  64  * Should we allow guests to manipulate multicast group membership?
  65  */
  66 static boolean_t        xnb_multicast_control = B_TRUE;
  67 
  68 static boolean_t        xnb_connect_rings(dev_info_t *);
  69 static void             xnb_disconnect_rings(dev_info_t *);
  70 static void             xnb_oe_state_change(dev_info_t *, ddi_eventcookie_t,
  71     void *, void *);
  72 static void             xnb_hp_state_change(dev_info_t *, ddi_eventcookie_t,
  73     void *, void *);
  74 
  75 static int      xnb_txbuf_constructor(void *, void *, int);
  76 static void     xnb_txbuf_destructor(void *, void *);
  77 static void     xnb_tx_notify_peer(xnb_t *, boolean_t);
  78 static void     xnb_tx_mark_complete(xnb_t *, RING_IDX, int16_t);
  79 
  80 mblk_t          *xnb_to_peer(xnb_t *, mblk_t *);
  81 mblk_t          *xnb_copy_to_peer(xnb_t *, mblk_t *);
  82 
  83 static void             setup_gop(xnb_t *, gnttab_copy_t *, uchar_t *,
  84     size_t, size_t, size_t, grant_ref_t);
  85 #pragma inline(setup_gop)
  86 static boolean_t        is_foreign(void *);
  87 #pragma inline(is_foreign)
  88 
  89 #define INVALID_GRANT_HANDLE    ((grant_handle_t)-1)
  90 #define INVALID_GRANT_REF       ((grant_ref_t)-1)
  91 
  92 static kmutex_t xnb_alloc_page_lock;
  93 
  94 /*
  95  * On a 32 bit PAE system physical and machine addresses are larger
  96  * than 32 bits.  ddi_btop() on such systems take an unsigned long
  97  * argument, and so addresses above 4G are truncated before ddi_btop()
  98  * gets to see them.  To avoid this, code the shift operation here.
  99  */
 100 #define xnb_btop(addr)  ((addr) >> PAGESHIFT)
 101 
 102 /* DMA attributes for transmit and receive data */
 103 static ddi_dma_attr_t buf_dma_attr = {
 104         DMA_ATTR_V0,            /* version of this structure */
 105         0,                      /* lowest usable address */
 106         0xffffffffffffffffULL,  /* highest usable address */
 107         0x7fffffff,             /* maximum DMAable byte count */
 108         MMU_PAGESIZE,           /* alignment in bytes */
 109         0x7ff,                  /* bitmap of burst sizes */
 110         1,                      /* minimum transfer */
 111         0xffffffffU,            /* maximum transfer */
 112         0xffffffffffffffffULL,  /* maximum segment length */
 113         1,                      /* maximum number of segments */
 114         1,                      /* granularity */
 115         0,                      /* flags (reserved) */
 116 };
 117 
 118 /* DMA access attributes for data: NOT to be byte swapped. */
 119 static ddi_device_acc_attr_t data_accattr = {
 120         DDI_DEVICE_ATTR_V0,
 121         DDI_NEVERSWAP_ACC,
 122         DDI_STRICTORDER_ACC
 123 };
 124 
 125 /*
 126  * Statistics.
 127  */
 128 static const char * const aux_statistics[] = {
 129         "rx_cksum_deferred",
 130         "tx_cksum_no_need",
 131         "rx_rsp_notok",
 132         "tx_notify_deferred",
 133         "tx_notify_sent",
 134         "rx_notify_deferred",
 135         "rx_notify_sent",
 136         "tx_too_early",
 137         "rx_too_early",
 138         "rx_allocb_failed",
 139         "tx_allocb_failed",
 140         "rx_foreign_page",
 141         "mac_full",
 142         "spurious_intr",
 143         "allocation_success",
 144         "allocation_failure",
 145         "small_allocation_success",
 146         "small_allocation_failure",
 147         "other_allocation_failure",
 148         "rx_pageboundary_crossed",
 149         "rx_cpoparea_grown",
 150         "csum_hardware",
 151         "csum_software",
 152         "tx_overflow_page",
 153         "tx_unexpected_flags",
 154 };
 155 
 156 static int
 157 xnb_ks_aux_update(kstat_t *ksp, int flag)
 158 {
 159         xnb_t *xnbp;
 160         kstat_named_t *knp;
 161 
 162         if (flag != KSTAT_READ)
 163                 return (EACCES);
 164 
 165         xnbp = ksp->ks_private;
 166         knp = ksp->ks_data;
 167 
 168         /*
 169          * Assignment order should match that of the names in
 170          * aux_statistics.
 171          */
 172         (knp++)->value.ui64 = xnbp->xnb_stat_rx_cksum_deferred;
 173         (knp++)->value.ui64 = xnbp->xnb_stat_tx_cksum_no_need;
 174         (knp++)->value.ui64 = xnbp->xnb_stat_rx_rsp_notok;
 175         (knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_deferred;
 176         (knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_sent;
 177         (knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_deferred;
 178         (knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_sent;
 179         (knp++)->value.ui64 = xnbp->xnb_stat_tx_too_early;
 180         (knp++)->value.ui64 = xnbp->xnb_stat_rx_too_early;
 181         (knp++)->value.ui64 = xnbp->xnb_stat_rx_allocb_failed;
 182         (knp++)->value.ui64 = xnbp->xnb_stat_tx_allocb_failed;
 183         (knp++)->value.ui64 = xnbp->xnb_stat_rx_foreign_page;
 184         (knp++)->value.ui64 = xnbp->xnb_stat_mac_full;
 185         (knp++)->value.ui64 = xnbp->xnb_stat_spurious_intr;
 186         (knp++)->value.ui64 = xnbp->xnb_stat_allocation_success;
 187         (knp++)->value.ui64 = xnbp->xnb_stat_allocation_failure;
 188         (knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_success;
 189         (knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_failure;
 190         (knp++)->value.ui64 = xnbp->xnb_stat_other_allocation_failure;
 191         (knp++)->value.ui64 = xnbp->xnb_stat_rx_pagebndry_crossed;
 192         (knp++)->value.ui64 = xnbp->xnb_stat_rx_cpoparea_grown;
 193         (knp++)->value.ui64 = xnbp->xnb_stat_csum_hardware;
 194         (knp++)->value.ui64 = xnbp->xnb_stat_csum_software;
 195         (knp++)->value.ui64 = xnbp->xnb_stat_tx_overflow_page;
 196         (knp++)->value.ui64 = xnbp->xnb_stat_tx_unexpected_flags;
 197 
 198         return (0);
 199 }
 200 
 201 static boolean_t
 202 xnb_ks_init(xnb_t *xnbp)
 203 {
 204         int nstat = sizeof (aux_statistics) /
 205             sizeof (aux_statistics[0]);
 206         const char * const *cp = aux_statistics;
 207         kstat_named_t *knp;
 208 
 209         /*
 210          * Create and initialise kstats.
 211          */
 212         xnbp->xnb_kstat_aux = kstat_create(ddi_driver_name(xnbp->xnb_devinfo),
 213             ddi_get_instance(xnbp->xnb_devinfo), "aux_statistics", "net",
 214             KSTAT_TYPE_NAMED, nstat, 0);
 215         if (xnbp->xnb_kstat_aux == NULL)
 216                 return (B_FALSE);
 217 
 218         xnbp->xnb_kstat_aux->ks_private = xnbp;
 219         xnbp->xnb_kstat_aux->ks_update = xnb_ks_aux_update;
 220 
 221         knp = xnbp->xnb_kstat_aux->ks_data;
 222         while (nstat > 0) {
 223                 kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
 224 
 225                 knp++;
 226                 cp++;
 227                 nstat--;
 228         }
 229 
 230         kstat_install(xnbp->xnb_kstat_aux);
 231 
 232         return (B_TRUE);
 233 }
 234 
 235 static void
 236 xnb_ks_free(xnb_t *xnbp)
 237 {
 238         kstat_delete(xnbp->xnb_kstat_aux);
 239 }
 240 
 241 /*
 242  * Calculate and insert the transport checksum for an arbitrary packet.
 243  */
 244 static mblk_t *
 245 xnb_software_csum(xnb_t *xnbp, mblk_t *mp)
 246 {
 247         _NOTE(ARGUNUSED(xnbp));
 248 
 249         /*
 250          * XXPV dme: shouldn't rely on mac_fix_cksum(), not least
 251          * because it doesn't cover all of the interesting cases :-(
 252          */
 253         mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM);
 254 
 255         return (mac_fix_cksum(mp));
 256 }
 257 
 258 mblk_t *
 259 xnb_process_cksum_flags(xnb_t *xnbp, mblk_t *mp, uint32_t capab)
 260 {
 261         struct ether_header *ehp;
 262         uint16_t sap;
 263         uint32_t offset;
 264         ipha_t *ipha;
 265 
 266         ASSERT(mp->b_next == NULL);
 267 
 268         /*
 269          * Check that the packet is contained in a single mblk.  In
 270          * the "from peer" path this is true today, but may change
 271          * when scatter gather support is added.  In the "to peer"
 272          * path we cannot be sure, but in most cases it will be true
 273          * (in the xnbo case the packet has come from a MAC device
 274          * which is unlikely to split packets).
 275          */
 276         if (mp->b_cont != NULL)
 277                 goto software;
 278 
 279         /*
 280          * If the MAC has no hardware capability don't do any further
 281          * checking.
 282          */
 283         if (capab == 0)
 284                 goto software;
 285 
 286         ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
 287         ehp = (struct ether_header *)mp->b_rptr;
 288 
 289         if (ntohs(ehp->ether_type) == VLAN_TPID) {
 290                 struct ether_vlan_header *evhp;
 291 
 292                 ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
 293                 evhp = (struct ether_vlan_header *)mp->b_rptr;
 294                 sap = ntohs(evhp->ether_type);
 295                 offset = sizeof (struct ether_vlan_header);
 296         } else {
 297                 sap = ntohs(ehp->ether_type);
 298                 offset = sizeof (struct ether_header);
 299         }
 300 
 301         /*
 302          * We only attempt to do IPv4 packets in hardware.
 303          */
 304         if (sap != ETHERTYPE_IP)
 305                 goto software;
 306 
 307         /*
 308          * We know that this is an IPv4 packet.
 309          */
 310         ipha = (ipha_t *)(mp->b_rptr + offset);
 311 
 312         switch (ipha->ipha_protocol) {
 313         case IPPROTO_TCP:
 314         case IPPROTO_UDP: {
 315                 uint32_t start, length, stuff, cksum;
 316                 uint16_t *stuffp;
 317 
 318                 /*
 319                  * This is a TCP/IPv4 or UDP/IPv4 packet, for which we
 320                  * can use full IPv4 and partial checksum offload.
 321                  */
 322                 if ((capab & (HCKSUM_INET_FULL_V4|HCKSUM_INET_PARTIAL)) == 0)
 323                         break;
 324 
 325                 start = IP_SIMPLE_HDR_LENGTH;
 326                 length = ntohs(ipha->ipha_length);
 327                 if (ipha->ipha_protocol == IPPROTO_TCP) {
 328                         stuff = start + TCP_CHECKSUM_OFFSET;
 329                         cksum = IP_TCP_CSUM_COMP;
 330                 } else {
 331                         stuff = start + UDP_CHECKSUM_OFFSET;
 332                         cksum = IP_UDP_CSUM_COMP;
 333                 }
 334                 stuffp = (uint16_t *)(mp->b_rptr + offset + stuff);
 335 
 336                 if (capab & HCKSUM_INET_FULL_V4) {
 337                         /*
 338                          * Some devices require that the checksum
 339                          * field of the packet is zero for full
 340                          * offload.
 341                          */
 342                         *stuffp = 0;
 343 
 344                         mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM);
 345 
 346                         xnbp->xnb_stat_csum_hardware++;
 347 
 348                         return (mp);
 349                 }
 350 
 351                 if (capab & HCKSUM_INET_PARTIAL) {
 352                         if (*stuffp == 0) {
 353                                 ipaddr_t src, dst;
 354 
 355                                 /*
 356                                  * Older Solaris guests don't insert
 357                                  * the pseudo-header checksum, so we
 358                                  * calculate it here.
 359                                  */
 360                                 src = ipha->ipha_src;
 361                                 dst = ipha->ipha_dst;
 362 
 363                                 cksum += (dst >> 16) + (dst & 0xFFFF);
 364                                 cksum += (src >> 16) + (src & 0xFFFF);
 365                                 cksum += length - IP_SIMPLE_HDR_LENGTH;
 366 
 367                                 cksum = (cksum >> 16) + (cksum & 0xFFFF);
 368                                 cksum = (cksum >> 16) + (cksum & 0xFFFF);
 369 
 370                                 ASSERT(cksum <= 0xFFFF);
 371 
 372                                 *stuffp = (uint16_t)(cksum ? cksum : ~cksum);
 373                         }
 374 
 375                         mac_hcksum_set(mp, start, stuff, length, 0,
 376                             HCK_PARTIALCKSUM);
 377 
 378                         xnbp->xnb_stat_csum_hardware++;
 379 
 380                         return (mp);
 381                 }
 382 
 383                 /* NOTREACHED */
 384                 break;
 385         }
 386 
 387         default:
 388                 /* Use software. */
 389                 break;
 390         }
 391 
 392 software:
 393         /*
 394          * We are not able to use any offload so do the whole thing in
 395          * software.
 396          */
 397         xnbp->xnb_stat_csum_software++;
 398 
 399         return (xnb_software_csum(xnbp, mp));
 400 }
 401 
 402 int
 403 xnb_attach(dev_info_t *dip, xnb_flavour_t *flavour, void *flavour_data)
 404 {
 405         xnb_t *xnbp;
 406         char *xsname;
 407         char cachename[32];
 408 
 409         xnbp = kmem_zalloc(sizeof (*xnbp), KM_SLEEP);
 410 
 411         xnbp->xnb_flavour = flavour;
 412         xnbp->xnb_flavour_data = flavour_data;
 413         xnbp->xnb_devinfo = dip;
 414         xnbp->xnb_evtchn = INVALID_EVTCHN;
 415         xnbp->xnb_irq = B_FALSE;
 416         xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE;
 417         xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE;
 418         xnbp->xnb_connected = B_FALSE;
 419         xnbp->xnb_hotplugged = B_FALSE;
 420         xnbp->xnb_detachable = B_FALSE;
 421         xnbp->xnb_peer = xvdi_get_oeid(dip);
 422         xnbp->xnb_be_status = XNB_STATE_INIT;
 423         xnbp->xnb_fe_status = XNB_STATE_INIT;
 424 
 425         xnbp->xnb_tx_buf_count = 0;
 426 
 427         xnbp->xnb_rx_hv_copy = B_FALSE;
 428         xnbp->xnb_multicast_control = B_FALSE;
 429 
 430         xnbp->xnb_rx_va = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
 431         ASSERT(xnbp->xnb_rx_va != NULL);
 432 
 433         if (ddi_get_iblock_cookie(dip, 0, &xnbp->xnb_icookie)
 434             != DDI_SUCCESS)
 435                 goto failure;
 436 
 437         /* Allocated on demand, when/if we enter xnb_copy_to_peer(). */
 438         xnbp->xnb_rx_cpop = NULL;
 439         xnbp->xnb_rx_cpop_count = 0;
 440 
 441         mutex_init(&xnbp->xnb_tx_lock, NULL, MUTEX_DRIVER,
 442             xnbp->xnb_icookie);
 443         mutex_init(&xnbp->xnb_rx_lock, NULL, MUTEX_DRIVER,
 444             xnbp->xnb_icookie);
 445         mutex_init(&xnbp->xnb_state_lock, NULL, MUTEX_DRIVER,
 446             xnbp->xnb_icookie);
 447 
 448         /* Set driver private pointer now. */
 449         ddi_set_driver_private(dip, xnbp);
 450 
 451         (void) sprintf(cachename, "xnb_tx_buf_cache_%d", ddi_get_instance(dip));
 452         xnbp->xnb_tx_buf_cache = kmem_cache_create(cachename,
 453             sizeof (xnb_txbuf_t), 0,
 454             xnb_txbuf_constructor, xnb_txbuf_destructor,
 455             NULL, xnbp, NULL, 0);
 456         if (xnbp->xnb_tx_buf_cache == NULL)
 457                 goto failure_0;
 458 
 459         if (!xnb_ks_init(xnbp))
 460                 goto failure_1;
 461 
 462         /*
 463          * Receive notification of changes in the state of the
 464          * driver in the guest domain.
 465          */
 466         if (xvdi_add_event_handler(dip, XS_OE_STATE, xnb_oe_state_change,
 467             NULL) != DDI_SUCCESS)
 468                 goto failure_2;
 469 
 470         /*
 471          * Receive notification of hotplug events.
 472          */
 473         if (xvdi_add_event_handler(dip, XS_HP_STATE, xnb_hp_state_change,
 474             NULL) != DDI_SUCCESS)
 475                 goto failure_2;
 476 
 477         xsname = xvdi_get_xsname(dip);
 478 
 479         if (xenbus_printf(XBT_NULL, xsname,
 480             "feature-multicast-control", "%d",
 481             xnb_multicast_control ? 1 : 0) != 0)
 482                 goto failure_3;
 483 
 484         if (xenbus_printf(XBT_NULL, xsname,
 485             "feature-rx-copy", "%d",  1) != 0)
 486                 goto failure_3;
 487         /*
 488          * Linux domUs seem to depend on "feature-rx-flip" being 0
 489          * in addition to "feature-rx-copy" being 1. It seems strange
 490          * to use four possible states to describe a binary decision,
 491          * but we might as well play nice.
 492          */
 493         if (xenbus_printf(XBT_NULL, xsname,
 494             "feature-rx-flip", "%d", 0) != 0)
 495                 goto failure_3;
 496 
 497         (void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitWait);
 498         (void) xvdi_post_event(dip, XEN_HP_ADD);
 499 
 500         return (DDI_SUCCESS);
 501 
 502 failure_3:
 503         xvdi_remove_event_handler(dip, NULL);
 504 
 505 failure_2:
 506         xnb_ks_free(xnbp);
 507 
 508 failure_1:
 509         kmem_cache_destroy(xnbp->xnb_tx_buf_cache);
 510 
 511 failure_0:
 512         mutex_destroy(&xnbp->xnb_state_lock);
 513         mutex_destroy(&xnbp->xnb_rx_lock);
 514         mutex_destroy(&xnbp->xnb_tx_lock);
 515 
 516 failure:
 517         vmem_free(heap_arena, xnbp->xnb_rx_va, PAGESIZE);
 518         kmem_free(xnbp, sizeof (*xnbp));
 519         return (DDI_FAILURE);
 520 }
 521 
 522 void
 523 xnb_detach(dev_info_t *dip)
 524 {
 525         xnb_t *xnbp = ddi_get_driver_private(dip);
 526 
 527         ASSERT(xnbp != NULL);
 528         ASSERT(!xnbp->xnb_connected);
 529         ASSERT(xnbp->xnb_tx_buf_count == 0);
 530 
 531         xnb_disconnect_rings(dip);
 532 
 533         xvdi_remove_event_handler(dip, NULL);
 534 
 535         xnb_ks_free(xnbp);
 536 
 537         kmem_cache_destroy(xnbp->xnb_tx_buf_cache);
 538 
 539         ddi_set_driver_private(dip, NULL);
 540 
 541         mutex_destroy(&xnbp->xnb_state_lock);
 542         mutex_destroy(&xnbp->xnb_rx_lock);
 543         mutex_destroy(&xnbp->xnb_tx_lock);
 544 
 545         if (xnbp->xnb_rx_cpop_count > 0)
 546                 kmem_free(xnbp->xnb_rx_cpop, sizeof (xnbp->xnb_rx_cpop[0])
 547                     * xnbp->xnb_rx_cpop_count);
 548 
 549         ASSERT(xnbp->xnb_rx_va != NULL);
 550         vmem_free(heap_arena, xnbp->xnb_rx_va, PAGESIZE);
 551 
 552         kmem_free(xnbp, sizeof (*xnbp));
 553 }
 554 
 555 /*
 556  * Allocate a page from the hypervisor to be flipped to the peer.
 557  *
 558  * Try to get pages in batches to reduce the overhead of calls into
 559  * the balloon driver.
 560  */
 561 static mfn_t
 562 xnb_alloc_page(xnb_t *xnbp)
 563 {
 564 #define WARNING_RATE_LIMIT 100
 565 #define BATCH_SIZE 256
 566         static mfn_t mfns[BATCH_SIZE];  /* common across all instances */
 567         static int nth = BATCH_SIZE;
 568         mfn_t mfn;
 569 
 570         mutex_enter(&xnb_alloc_page_lock);
 571         if (nth == BATCH_SIZE) {
 572                 if (balloon_alloc_pages(BATCH_SIZE, mfns) != BATCH_SIZE) {
 573                         xnbp->xnb_stat_allocation_failure++;
 574                         mutex_exit(&xnb_alloc_page_lock);
 575 
 576                         /*
 577                          * Try for a single page in low memory situations.
 578                          */
 579                         if (balloon_alloc_pages(1, &mfn) != 1) {
 580                                 if ((xnbp->xnb_stat_small_allocation_failure++
 581                                     % WARNING_RATE_LIMIT) == 0)
 582                                         cmn_err(CE_WARN, "xnb_alloc_page: "
 583                                             "Cannot allocate memory to "
 584                                             "transfer packets to peer.");
 585                                 return (0);
 586                         } else {
 587                                 xnbp->xnb_stat_small_allocation_success++;
 588                                 return (mfn);
 589                         }
 590                 }
 591 
 592                 nth = 0;
 593                 xnbp->xnb_stat_allocation_success++;
 594         }
 595 
 596         mfn = mfns[nth++];
 597         mutex_exit(&xnb_alloc_page_lock);
 598 
 599         ASSERT(mfn != 0);
 600 
 601         return (mfn);
 602 #undef BATCH_SIZE
 603 #undef WARNING_RATE_LIMIT
 604 }
 605 
 606 /*
 607  * Free a page back to the hypervisor.
 608  *
 609  * This happens only in the error path, so batching is not worth the
 610  * complication.
 611  */
 612 static void
 613 xnb_free_page(xnb_t *xnbp, mfn_t mfn)
 614 {
 615         _NOTE(ARGUNUSED(xnbp));
 616         int r;
 617         pfn_t pfn;
 618 
 619         pfn = xen_assign_pfn(mfn);
 620         pfnzero(pfn, 0, PAGESIZE);
 621         xen_release_pfn(pfn);
 622 
 623         if ((r = balloon_free_pages(1, &mfn, NULL, NULL)) != 1) {
 624                 cmn_err(CE_WARN, "free_page: cannot decrease memory "
 625                     "reservation (%d): page kept but unusable (mfn = 0x%lx).",
 626                     r, mfn);
 627         }
 628 }
 629 
 630 /*
 631  * Similar to RING_HAS_UNCONSUMED_REQUESTS(&xnbp->rx_ring) but using
 632  * local variables. Used in both xnb_to_peer() and xnb_copy_to_peer().
 633  */
 634 #define XNB_RING_HAS_UNCONSUMED_REQUESTS(_r)            \
 635         ((((_r)->sring->req_prod - loop) <             \
 636                 (RING_SIZE(_r) - (loop - prod))) ?      \
 637             ((_r)->sring->req_prod - loop) :              \
 638             (RING_SIZE(_r) - (loop - prod)))
 639 
 640 /*
 641  * Pass packets to the peer using page flipping.
 642  */
 643 mblk_t *
 644 xnb_to_peer(xnb_t *xnbp, mblk_t *mp)
 645 {
 646         mblk_t *free = mp, *prev = NULL;
 647         size_t len;
 648         gnttab_transfer_t *gop;
 649         boolean_t notify;
 650         RING_IDX loop, prod, end;
 651 
 652         /*
 653          * For each packet the sequence of operations is:
 654          *
 655          * 1. get a new page from the hypervisor.
 656          * 2. get a request slot from the ring.
 657          * 3. copy the data into the new page.
 658          * 4. transfer the page to the peer.
 659          * 5. update the request slot.
 660          * 6. kick the peer.
 661          * 7. free mp.
 662          *
 663          * In order to reduce the number of hypercalls, we prepare
 664          * several packets for the peer and perform a single hypercall
 665          * to transfer them.
 666          */
 667 
 668         mutex_enter(&xnbp->xnb_rx_lock);
 669 
 670         /*
 671          * If we are not connected to the peer or have not yet
 672          * finished hotplug it is too early to pass packets to the
 673          * peer.
 674          */
 675         if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) {
 676                 mutex_exit(&xnbp->xnb_rx_lock);
 677                 DTRACE_PROBE(flip_rx_too_early);
 678                 xnbp->xnb_stat_rx_too_early++;
 679                 return (mp);
 680         }
 681 
 682         loop = xnbp->xnb_rx_ring.req_cons;
 683         prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
 684         gop = xnbp->xnb_rx_top;
 685 
 686         while ((mp != NULL) &&
 687             XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) {
 688 
 689                 mfn_t mfn;
 690                 pfn_t pfn;
 691                 netif_rx_request_t *rxreq;
 692                 netif_rx_response_t *rxresp;
 693                 char *valoop;
 694                 mblk_t *ml;
 695                 uint16_t cksum_flags;
 696 
 697                 /* 1 */
 698                 if ((mfn = xnb_alloc_page(xnbp)) == 0) {
 699                         xnbp->xnb_stat_rx_defer++;
 700                         break;
 701                 }
 702 
 703                 /* 2 */
 704                 rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop);
 705 
 706 #ifdef XNB_DEBUG
 707                 if (!(rxreq->id < NET_RX_RING_SIZE))
 708                         cmn_err(CE_PANIC, "xnb_to_peer: "
 709                             "id %d out of range in request 0x%p",
 710                             rxreq->id, (void *)rxreq);
 711 #endif /* XNB_DEBUG */
 712 
 713                 /* Assign a pfn and map the new page at the allocated va. */
 714                 pfn = xen_assign_pfn(mfn);
 715                 hat_devload(kas.a_hat, xnbp->xnb_rx_va, PAGESIZE,
 716                     pfn, PROT_READ | PROT_WRITE, HAT_LOAD);
 717 
 718                 /* 3 */
 719                 len = 0;
 720                 valoop = xnbp->xnb_rx_va;
 721                 for (ml = mp; ml != NULL; ml = ml->b_cont) {
 722                         size_t chunk = ml->b_wptr - ml->b_rptr;
 723 
 724                         bcopy(ml->b_rptr, valoop, chunk);
 725                         valoop += chunk;
 726                         len += chunk;
 727                 }
 728 
 729                 ASSERT(len < PAGESIZE);
 730 
 731                 /* Release the pfn. */
 732                 hat_unload(kas.a_hat, xnbp->xnb_rx_va, PAGESIZE,
 733                     HAT_UNLOAD_UNMAP);
 734                 xen_release_pfn(pfn);
 735 
 736                 /* 4 */
 737                 gop->mfn = mfn;
 738                 gop->domid = xnbp->xnb_peer;
 739                 gop->ref = rxreq->gref;
 740 
 741                 /* 5.1 */
 742                 rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod);
 743                 rxresp->offset = 0;
 744                 rxresp->flags = 0;
 745 
 746                 cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp);
 747                 if (cksum_flags != 0)
 748                         xnbp->xnb_stat_rx_cksum_deferred++;
 749                 rxresp->flags |= cksum_flags;
 750 
 751                 rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id;
 752                 rxresp->status = len;
 753 
 754                 loop++;
 755                 prod++;
 756                 gop++;
 757                 prev = mp;
 758                 mp = mp->b_next;
 759         }
 760 
 761         /*
 762          * Did we actually do anything?
 763          */
 764         if (loop == xnbp->xnb_rx_ring.req_cons) {
 765                 mutex_exit(&xnbp->xnb_rx_lock);
 766                 return (mp);
 767         }
 768 
 769         end = loop;
 770 
 771         /*
 772          * Unlink the end of the 'done' list from the remainder.
 773          */
 774         ASSERT(prev != NULL);
 775         prev->b_next = NULL;
 776 
 777         if (HYPERVISOR_grant_table_op(GNTTABOP_transfer, xnbp->xnb_rx_top,
 778             loop - xnbp->xnb_rx_ring.req_cons) != 0) {
 779                 cmn_err(CE_WARN, "xnb_to_peer: transfer operation failed");
 780         }
 781 
 782         loop = xnbp->xnb_rx_ring.req_cons;
 783         prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
 784         gop = xnbp->xnb_rx_top;
 785 
 786         while (loop < end) {
 787                 int16_t status = NETIF_RSP_OKAY;
 788 
 789                 if (gop->status != 0) {
 790                         status = NETIF_RSP_ERROR;
 791 
 792                         /*
 793                          * If the status is anything other than
 794                          * GNTST_bad_page then we don't own the page
 795                          * any more, so don't try to give it back.
 796                          */
 797                         if (gop->status != GNTST_bad_page)
 798                                 gop->mfn = 0;
 799                 } else {
 800                         /* The page is no longer ours. */
 801                         gop->mfn = 0;
 802                 }
 803 
 804                 if (gop->mfn != 0)
 805                         /*
 806                          * Give back the page, as we won't be using
 807                          * it.
 808                          */
 809                         xnb_free_page(xnbp, gop->mfn);
 810                 else
 811                         /*
 812                          * We gave away a page, update our accounting
 813                          * now.
 814                          */
 815                         balloon_drv_subtracted(1);
 816 
 817                 /* 5.2 */
 818                 if (status != NETIF_RSP_OKAY) {
 819                         RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status =
 820                             status;
 821                 } else {
 822                         xnbp->xnb_stat_ipackets++;
 823                         xnbp->xnb_stat_rbytes += len;
 824                 }
 825 
 826                 loop++;
 827                 prod++;
 828                 gop++;
 829         }
 830 
 831         xnbp->xnb_rx_ring.req_cons = loop;
 832         xnbp->xnb_rx_ring.rsp_prod_pvt = prod;
 833 
 834         /* 6 */
 835         /* LINTED: constant in conditional context */
 836         RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify);
 837         if (notify) {
 838                 ec_notify_via_evtchn(xnbp->xnb_evtchn);
 839                 xnbp->xnb_stat_rx_notify_sent++;
 840         } else {
 841                 xnbp->xnb_stat_rx_notify_deferred++;
 842         }
 843 
 844         if (mp != NULL)
 845                 xnbp->xnb_stat_rx_defer++;
 846 
 847         mutex_exit(&xnbp->xnb_rx_lock);
 848 
 849         /* Free mblk_t's that we consumed. */
 850         freemsgchain(free);
 851 
 852         return (mp);
 853 }
 854 
 855 /* Helper functions for xnb_copy_to_peer(). */
 856 
 857 /*
 858  * Grow the array of copy operation descriptors.
 859  */
 860 static boolean_t
 861 grow_cpop_area(xnb_t *xnbp)
 862 {
 863         size_t count;
 864         gnttab_copy_t *new;
 865 
 866         ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
 867 
 868         count = xnbp->xnb_rx_cpop_count + CPOP_DEFCNT;
 869 
 870         if ((new = kmem_alloc(sizeof (new[0]) * count, KM_NOSLEEP)) == NULL) {
 871                 xnbp->xnb_stat_other_allocation_failure++;
 872                 return (B_FALSE);
 873         }
 874 
 875         bcopy(xnbp->xnb_rx_cpop, new,
 876             sizeof (xnbp->xnb_rx_cpop[0]) * xnbp->xnb_rx_cpop_count);
 877 
 878         kmem_free(xnbp->xnb_rx_cpop,
 879             sizeof (xnbp->xnb_rx_cpop[0]) * xnbp->xnb_rx_cpop_count);
 880 
 881         xnbp->xnb_rx_cpop = new;
 882         xnbp->xnb_rx_cpop_count = count;
 883 
 884         xnbp->xnb_stat_rx_cpoparea_grown++;
 885 
 886         return (B_TRUE);
 887 }
 888 
 889 /*
 890  * Check whether an address is on a page that's foreign to this domain.
 891  */
 892 static boolean_t
 893 is_foreign(void *addr)
 894 {
 895         pfn_t pfn = hat_getpfnum(kas.a_hat, addr);
 896 
 897         return ((pfn & PFN_IS_FOREIGN_MFN) == PFN_IS_FOREIGN_MFN);
 898 }
 899 
 900 /*
 901  * Insert a newly allocated mblk into a chain, replacing the old one.
 902  */
 903 static mblk_t *
 904 replace_msg(mblk_t *mp, size_t len, mblk_t *mp_prev, mblk_t *ml_prev)
 905 {
 906         uint32_t        start, stuff, end, value, flags;
 907         mblk_t          *new_mp;
 908 
 909         new_mp = copyb(mp);
 910         if (new_mp == NULL) {
 911                 cmn_err(CE_PANIC, "replace_msg: cannot alloc new message"
 912                     "for %p, len %lu", (void *) mp, len);
 913         }
 914 
 915         mac_hcksum_get(mp, &start, &stuff, &end, &value, &flags);
 916         mac_hcksum_set(new_mp, start, stuff, end, value, flags);
 917 
 918         new_mp->b_next = mp->b_next;
 919         new_mp->b_prev = mp->b_prev;
 920         new_mp->b_cont = mp->b_cont;
 921 
 922         /* Make sure we only overwrite pointers to the mblk being replaced. */
 923         if (mp_prev != NULL && mp_prev->b_next == mp)
 924                 mp_prev->b_next = new_mp;
 925 
 926         if (ml_prev != NULL && ml_prev->b_cont == mp)
 927                 ml_prev->b_cont = new_mp;
 928 
 929         mp->b_next = mp->b_prev = mp->b_cont = NULL;
 930         freemsg(mp);
 931 
 932         return (new_mp);
 933 }
 934 
 935 /*
 936  * Set all the fields in a gnttab_copy_t.
 937  */
 938 static void
 939 setup_gop(xnb_t *xnbp, gnttab_copy_t *gp, uchar_t *rptr,
 940     size_t s_off, size_t d_off, size_t len, grant_ref_t d_ref)
 941 {
 942         ASSERT(xnbp != NULL && gp != NULL);
 943 
 944         gp->source.offset = s_off;
 945         gp->source.u.gmfn = pfn_to_mfn(hat_getpfnum(kas.a_hat, (caddr_t)rptr));
 946         gp->source.domid = DOMID_SELF;
 947 
 948         gp->len = (uint16_t)len;
 949         gp->flags = GNTCOPY_dest_gref;
 950         gp->status = 0;
 951 
 952         gp->dest.u.ref = d_ref;
 953         gp->dest.offset = d_off;
 954         gp->dest.domid = xnbp->xnb_peer;
 955 }
 956 
 957 /*
 958  * Pass packets to the peer using hypervisor copy operations.
 959  */
 960 mblk_t *
 961 xnb_copy_to_peer(xnb_t *xnbp, mblk_t *mp)
 962 {
 963         mblk_t          *free = mp, *mp_prev = NULL, *saved_mp = mp;
 964         mblk_t          *ml, *ml_prev;
 965         boolean_t       notify;
 966         RING_IDX        loop, prod;
 967         int             i;
 968 
 969         /*
 970          * If the peer does not pre-post buffers for received packets,
 971          * use page flipping to pass packets to it.
 972          */
 973         if (!xnbp->xnb_rx_hv_copy)
 974                 return (xnb_to_peer(xnbp, mp));
 975 
 976         /*
 977          * For each packet the sequence of operations is:
 978          *
 979          *  1. get a request slot from the ring.
 980          *  2. set up data for hypercall (see NOTE below)
 981          *  3. have the hypervisore copy the data
 982          *  4. update the request slot.
 983          *  5. kick the peer.
 984          *
 985          * NOTE ad 2.
 986          *  In order to reduce the number of hypercalls, we prepare
 987          *  several mblks (mp->b_cont != NULL) for the peer and
 988          *  perform a single hypercall to transfer them.  We also have
 989          *  to set up a seperate copy operation for every page.
 990          *
 991          * If we have more than one packet (mp->b_next != NULL), we do
 992          * this whole dance repeatedly.
 993          */
 994 
 995         mutex_enter(&xnbp->xnb_rx_lock);
 996 
 997         if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) {
 998                 mutex_exit(&xnbp->xnb_rx_lock);
 999                 DTRACE_PROBE(copy_rx_too_early);
1000                 xnbp->xnb_stat_rx_too_early++;
1001                 return (mp);
1002         }
1003 
1004         loop = xnbp->xnb_rx_ring.req_cons;
1005         prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
1006 
1007         while ((mp != NULL) &&
1008             XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) {
1009                 netif_rx_request_t      *rxreq;
1010                 size_t                  d_offset, len;
1011                 int                     item_count;
1012                 gnttab_copy_t           *gop_cp;
1013                 netif_rx_response_t     *rxresp;
1014                 uint16_t                cksum_flags;
1015                 int16_t                 status = NETIF_RSP_OKAY;
1016 
1017                 /* 1 */
1018                 rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop);
1019 
1020 #ifdef XNB_DEBUG
1021                 if (!(rxreq->id < NET_RX_RING_SIZE))
1022                         cmn_err(CE_PANIC, "xnb_copy_to_peer: "
1023                             "id %d out of range in request 0x%p",
1024                             rxreq->id, (void *)rxreq);
1025 #endif /* XNB_DEBUG */
1026 
1027                 /* 2 */
1028                 d_offset = 0;
1029                 len = 0;
1030                 item_count = 0;
1031 
1032                 gop_cp = xnbp->xnb_rx_cpop;
1033 
1034                 /*
1035                  * We walk the b_cont pointers and set up a
1036                  * gnttab_copy_t for each sub-page chunk in each data
1037                  * block.
1038                  */
1039                 /* 2a */
1040                 for (ml = mp, ml_prev = NULL; ml != NULL; ml = ml->b_cont) {
1041                         size_t  chunk = ml->b_wptr - ml->b_rptr;
1042                         uchar_t *r_tmp, *rpt_align;
1043                         size_t  r_offset;
1044 
1045                         /*
1046                          * The hypervisor will not allow us to
1047                          * reference a foreign page (e.g. one
1048                          * belonging to another domain) by mfn in the
1049                          * copy operation. If the data in this mblk is
1050                          * on such a page we must copy the data into a
1051                          * local page before initiating the hypervisor
1052                          * copy operation.
1053                          */
1054                         if (is_foreign(ml->b_rptr) || is_foreign(ml->b_wptr)) {
1055                                 mblk_t *ml_new = replace_msg(ml, chunk,
1056                                     mp_prev, ml_prev);
1057 
1058                                 /* We can still use old ml, but not *ml! */
1059                                 if (free == ml)
1060                                         free = ml_new;
1061                                 if (mp == ml)
1062                                         mp = ml_new;
1063                                 ml = ml_new;
1064 
1065                                 xnbp->xnb_stat_rx_foreign_page++;
1066                         }
1067 
1068                         rpt_align = (uchar_t *)ALIGN2PAGE(ml->b_rptr);
1069                         r_offset = (uint16_t)(ml->b_rptr - rpt_align);
1070                         r_tmp = ml->b_rptr;
1071 
1072                         if (d_offset + chunk > PAGESIZE)
1073                                 cmn_err(CE_PANIC, "xnb_copy_to_peer: mp %p "
1074                                     "(svd: %p), ml %p,rpt_alg. %p, d_offset "
1075                                     "(%lu) + chunk (%lu) > PAGESIZE %d!",
1076                                     (void *)mp, (void *)saved_mp, (void *)ml,
1077                                     (void *)rpt_align,
1078                                     d_offset, chunk, (int)PAGESIZE);
1079 
1080                         while (chunk > 0) {
1081                                 size_t part_len;
1082 
1083                                 if (item_count == xnbp->xnb_rx_cpop_count) {
1084                                         if (!grow_cpop_area(xnbp))
1085                                                 goto failure;
1086                                         gop_cp = &xnbp->xnb_rx_cpop[item_count];
1087                                 }
1088                                 /*
1089                                  * If our mblk crosses a page boundary, we need
1090                                  * to do a seperate copy for each page.
1091                                  */
1092                                 if (r_offset + chunk > PAGESIZE) {
1093                                         part_len = PAGESIZE - r_offset;
1094 
1095                                         DTRACE_PROBE3(mblk_page_crossed,
1096                                             (mblk_t *), ml, int, chunk, int,
1097                                             (int)r_offset);
1098 
1099                                         xnbp->xnb_stat_rx_pagebndry_crossed++;
1100                                 } else {
1101                                         part_len = chunk;
1102                                 }
1103 
1104                                 setup_gop(xnbp, gop_cp, r_tmp, r_offset,
1105                                     d_offset, part_len, rxreq->gref);
1106 
1107                                 chunk -= part_len;
1108 
1109                                 len += part_len;
1110                                 d_offset += part_len;
1111                                 r_tmp += part_len;
1112                                 /*
1113                                  * The 2nd, 3rd ... last copies will always
1114                                  * start at r_tmp, therefore r_offset is 0.
1115                                  */
1116                                 r_offset = 0;
1117                                 gop_cp++;
1118                                 item_count++;
1119                         }
1120                         ml_prev = ml;
1121 
1122                         DTRACE_PROBE4(mblk_loop_end, (mblk_t *), ml, int,
1123                             chunk, int, len, int, item_count);
1124                 }
1125                 /* 3 */
1126                 if (HYPERVISOR_grant_table_op(GNTTABOP_copy, xnbp->xnb_rx_cpop,
1127                     item_count) != 0) {
1128                         cmn_err(CE_WARN, "xnb_copy_to_peer: copy op. failed");
1129                         DTRACE_PROBE(HV_granttableopfailed);
1130                 }
1131 
1132                 /* 4 */
1133                 rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod);
1134                 rxresp->offset = 0;
1135 
1136                 rxresp->flags = 0;
1137 
1138                 DTRACE_PROBE4(got_RX_rsp, int, (int)rxresp->id, int,
1139                     (int)rxresp->offset, int, (int)rxresp->flags, int,
1140                     (int)rxresp->status);
1141 
1142                 cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp);
1143                 if (cksum_flags != 0)
1144                         xnbp->xnb_stat_rx_cksum_deferred++;
1145                 rxresp->flags |= cksum_flags;
1146 
1147                 rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id;
1148                 rxresp->status = len;
1149 
1150                 DTRACE_PROBE4(RX_rsp_set, int, (int)rxresp->id, int,
1151                     (int)rxresp->offset, int, (int)rxresp->flags, int,
1152                     (int)rxresp->status);
1153 
1154                 for (i = 0; i < item_count; i++) {
1155                         if (xnbp->xnb_rx_cpop[i].status != 0) {
1156                                 DTRACE_PROBE2(cpop_status_nonnull, int,
1157                                     (int)xnbp->xnb_rx_cpop[i].status,
1158                                     int, i);
1159                                 status = NETIF_RSP_ERROR;
1160                         }
1161                 }
1162 
1163                 /* 5.2 */
1164                 if (status != NETIF_RSP_OKAY) {
1165                         RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status =
1166                             status;
1167                         xnbp->xnb_stat_rx_rsp_notok++;
1168                 } else {
1169                         xnbp->xnb_stat_ipackets++;
1170                         xnbp->xnb_stat_rbytes += len;
1171                 }
1172 
1173                 loop++;
1174                 prod++;
1175                 mp_prev = mp;
1176                 mp = mp->b_next;
1177         }
1178 failure:
1179         /*
1180          * Did we actually do anything?
1181          */
1182         if (loop == xnbp->xnb_rx_ring.req_cons) {
1183                 mutex_exit(&xnbp->xnb_rx_lock);
1184                 return (mp);
1185         }
1186 
1187         /*
1188          * Unlink the end of the 'done' list from the remainder.
1189          */
1190         ASSERT(mp_prev != NULL);
1191         mp_prev->b_next = NULL;
1192 
1193         xnbp->xnb_rx_ring.req_cons = loop;
1194         xnbp->xnb_rx_ring.rsp_prod_pvt = prod;
1195 
1196         /* 6 */
1197         /* LINTED: constant in conditional context */
1198         RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify);
1199         if (notify) {
1200                 ec_notify_via_evtchn(xnbp->xnb_evtchn);
1201                 xnbp->xnb_stat_rx_notify_sent++;
1202         } else {
1203                 xnbp->xnb_stat_rx_notify_deferred++;
1204         }
1205 
1206         if (mp != NULL)
1207                 xnbp->xnb_stat_rx_defer++;
1208 
1209         mutex_exit(&xnbp->xnb_rx_lock);
1210 
1211         /* Free mblk_t structs we have consumed. */
1212         freemsgchain(free);
1213 
1214         return (mp);
1215 }
1216 
1217 
1218 static void
1219 xnb_tx_notify_peer(xnb_t *xnbp, boolean_t force)
1220 {
1221         boolean_t notify;
1222 
1223         ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1224 
1225         /* LINTED: constant in conditional context */
1226         RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_tx_ring, notify);
1227         if (notify || force) {
1228                 ec_notify_via_evtchn(xnbp->xnb_evtchn);
1229                 xnbp->xnb_stat_tx_notify_sent++;
1230         } else {
1231                 xnbp->xnb_stat_tx_notify_deferred++;
1232         }
1233 }
1234 
1235 static void
1236 xnb_tx_mark_complete(xnb_t *xnbp, RING_IDX id, int16_t status)
1237 {
1238         RING_IDX i;
1239         netif_tx_response_t *txresp;
1240 
1241         ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1242 
1243         i = xnbp->xnb_tx_ring.rsp_prod_pvt;
1244 
1245         txresp = RING_GET_RESPONSE(&xnbp->xnb_tx_ring, i);
1246         txresp->id = id;
1247         txresp->status = status;
1248 
1249         xnbp->xnb_tx_ring.rsp_prod_pvt = i + 1;
1250 
1251         /*
1252          * Note that we don't push the change to the peer here - that
1253          * is the callers responsibility.
1254          */
1255 }
1256 
1257 static void
1258 xnb_txbuf_recycle(xnb_txbuf_t *txp)
1259 {
1260         xnb_t *xnbp = txp->xt_xnbp;
1261 
1262         kmem_cache_free(xnbp->xnb_tx_buf_cache, txp);
1263 
1264         xnbp->xnb_tx_buf_outstanding--;
1265 }
1266 
1267 static int
1268 xnb_txbuf_constructor(void *buf, void *arg, int kmflag)
1269 {
1270         _NOTE(ARGUNUSED(kmflag));
1271         xnb_txbuf_t *txp = buf;
1272         xnb_t *xnbp = arg;
1273         size_t len;
1274         ddi_dma_cookie_t dma_cookie;
1275         uint_t ncookies;
1276 
1277         txp->xt_free_rtn.free_func = xnb_txbuf_recycle;
1278         txp->xt_free_rtn.free_arg = (caddr_t)txp;
1279         txp->xt_xnbp = xnbp;
1280         txp->xt_next = NULL;
1281 
1282         if (ddi_dma_alloc_handle(xnbp->xnb_devinfo, &buf_dma_attr,
1283             0, 0, &txp->xt_dma_handle) != DDI_SUCCESS)
1284                 goto failure;
1285 
1286         if (ddi_dma_mem_alloc(txp->xt_dma_handle, PAGESIZE, &data_accattr,
1287             DDI_DMA_STREAMING, 0, 0, &txp->xt_buf, &len,
1288             &txp->xt_acc_handle) != DDI_SUCCESS)
1289                 goto failure_1;
1290 
1291         if (ddi_dma_addr_bind_handle(txp->xt_dma_handle, NULL, txp->xt_buf,
1292             len, DDI_DMA_RDWR | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT, 0,
1293             &dma_cookie, &ncookies)
1294             != DDI_DMA_MAPPED)
1295                 goto failure_2;
1296         ASSERT(ncookies == 1);
1297 
1298         txp->xt_mfn = xnb_btop(dma_cookie.dmac_laddress);
1299         txp->xt_buflen = dma_cookie.dmac_size;
1300 
1301         DTRACE_PROBE(txbuf_allocated);
1302 
1303         atomic_inc_32(&xnbp->xnb_tx_buf_count);
1304         xnbp->xnb_tx_buf_outstanding++;
1305 
1306         return (0);
1307 
1308 failure_2:
1309         ddi_dma_mem_free(&txp->xt_acc_handle);
1310 
1311 failure_1:
1312         ddi_dma_free_handle(&txp->xt_dma_handle);
1313 
1314 failure:
1315 
1316         return (-1);
1317 }
1318 
1319 static void
1320 xnb_txbuf_destructor(void *buf, void *arg)
1321 {
1322         xnb_txbuf_t *txp = buf;
1323         xnb_t *xnbp = arg;
1324 
1325         (void) ddi_dma_unbind_handle(txp->xt_dma_handle);
1326         ddi_dma_mem_free(&txp->xt_acc_handle);
1327         ddi_dma_free_handle(&txp->xt_dma_handle);
1328 
1329         atomic_dec_32(&xnbp->xnb_tx_buf_count);
1330 }
1331 
1332 /*
1333  * Take packets from the peer and deliver them onward.
1334  */
1335 static mblk_t *
1336 xnb_from_peer(xnb_t *xnbp)
1337 {
1338         RING_IDX start, end, loop;
1339         gnttab_copy_t *cop;
1340         xnb_txbuf_t **txpp;
1341         netif_tx_request_t *txreq;
1342         boolean_t work_to_do, need_notify = B_FALSE;
1343         mblk_t *head, *tail;
1344         int n_data_req, i;
1345 
1346         ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1347 
1348         head = tail = NULL;
1349 around:
1350 
1351         /* LINTED: constant in conditional context */
1352         RING_FINAL_CHECK_FOR_REQUESTS(&xnbp->xnb_tx_ring, work_to_do);
1353         if (!work_to_do) {
1354 finished:
1355                 xnb_tx_notify_peer(xnbp, need_notify);
1356 
1357                 return (head);
1358         }
1359 
1360         start = xnbp->xnb_tx_ring.req_cons;
1361         end = xnbp->xnb_tx_ring.sring->req_prod;
1362 
1363         if ((end - start) > NET_TX_RING_SIZE) {
1364                 /*
1365                  * This usually indicates that the frontend driver is
1366                  * misbehaving, as it's not possible to have more than
1367                  * NET_TX_RING_SIZE ring elements in play at any one
1368                  * time.
1369                  *
1370                  * We reset the ring pointers to the state declared by
1371                  * the frontend and try to carry on.
1372                  */
1373                 cmn_err(CE_WARN, "xnb_from_peer: domain %d tried to give us %u "
1374                     "items in the ring, resetting and trying to recover.",
1375                     xnbp->xnb_peer, (end - start));
1376 
1377                 /* LINTED: constant in conditional context */
1378                 BACK_RING_ATTACH(&xnbp->xnb_tx_ring,
1379                     (netif_tx_sring_t *)xnbp->xnb_tx_ring_addr, PAGESIZE);
1380 
1381                 goto around;
1382         }
1383 
1384         loop = start;
1385         cop = xnbp->xnb_tx_cop;
1386         txpp = xnbp->xnb_tx_bufp;
1387         n_data_req = 0;
1388 
1389         while (loop < end) {
1390                 static const uint16_t acceptable_flags =
1391                     NETTXF_csum_blank |
1392                     NETTXF_data_validated |
1393                     NETTXF_extra_info;
1394                 uint16_t unexpected_flags;
1395 
1396                 txreq = RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop);
1397 
1398                 unexpected_flags = txreq->flags & ~acceptable_flags;
1399                 if (unexpected_flags != 0) {
1400                         /*
1401                          * The peer used flag bits that we do not
1402                          * recognize.
1403                          */
1404                         cmn_err(CE_WARN, "xnb_from_peer: "
1405                             "unexpected flag bits (0x%x) from peer "
1406                             "in transmit request",
1407                             unexpected_flags);
1408                         xnbp->xnb_stat_tx_unexpected_flags++;
1409 
1410                         /* Mark this entry as failed. */
1411                         xnb_tx_mark_complete(xnbp, txreq->id, NETIF_RSP_ERROR);
1412                         need_notify = B_TRUE;
1413 
1414                 } else if (txreq->flags & NETTXF_extra_info) {
1415                         struct netif_extra_info *erp;
1416                         boolean_t status;
1417 
1418                         loop++; /* Consume another slot in the ring. */
1419                         ASSERT(loop <= end);
1420 
1421                         erp = (struct netif_extra_info *)
1422                             RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop);
1423 
1424                         switch (erp->type) {
1425                         case XEN_NETIF_EXTRA_TYPE_MCAST_ADD:
1426                                 ASSERT(xnbp->xnb_multicast_control);
1427                                 status = xnbp->xnb_flavour->xf_mcast_add(xnbp,
1428                                     &erp->u.mcast.addr);
1429                                 break;
1430                         case XEN_NETIF_EXTRA_TYPE_MCAST_DEL:
1431                                 ASSERT(xnbp->xnb_multicast_control);
1432                                 status = xnbp->xnb_flavour->xf_mcast_del(xnbp,
1433                                     &erp->u.mcast.addr);
1434                                 break;
1435                         default:
1436                                 status = B_FALSE;
1437                                 cmn_err(CE_WARN, "xnb_from_peer: "
1438                                     "unknown extra type %d", erp->type);
1439                                 break;
1440                         }
1441 
1442                         xnb_tx_mark_complete(xnbp, txreq->id,
1443                             status ? NETIF_RSP_OKAY : NETIF_RSP_ERROR);
1444                         need_notify = B_TRUE;
1445 
1446                 } else if ((txreq->offset > PAGESIZE) ||
1447                     (txreq->offset + txreq->size > PAGESIZE)) {
1448                         /*
1449                          * Peer attempted to refer to data beyond the
1450                          * end of the granted page.
1451                          */
1452                         cmn_err(CE_WARN, "xnb_from_peer: "
1453                             "attempt to refer beyond the end of granted "
1454                             "page in txreq (offset %d, size %d).",
1455                             txreq->offset, txreq->size);
1456                         xnbp->xnb_stat_tx_overflow_page++;
1457 
1458                         /* Mark this entry as failed. */
1459                         xnb_tx_mark_complete(xnbp, txreq->id, NETIF_RSP_ERROR);
1460                         need_notify = B_TRUE;
1461 
1462                 } else {
1463                         xnb_txbuf_t *txp;
1464 
1465                         txp = kmem_cache_alloc(xnbp->xnb_tx_buf_cache,
1466                             KM_NOSLEEP);
1467                         if (txp == NULL)
1468                                 break;
1469 
1470                         txp->xt_mblk = desballoc((unsigned char *)txp->xt_buf,
1471                             txp->xt_buflen, 0, &txp->xt_free_rtn);
1472                         if (txp->xt_mblk == NULL) {
1473                                 kmem_cache_free(xnbp->xnb_tx_buf_cache, txp);
1474                                 break;
1475                         }
1476 
1477                         txp->xt_idx = loop;
1478                         txp->xt_id = txreq->id;
1479 
1480                         cop->source.u.ref = txreq->gref;
1481                         cop->source.domid = xnbp->xnb_peer;
1482                         cop->source.offset = txreq->offset;
1483 
1484                         cop->dest.u.gmfn = txp->xt_mfn;
1485                         cop->dest.domid = DOMID_SELF;
1486                         cop->dest.offset = 0;
1487 
1488                         cop->len = txreq->size;
1489                         cop->flags = GNTCOPY_source_gref;
1490                         cop->status = 0;
1491 
1492                         *txpp = txp;
1493 
1494                         txpp++;
1495                         cop++;
1496                         n_data_req++;
1497 
1498                         ASSERT(n_data_req <= NET_TX_RING_SIZE);
1499                 }
1500 
1501                 loop++;
1502         }
1503 
1504         xnbp->xnb_tx_ring.req_cons = loop;
1505 
1506         if (n_data_req == 0)
1507                 goto around;
1508 
1509         if (HYPERVISOR_grant_table_op(GNTTABOP_copy,
1510             xnbp->xnb_tx_cop, n_data_req) != 0) {
1511 
1512                 cmn_err(CE_WARN, "xnb_from_peer: copy operation failed");
1513 
1514                 txpp = xnbp->xnb_tx_bufp;
1515                 i = n_data_req;
1516                 while (i > 0) {
1517                         kmem_cache_free(xnbp->xnb_tx_buf_cache, *txpp);
1518                         txpp++;
1519                         i--;
1520                 }
1521 
1522                 goto finished;
1523         }
1524 
1525         txpp = xnbp->xnb_tx_bufp;
1526         cop = xnbp->xnb_tx_cop;
1527         i = n_data_req;
1528 
1529         while (i > 0) {
1530                 xnb_txbuf_t *txp = *txpp;
1531 
1532                 txreq = RING_GET_REQUEST(&xnbp->xnb_tx_ring, txp->xt_idx);
1533 
1534                 if (cop->status != 0) {
1535 #ifdef XNB_DEBUG
1536                         cmn_err(CE_WARN, "xnb_from_peer: "
1537                             "txpp 0x%p failed (%d)",
1538                             (void *)*txpp, cop->status);
1539 #endif /* XNB_DEBUG */
1540                         xnb_tx_mark_complete(xnbp, txp->xt_id, NETIF_RSP_ERROR);
1541                         freemsg(txp->xt_mblk);
1542                 } else {
1543                         mblk_t *mp;
1544 
1545                         mp = txp->xt_mblk;
1546                         mp->b_rptr = mp->b_wptr = (unsigned char *)txp->xt_buf;
1547                         mp->b_wptr += txreq->size;
1548                         mp->b_next = NULL;
1549 
1550                         /*
1551                          * If there are checksum flags, process them
1552                          * appropriately.
1553                          */
1554                         if ((txreq->flags &
1555                             (NETTXF_csum_blank | NETTXF_data_validated))
1556                             != 0) {
1557                                 mp = xnbp->xnb_flavour->xf_cksum_from_peer(xnbp,
1558                                     mp, txreq->flags);
1559                                 xnbp->xnb_stat_tx_cksum_no_need++;
1560 
1561                                 txp->xt_mblk = mp;
1562                         }
1563 
1564                         if (head == NULL) {
1565                                 ASSERT(tail == NULL);
1566                                 head = mp;
1567                         } else {
1568                                 ASSERT(tail != NULL);
1569                                 tail->b_next = mp;
1570                         }
1571                         tail = mp;
1572 
1573                         xnbp->xnb_stat_opackets++;
1574                         xnbp->xnb_stat_obytes += txreq->size;
1575 
1576                         xnb_tx_mark_complete(xnbp, txp->xt_id, NETIF_RSP_OKAY);
1577                 }
1578 
1579                 txpp++;
1580                 cop++;
1581                 i--;
1582         }
1583 
1584         goto around;
1585         /* NOTREACHED */
1586 }
1587 
1588 static uint_t
1589 xnb_intr(caddr_t arg)
1590 {
1591         xnb_t *xnbp = (xnb_t *)arg;
1592         mblk_t *mp;
1593 
1594         xnbp->xnb_stat_intr++;
1595 
1596         mutex_enter(&xnbp->xnb_tx_lock);
1597 
1598         ASSERT(xnbp->xnb_connected);
1599 
1600         mp = xnb_from_peer(xnbp);
1601 
1602         mutex_exit(&xnbp->xnb_tx_lock);
1603 
1604         if (!xnbp->xnb_hotplugged) {
1605                 xnbp->xnb_stat_tx_too_early++;
1606                 goto fail;
1607         }
1608         if (mp == NULL) {
1609                 xnbp->xnb_stat_spurious_intr++;
1610                 goto fail;
1611         }
1612 
1613         xnbp->xnb_flavour->xf_from_peer(xnbp, mp);
1614 
1615         return (DDI_INTR_CLAIMED);
1616 
1617 fail:
1618         freemsgchain(mp);
1619         return (DDI_INTR_CLAIMED);
1620 }
1621 
1622 /*
1623  * Read our configuration from xenstore.
1624  */
1625 boolean_t
1626 xnb_read_xs_config(xnb_t *xnbp)
1627 {
1628         char *xsname;
1629         char mac[ETHERADDRL * 3];
1630 
1631         xsname = xvdi_get_xsname(xnbp->xnb_devinfo);
1632 
1633         if (xenbus_scanf(XBT_NULL, xsname,
1634             "mac", "%s", mac) != 0) {
1635                 cmn_err(CE_WARN, "xnb_attach: "
1636                     "cannot read mac address from %s",
1637                     xsname);
1638                 return (B_FALSE);
1639         }
1640 
1641         if (ether_aton(mac, xnbp->xnb_mac_addr) != ETHERADDRL) {
1642                 cmn_err(CE_WARN,
1643                     "xnb_attach: cannot parse mac address %s",
1644                     mac);
1645                 return (B_FALSE);
1646         }
1647 
1648         return (B_TRUE);
1649 }
1650 
1651 /*
1652  * Read the configuration of the peer from xenstore.
1653  */
1654 boolean_t
1655 xnb_read_oe_config(xnb_t *xnbp)
1656 {
1657         char *oename;
1658         int i;
1659 
1660         oename = xvdi_get_oename(xnbp->xnb_devinfo);
1661 
1662         if (xenbus_gather(XBT_NULL, oename,
1663             "event-channel", "%u", &xnbp->xnb_fe_evtchn,
1664             "tx-ring-ref", "%lu", &xnbp->xnb_tx_ring_ref,
1665             "rx-ring-ref", "%lu", &xnbp->xnb_rx_ring_ref,
1666             NULL) != 0) {
1667                 cmn_err(CE_WARN, "xnb_read_oe_config: "
1668                     "cannot read other-end details from %s",
1669                     oename);
1670                 return (B_FALSE);
1671         }
1672 
1673         /*
1674          * Check whether our peer requests receive side hypervisor
1675          * copy.
1676          */
1677         if (xenbus_scanf(XBT_NULL, oename,
1678             "request-rx-copy", "%d", &i) != 0)
1679                 i = 0;
1680         if (i != 0)
1681                 xnbp->xnb_rx_hv_copy = B_TRUE;
1682 
1683         /*
1684          * Check whether our peer requests multicast_control.
1685          */
1686         if (xenbus_scanf(XBT_NULL, oename,
1687             "request-multicast-control", "%d", &i) != 0)
1688                 i = 0;
1689         if (i != 0)
1690                 xnbp->xnb_multicast_control = B_TRUE;
1691 
1692         /*
1693          * The Linux backend driver here checks to see if the peer has
1694          * set 'feature-no-csum-offload'. This is used to indicate
1695          * that the guest cannot handle receiving packets without a
1696          * valid checksum. We don't check here, because packets passed
1697          * to the peer _always_ have a valid checksum.
1698          *
1699          * There are three cases:
1700          *
1701          * - the NIC is dedicated: packets from the wire should always
1702          *   have a valid checksum. If the hardware validates the
1703          *   checksum then the relevant bit will be set in the packet
1704          *   attributes and we will inform the peer. It can choose to
1705          *   ignore the hardware verification.
1706          *
1707          * - the NIC is shared (VNIC) and a packet originates from the
1708          *   wire: this is the same as the case above - the packets
1709          *   will have a valid checksum.
1710          *
1711          * - the NIC is shared (VNIC) and a packet originates from the
1712          *   host: the MAC layer ensures that all such packets have a
1713          *   valid checksum by calculating one if the stack did not.
1714          */
1715 
1716         return (B_TRUE);
1717 }
1718 
1719 void
1720 xnb_start_connect(xnb_t *xnbp)
1721 {
1722         dev_info_t  *dip = xnbp->xnb_devinfo;
1723 
1724         if (!xnb_connect_rings(dip)) {
1725                 cmn_err(CE_WARN, "xnb_start_connect: "
1726                     "cannot connect rings");
1727                 goto failed;
1728         }
1729 
1730         if (!xnbp->xnb_flavour->xf_start_connect(xnbp)) {
1731                 cmn_err(CE_WARN, "xnb_start_connect: "
1732                     "flavour failed to connect");
1733                 goto failed;
1734         }
1735 
1736         (void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
1737         return;
1738 
1739 failed:
1740         xnbp->xnb_flavour->xf_peer_disconnected(xnbp);
1741         xnb_disconnect_rings(dip);
1742         (void) xvdi_switch_state(dip, XBT_NULL,
1743             XenbusStateClosed);
1744         (void) xvdi_post_event(dip, XEN_HP_REMOVE);
1745 }
1746 
1747 static boolean_t
1748 xnb_connect_rings(dev_info_t *dip)
1749 {
1750         xnb_t *xnbp = ddi_get_driver_private(dip);
1751         struct gnttab_map_grant_ref map_op;
1752 
1753         /*
1754          * Cannot attempt to connect the rings if already connected.
1755          */
1756         ASSERT(!xnbp->xnb_connected);
1757 
1758         /*
1759          * 1. allocate a vaddr for the tx page, one for the rx page.
1760          * 2. call GNTTABOP_map_grant_ref to map the relevant pages
1761          *    into the allocated vaddr (one for tx, one for rx).
1762          * 3. call EVTCHNOP_bind_interdomain to have the event channel
1763          *    bound to this domain.
1764          * 4. associate the event channel with an interrupt.
1765          * 5. enable the interrupt.
1766          */
1767 
1768         /* 1.tx */
1769         xnbp->xnb_tx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
1770             0, 0, 0, 0, VM_SLEEP);
1771         ASSERT(xnbp->xnb_tx_ring_addr != NULL);
1772 
1773         /* 2.tx */
1774         map_op.host_addr = (uint64_t)((long)xnbp->xnb_tx_ring_addr);
1775         map_op.flags = GNTMAP_host_map;
1776         map_op.ref = xnbp->xnb_tx_ring_ref;
1777         map_op.dom = xnbp->xnb_peer;
1778         hat_prepare_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr, NULL);
1779         if (xen_map_gref(GNTTABOP_map_grant_ref, &map_op, 1, B_FALSE) != 0 ||
1780             map_op.status != 0) {
1781                 cmn_err(CE_WARN, "xnb_connect_rings: cannot map tx-ring page.");
1782                 goto fail;
1783         }
1784         xnbp->xnb_tx_ring_handle = map_op.handle;
1785 
1786         /* LINTED: constant in conditional context */
1787         BACK_RING_INIT(&xnbp->xnb_tx_ring,
1788             (netif_tx_sring_t *)xnbp->xnb_tx_ring_addr, PAGESIZE);
1789 
1790         /* 1.rx */
1791         xnbp->xnb_rx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
1792             0, 0, 0, 0, VM_SLEEP);
1793         ASSERT(xnbp->xnb_rx_ring_addr != NULL);
1794 
1795         /* 2.rx */
1796         map_op.host_addr = (uint64_t)((long)xnbp->xnb_rx_ring_addr);
1797         map_op.flags = GNTMAP_host_map;
1798         map_op.ref = xnbp->xnb_rx_ring_ref;
1799         map_op.dom = xnbp->xnb_peer;
1800         hat_prepare_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr, NULL);
1801         if (xen_map_gref(GNTTABOP_map_grant_ref, &map_op, 1, B_FALSE) != 0 ||
1802             map_op.status != 0) {
1803                 cmn_err(CE_WARN, "xnb_connect_rings: cannot map rx-ring page.");
1804                 goto fail;
1805         }
1806         xnbp->xnb_rx_ring_handle = map_op.handle;
1807 
1808         /* LINTED: constant in conditional context */
1809         BACK_RING_INIT(&xnbp->xnb_rx_ring,
1810             (netif_rx_sring_t *)xnbp->xnb_rx_ring_addr, PAGESIZE);
1811 
1812         /* 3 */
1813         if (xvdi_bind_evtchn(dip, xnbp->xnb_fe_evtchn) != DDI_SUCCESS) {
1814                 cmn_err(CE_WARN, "xnb_connect_rings: "
1815                     "cannot bind event channel %d", xnbp->xnb_evtchn);
1816                 xnbp->xnb_evtchn = INVALID_EVTCHN;
1817                 goto fail;
1818         }
1819         xnbp->xnb_evtchn = xvdi_get_evtchn(dip);
1820 
1821         /*
1822          * It would be good to set the state to XenbusStateConnected
1823          * here as well, but then what if ddi_add_intr() failed?
1824          * Changing the state in the store will be noticed by the peer
1825          * and cannot be "taken back".
1826          */
1827         mutex_enter(&xnbp->xnb_tx_lock);
1828         mutex_enter(&xnbp->xnb_rx_lock);
1829 
1830         xnbp->xnb_connected = B_TRUE;
1831 
1832         mutex_exit(&xnbp->xnb_rx_lock);
1833         mutex_exit(&xnbp->xnb_tx_lock);
1834 
1835         /* 4, 5 */
1836         if (ddi_add_intr(dip, 0, NULL, NULL, xnb_intr, (caddr_t)xnbp)
1837             != DDI_SUCCESS) {
1838                 cmn_err(CE_WARN, "xnb_connect_rings: cannot add interrupt");
1839                 goto fail;
1840         }
1841         xnbp->xnb_irq = B_TRUE;
1842 
1843         return (B_TRUE);
1844 
1845 fail:
1846         mutex_enter(&xnbp->xnb_tx_lock);
1847         mutex_enter(&xnbp->xnb_rx_lock);
1848 
1849         xnbp->xnb_connected = B_FALSE;
1850 
1851         mutex_exit(&xnbp->xnb_rx_lock);
1852         mutex_exit(&xnbp->xnb_tx_lock);
1853 
1854         return (B_FALSE);
1855 }
1856 
1857 static void
1858 xnb_disconnect_rings(dev_info_t *dip)
1859 {
1860         xnb_t *xnbp = ddi_get_driver_private(dip);
1861 
1862         if (xnbp->xnb_irq) {
1863                 ddi_remove_intr(dip, 0, NULL);
1864                 xnbp->xnb_irq = B_FALSE;
1865         }
1866 
1867         if (xnbp->xnb_evtchn != INVALID_EVTCHN) {
1868                 xvdi_free_evtchn(dip);
1869                 xnbp->xnb_evtchn = INVALID_EVTCHN;
1870         }
1871 
1872         if (xnbp->xnb_rx_ring_handle != INVALID_GRANT_HANDLE) {
1873                 struct gnttab_unmap_grant_ref unmap_op;
1874 
1875                 unmap_op.host_addr = (uint64_t)(uintptr_t)
1876                     xnbp->xnb_rx_ring_addr;
1877                 unmap_op.dev_bus_addr = 0;
1878                 unmap_op.handle = xnbp->xnb_rx_ring_handle;
1879                 if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1880                     &unmap_op, 1) != 0)
1881                         cmn_err(CE_WARN, "xnb_disconnect_rings: "
1882                             "cannot unmap rx-ring page (%d)",
1883                             unmap_op.status);
1884 
1885                 xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE;
1886         }
1887 
1888         if (xnbp->xnb_rx_ring_addr != NULL) {
1889                 hat_release_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr);
1890                 vmem_free(heap_arena, xnbp->xnb_rx_ring_addr, PAGESIZE);
1891                 xnbp->xnb_rx_ring_addr = NULL;
1892         }
1893 
1894         if (xnbp->xnb_tx_ring_handle != INVALID_GRANT_HANDLE) {
1895                 struct gnttab_unmap_grant_ref unmap_op;
1896 
1897                 unmap_op.host_addr = (uint64_t)(uintptr_t)
1898                     xnbp->xnb_tx_ring_addr;
1899                 unmap_op.dev_bus_addr = 0;
1900                 unmap_op.handle = xnbp->xnb_tx_ring_handle;
1901                 if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1902                     &unmap_op, 1) != 0)
1903                         cmn_err(CE_WARN, "xnb_disconnect_rings: "
1904                             "cannot unmap tx-ring page (%d)",
1905                             unmap_op.status);
1906 
1907                 xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE;
1908         }
1909 
1910         if (xnbp->xnb_tx_ring_addr != NULL) {
1911                 hat_release_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr);
1912                 vmem_free(heap_arena, xnbp->xnb_tx_ring_addr, PAGESIZE);
1913                 xnbp->xnb_tx_ring_addr = NULL;
1914         }
1915 }
1916 
1917 static void
1918 xnb_oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
1919     void *arg, void *impl_data)
1920 {
1921         _NOTE(ARGUNUSED(id, arg));
1922         xnb_t *xnbp = ddi_get_driver_private(dip);
1923         XenbusState new_state = *(XenbusState *)impl_data;
1924 
1925         ASSERT(xnbp != NULL);
1926 
1927         switch (new_state) {
1928         case XenbusStateConnected:
1929                 /* spurious state change */
1930                 if (xnbp->xnb_connected)
1931                         return;
1932 
1933                 if (!xnb_read_oe_config(xnbp) ||
1934                     !xnbp->xnb_flavour->xf_peer_connected(xnbp)) {
1935                         cmn_err(CE_WARN, "xnb_oe_state_change: "
1936                             "read otherend config error");
1937                         (void) xvdi_switch_state(dip, XBT_NULL,
1938                             XenbusStateClosed);
1939                         (void) xvdi_post_event(dip, XEN_HP_REMOVE);
1940 
1941                         break;
1942                 }
1943 
1944 
1945                 mutex_enter(&xnbp->xnb_state_lock);
1946                 xnbp->xnb_fe_status = XNB_STATE_READY;
1947                 if (xnbp->xnb_be_status == XNB_STATE_READY)
1948                         xnb_start_connect(xnbp);
1949                 mutex_exit(&xnbp->xnb_state_lock);
1950 
1951                 /*
1952                  * Now that we've attempted to connect it's reasonable
1953                  * to allow an attempt to detach.
1954                  */
1955                 xnbp->xnb_detachable = B_TRUE;
1956 
1957                 break;
1958 
1959         case XenbusStateClosing:
1960                 (void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosing);
1961 
1962                 break;
1963 
1964         case XenbusStateClosed:
1965                 xnbp->xnb_flavour->xf_peer_disconnected(xnbp);
1966 
1967                 mutex_enter(&xnbp->xnb_tx_lock);
1968                 mutex_enter(&xnbp->xnb_rx_lock);
1969 
1970                 xnb_disconnect_rings(dip);
1971                 xnbp->xnb_connected = B_FALSE;
1972 
1973                 mutex_exit(&xnbp->xnb_rx_lock);
1974                 mutex_exit(&xnbp->xnb_tx_lock);
1975 
1976                 (void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
1977                 (void) xvdi_post_event(dip, XEN_HP_REMOVE);
1978                 /*
1979                  * In all likelyhood this is already set (in the above
1980                  * case), but if the peer never attempted to connect
1981                  * and the domain is destroyed we get here without
1982                  * having been through the case above, so we set it to
1983                  * be sure.
1984                  */
1985                 xnbp->xnb_detachable = B_TRUE;
1986 
1987                 break;
1988 
1989         default:
1990                 break;
1991         }
1992 }
1993 
1994 static void
1995 xnb_hp_state_change(dev_info_t *dip, ddi_eventcookie_t id,
1996     void *arg, void *impl_data)
1997 {
1998         _NOTE(ARGUNUSED(id, arg));
1999         xnb_t *xnbp = ddi_get_driver_private(dip);
2000         xendev_hotplug_state_t state = *(xendev_hotplug_state_t *)impl_data;
2001 
2002         ASSERT(xnbp != NULL);
2003 
2004         switch (state) {
2005         case Connected:
2006                 /* spurious hotplug event */
2007                 if (xnbp->xnb_hotplugged)
2008                         break;
2009 
2010                 if (!xnb_read_xs_config(xnbp))
2011                         break;
2012 
2013                 if (!xnbp->xnb_flavour->xf_hotplug_connected(xnbp))
2014                         break;
2015 
2016                 mutex_enter(&xnbp->xnb_tx_lock);
2017                 mutex_enter(&xnbp->xnb_rx_lock);
2018 
2019                 xnbp->xnb_hotplugged = B_TRUE;
2020 
2021                 mutex_exit(&xnbp->xnb_rx_lock);
2022                 mutex_exit(&xnbp->xnb_tx_lock);
2023 
2024                 mutex_enter(&xnbp->xnb_state_lock);
2025                 xnbp->xnb_be_status = XNB_STATE_READY;
2026                 if (xnbp->xnb_fe_status == XNB_STATE_READY)
2027                         xnb_start_connect(xnbp);
2028                 mutex_exit(&xnbp->xnb_state_lock);
2029 
2030                 break;
2031 
2032         default:
2033                 break;
2034         }
2035 }
2036 
2037 static struct modldrv modldrv = {
2038         &mod_miscops, "xnb",
2039 };
2040 
2041 static struct modlinkage modlinkage = {
2042         MODREV_1, { &modldrv, NULL }
2043 };
2044 
2045 int
2046 _init(void)
2047 {
2048         int i;
2049 
2050         mutex_init(&xnb_alloc_page_lock, NULL, MUTEX_DRIVER, NULL);
2051 
2052         i = mod_install(&modlinkage);
2053         if (i != DDI_SUCCESS)
2054                 mutex_destroy(&xnb_alloc_page_lock);
2055 
2056         return (i);
2057 }
2058 
2059 int
2060 _info(struct modinfo *modinfop)
2061 {
2062         return (mod_info(&modlinkage, modinfop));
2063 }
2064 
2065 int
2066 _fini(void)
2067 {
2068         int i;
2069 
2070         i = mod_remove(&modlinkage);
2071         if (i == DDI_SUCCESS)
2072                 mutex_destroy(&xnb_alloc_page_lock);
2073 
2074         return (i);
2075 }