1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*
  28  *
  29  * Copyright (c) 2004 Christian Limpach.
  30  * All rights reserved.
  31  *
  32  * Redistribution and use in source and binary forms, with or without
  33  * modification, are permitted provided that the following conditions
  34  * are met:
  35  * 1. Redistributions of source code must retain the above copyright
  36  *    notice, this list of conditions and the following disclaimer.
  37  * 2. Redistributions in binary form must reproduce the above copyright
  38  *    notice, this list of conditions and the following disclaimer in the
  39  *    documentation and/or other materials provided with the distribution.
  40  * 3. This section intentionally left blank.
  41  * 4. The name of the author may not be used to endorse or promote products
  42  *    derived from this software without specific prior written permission.
  43  *
  44  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  45  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  46  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  47  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  48  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  49  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  50  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  51  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  52  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  53  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  54  */
  55 /*
  56  * Section 3 of the above license was updated in response to bug 6379571.
  57  */
  58 
  59 /*
  60  * xnf.c - GLDv3 network driver for domU.
  61  */
  62 
  63 /*
  64  * This driver uses four per-instance locks:
  65  *
  66  * xnf_gref_lock:
  67  *
  68  *    Protects access to the grant reference list stored in
  69  *    xnf_gref_head. Grant references should be acquired and released
  70  *    using gref_get() and gref_put() respectively.
  71  *
  72  * xnf_schedlock:
  73  *
  74  *    Protects:
  75  *    xnf_need_sched - used to record that a previous transmit attempt
  76  *       failed (and consequently it will be necessary to call
  77  *       mac_tx_update() when transmit resources are available).
  78  *    xnf_pending_multicast - the number of multicast requests that
  79  *       have been submitted to the backend for which we have not
  80  *       processed responses.
  81  *
  82  * xnf_txlock:
  83  *
  84  *    Protects the transmit ring (xnf_tx_ring) and associated
  85  *    structures (notably xnf_tx_pkt_id and xnf_tx_pkt_id_head).
  86  *
  87  * xnf_rxlock:
  88  *
  89  *    Protects the receive ring (xnf_rx_ring) and associated
  90  *    structures (notably xnf_rx_pkt_info).
  91  *
  92  * If driver-global state that affects both the transmit and receive
  93  * rings is manipulated, both xnf_txlock and xnf_rxlock should be
  94  * held, in that order.
  95  *
  96  * xnf_schedlock is acquired both whilst holding xnf_txlock and
  97  * without. It should always be acquired after xnf_txlock if both are
  98  * held.
  99  *
 100  * Notes:
 101  * - atomic_add_64() is used to manipulate counters where we require
 102  *   accuracy. For counters intended only for observation by humans,
 103  *   post increment/decrement are used instead.
 104  */
 105 
 106 #include <sys/types.h>
 107 #include <sys/errno.h>
 108 #include <sys/param.h>
 109 #include <sys/sysmacros.h>
 110 #include <sys/systm.h>
 111 #include <sys/stream.h>
 112 #include <sys/strsubr.h>
 113 #include <sys/strsun.h>
 114 #include <sys/conf.h>
 115 #include <sys/ddi.h>
 116 #include <sys/devops.h>
 117 #include <sys/sunddi.h>
 118 #include <sys/sunndi.h>
 119 #include <sys/dlpi.h>
 120 #include <sys/ethernet.h>
 121 #include <sys/strsun.h>
 122 #include <sys/pattr.h>
 123 #include <inet/ip.h>
 124 #include <inet/ip_impl.h>
 125 #include <sys/gld.h>
 126 #include <sys/modctl.h>
 127 #include <sys/mac_provider.h>
 128 #include <sys/mac_ether.h>
 129 #include <sys/bootinfo.h>
 130 #include <sys/mach_mmu.h>
 131 #ifdef  XPV_HVM_DRIVER
 132 #include <sys/xpv_support.h>
 133 #include <sys/hypervisor.h>
 134 #else
 135 #include <sys/hypervisor.h>
 136 #include <sys/evtchn_impl.h>
 137 #include <sys/balloon_impl.h>
 138 #endif
 139 #include <xen/public/io/netif.h>
 140 #include <sys/gnttab.h>
 141 #include <xen/sys/xendev.h>
 142 #include <sys/sdt.h>
 143 #include <sys/note.h>
 144 #include <sys/debug.h>
 145 
 146 #include <io/xnf.h>
 147 
 148 #if defined(DEBUG) || defined(__lint)
 149 #define XNF_DEBUG
 150 #endif
 151 
 152 #ifdef XNF_DEBUG
 153 int xnf_debug = 0;
 154 xnf_t *xnf_debug_instance = NULL;
 155 #endif
 156 
 157 /*
 158  * On a 32 bit PAE system physical and machine addresses are larger
 159  * than 32 bits.  ddi_btop() on such systems take an unsigned long
 160  * argument, and so addresses above 4G are truncated before ddi_btop()
 161  * gets to see them.  To avoid this, code the shift operation here.
 162  */
 163 #define xnf_btop(addr)  ((addr) >> PAGESHIFT)
 164 
 165 unsigned int    xnf_max_tx_frags = 1;
 166 
 167 /*
 168  * Should we use the multicast control feature if the backend provides
 169  * it?
 170  */
 171 boolean_t xnf_multicast_control = B_TRUE;
 172 
 173 /*
 174  * Received packets below this size are copied to a new streams buffer
 175  * rather than being desballoc'ed.
 176  *
 177  * This value is chosen to accommodate traffic where there are a large
 178  * number of small packets. For data showing a typical distribution,
 179  * see:
 180  *
 181  * Sinha07a:
 182  *      Rishi Sinha, Christos Papadopoulos, and John
 183  *      Heidemann. Internet Packet Size Distributions: Some
 184  *      Observations. Technical Report ISI-TR-2007-643,
 185  *      USC/Information Sciences Institute, May, 2007. Orignally
 186  *      released October 2005 as web page
 187  *      http://netweb.usc.edu/~sinha/pkt-sizes/.
 188  *      <http://www.isi.edu/~johnh/PAPERS/Sinha07a.html>.
 189  */
 190 size_t xnf_rx_copy_limit = 64;
 191 
 192 #define INVALID_GRANT_HANDLE    ((grant_handle_t)-1)
 193 #define INVALID_GRANT_REF       ((grant_ref_t)-1)
 194 #define INVALID_TX_ID           ((uint16_t)-1)
 195 
 196 #define TX_ID_TO_TXID(p, id) (&((p)->xnf_tx_pkt_id[(id)]))
 197 #define TX_ID_VALID(i) (((i) != INVALID_TX_ID) && ((i) < NET_TX_RING_SIZE))
 198 
 199 /* Required system entry points */
 200 static int      xnf_attach(dev_info_t *, ddi_attach_cmd_t);
 201 static int      xnf_detach(dev_info_t *, ddi_detach_cmd_t);
 202 
 203 /* Required driver entry points for Nemo */
 204 static int      xnf_start(void *);
 205 static void     xnf_stop(void *);
 206 static int      xnf_set_mac_addr(void *, const uint8_t *);
 207 static int      xnf_set_multicast(void *, boolean_t, const uint8_t *);
 208 static int      xnf_set_promiscuous(void *, boolean_t);
 209 static mblk_t   *xnf_send(void *, mblk_t *);
 210 static uint_t   xnf_intr(caddr_t);
 211 static int      xnf_stat(void *, uint_t, uint64_t *);
 212 static boolean_t xnf_getcapab(void *, mac_capab_t, void *);
 213 
 214 /* Driver private functions */
 215 static int xnf_alloc_dma_resources(xnf_t *);
 216 static void xnf_release_dma_resources(xnf_t *);
 217 static void xnf_release_mblks(xnf_t *);
 218 
 219 static int xnf_buf_constructor(void *, void *, int);
 220 static void xnf_buf_destructor(void *, void *);
 221 static xnf_buf_t *xnf_buf_get(xnf_t *, int, boolean_t);
 222 #pragma inline(xnf_buf_get)
 223 static void xnf_buf_put(xnf_t *, xnf_buf_t *, boolean_t);
 224 #pragma inline(xnf_buf_put)
 225 static void xnf_buf_refresh(xnf_buf_t *);
 226 #pragma inline(xnf_buf_refresh)
 227 static void xnf_buf_recycle(xnf_buf_t *);
 228 
 229 static int xnf_tx_buf_constructor(void *, void *, int);
 230 static void xnf_tx_buf_destructor(void *, void *);
 231 
 232 static grant_ref_t gref_get(xnf_t *);
 233 #pragma inline(gref_get)
 234 static void gref_put(xnf_t *, grant_ref_t);
 235 #pragma inline(gref_put)
 236 
 237 static xnf_txid_t *txid_get(xnf_t *);
 238 #pragma inline(txid_get)
 239 static void txid_put(xnf_t *, xnf_txid_t *);
 240 #pragma inline(txid_put)
 241 
 242 void xnf_send_driver_status(int, int);
 243 static void xnf_rxbuf_hang(xnf_t *, xnf_buf_t *);
 244 static int xnf_tx_clean_ring(xnf_t  *);
 245 static void oe_state_change(dev_info_t *, ddi_eventcookie_t,
 246     void *, void *);
 247 static boolean_t xnf_kstat_init(xnf_t *);
 248 static void xnf_rx_collect(xnf_t *);
 249 
 250 static mac_callbacks_t xnf_callbacks = {
 251         MC_GETCAPAB,
 252         xnf_stat,
 253         xnf_start,
 254         xnf_stop,
 255         xnf_set_promiscuous,
 256         xnf_set_multicast,
 257         xnf_set_mac_addr,
 258         xnf_send,
 259         NULL,
 260         NULL,
 261         xnf_getcapab
 262 };
 263 
 264 /* DMA attributes for network ring buffer */
 265 static ddi_dma_attr_t ringbuf_dma_attr = {
 266         DMA_ATTR_V0,            /* version of this structure */
 267         0,                      /* lowest usable address */
 268         0xffffffffffffffffULL,  /* highest usable address */
 269         0x7fffffff,             /* maximum DMAable byte count */
 270         MMU_PAGESIZE,           /* alignment in bytes */
 271         0x7ff,                  /* bitmap of burst sizes */
 272         1,                      /* minimum transfer */
 273         0xffffffffU,            /* maximum transfer */
 274         0xffffffffffffffffULL,  /* maximum segment length */
 275         1,                      /* maximum number of segments */
 276         1,                      /* granularity */
 277         0,                      /* flags (reserved) */
 278 };
 279 
 280 /* DMA attributes for transmit and receive data */
 281 static ddi_dma_attr_t buf_dma_attr = {
 282         DMA_ATTR_V0,            /* version of this structure */
 283         0,                      /* lowest usable address */
 284         0xffffffffffffffffULL,  /* highest usable address */
 285         0x7fffffff,             /* maximum DMAable byte count */
 286         MMU_PAGESIZE,           /* alignment in bytes */
 287         0x7ff,                  /* bitmap of burst sizes */
 288         1,                      /* minimum transfer */
 289         0xffffffffU,            /* maximum transfer */
 290         0xffffffffffffffffULL,  /* maximum segment length */
 291         1,                      /* maximum number of segments */
 292         1,                      /* granularity */
 293         0,                      /* flags (reserved) */
 294 };
 295 
 296 /* DMA access attributes for registers and descriptors */
 297 static ddi_device_acc_attr_t accattr = {
 298         DDI_DEVICE_ATTR_V0,
 299         DDI_STRUCTURE_LE_ACC,   /* This is a little-endian device */
 300         DDI_STRICTORDER_ACC
 301 };
 302 
 303 /* DMA access attributes for data: NOT to be byte swapped. */
 304 static ddi_device_acc_attr_t data_accattr = {
 305         DDI_DEVICE_ATTR_V0,
 306         DDI_NEVERSWAP_ACC,
 307         DDI_STRICTORDER_ACC
 308 };
 309 
 310 DDI_DEFINE_STREAM_OPS(xnf_dev_ops, nulldev, nulldev, xnf_attach, xnf_detach,
 311     nodev, NULL, D_MP, NULL, ddi_quiesce_not_supported);
 312 
 313 static struct modldrv xnf_modldrv = {
 314         &mod_driverops,
 315         "Virtual Ethernet driver",
 316         &xnf_dev_ops
 317 };
 318 
 319 static struct modlinkage modlinkage = {
 320         MODREV_1, { &xnf_modldrv, NULL }
 321 };
 322 
 323 int
 324 _init(void)
 325 {
 326         int r;
 327 
 328         mac_init_ops(&xnf_dev_ops, "xnf");
 329         r = mod_install(&modlinkage);
 330         if (r != DDI_SUCCESS)
 331                 mac_fini_ops(&xnf_dev_ops);
 332 
 333         return (r);
 334 }
 335 
 336 int
 337 _fini(void)
 338 {
 339         return (EBUSY); /* XXPV should be removable */
 340 }
 341 
 342 int
 343 _info(struct modinfo *modinfop)
 344 {
 345         return (mod_info(&modlinkage, modinfop));
 346 }
 347 
 348 /*
 349  * Acquire a grant reference.
 350  */
 351 static grant_ref_t
 352 gref_get(xnf_t *xnfp)
 353 {
 354         grant_ref_t gref;
 355 
 356         mutex_enter(&xnfp->xnf_gref_lock);
 357 
 358         do {
 359                 gref = gnttab_claim_grant_reference(&xnfp->xnf_gref_head);
 360 
 361         } while ((gref == INVALID_GRANT_REF) &&
 362             (gnttab_alloc_grant_references(16, &xnfp->xnf_gref_head) == 0));
 363 
 364         mutex_exit(&xnfp->xnf_gref_lock);
 365 
 366         if (gref == INVALID_GRANT_REF) {
 367                 xnfp->xnf_stat_gref_failure++;
 368         } else {
 369                 atomic_inc_64(&xnfp->xnf_stat_gref_outstanding);
 370                 if (xnfp->xnf_stat_gref_outstanding > xnfp->xnf_stat_gref_peak)
 371                         xnfp->xnf_stat_gref_peak =
 372                             xnfp->xnf_stat_gref_outstanding;
 373         }
 374 
 375         return (gref);
 376 }
 377 
 378 /*
 379  * Release a grant reference.
 380  */
 381 static void
 382 gref_put(xnf_t *xnfp, grant_ref_t gref)
 383 {
 384         ASSERT(gref != INVALID_GRANT_REF);
 385 
 386         mutex_enter(&xnfp->xnf_gref_lock);
 387         gnttab_release_grant_reference(&xnfp->xnf_gref_head, gref);
 388         mutex_exit(&xnfp->xnf_gref_lock);
 389 
 390         atomic_dec_64(&xnfp->xnf_stat_gref_outstanding);
 391 }
 392 
 393 /*
 394  * Acquire a transmit id.
 395  */
 396 static xnf_txid_t *
 397 txid_get(xnf_t *xnfp)
 398 {
 399         xnf_txid_t *tidp;
 400 
 401         ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
 402 
 403         if (xnfp->xnf_tx_pkt_id_head == INVALID_TX_ID)
 404                 return (NULL);
 405 
 406         ASSERT(TX_ID_VALID(xnfp->xnf_tx_pkt_id_head));
 407 
 408         tidp = TX_ID_TO_TXID(xnfp, xnfp->xnf_tx_pkt_id_head);
 409         xnfp->xnf_tx_pkt_id_head = tidp->next;
 410         tidp->next = INVALID_TX_ID;
 411 
 412         ASSERT(tidp->txbuf == NULL);
 413 
 414         return (tidp);
 415 }
 416 
 417 /*
 418  * Release a transmit id.
 419  */
 420 static void
 421 txid_put(xnf_t *xnfp, xnf_txid_t *tidp)
 422 {
 423         ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
 424         ASSERT(TX_ID_VALID(tidp->id));
 425         ASSERT(tidp->next == INVALID_TX_ID);
 426 
 427         tidp->txbuf = NULL;
 428         tidp->next = xnfp->xnf_tx_pkt_id_head;
 429         xnfp->xnf_tx_pkt_id_head = tidp->id;
 430 }
 431 
 432 /*
 433  * Get `wanted' slots in the transmit ring, waiting for at least that
 434  * number if `wait' is B_TRUE. Force the ring to be cleaned by setting
 435  * `wanted' to zero.
 436  *
 437  * Return the number of slots available.
 438  */
 439 static int
 440 tx_slots_get(xnf_t *xnfp, int wanted, boolean_t wait)
 441 {
 442         int slotsfree;
 443         boolean_t forced_clean = (wanted == 0);
 444 
 445         ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
 446 
 447         /* LINTED: constant in conditional context */
 448         while (B_TRUE) {
 449                 slotsfree = RING_FREE_REQUESTS(&xnfp->xnf_tx_ring);
 450 
 451                 if ((slotsfree < wanted) || forced_clean)
 452                         slotsfree = xnf_tx_clean_ring(xnfp);
 453 
 454                 /*
 455                  * If there are more than we need free, tell other
 456                  * people to come looking again. We hold txlock, so we
 457                  * are able to take our slots before anyone else runs.
 458                  */
 459                 if (slotsfree > wanted)
 460                         cv_broadcast(&xnfp->xnf_cv_tx_slots);
 461 
 462                 if (slotsfree >= wanted)
 463                         break;
 464 
 465                 if (!wait)
 466                         break;
 467 
 468                 cv_wait(&xnfp->xnf_cv_tx_slots, &xnfp->xnf_txlock);
 469         }
 470 
 471         ASSERT(slotsfree <= RING_SIZE(&(xnfp->xnf_tx_ring)));
 472 
 473         return (slotsfree);
 474 }
 475 
 476 static int
 477 xnf_setup_rings(xnf_t *xnfp)
 478 {
 479         domid_t                 oeid;
 480         struct xenbus_device    *xsd;
 481         RING_IDX                i;
 482         int                     err;
 483         xnf_txid_t              *tidp;
 484         xnf_buf_t **bdescp;
 485 
 486         oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
 487         xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
 488 
 489         if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF)
 490                 gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
 491 
 492         err = gnttab_grant_foreign_access(oeid,
 493             xnf_btop(pa_to_ma(xnfp->xnf_tx_ring_phys_addr)), 0);
 494         if (err <= 0) {
 495                 err = -err;
 496                 xenbus_dev_error(xsd, err, "granting access to tx ring page");
 497                 goto out;
 498         }
 499         xnfp->xnf_tx_ring_ref = (grant_ref_t)err;
 500 
 501         if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF)
 502                 gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
 503 
 504         err = gnttab_grant_foreign_access(oeid,
 505             xnf_btop(pa_to_ma(xnfp->xnf_rx_ring_phys_addr)), 0);
 506         if (err <= 0) {
 507                 err = -err;
 508                 xenbus_dev_error(xsd, err, "granting access to rx ring page");
 509                 goto out;
 510         }
 511         xnfp->xnf_rx_ring_ref = (grant_ref_t)err;
 512 
 513         mutex_enter(&xnfp->xnf_txlock);
 514 
 515         /*
 516          * Setup/cleanup the TX ring.  Note that this can lose packets
 517          * after a resume, but we expect to stagger on.
 518          */
 519         xnfp->xnf_tx_pkt_id_head = INVALID_TX_ID; /* I.e. emtpy list. */
 520         for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
 521             i < NET_TX_RING_SIZE;
 522             i++, tidp++) {
 523                 xnf_txbuf_t *txp;
 524 
 525                 tidp->id = i;
 526 
 527                 txp = tidp->txbuf;
 528                 if (txp == NULL) {
 529                         tidp->next = INVALID_TX_ID; /* Appease txid_put(). */
 530                         txid_put(xnfp, tidp);
 531                         continue;
 532                 }
 533 
 534                 ASSERT(txp->tx_txreq.gref != INVALID_GRANT_REF);
 535                 ASSERT(txp->tx_mp != NULL);
 536 
 537                 switch (txp->tx_type) {
 538                 case TX_DATA:
 539                         VERIFY(gnttab_query_foreign_access(txp->tx_txreq.gref)
 540                             == 0);
 541 
 542                         if (txp->tx_bdesc == NULL) {
 543                                 (void) gnttab_end_foreign_access_ref(
 544                                     txp->tx_txreq.gref, 1);
 545                                 gref_put(xnfp, txp->tx_txreq.gref);
 546                                 (void) ddi_dma_unbind_handle(
 547                                     txp->tx_dma_handle);
 548                         } else {
 549                                 xnf_buf_put(xnfp, txp->tx_bdesc, B_TRUE);
 550                         }
 551 
 552                         freemsg(txp->tx_mp);
 553                         txid_put(xnfp, tidp);
 554                         kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
 555 
 556                         break;
 557 
 558                 case TX_MCAST_REQ:
 559                         txp->tx_type = TX_MCAST_RSP;
 560                         txp->tx_status = NETIF_RSP_DROPPED;
 561                         cv_broadcast(&xnfp->xnf_cv_multicast);
 562 
 563                         /*
 564                          * The request consumed two slots in the ring,
 565                          * yet only a single xnf_txid_t is used. Step
 566                          * over the empty slot.
 567                          */
 568                         i++;
 569                         ASSERT(i < NET_TX_RING_SIZE);
 570 
 571                         break;
 572 
 573                 case TX_MCAST_RSP:
 574                         break;
 575                 }
 576         }
 577 
 578         /* LINTED: constant in conditional context */
 579         SHARED_RING_INIT(xnfp->xnf_tx_ring.sring);
 580         /* LINTED: constant in conditional context */
 581         FRONT_RING_INIT(&xnfp->xnf_tx_ring,
 582             xnfp->xnf_tx_ring.sring, PAGESIZE);
 583 
 584         mutex_exit(&xnfp->xnf_txlock);
 585 
 586         mutex_enter(&xnfp->xnf_rxlock);
 587 
 588         /*
 589          * Clean out any buffers currently posted to the receive ring
 590          * before we reset it.
 591          */
 592         for (i = 0, bdescp = &xnfp->xnf_rx_pkt_info[0];
 593             i < NET_RX_RING_SIZE;
 594             i++, bdescp++) {
 595                 if (*bdescp != NULL) {
 596                         xnf_buf_put(xnfp, *bdescp, B_FALSE);
 597                         *bdescp = NULL;
 598                 }
 599         }
 600 
 601         /* LINTED: constant in conditional context */
 602         SHARED_RING_INIT(xnfp->xnf_rx_ring.sring);
 603         /* LINTED: constant in conditional context */
 604         FRONT_RING_INIT(&xnfp->xnf_rx_ring,
 605             xnfp->xnf_rx_ring.sring, PAGESIZE);
 606 
 607         /*
 608          * Fill the ring with buffers.
 609          */
 610         for (i = 0; i < NET_RX_RING_SIZE; i++) {
 611                 xnf_buf_t *bdesc;
 612 
 613                 bdesc = xnf_buf_get(xnfp, KM_SLEEP, B_FALSE);
 614                 VERIFY(bdesc != NULL);
 615                 xnf_rxbuf_hang(xnfp, bdesc);
 616         }
 617 
 618         /* LINTED: constant in conditional context */
 619         RING_PUSH_REQUESTS(&xnfp->xnf_rx_ring);
 620 
 621         mutex_exit(&xnfp->xnf_rxlock);
 622 
 623         return (0);
 624 
 625 out:
 626         if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF)
 627                 gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
 628         xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF;
 629 
 630         if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF)
 631                 gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
 632         xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF;
 633 
 634         return (err);
 635 }
 636 
 637 /*
 638  * Connect driver to back end, called to set up communication with
 639  * back end driver both initially and on resume after restore/migrate.
 640  */
 641 void
 642 xnf_be_connect(xnf_t *xnfp)
 643 {
 644         const char      *message;
 645         xenbus_transaction_t xbt;
 646         struct          xenbus_device *xsd;
 647         char            *xsname;
 648         int             err;
 649 
 650         ASSERT(!xnfp->xnf_connected);
 651 
 652         xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
 653         xsname = xvdi_get_xsname(xnfp->xnf_devinfo);
 654 
 655         err = xnf_setup_rings(xnfp);
 656         if (err != 0) {
 657                 cmn_err(CE_WARN, "failed to set up tx/rx rings");
 658                 xenbus_dev_error(xsd, err, "setting up ring");
 659                 return;
 660         }
 661 
 662 again:
 663         err = xenbus_transaction_start(&xbt);
 664         if (err != 0) {
 665                 xenbus_dev_error(xsd, EIO, "starting transaction");
 666                 return;
 667         }
 668 
 669         err = xenbus_printf(xbt, xsname, "tx-ring-ref", "%u",
 670             xnfp->xnf_tx_ring_ref);
 671         if (err != 0) {
 672                 message = "writing tx ring-ref";
 673                 goto abort_transaction;
 674         }
 675 
 676         err = xenbus_printf(xbt, xsname, "rx-ring-ref", "%u",
 677             xnfp->xnf_rx_ring_ref);
 678         if (err != 0) {
 679                 message = "writing rx ring-ref";
 680                 goto abort_transaction;
 681         }
 682 
 683         err = xenbus_printf(xbt, xsname, "event-channel", "%u",
 684             xnfp->xnf_evtchn);
 685         if (err != 0) {
 686                 message = "writing event-channel";
 687                 goto abort_transaction;
 688         }
 689 
 690         err = xenbus_printf(xbt, xsname, "feature-rx-notify", "%d", 1);
 691         if (err != 0) {
 692                 message = "writing feature-rx-notify";
 693                 goto abort_transaction;
 694         }
 695 
 696         err = xenbus_printf(xbt, xsname, "request-rx-copy", "%d", 1);
 697         if (err != 0) {
 698                 message = "writing request-rx-copy";
 699                 goto abort_transaction;
 700         }
 701 
 702         if (xnfp->xnf_be_mcast_control) {
 703                 err = xenbus_printf(xbt, xsname, "request-multicast-control",
 704                     "%d", 1);
 705                 if (err != 0) {
 706                         message = "writing request-multicast-control";
 707                         goto abort_transaction;
 708                 }
 709         }
 710 
 711         err = xvdi_switch_state(xnfp->xnf_devinfo, xbt, XenbusStateConnected);
 712         if (err != 0) {
 713                 message = "switching state to XenbusStateConnected";
 714                 goto abort_transaction;
 715         }
 716 
 717         err = xenbus_transaction_end(xbt, 0);
 718         if (err != 0) {
 719                 if (err == EAGAIN)
 720                         goto again;
 721                 xenbus_dev_error(xsd, err, "completing transaction");
 722         }
 723 
 724         return;
 725 
 726 abort_transaction:
 727         (void) xenbus_transaction_end(xbt, 1);
 728         xenbus_dev_error(xsd, err, "%s", message);
 729 }
 730 
 731 /*
 732  * Read configuration information from xenstore.
 733  */
 734 void
 735 xnf_read_config(xnf_t *xnfp)
 736 {
 737         int err, be_cap;
 738         char mac[ETHERADDRL * 3];
 739         char *oename = xvdi_get_oename(xnfp->xnf_devinfo);
 740 
 741         err = xenbus_scanf(XBT_NULL, oename, "mac",
 742             "%s", (char *)&mac[0]);
 743         if (err != 0) {
 744                 /*
 745                  * bad: we're supposed to be set up with a proper mac
 746                  * addr. at this point
 747                  */
 748                 cmn_err(CE_WARN, "%s%d: no mac address",
 749                     ddi_driver_name(xnfp->xnf_devinfo),
 750                     ddi_get_instance(xnfp->xnf_devinfo));
 751                         return;
 752         }
 753         if (ether_aton(mac, xnfp->xnf_mac_addr) != ETHERADDRL) {
 754                 err = ENOENT;
 755                 xenbus_dev_error(xvdi_get_xsd(xnfp->xnf_devinfo), ENOENT,
 756                     "parsing %s/mac", xvdi_get_xsname(xnfp->xnf_devinfo));
 757                 return;
 758         }
 759 
 760         err = xenbus_scanf(XBT_NULL, oename,
 761             "feature-rx-copy", "%d", &be_cap);
 762         /*
 763          * If we fail to read the store we assume that the key is
 764          * absent, implying an older domain at the far end.  Older
 765          * domains cannot do HV copy.
 766          */
 767         if (err != 0)
 768                 be_cap = 0;
 769         xnfp->xnf_be_rx_copy = (be_cap != 0);
 770 
 771         err = xenbus_scanf(XBT_NULL, oename,
 772             "feature-multicast-control", "%d", &be_cap);
 773         /*
 774          * If we fail to read the store we assume that the key is
 775          * absent, implying an older domain at the far end.  Older
 776          * domains do not support multicast control.
 777          */
 778         if (err != 0)
 779                 be_cap = 0;
 780         xnfp->xnf_be_mcast_control = (be_cap != 0) && xnf_multicast_control;
 781 }
 782 
 783 /*
 784  *  attach(9E) -- Attach a device to the system
 785  */
 786 static int
 787 xnf_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
 788 {
 789         mac_register_t *macp;
 790         xnf_t *xnfp;
 791         int err;
 792         char cachename[32];
 793 
 794 #ifdef XNF_DEBUG
 795         if (xnf_debug & XNF_DEBUG_DDI)
 796                 printf("xnf%d: attach(0x%p)\n", ddi_get_instance(devinfo),
 797                     (void *)devinfo);
 798 #endif
 799 
 800         switch (cmd) {
 801         case DDI_RESUME:
 802                 xnfp = ddi_get_driver_private(devinfo);
 803                 xnfp->xnf_gen++;
 804 
 805                 (void) xvdi_resume(devinfo);
 806                 (void) xvdi_alloc_evtchn(devinfo);
 807                 xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
 808 #ifdef XPV_HVM_DRIVER
 809                 ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr,
 810                     xnfp);
 811 #else
 812                 (void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr,
 813                     (caddr_t)xnfp);
 814 #endif
 815                 return (DDI_SUCCESS);
 816 
 817         case DDI_ATTACH:
 818                 break;
 819 
 820         default:
 821                 return (DDI_FAILURE);
 822         }
 823 
 824         /*
 825          *  Allocate gld_mac_info_t and xnf_instance structures
 826          */
 827         macp = mac_alloc(MAC_VERSION);
 828         if (macp == NULL)
 829                 return (DDI_FAILURE);
 830         xnfp = kmem_zalloc(sizeof (*xnfp), KM_SLEEP);
 831 
 832         macp->m_dip = devinfo;
 833         macp->m_driver = xnfp;
 834         xnfp->xnf_devinfo = devinfo;
 835 
 836         macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
 837         macp->m_src_addr = xnfp->xnf_mac_addr;
 838         macp->m_callbacks = &xnf_callbacks;
 839         macp->m_min_sdu = 0;
 840         macp->m_max_sdu = XNF_MAXPKT;
 841 
 842         xnfp->xnf_running = B_FALSE;
 843         xnfp->xnf_connected = B_FALSE;
 844         xnfp->xnf_be_rx_copy = B_FALSE;
 845         xnfp->xnf_be_mcast_control = B_FALSE;
 846         xnfp->xnf_need_sched = B_FALSE;
 847 
 848         xnfp->xnf_rx_head = NULL;
 849         xnfp->xnf_rx_tail = NULL;
 850         xnfp->xnf_rx_new_buffers_posted = B_FALSE;
 851 
 852 #ifdef XPV_HVM_DRIVER
 853         /*
 854          * Report our version to dom0.
 855          */
 856         if (xenbus_printf(XBT_NULL, "guest/xnf", "version", "%d",
 857             HVMPV_XNF_VERS))
 858                 cmn_err(CE_WARN, "xnf: couldn't write version\n");
 859 #endif
 860 
 861         /*
 862          * Get the iblock cookie with which to initialize the mutexes.
 863          */
 864         if (ddi_get_iblock_cookie(devinfo, 0, &xnfp->xnf_icookie)
 865             != DDI_SUCCESS)
 866                 goto failure;
 867 
 868         mutex_init(&xnfp->xnf_txlock,
 869             NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
 870         mutex_init(&xnfp->xnf_rxlock,
 871             NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
 872         mutex_init(&xnfp->xnf_schedlock,
 873             NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
 874         mutex_init(&xnfp->xnf_gref_lock,
 875             NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
 876 
 877         cv_init(&xnfp->xnf_cv_state, NULL, CV_DEFAULT, NULL);
 878         cv_init(&xnfp->xnf_cv_multicast, NULL, CV_DEFAULT, NULL);
 879         cv_init(&xnfp->xnf_cv_tx_slots, NULL, CV_DEFAULT, NULL);
 880 
 881         (void) sprintf(cachename, "xnf_buf_cache_%d",
 882             ddi_get_instance(devinfo));
 883         xnfp->xnf_buf_cache = kmem_cache_create(cachename,
 884             sizeof (xnf_buf_t), 0,
 885             xnf_buf_constructor, xnf_buf_destructor,
 886             NULL, xnfp, NULL, 0);
 887         if (xnfp->xnf_buf_cache == NULL)
 888                 goto failure_0;
 889 
 890         (void) sprintf(cachename, "xnf_tx_buf_cache_%d",
 891             ddi_get_instance(devinfo));
 892         xnfp->xnf_tx_buf_cache = kmem_cache_create(cachename,
 893             sizeof (xnf_txbuf_t), 0,
 894             xnf_tx_buf_constructor, xnf_tx_buf_destructor,
 895             NULL, xnfp, NULL, 0);
 896         if (xnfp->xnf_tx_buf_cache == NULL)
 897                 goto failure_1;
 898 
 899         xnfp->xnf_gref_head = INVALID_GRANT_REF;
 900 
 901         if (xnf_alloc_dma_resources(xnfp) == DDI_FAILURE) {
 902                 cmn_err(CE_WARN, "xnf%d: failed to allocate and initialize "
 903                     "driver data structures",
 904                     ddi_get_instance(xnfp->xnf_devinfo));
 905                 goto failure_2;
 906         }
 907 
 908         xnfp->xnf_rx_ring.sring->rsp_event =
 909             xnfp->xnf_tx_ring.sring->rsp_event = 1;
 910 
 911         xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF;
 912         xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF;
 913 
 914         /* set driver private pointer now */
 915         ddi_set_driver_private(devinfo, xnfp);
 916 
 917         if (!xnf_kstat_init(xnfp))
 918                 goto failure_3;
 919 
 920         /*
 921          * Allocate an event channel, add the interrupt handler and
 922          * bind it to the event channel.
 923          */
 924         (void) xvdi_alloc_evtchn(devinfo);
 925         xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
 926 #ifdef XPV_HVM_DRIVER
 927         ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr, xnfp);
 928 #else
 929         (void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr, (caddr_t)xnfp);
 930 #endif
 931 
 932         err = mac_register(macp, &xnfp->xnf_mh);
 933         mac_free(macp);
 934         macp = NULL;
 935         if (err != 0)
 936                 goto failure_4;
 937 
 938         if (xvdi_add_event_handler(devinfo, XS_OE_STATE, oe_state_change, NULL)
 939             != DDI_SUCCESS)
 940                 goto failure_5;
 941 
 942 #ifdef XPV_HVM_DRIVER
 943         /*
 944          * In the HVM case, this driver essentially replaces a driver for
 945          * a 'real' PCI NIC. Without the "model" property set to
 946          * "Ethernet controller", like the PCI code does, netbooting does
 947          * not work correctly, as strplumb_get_netdev_path() will not find
 948          * this interface.
 949          */
 950         (void) ndi_prop_update_string(DDI_DEV_T_NONE, devinfo, "model",
 951             "Ethernet controller");
 952 #endif
 953 
 954 #ifdef XNF_DEBUG
 955         if (xnf_debug_instance == NULL)
 956                 xnf_debug_instance = xnfp;
 957 #endif
 958 
 959         return (DDI_SUCCESS);
 960 
 961 failure_5:
 962         (void) mac_unregister(xnfp->xnf_mh);
 963 
 964 failure_4:
 965 #ifdef XPV_HVM_DRIVER
 966         ec_unbind_evtchn(xnfp->xnf_evtchn);
 967         xvdi_free_evtchn(devinfo);
 968 #else
 969         ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
 970 #endif
 971         xnfp->xnf_evtchn = INVALID_EVTCHN;
 972         kstat_delete(xnfp->xnf_kstat_aux);
 973 
 974 failure_3:
 975         xnf_release_dma_resources(xnfp);
 976 
 977 failure_2:
 978         kmem_cache_destroy(xnfp->xnf_tx_buf_cache);
 979 
 980 failure_1:
 981         kmem_cache_destroy(xnfp->xnf_buf_cache);
 982 
 983 failure_0:
 984         cv_destroy(&xnfp->xnf_cv_tx_slots);
 985         cv_destroy(&xnfp->xnf_cv_multicast);
 986         cv_destroy(&xnfp->xnf_cv_state);
 987 
 988         mutex_destroy(&xnfp->xnf_gref_lock);
 989         mutex_destroy(&xnfp->xnf_schedlock);
 990         mutex_destroy(&xnfp->xnf_rxlock);
 991         mutex_destroy(&xnfp->xnf_txlock);
 992 
 993 failure:
 994         kmem_free(xnfp, sizeof (*xnfp));
 995         if (macp != NULL)
 996                 mac_free(macp);
 997 
 998         return (DDI_FAILURE);
 999 }
1000 
1001 /*  detach(9E) -- Detach a device from the system */
1002 static int
1003 xnf_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd)
1004 {
1005         xnf_t *xnfp;            /* Our private device info */
1006 
1007 #ifdef XNF_DEBUG
1008         if (xnf_debug & XNF_DEBUG_DDI)
1009                 printf("xnf_detach(0x%p)\n", (void *)devinfo);
1010 #endif
1011 
1012         xnfp = ddi_get_driver_private(devinfo);
1013 
1014         switch (cmd) {
1015         case DDI_SUSPEND:
1016 #ifdef XPV_HVM_DRIVER
1017                 ec_unbind_evtchn(xnfp->xnf_evtchn);
1018                 xvdi_free_evtchn(devinfo);
1019 #else
1020                 ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
1021 #endif
1022 
1023                 xvdi_suspend(devinfo);
1024 
1025                 mutex_enter(&xnfp->xnf_rxlock);
1026                 mutex_enter(&xnfp->xnf_txlock);
1027 
1028                 xnfp->xnf_evtchn = INVALID_EVTCHN;
1029                 xnfp->xnf_connected = B_FALSE;
1030                 mutex_exit(&xnfp->xnf_txlock);
1031                 mutex_exit(&xnfp->xnf_rxlock);
1032 
1033                 /* claim link to be down after disconnect */
1034                 mac_link_update(xnfp->xnf_mh, LINK_STATE_DOWN);
1035                 return (DDI_SUCCESS);
1036 
1037         case DDI_DETACH:
1038                 break;
1039 
1040         default:
1041                 return (DDI_FAILURE);
1042         }
1043 
1044         if (xnfp->xnf_connected)
1045                 return (DDI_FAILURE);
1046 
1047         /*
1048          * Cannot detach if we have xnf_buf_t outstanding.
1049          */
1050         if (xnfp->xnf_stat_buf_allocated > 0)
1051                 return (DDI_FAILURE);
1052 
1053         if (mac_unregister(xnfp->xnf_mh) != 0)
1054                 return (DDI_FAILURE);
1055 
1056         kstat_delete(xnfp->xnf_kstat_aux);
1057 
1058         /* Stop the receiver */
1059         xnf_stop(xnfp);
1060 
1061         xvdi_remove_event_handler(devinfo, XS_OE_STATE);
1062 
1063         /* Remove the interrupt */
1064 #ifdef XPV_HVM_DRIVER
1065         ec_unbind_evtchn(xnfp->xnf_evtchn);
1066         xvdi_free_evtchn(devinfo);
1067 #else
1068         ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
1069 #endif
1070 
1071         /* Release any pending xmit mblks */
1072         xnf_release_mblks(xnfp);
1073 
1074         /* Release all DMA resources */
1075         xnf_release_dma_resources(xnfp);
1076 
1077         cv_destroy(&xnfp->xnf_cv_tx_slots);
1078         cv_destroy(&xnfp->xnf_cv_multicast);
1079         cv_destroy(&xnfp->xnf_cv_state);
1080 
1081         kmem_cache_destroy(xnfp->xnf_tx_buf_cache);
1082         kmem_cache_destroy(xnfp->xnf_buf_cache);
1083 
1084         mutex_destroy(&xnfp->xnf_gref_lock);
1085         mutex_destroy(&xnfp->xnf_schedlock);
1086         mutex_destroy(&xnfp->xnf_rxlock);
1087         mutex_destroy(&xnfp->xnf_txlock);
1088 
1089         kmem_free(xnfp, sizeof (*xnfp));
1090 
1091         return (DDI_SUCCESS);
1092 }
1093 
1094 /*
1095  *  xnf_set_mac_addr() -- set the physical network address on the board.
1096  */
1097 static int
1098 xnf_set_mac_addr(void *arg, const uint8_t *macaddr)
1099 {
1100         _NOTE(ARGUNUSED(arg, macaddr));
1101 
1102         /*
1103          * We can't set our macaddr.
1104          */
1105         return (ENOTSUP);
1106 }
1107 
1108 /*
1109  *  xnf_set_multicast() -- set (enable) or disable a multicast address.
1110  *
1111  *  Program the hardware to enable/disable the multicast address
1112  *  in "mca".  Enable if "add" is true, disable if false.
1113  */
1114 static int
1115 xnf_set_multicast(void *arg, boolean_t add, const uint8_t *mca)
1116 {
1117         xnf_t *xnfp = arg;
1118         xnf_txbuf_t *txp;
1119         int n_slots;
1120         RING_IDX slot;
1121         xnf_txid_t *tidp;
1122         netif_tx_request_t *txrp;
1123         struct netif_extra_info *erp;
1124         boolean_t notify, result;
1125 
1126         /*
1127          * If the backend does not support multicast control then we
1128          * must assume that the right packets will just arrive.
1129          */
1130         if (!xnfp->xnf_be_mcast_control)
1131                 return (0);
1132 
1133         txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP);
1134 
1135         mutex_enter(&xnfp->xnf_txlock);
1136 
1137         /*
1138          * If we're not yet connected then claim success. This is
1139          * acceptable because we refresh the entire set of multicast
1140          * addresses when we get connected.
1141          *
1142          * We can't wait around here because the MAC layer expects
1143          * this to be a non-blocking operation - waiting ends up
1144          * causing a deadlock during resume.
1145          */
1146         if (!xnfp->xnf_connected) {
1147                 mutex_exit(&xnfp->xnf_txlock);
1148                 return (0);
1149         }
1150 
1151         /*
1152          * 1. Acquire two slots in the ring.
1153          * 2. Fill in the slots.
1154          * 3. Request notification when the operation is done.
1155          * 4. Kick the peer.
1156          * 5. Wait for the response via xnf_tx_clean_ring().
1157          */
1158 
1159         n_slots = tx_slots_get(xnfp, 2, B_TRUE);
1160         ASSERT(n_slots >= 2);
1161 
1162         slot = xnfp->xnf_tx_ring.req_prod_pvt;
1163         tidp = txid_get(xnfp);
1164         VERIFY(tidp != NULL);
1165 
1166         txp->tx_type = TX_MCAST_REQ;
1167         txp->tx_slot = slot;
1168 
1169         txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
1170         erp = (struct netif_extra_info *)
1171             RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot + 1);
1172 
1173         txrp->gref = 0;
1174         txrp->size = 0;
1175         txrp->offset = 0;
1176         /* Set tx_txreq.id to appease xnf_tx_clean_ring(). */
1177         txrp->id = txp->tx_txreq.id = tidp->id;
1178         txrp->flags = NETTXF_extra_info;
1179 
1180         erp->type = add ? XEN_NETIF_EXTRA_TYPE_MCAST_ADD :
1181             XEN_NETIF_EXTRA_TYPE_MCAST_DEL;
1182         bcopy((void *)mca, &erp->u.mcast.addr, ETHERADDRL);
1183 
1184         tidp->txbuf = txp;
1185 
1186         xnfp->xnf_tx_ring.req_prod_pvt = slot + 2;
1187 
1188         mutex_enter(&xnfp->xnf_schedlock);
1189         xnfp->xnf_pending_multicast++;
1190         mutex_exit(&xnfp->xnf_schedlock);
1191 
1192         /* LINTED: constant in conditional context */
1193         RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring,
1194             notify);
1195         if (notify)
1196                 ec_notify_via_evtchn(xnfp->xnf_evtchn);
1197 
1198         while (txp->tx_type == TX_MCAST_REQ)
1199                 cv_wait(&xnfp->xnf_cv_multicast,
1200                     &xnfp->xnf_txlock);
1201 
1202         ASSERT(txp->tx_type == TX_MCAST_RSP);
1203 
1204         mutex_enter(&xnfp->xnf_schedlock);
1205         xnfp->xnf_pending_multicast--;
1206         mutex_exit(&xnfp->xnf_schedlock);
1207 
1208         result = (txp->tx_status == NETIF_RSP_OKAY);
1209 
1210         txid_put(xnfp, tidp);
1211 
1212         mutex_exit(&xnfp->xnf_txlock);
1213 
1214         kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
1215 
1216         return (result ? 0 : 1);
1217 }
1218 
1219 /*
1220  * xnf_set_promiscuous() -- set or reset promiscuous mode on the board
1221  *
1222  *  Program the hardware to enable/disable promiscuous mode.
1223  */
1224 static int
1225 xnf_set_promiscuous(void *arg, boolean_t on)
1226 {
1227         _NOTE(ARGUNUSED(arg, on));
1228 
1229         /*
1230          * We can't really do this, but we pretend that we can in
1231          * order that snoop will work.
1232          */
1233         return (0);
1234 }
1235 
1236 /*
1237  * Clean buffers that we have responses for from the transmit ring.
1238  */
1239 static int
1240 xnf_tx_clean_ring(xnf_t *xnfp)
1241 {
1242         boolean_t work_to_do;
1243 
1244         ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
1245 
1246 loop:
1247         while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_tx_ring)) {
1248                 RING_IDX cons, prod, i;
1249 
1250                 cons = xnfp->xnf_tx_ring.rsp_cons;
1251                 prod = xnfp->xnf_tx_ring.sring->rsp_prod;
1252                 membar_consumer();
1253                 /*
1254                  * Clean tx requests from ring that we have responses
1255                  * for.
1256                  */
1257                 DTRACE_PROBE2(xnf_tx_clean_range, int, cons, int, prod);
1258                 for (i = cons; i != prod; i++) {
1259                         netif_tx_response_t *trp;
1260                         xnf_txid_t *tidp;
1261                         xnf_txbuf_t *txp;
1262 
1263                         trp = RING_GET_RESPONSE(&xnfp->xnf_tx_ring, i);
1264                         ASSERT(TX_ID_VALID(trp->id));
1265 
1266                         tidp = TX_ID_TO_TXID(xnfp, trp->id);
1267                         ASSERT(tidp->id == trp->id);
1268                         ASSERT(tidp->next == INVALID_TX_ID);
1269 
1270                         txp = tidp->txbuf;
1271                         ASSERT(txp != NULL);
1272                         ASSERT(txp->tx_txreq.id == trp->id);
1273 
1274                         switch (txp->tx_type) {
1275                         case TX_DATA:
1276                                 if (gnttab_query_foreign_access(
1277                                     txp->tx_txreq.gref) != 0)
1278                                         cmn_err(CE_PANIC,
1279                                             "tx grant %d still in use by "
1280                                             "backend domain",
1281                                             txp->tx_txreq.gref);
1282 
1283                                 if (txp->tx_bdesc == NULL) {
1284                                         (void) gnttab_end_foreign_access_ref(
1285                                             txp->tx_txreq.gref, 1);
1286                                         gref_put(xnfp, txp->tx_txreq.gref);
1287                                         (void) ddi_dma_unbind_handle(
1288                                             txp->tx_dma_handle);
1289                                 } else {
1290                                         xnf_buf_put(xnfp, txp->tx_bdesc,
1291                                             B_TRUE);
1292                                 }
1293 
1294                                 freemsg(txp->tx_mp);
1295                                 txid_put(xnfp, tidp);
1296                                 kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
1297 
1298                                 break;
1299 
1300                         case TX_MCAST_REQ:
1301                                 txp->tx_type = TX_MCAST_RSP;
1302                                 txp->tx_status = trp->status;
1303                                 cv_broadcast(&xnfp->xnf_cv_multicast);
1304 
1305                                 break;
1306 
1307                         case TX_MCAST_RSP:
1308                                 break;
1309 
1310                         default:
1311                                 cmn_err(CE_PANIC, "xnf_tx_clean_ring: "
1312                                     "invalid xnf_txbuf_t type: %d",
1313                                     txp->tx_type);
1314                                 break;
1315                         }
1316                 }
1317                 /*
1318                  * Record the last response we dealt with so that we
1319                  * know where to start next time around.
1320                  */
1321                 xnfp->xnf_tx_ring.rsp_cons = prod;
1322                 membar_enter();
1323         }
1324 
1325         /* LINTED: constant in conditional context */
1326         RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_tx_ring, work_to_do);
1327         if (work_to_do)
1328                 goto loop;
1329 
1330         return (RING_FREE_REQUESTS(&xnfp->xnf_tx_ring));
1331 }
1332 
1333 /*
1334  * Allocate and fill in a look-aside buffer for the packet `mp'. Used
1335  * to ensure that the packet is physically contiguous and contained
1336  * within a single page.
1337  */
1338 static xnf_buf_t *
1339 xnf_tx_pullup(xnf_t *xnfp, mblk_t *mp)
1340 {
1341         xnf_buf_t *bd;
1342         caddr_t bp;
1343 
1344         bd = xnf_buf_get(xnfp, KM_SLEEP, B_TRUE);
1345         if (bd == NULL)
1346                 return (NULL);
1347 
1348         bp = bd->buf;
1349         while (mp != NULL) {
1350                 size_t len = MBLKL(mp);
1351 
1352                 bcopy(mp->b_rptr, bp, len);
1353                 bp += len;
1354 
1355                 mp = mp->b_cont;
1356         }
1357 
1358         ASSERT((bp - bd->buf) <= PAGESIZE);
1359 
1360         xnfp->xnf_stat_tx_pullup++;
1361 
1362         return (bd);
1363 }
1364 
1365 /*
1366  * Insert the pseudo-header checksum into the packet `buf'.
1367  */
1368 void
1369 xnf_pseudo_cksum(caddr_t buf, int length)
1370 {
1371         struct ether_header *ehp;
1372         uint16_t sap, len, *stuff;
1373         uint32_t cksum;
1374         size_t offset;
1375         ipha_t *ipha;
1376         ipaddr_t src, dst;
1377 
1378         ASSERT(length >= sizeof (*ehp));
1379         ehp = (struct ether_header *)buf;
1380 
1381         if (ntohs(ehp->ether_type) == VLAN_TPID) {
1382                 struct ether_vlan_header *evhp;
1383 
1384                 ASSERT(length >= sizeof (*evhp));
1385                 evhp = (struct ether_vlan_header *)buf;
1386                 sap = ntohs(evhp->ether_type);
1387                 offset = sizeof (*evhp);
1388         } else {
1389                 sap = ntohs(ehp->ether_type);
1390                 offset = sizeof (*ehp);
1391         }
1392 
1393         ASSERT(sap == ETHERTYPE_IP);
1394 
1395         /* Packet should have been pulled up by the caller. */
1396         if ((offset + sizeof (ipha_t)) > length) {
1397                 cmn_err(CE_WARN, "xnf_pseudo_cksum: no room for checksum");
1398                 return;
1399         }
1400 
1401         ipha = (ipha_t *)(buf + offset);
1402 
1403         ASSERT(IPH_HDR_LENGTH(ipha) == IP_SIMPLE_HDR_LENGTH);
1404 
1405         len = ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH;
1406 
1407         switch (ipha->ipha_protocol) {
1408         case IPPROTO_TCP:
1409                 stuff = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
1410                 cksum = IP_TCP_CSUM_COMP;
1411                 break;
1412         case IPPROTO_UDP:
1413                 stuff = IPH_UDPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
1414                 cksum = IP_UDP_CSUM_COMP;
1415                 break;
1416         default:
1417                 cmn_err(CE_WARN, "xnf_pseudo_cksum: unexpected protocol %d",
1418                     ipha->ipha_protocol);
1419                 return;
1420         }
1421 
1422         src = ipha->ipha_src;
1423         dst = ipha->ipha_dst;
1424 
1425         cksum += (dst >> 16) + (dst & 0xFFFF);
1426         cksum += (src >> 16) + (src & 0xFFFF);
1427         cksum += htons(len);
1428 
1429         cksum = (cksum >> 16) + (cksum & 0xFFFF);
1430         cksum = (cksum >> 16) + (cksum & 0xFFFF);
1431 
1432         ASSERT(cksum <= 0xFFFF);
1433 
1434         *stuff = (uint16_t)(cksum ? cksum : ~cksum);
1435 }
1436 
1437 /*
1438  * Push a list of prepared packets (`txp') into the transmit ring.
1439  */
1440 static xnf_txbuf_t *
1441 tx_push_packets(xnf_t *xnfp, xnf_txbuf_t *txp)
1442 {
1443         int slots_free;
1444         RING_IDX slot;
1445         boolean_t notify;
1446 
1447         mutex_enter(&xnfp->xnf_txlock);
1448 
1449         ASSERT(xnfp->xnf_running);
1450 
1451         /*
1452          * Wait until we are connected to the backend.
1453          */
1454         while (!xnfp->xnf_connected)
1455                 cv_wait(&xnfp->xnf_cv_state, &xnfp->xnf_txlock);
1456 
1457         slots_free = tx_slots_get(xnfp, 1, B_FALSE);
1458         DTRACE_PROBE1(xnf_send_slotsfree, int, slots_free);
1459 
1460         slot = xnfp->xnf_tx_ring.req_prod_pvt;
1461 
1462         while ((txp != NULL) && (slots_free > 0)) {
1463                 xnf_txid_t *tidp;
1464                 netif_tx_request_t *txrp;
1465 
1466                 tidp = txid_get(xnfp);
1467                 VERIFY(tidp != NULL);
1468 
1469                 txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
1470 
1471                 txp->tx_slot = slot;
1472                 txp->tx_txreq.id = tidp->id;
1473                 *txrp = txp->tx_txreq;
1474 
1475                 tidp->txbuf = txp;
1476 
1477                 xnfp->xnf_stat_opackets++;
1478                 xnfp->xnf_stat_obytes += txp->tx_txreq.size;
1479 
1480                 txp = txp->tx_next;
1481                 slots_free--;
1482                 slot++;
1483 
1484         }
1485 
1486         xnfp->xnf_tx_ring.req_prod_pvt = slot;
1487 
1488         /*
1489          * Tell the peer that we sent something, if it cares.
1490          */
1491         /* LINTED: constant in conditional context */
1492         RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring,
1493             notify);
1494         if (notify)
1495                 ec_notify_via_evtchn(xnfp->xnf_evtchn);
1496 
1497         mutex_exit(&xnfp->xnf_txlock);
1498 
1499         return (txp);
1500 }
1501 
1502 /*
1503  * Send the chain of packets `mp'. Called by the MAC framework.
1504  */
1505 static mblk_t *
1506 xnf_send(void *arg, mblk_t *mp)
1507 {
1508         xnf_t *xnfp = arg;
1509         domid_t oeid;
1510         xnf_txbuf_t *head, *tail;
1511         mblk_t *ml;
1512         int prepared;
1513 
1514         oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
1515 
1516         /*
1517          * Prepare packets for transmission.
1518          */
1519         head = tail = NULL;
1520         prepared = 0;
1521         while (mp != NULL) {
1522                 xnf_txbuf_t *txp;
1523                 int n_chunks, length;
1524                 boolean_t page_oops;
1525                 uint32_t pflags;
1526 
1527                 for (ml = mp, n_chunks = length = 0, page_oops = B_FALSE;
1528                     ml != NULL;
1529                     ml = ml->b_cont, n_chunks++) {
1530 
1531                         /*
1532                          * Test if this buffer includes a page
1533                          * boundary. The test assumes that the range
1534                          * b_rptr...b_wptr can include only a single
1535                          * boundary.
1536                          */
1537                         if (xnf_btop((size_t)ml->b_rptr) !=
1538                             xnf_btop((size_t)ml->b_wptr)) {
1539                                 xnfp->xnf_stat_tx_pagebndry++;
1540                                 page_oops = B_TRUE;
1541                         }
1542 
1543                         length += MBLKL(ml);
1544                 }
1545                 DTRACE_PROBE1(xnf_send_b_cont, int, n_chunks);
1546 
1547                 /*
1548                  * Make sure packet isn't too large.
1549                  */
1550                 if (length > XNF_FRAMESIZE) {
1551                         cmn_err(CE_WARN,
1552                             "xnf%d: oversized packet (%d bytes) dropped",
1553                             ddi_get_instance(xnfp->xnf_devinfo), length);
1554                         freemsg(mp);
1555                         continue;
1556                 }
1557 
1558                 txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP);
1559 
1560                 txp->tx_type = TX_DATA;
1561 
1562                 if ((n_chunks > xnf_max_tx_frags) || page_oops) {
1563                         /*
1564                          * Loan a side buffer rather than the mblk
1565                          * itself.
1566                          */
1567                         txp->tx_bdesc = xnf_tx_pullup(xnfp, mp);
1568                         if (txp->tx_bdesc == NULL) {
1569                                 kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
1570                                 break;
1571                         }
1572 
1573                         txp->tx_bufp = txp->tx_bdesc->buf;
1574                         txp->tx_mfn = txp->tx_bdesc->buf_mfn;
1575                         txp->tx_txreq.gref = txp->tx_bdesc->grant_ref;
1576 
1577                 } else {
1578                         int rc;
1579                         ddi_dma_cookie_t dma_cookie;
1580                         uint_t ncookies;
1581 
1582                         rc = ddi_dma_addr_bind_handle(txp->tx_dma_handle,
1583                             NULL, (char *)mp->b_rptr, length,
1584                             DDI_DMA_WRITE | DDI_DMA_STREAMING,
1585                             DDI_DMA_DONTWAIT, 0, &dma_cookie,
1586                             &ncookies);
1587                         if (rc != DDI_DMA_MAPPED) {
1588                                 ASSERT(rc != DDI_DMA_INUSE);
1589                                 ASSERT(rc != DDI_DMA_PARTIAL_MAP);
1590 
1591 #ifdef XNF_DEBUG
1592                                 if (rc != DDI_DMA_NORESOURCES)
1593                                         cmn_err(CE_WARN,
1594                                             "xnf%d: bind_handle failed (%x)",
1595                                             ddi_get_instance(xnfp->xnf_devinfo),
1596                                             rc);
1597 #endif
1598                                 kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
1599                                 break;
1600                         }
1601                         ASSERT(ncookies == 1);
1602 
1603                         txp->tx_bdesc = NULL;
1604                         txp->tx_bufp = (caddr_t)mp->b_rptr;
1605                         txp->tx_mfn =
1606                             xnf_btop(pa_to_ma(dma_cookie.dmac_laddress));
1607                         txp->tx_txreq.gref = gref_get(xnfp);
1608                         if (txp->tx_txreq.gref == INVALID_GRANT_REF) {
1609                                 (void) ddi_dma_unbind_handle(
1610                                     txp->tx_dma_handle);
1611                                 kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
1612                                 break;
1613                         }
1614                         gnttab_grant_foreign_access_ref(txp->tx_txreq.gref,
1615                             oeid, txp->tx_mfn, 1);
1616                 }
1617 
1618                 txp->tx_next = NULL;
1619                 txp->tx_mp = mp;
1620                 txp->tx_txreq.size = length;
1621                 txp->tx_txreq.offset = (uintptr_t)txp->tx_bufp & PAGEOFFSET;
1622                 txp->tx_txreq.flags = 0;
1623                 mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &pflags);
1624                 if (pflags != 0) {
1625                         /*
1626                          * If the local protocol stack requests checksum
1627                          * offload we set the 'checksum blank' flag,
1628                          * indicating to the peer that we need the checksum
1629                          * calculated for us.
1630                          *
1631                          * We _don't_ set the validated flag, because we haven't
1632                          * validated that the data and the checksum match.
1633                          */
1634                         xnf_pseudo_cksum(txp->tx_bufp, length);
1635                         txp->tx_txreq.flags |= NETTXF_csum_blank;
1636 
1637                         xnfp->xnf_stat_tx_cksum_deferred++;
1638                 }
1639 
1640                 if (head == NULL) {
1641                         ASSERT(tail == NULL);
1642 
1643                         head = txp;
1644                 } else {
1645                         ASSERT(tail != NULL);
1646 
1647                         tail->tx_next = txp;
1648                 }
1649                 tail = txp;
1650 
1651                 mp = mp->b_next;
1652                 prepared++;
1653 
1654                 /*
1655                  * There is no point in preparing more than
1656                  * NET_TX_RING_SIZE, as we won't be able to push them
1657                  * into the ring in one go and would hence have to
1658                  * un-prepare the extra.
1659                  */
1660                 if (prepared == NET_TX_RING_SIZE)
1661                         break;
1662         }
1663 
1664         DTRACE_PROBE1(xnf_send_prepared, int, prepared);
1665 
1666         if (mp != NULL) {
1667 #ifdef XNF_DEBUG
1668                 int notprepared = 0;
1669                 mblk_t *l = mp;
1670 
1671                 while (l != NULL) {
1672                         notprepared++;
1673                         l = l->b_next;
1674                 }
1675 
1676                 DTRACE_PROBE1(xnf_send_notprepared, int, notprepared);
1677 #else /* !XNF_DEBUG */
1678                 DTRACE_PROBE1(xnf_send_notprepared, int, -1);
1679 #endif /* XNF_DEBUG */
1680         }
1681 
1682         /*
1683          * Push the packets we have prepared into the ring. They may
1684          * not all go.
1685          */
1686         if (head != NULL)
1687                 head = tx_push_packets(xnfp, head);
1688 
1689         /*
1690          * If some packets that we prepared were not sent, unprepare
1691          * them and add them back to the head of those we didn't
1692          * prepare.
1693          */
1694         {
1695                 xnf_txbuf_t *loop;
1696                 mblk_t *mp_head, *mp_tail;
1697                 int unprepared = 0;
1698 
1699                 mp_head = mp_tail = NULL;
1700                 loop = head;
1701 
1702                 while (loop != NULL) {
1703                         xnf_txbuf_t *next = loop->tx_next;
1704 
1705                         if (loop->tx_bdesc == NULL) {
1706                                 (void) gnttab_end_foreign_access_ref(
1707                                     loop->tx_txreq.gref, 1);
1708                                 gref_put(xnfp, loop->tx_txreq.gref);
1709                                 (void) ddi_dma_unbind_handle(
1710                                     loop->tx_dma_handle);
1711                         } else {
1712                                 xnf_buf_put(xnfp, loop->tx_bdesc, B_TRUE);
1713                         }
1714 
1715                         ASSERT(loop->tx_mp != NULL);
1716                         if (mp_head == NULL)
1717                                 mp_head = loop->tx_mp;
1718                         mp_tail = loop->tx_mp;
1719 
1720                         kmem_cache_free(xnfp->xnf_tx_buf_cache, loop);
1721                         loop = next;
1722                         unprepared++;
1723                 }
1724 
1725                 if (mp_tail == NULL) {
1726                         ASSERT(mp_head == NULL);
1727                 } else {
1728                         ASSERT(mp_head != NULL);
1729 
1730                         mp_tail->b_next = mp;
1731                         mp = mp_head;
1732                 }
1733 
1734                 DTRACE_PROBE1(xnf_send_unprepared, int, unprepared);
1735         }
1736 
1737         /*
1738          * If any mblks are left then we have deferred for some reason
1739          * and need to ask for a re-schedule later. This is typically
1740          * due to the ring filling.
1741          */
1742         if (mp != NULL) {
1743                 mutex_enter(&xnfp->xnf_schedlock);
1744                 xnfp->xnf_need_sched = B_TRUE;
1745                 mutex_exit(&xnfp->xnf_schedlock);
1746 
1747                 xnfp->xnf_stat_tx_defer++;
1748         }
1749 
1750         return (mp);
1751 }
1752 
1753 /*
1754  * Notification of RX packets. Currently no TX-complete interrupt is
1755  * used, as we clean the TX ring lazily.
1756  */
1757 static uint_t
1758 xnf_intr(caddr_t arg)
1759 {
1760         xnf_t *xnfp = (xnf_t *)arg;
1761         mblk_t *mp;
1762         boolean_t need_sched, clean_ring;
1763 
1764         mutex_enter(&xnfp->xnf_rxlock);
1765 
1766         /*
1767          * Interrupts before we are connected are spurious.
1768          */
1769         if (!xnfp->xnf_connected) {
1770                 mutex_exit(&xnfp->xnf_rxlock);
1771                 xnfp->xnf_stat_unclaimed_interrupts++;
1772                 return (DDI_INTR_UNCLAIMED);
1773         }
1774 
1775         /*
1776          * Receive side processing.
1777          */
1778         do {
1779                 /*
1780                  * Collect buffers from the ring.
1781                  */
1782                 xnf_rx_collect(xnfp);
1783 
1784                 /*
1785                  * Interrupt me when the next receive buffer is consumed.
1786                  */
1787                 xnfp->xnf_rx_ring.sring->rsp_event =
1788                     xnfp->xnf_rx_ring.rsp_cons + 1;
1789                 xen_mb();
1790 
1791         } while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring));
1792 
1793         if (xnfp->xnf_rx_new_buffers_posted) {
1794                 boolean_t notify;
1795 
1796                 /*
1797                  * Indicate to the peer that we have re-filled the
1798                  * receive ring, if it cares.
1799                  */
1800                 /* LINTED: constant in conditional context */
1801                 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify);
1802                 if (notify)
1803                         ec_notify_via_evtchn(xnfp->xnf_evtchn);
1804                 xnfp->xnf_rx_new_buffers_posted = B_FALSE;
1805         }
1806 
1807         mp = xnfp->xnf_rx_head;
1808         xnfp->xnf_rx_head = xnfp->xnf_rx_tail = NULL;
1809 
1810         xnfp->xnf_stat_interrupts++;
1811         mutex_exit(&xnfp->xnf_rxlock);
1812 
1813         if (mp != NULL)
1814                 mac_rx(xnfp->xnf_mh, NULL, mp);
1815 
1816         /*
1817          * Transmit side processing.
1818          *
1819          * If a previous transmit attempt failed or we have pending
1820          * multicast requests, clean the ring.
1821          *
1822          * If we previously stalled transmission and cleaning produces
1823          * some free slots, tell upstream to attempt sending again.
1824          *
1825          * The odd style is to avoid acquiring xnf_txlock unless we
1826          * will actually look inside the tx machinery.
1827          */
1828         mutex_enter(&xnfp->xnf_schedlock);
1829         need_sched = xnfp->xnf_need_sched;
1830         clean_ring = need_sched || (xnfp->xnf_pending_multicast > 0);
1831         mutex_exit(&xnfp->xnf_schedlock);
1832 
1833         if (clean_ring) {
1834                 int free_slots;
1835 
1836                 mutex_enter(&xnfp->xnf_txlock);
1837                 free_slots = tx_slots_get(xnfp, 0, B_FALSE);
1838 
1839                 if (need_sched && (free_slots > 0)) {
1840                         mutex_enter(&xnfp->xnf_schedlock);
1841                         xnfp->xnf_need_sched = B_FALSE;
1842                         mutex_exit(&xnfp->xnf_schedlock);
1843 
1844                         mac_tx_update(xnfp->xnf_mh);
1845                 }
1846                 mutex_exit(&xnfp->xnf_txlock);
1847         }
1848 
1849         return (DDI_INTR_CLAIMED);
1850 }
1851 
1852 /*
1853  *  xnf_start() -- start the board receiving and enable interrupts.
1854  */
1855 static int
1856 xnf_start(void *arg)
1857 {
1858         xnf_t *xnfp = arg;
1859 
1860 #ifdef XNF_DEBUG
1861         if (xnf_debug & XNF_DEBUG_TRACE)
1862                 printf("xnf%d start(0x%p)\n",
1863                     ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
1864 #endif
1865 
1866         mutex_enter(&xnfp->xnf_rxlock);
1867         mutex_enter(&xnfp->xnf_txlock);
1868 
1869         /* Accept packets from above. */
1870         xnfp->xnf_running = B_TRUE;
1871 
1872         mutex_exit(&xnfp->xnf_txlock);
1873         mutex_exit(&xnfp->xnf_rxlock);
1874 
1875         return (0);
1876 }
1877 
1878 /* xnf_stop() - disable hardware */
1879 static void
1880 xnf_stop(void *arg)
1881 {
1882         xnf_t *xnfp = arg;
1883 
1884 #ifdef XNF_DEBUG
1885         if (xnf_debug & XNF_DEBUG_TRACE)
1886                 printf("xnf%d stop(0x%p)\n",
1887                     ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
1888 #endif
1889 
1890         mutex_enter(&xnfp->xnf_rxlock);
1891         mutex_enter(&xnfp->xnf_txlock);
1892 
1893         xnfp->xnf_running = B_FALSE;
1894 
1895         mutex_exit(&xnfp->xnf_txlock);
1896         mutex_exit(&xnfp->xnf_rxlock);
1897 }
1898 
1899 /*
1900  * Hang buffer `bdesc' on the RX ring.
1901  */
1902 static void
1903 xnf_rxbuf_hang(xnf_t *xnfp, xnf_buf_t *bdesc)
1904 {
1905         netif_rx_request_t *reqp;
1906         RING_IDX hang_ix;
1907 
1908         ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock));
1909 
1910         reqp = RING_GET_REQUEST(&xnfp->xnf_rx_ring,
1911             xnfp->xnf_rx_ring.req_prod_pvt);
1912         hang_ix = (RING_IDX) (reqp - RING_GET_REQUEST(&xnfp->xnf_rx_ring, 0));
1913         ASSERT(xnfp->xnf_rx_pkt_info[hang_ix] == NULL);
1914 
1915         reqp->id = bdesc->id = hang_ix;
1916         reqp->gref = bdesc->grant_ref;
1917 
1918         xnfp->xnf_rx_pkt_info[hang_ix] = bdesc;
1919         xnfp->xnf_rx_ring.req_prod_pvt++;
1920 
1921         xnfp->xnf_rx_new_buffers_posted = B_TRUE;
1922 }
1923 
1924 /*
1925  * Collect packets from the RX ring, storing them in `xnfp' for later
1926  * use.
1927  */
1928 static void
1929 xnf_rx_collect(xnf_t *xnfp)
1930 {
1931         mblk_t *head, *tail;
1932 
1933         ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock));
1934 
1935         /*
1936          * Loop over unconsumed responses:
1937          * 1. get a response
1938          * 2. take corresponding buffer off recv. ring
1939          * 3. indicate this by setting slot to NULL
1940          * 4. create a new message and
1941          * 5. copy data in, adjust ptr
1942          */
1943 
1944         head = tail = NULL;
1945 
1946         while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)) {
1947                 netif_rx_response_t *rxpkt;
1948                 xnf_buf_t *bdesc;
1949                 ssize_t len;
1950                 size_t off;
1951                 mblk_t *mp = NULL;
1952                 boolean_t hwcsum = B_FALSE;
1953                 grant_ref_t ref;
1954 
1955                 /* 1. */
1956                 rxpkt = RING_GET_RESPONSE(&xnfp->xnf_rx_ring,
1957                     xnfp->xnf_rx_ring.rsp_cons);
1958 
1959                 DTRACE_PROBE4(xnf_rx_got_rsp, int, (int)rxpkt->id,
1960                     int, (int)rxpkt->offset,
1961                     int, (int)rxpkt->flags,
1962                     int, (int)rxpkt->status);
1963 
1964                 /*
1965                  * 2.
1966                  */
1967                 bdesc = xnfp->xnf_rx_pkt_info[rxpkt->id];
1968 
1969                 /*
1970                  * 3.
1971                  */
1972                 xnfp->xnf_rx_pkt_info[rxpkt->id] = NULL;
1973                 ASSERT(bdesc->id == rxpkt->id);
1974 
1975                 ref = bdesc->grant_ref;
1976                 off = rxpkt->offset;
1977                 len = rxpkt->status;
1978 
1979                 if (!xnfp->xnf_running) {
1980                         DTRACE_PROBE4(xnf_rx_not_running,
1981                             int, rxpkt->status,
1982                             char *, bdesc->buf, int, rxpkt->offset,
1983                             char *, ((char *)bdesc->buf) + rxpkt->offset);
1984 
1985                         xnfp->xnf_stat_drop++;
1986 
1987                 } else if (len <= 0) {
1988                         DTRACE_PROBE4(xnf_rx_pkt_status_negative,
1989                             int, rxpkt->status,
1990                             char *, bdesc->buf, int, rxpkt->offset,
1991                             char *, ((char *)bdesc->buf) + rxpkt->offset);
1992 
1993                         xnfp->xnf_stat_errrx++;
1994 
1995                         switch (len) {
1996                         case 0:
1997                                 xnfp->xnf_stat_runt++;
1998                                 break;
1999                         case NETIF_RSP_ERROR:
2000                                 xnfp->xnf_stat_mac_rcv_error++;
2001                                 break;
2002                         case NETIF_RSP_DROPPED:
2003                                 xnfp->xnf_stat_norxbuf++;
2004                                 break;
2005                         }
2006 
2007                 } else if (bdesc->grant_ref == INVALID_GRANT_REF) {
2008                         cmn_err(CE_WARN, "Bad rx grant reference %d "
2009                             "from domain %d", ref,
2010                             xvdi_get_oeid(xnfp->xnf_devinfo));
2011 
2012                 } else if ((off + len) > PAGESIZE) {
2013                         cmn_err(CE_WARN, "Rx packet overflows page "
2014                             "(offset %ld, length %ld) from domain %d",
2015                             off, len, xvdi_get_oeid(xnfp->xnf_devinfo));
2016                 } else {
2017                         xnf_buf_t *nbuf = NULL;
2018 
2019                         DTRACE_PROBE4(xnf_rx_packet, int, len,
2020                             char *, bdesc->buf, int, off,
2021                             char *, ((char *)bdesc->buf) + off);
2022 
2023                         ASSERT(off + len <= PAGEOFFSET);
2024 
2025                         if (rxpkt->flags & NETRXF_data_validated)
2026                                 hwcsum = B_TRUE;
2027 
2028                         /*
2029                          * If the packet is below a pre-determined
2030                          * size we will copy data out rather than
2031                          * replace it.
2032                          */
2033                         if (len > xnf_rx_copy_limit)
2034                                 nbuf = xnf_buf_get(xnfp, KM_NOSLEEP, B_FALSE);
2035 
2036                         /*
2037                          * If we have a replacement buffer, attempt to
2038                          * wrap the existing one with an mblk_t in
2039                          * order that the upper layers of the stack
2040                          * might use it directly.
2041                          */
2042                         if (nbuf != NULL) {
2043                                 mp = desballoc((unsigned char *)bdesc->buf,
2044                                     bdesc->len, 0, &bdesc->free_rtn);
2045                                 if (mp == NULL) {
2046                                         xnfp->xnf_stat_rx_desballoc_fail++;
2047                                         xnfp->xnf_stat_norxbuf++;
2048 
2049                                         xnf_buf_put(xnfp, nbuf, B_FALSE);
2050                                         nbuf = NULL;
2051                                 } else {
2052                                         mp->b_rptr = mp->b_rptr + off;
2053                                         mp->b_wptr = mp->b_rptr + len;
2054 
2055                                         /*
2056                                          * Release the grant reference
2057                                          * associated with this buffer
2058                                          * - they are scarce and the
2059                                          * upper layers of the stack
2060                                          * don't need it.
2061                                          */
2062                                         (void) gnttab_end_foreign_access_ref(
2063                                             bdesc->grant_ref, 0);
2064                                         gref_put(xnfp, bdesc->grant_ref);
2065                                         bdesc->grant_ref = INVALID_GRANT_REF;
2066 
2067                                         bdesc = nbuf;
2068                                 }
2069                         }
2070 
2071                         if (nbuf == NULL) {
2072                                 /*
2073                                  * No replacement buffer allocated -
2074                                  * attempt to copy the data out and
2075                                  * re-hang the existing buffer.
2076                                  */
2077 
2078                                 /* 4. */
2079                                 mp = allocb(len, BPRI_MED);
2080                                 if (mp == NULL) {
2081                                         xnfp->xnf_stat_rx_allocb_fail++;
2082                                         xnfp->xnf_stat_norxbuf++;
2083                                 } else {
2084                                         /* 5. */
2085                                         bcopy(bdesc->buf + off, mp->b_wptr,
2086                                             len);
2087                                         mp->b_wptr += len;
2088                                 }
2089                         }
2090                 }
2091 
2092                 /* Re-hang the buffer. */
2093                 xnf_rxbuf_hang(xnfp, bdesc);
2094 
2095                 if (mp != NULL) {
2096                         if (hwcsum) {
2097                                 /*
2098                                  * If the peer says that the data has
2099                                  * been validated then we declare that
2100                                  * the full checksum has been
2101                                  * verified.
2102                                  *
2103                                  * We don't look at the "checksum
2104                                  * blank" flag, and hence could have a
2105                                  * packet here that we are asserting
2106                                  * is good with a blank checksum.
2107                                  */
2108                                 mac_hcksum_set(mp, 0, 0, 0, 0,
2109                                     HCK_FULLCKSUM_OK);
2110                                 xnfp->xnf_stat_rx_cksum_no_need++;
2111                         }
2112                         if (head == NULL) {
2113                                 ASSERT(tail == NULL);
2114 
2115                                 head = mp;
2116                         } else {
2117                                 ASSERT(tail != NULL);
2118 
2119                                 tail->b_next = mp;
2120                         }
2121                         tail = mp;
2122 
2123                         ASSERT(mp->b_next == NULL);
2124 
2125                         xnfp->xnf_stat_ipackets++;
2126                         xnfp->xnf_stat_rbytes += len;
2127                 }
2128 
2129                 xnfp->xnf_rx_ring.rsp_cons++;
2130         }
2131 
2132         /*
2133          * Store the mblks we have collected.
2134          */
2135         if (head != NULL) {
2136                 ASSERT(tail != NULL);
2137 
2138                 if (xnfp->xnf_rx_head == NULL) {
2139                         ASSERT(xnfp->xnf_rx_tail == NULL);
2140 
2141                         xnfp->xnf_rx_head = head;
2142                 } else {
2143                         ASSERT(xnfp->xnf_rx_tail != NULL);
2144 
2145                         xnfp->xnf_rx_tail->b_next = head;
2146                 }
2147                 xnfp->xnf_rx_tail = tail;
2148         }
2149 }
2150 
2151 /*
2152  *  xnf_alloc_dma_resources() -- initialize the drivers structures
2153  */
2154 static int
2155 xnf_alloc_dma_resources(xnf_t *xnfp)
2156 {
2157         dev_info_t              *devinfo = xnfp->xnf_devinfo;
2158         size_t                  len;
2159         ddi_dma_cookie_t        dma_cookie;
2160         uint_t                  ncookies;
2161         int                     rc;
2162         caddr_t                 rptr;
2163 
2164         /*
2165          * The code below allocates all the DMA data structures that
2166          * need to be released when the driver is detached.
2167          *
2168          * Allocate page for the transmit descriptor ring.
2169          */
2170         if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
2171             DDI_DMA_SLEEP, 0, &xnfp->xnf_tx_ring_dma_handle) != DDI_SUCCESS)
2172                 goto alloc_error;
2173 
2174         if (ddi_dma_mem_alloc(xnfp->xnf_tx_ring_dma_handle,
2175             PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
2176             DDI_DMA_SLEEP, 0, &rptr, &len,
2177             &xnfp->xnf_tx_ring_dma_acchandle) != DDI_SUCCESS) {
2178                 ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2179                 xnfp->xnf_tx_ring_dma_handle = NULL;
2180                 goto alloc_error;
2181         }
2182 
2183         if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_tx_ring_dma_handle, NULL,
2184             rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
2185             DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
2186                 ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
2187                 ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2188                 xnfp->xnf_tx_ring_dma_handle = NULL;
2189                 xnfp->xnf_tx_ring_dma_acchandle = NULL;
2190                 if (rc == DDI_DMA_NORESOURCES)
2191                         goto alloc_error;
2192                 else
2193                         goto error;
2194         }
2195 
2196         ASSERT(ncookies == 1);
2197         bzero(rptr, PAGESIZE);
2198         /* LINTED: constant in conditional context */
2199         SHARED_RING_INIT((netif_tx_sring_t *)rptr);
2200         /* LINTED: constant in conditional context */
2201         FRONT_RING_INIT(&xnfp->xnf_tx_ring, (netif_tx_sring_t *)rptr, PAGESIZE);
2202         xnfp->xnf_tx_ring_phys_addr = dma_cookie.dmac_laddress;
2203 
2204         /*
2205          * Allocate page for the receive descriptor ring.
2206          */
2207         if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
2208             DDI_DMA_SLEEP, 0, &xnfp->xnf_rx_ring_dma_handle) != DDI_SUCCESS)
2209                 goto alloc_error;
2210 
2211         if (ddi_dma_mem_alloc(xnfp->xnf_rx_ring_dma_handle,
2212             PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
2213             DDI_DMA_SLEEP, 0, &rptr, &len,
2214             &xnfp->xnf_rx_ring_dma_acchandle) != DDI_SUCCESS) {
2215                 ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2216                 xnfp->xnf_rx_ring_dma_handle = NULL;
2217                 goto alloc_error;
2218         }
2219 
2220         if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_rx_ring_dma_handle, NULL,
2221             rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
2222             DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
2223                 ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
2224                 ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2225                 xnfp->xnf_rx_ring_dma_handle = NULL;
2226                 xnfp->xnf_rx_ring_dma_acchandle = NULL;
2227                 if (rc == DDI_DMA_NORESOURCES)
2228                         goto alloc_error;
2229                 else
2230                         goto error;
2231         }
2232 
2233         ASSERT(ncookies == 1);
2234         bzero(rptr, PAGESIZE);
2235         /* LINTED: constant in conditional context */
2236         SHARED_RING_INIT((netif_rx_sring_t *)rptr);
2237         /* LINTED: constant in conditional context */
2238         FRONT_RING_INIT(&xnfp->xnf_rx_ring, (netif_rx_sring_t *)rptr, PAGESIZE);
2239         xnfp->xnf_rx_ring_phys_addr = dma_cookie.dmac_laddress;
2240 
2241         return (DDI_SUCCESS);
2242 
2243 alloc_error:
2244         cmn_err(CE_WARN, "xnf%d: could not allocate enough DMA memory",
2245             ddi_get_instance(xnfp->xnf_devinfo));
2246 error:
2247         xnf_release_dma_resources(xnfp);
2248         return (DDI_FAILURE);
2249 }
2250 
2251 /*
2252  * Release all DMA resources in the opposite order from acquisition
2253  */
2254 static void
2255 xnf_release_dma_resources(xnf_t *xnfp)
2256 {
2257         int i;
2258 
2259         /*
2260          * Free receive buffers which are currently associated with
2261          * descriptors.
2262          */
2263         mutex_enter(&xnfp->xnf_rxlock);
2264         for (i = 0; i < NET_RX_RING_SIZE; i++) {
2265                 xnf_buf_t *bp;
2266 
2267                 if ((bp = xnfp->xnf_rx_pkt_info[i]) == NULL)
2268                         continue;
2269                 xnfp->xnf_rx_pkt_info[i] = NULL;
2270                 xnf_buf_put(xnfp, bp, B_FALSE);
2271         }
2272         mutex_exit(&xnfp->xnf_rxlock);
2273 
2274         /* Free the receive ring buffer. */
2275         if (xnfp->xnf_rx_ring_dma_acchandle != NULL) {
2276                 (void) ddi_dma_unbind_handle(xnfp->xnf_rx_ring_dma_handle);
2277                 ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
2278                 ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2279                 xnfp->xnf_rx_ring_dma_acchandle = NULL;
2280         }
2281         /* Free the transmit ring buffer. */
2282         if (xnfp->xnf_tx_ring_dma_acchandle != NULL) {
2283                 (void) ddi_dma_unbind_handle(xnfp->xnf_tx_ring_dma_handle);
2284                 ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
2285                 ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2286                 xnfp->xnf_tx_ring_dma_acchandle = NULL;
2287         }
2288 
2289 }
2290 
2291 /*
2292  * Release any packets and associated structures used by the TX ring.
2293  */
2294 static void
2295 xnf_release_mblks(xnf_t *xnfp)
2296 {
2297         RING_IDX i;
2298         xnf_txid_t *tidp;
2299 
2300         for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
2301             i < NET_TX_RING_SIZE;
2302             i++, tidp++) {
2303                 xnf_txbuf_t *txp = tidp->txbuf;
2304 
2305                 if (txp != NULL) {
2306                         ASSERT(txp->tx_mp != NULL);
2307                         freemsg(txp->tx_mp);
2308 
2309                         txid_put(xnfp, tidp);
2310                         kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
2311                 }
2312         }
2313 }
2314 
2315 static int
2316 xnf_buf_constructor(void *buf, void *arg, int kmflag)
2317 {
2318         int (*ddiflags)(caddr_t) = DDI_DMA_SLEEP;
2319         xnf_buf_t *bdesc = buf;
2320         xnf_t *xnfp = arg;
2321         ddi_dma_cookie_t dma_cookie;
2322         uint_t ncookies;
2323         size_t len;
2324 
2325         if (kmflag & KM_NOSLEEP)
2326                 ddiflags = DDI_DMA_DONTWAIT;
2327 
2328         /* Allocate a DMA access handle for the buffer. */
2329         if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &buf_dma_attr,
2330             ddiflags, 0, &bdesc->dma_handle) != DDI_SUCCESS)
2331                 goto failure;
2332 
2333         /* Allocate DMA-able memory for buffer. */
2334         if (ddi_dma_mem_alloc(bdesc->dma_handle,
2335             PAGESIZE, &data_accattr, DDI_DMA_STREAMING, ddiflags, 0,
2336             &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS)
2337                 goto failure_1;
2338 
2339         /* Bind to virtual address of buffer to get physical address. */
2340         if (ddi_dma_addr_bind_handle(bdesc->dma_handle, NULL,
2341             bdesc->buf, len, DDI_DMA_RDWR | DDI_DMA_STREAMING,
2342             ddiflags, 0, &dma_cookie, &ncookies) != DDI_DMA_MAPPED)
2343                 goto failure_2;
2344         ASSERT(ncookies == 1);
2345 
2346         bdesc->free_rtn.free_func = xnf_buf_recycle;
2347         bdesc->free_rtn.free_arg = (caddr_t)bdesc;
2348         bdesc->xnfp = xnfp;
2349         bdesc->buf_phys = dma_cookie.dmac_laddress;
2350         bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys));
2351         bdesc->len = dma_cookie.dmac_size;
2352         bdesc->grant_ref = INVALID_GRANT_REF;
2353         bdesc->gen = xnfp->xnf_gen;
2354 
2355         atomic_inc_64(&xnfp->xnf_stat_buf_allocated);
2356 
2357         return (0);
2358 
2359 failure_2:
2360         ddi_dma_mem_free(&bdesc->acc_handle);
2361 
2362 failure_1:
2363         ddi_dma_free_handle(&bdesc->dma_handle);
2364 
2365 failure:
2366 
2367         ASSERT(kmflag & KM_NOSLEEP); /* Cannot fail for KM_SLEEP. */
2368         return (-1);
2369 }
2370 
2371 static void
2372 xnf_buf_destructor(void *buf, void *arg)
2373 {
2374         xnf_buf_t *bdesc = buf;
2375         xnf_t *xnfp = arg;
2376 
2377         (void) ddi_dma_unbind_handle(bdesc->dma_handle);
2378         ddi_dma_mem_free(&bdesc->acc_handle);
2379         ddi_dma_free_handle(&bdesc->dma_handle);
2380 
2381         atomic_dec_64(&xnfp->xnf_stat_buf_allocated);
2382 }
2383 
2384 static xnf_buf_t *
2385 xnf_buf_get(xnf_t *xnfp, int flags, boolean_t readonly)
2386 {
2387         grant_ref_t gref;
2388         xnf_buf_t *bufp;
2389 
2390         /*
2391          * Usually grant references are more scarce than memory, so we
2392          * attempt to acquire a grant reference first.
2393          */
2394         gref = gref_get(xnfp);
2395         if (gref == INVALID_GRANT_REF)
2396                 return (NULL);
2397 
2398         bufp = kmem_cache_alloc(xnfp->xnf_buf_cache, flags);
2399         if (bufp == NULL) {
2400                 gref_put(xnfp, gref);
2401                 return (NULL);
2402         }
2403 
2404         ASSERT(bufp->grant_ref == INVALID_GRANT_REF);
2405 
2406         bufp->grant_ref = gref;
2407 
2408         if (bufp->gen != xnfp->xnf_gen)
2409                 xnf_buf_refresh(bufp);
2410 
2411         gnttab_grant_foreign_access_ref(bufp->grant_ref,
2412             xvdi_get_oeid(bufp->xnfp->xnf_devinfo),
2413             bufp->buf_mfn, readonly ? 1 : 0);
2414 
2415         atomic_inc_64(&xnfp->xnf_stat_buf_outstanding);
2416 
2417         return (bufp);
2418 }
2419 
2420 static void
2421 xnf_buf_put(xnf_t *xnfp, xnf_buf_t *bufp, boolean_t readonly)
2422 {
2423         if (bufp->grant_ref != INVALID_GRANT_REF) {
2424                 (void) gnttab_end_foreign_access_ref(
2425                     bufp->grant_ref, readonly ? 1 : 0);
2426                 gref_put(xnfp, bufp->grant_ref);
2427                 bufp->grant_ref = INVALID_GRANT_REF;
2428         }
2429 
2430         kmem_cache_free(xnfp->xnf_buf_cache, bufp);
2431 
2432         atomic_dec_64(&xnfp->xnf_stat_buf_outstanding);
2433 }
2434 
2435 /*
2436  * Refresh any cached data about a buffer after resume.
2437  */
2438 static void
2439 xnf_buf_refresh(xnf_buf_t *bdesc)
2440 {
2441         bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys));
2442         bdesc->gen = bdesc->xnfp->xnf_gen;
2443 }
2444 
2445 /*
2446  * Streams `freeb' routine for `xnf_buf_t' when used as transmit
2447  * look-aside buffers.
2448  */
2449 static void
2450 xnf_buf_recycle(xnf_buf_t *bdesc)
2451 {
2452         xnf_t *xnfp = bdesc->xnfp;
2453 
2454         xnf_buf_put(xnfp, bdesc, B_TRUE);
2455 }
2456 
2457 static int
2458 xnf_tx_buf_constructor(void *buf, void *arg, int kmflag)
2459 {
2460         int (*ddiflags)(caddr_t) = DDI_DMA_SLEEP;
2461         xnf_txbuf_t *txp = buf;
2462         xnf_t *xnfp = arg;
2463 
2464         if (kmflag & KM_NOSLEEP)
2465                 ddiflags = DDI_DMA_DONTWAIT;
2466 
2467         if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &buf_dma_attr,
2468             ddiflags, 0, &txp->tx_dma_handle) != DDI_SUCCESS) {
2469                 ASSERT(kmflag & KM_NOSLEEP); /* Cannot fail for KM_SLEEP. */
2470                 return (-1);
2471         }
2472 
2473         return (0);
2474 }
2475 
2476 static void
2477 xnf_tx_buf_destructor(void *buf, void *arg)
2478 {
2479         _NOTE(ARGUNUSED(arg));
2480         xnf_txbuf_t *txp = buf;
2481 
2482         ddi_dma_free_handle(&txp->tx_dma_handle);
2483 }
2484 
2485 /*
2486  * Statistics.
2487  */
2488 static char *xnf_aux_statistics[] = {
2489         "tx_cksum_deferred",
2490         "rx_cksum_no_need",
2491         "interrupts",
2492         "unclaimed_interrupts",
2493         "tx_pullup",
2494         "tx_pagebndry",
2495         "tx_attempt",
2496         "buf_allocated",
2497         "buf_outstanding",
2498         "gref_outstanding",
2499         "gref_failure",
2500         "gref_peak",
2501         "rx_allocb_fail",
2502         "rx_desballoc_fail",
2503 };
2504 
2505 static int
2506 xnf_kstat_aux_update(kstat_t *ksp, int flag)
2507 {
2508         xnf_t *xnfp;
2509         kstat_named_t *knp;
2510 
2511         if (flag != KSTAT_READ)
2512                 return (EACCES);
2513 
2514         xnfp = ksp->ks_private;
2515         knp = ksp->ks_data;
2516 
2517         /*
2518          * Assignment order must match that of the names in
2519          * xnf_aux_statistics.
2520          */
2521         (knp++)->value.ui64 = xnfp->xnf_stat_tx_cksum_deferred;
2522         (knp++)->value.ui64 = xnfp->xnf_stat_rx_cksum_no_need;
2523 
2524         (knp++)->value.ui64 = xnfp->xnf_stat_interrupts;
2525         (knp++)->value.ui64 = xnfp->xnf_stat_unclaimed_interrupts;
2526         (knp++)->value.ui64 = xnfp->xnf_stat_tx_pullup;
2527         (knp++)->value.ui64 = xnfp->xnf_stat_tx_pagebndry;
2528         (knp++)->value.ui64 = xnfp->xnf_stat_tx_attempt;
2529 
2530         (knp++)->value.ui64 = xnfp->xnf_stat_buf_allocated;
2531         (knp++)->value.ui64 = xnfp->xnf_stat_buf_outstanding;
2532         (knp++)->value.ui64 = xnfp->xnf_stat_gref_outstanding;
2533         (knp++)->value.ui64 = xnfp->xnf_stat_gref_failure;
2534         (knp++)->value.ui64 = xnfp->xnf_stat_gref_peak;
2535         (knp++)->value.ui64 = xnfp->xnf_stat_rx_allocb_fail;
2536         (knp++)->value.ui64 = xnfp->xnf_stat_rx_desballoc_fail;
2537 
2538         return (0);
2539 }
2540 
2541 static boolean_t
2542 xnf_kstat_init(xnf_t *xnfp)
2543 {
2544         int nstat = sizeof (xnf_aux_statistics) /
2545             sizeof (xnf_aux_statistics[0]);
2546         char **cp = xnf_aux_statistics;
2547         kstat_named_t *knp;
2548 
2549         /*
2550          * Create and initialise kstats.
2551          */
2552         if ((xnfp->xnf_kstat_aux = kstat_create("xnf",
2553             ddi_get_instance(xnfp->xnf_devinfo),
2554             "aux_statistics", "net", KSTAT_TYPE_NAMED,
2555             nstat, 0)) == NULL)
2556                 return (B_FALSE);
2557 
2558         xnfp->xnf_kstat_aux->ks_private = xnfp;
2559         xnfp->xnf_kstat_aux->ks_update = xnf_kstat_aux_update;
2560 
2561         knp = xnfp->xnf_kstat_aux->ks_data;
2562         while (nstat > 0) {
2563                 kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
2564 
2565                 knp++;
2566                 cp++;
2567                 nstat--;
2568         }
2569 
2570         kstat_install(xnfp->xnf_kstat_aux);
2571 
2572         return (B_TRUE);
2573 }
2574 
2575 static int
2576 xnf_stat(void *arg, uint_t stat, uint64_t *val)
2577 {
2578         xnf_t *xnfp = arg;
2579 
2580         mutex_enter(&xnfp->xnf_rxlock);
2581         mutex_enter(&xnfp->xnf_txlock);
2582 
2583 #define mac_stat(q, r)                          \
2584         case (MAC_STAT_##q):                    \
2585                 *val = xnfp->xnf_stat_##r;   \
2586                 break
2587 
2588 #define ether_stat(q, r)                        \
2589         case (ETHER_STAT_##q):                  \
2590                 *val = xnfp->xnf_stat_##r;   \
2591                 break
2592 
2593         switch (stat) {
2594 
2595         mac_stat(IPACKETS, ipackets);
2596         mac_stat(OPACKETS, opackets);
2597         mac_stat(RBYTES, rbytes);
2598         mac_stat(OBYTES, obytes);
2599         mac_stat(NORCVBUF, norxbuf);
2600         mac_stat(IERRORS, errrx);
2601         mac_stat(NOXMTBUF, tx_defer);
2602 
2603         ether_stat(MACRCV_ERRORS, mac_rcv_error);
2604         ether_stat(TOOSHORT_ERRORS, runt);
2605 
2606         /* always claim to be in full duplex mode */
2607         case ETHER_STAT_LINK_DUPLEX:
2608                 *val = LINK_DUPLEX_FULL;
2609                 break;
2610 
2611         /* always claim to be at 1Gb/s link speed */
2612         case MAC_STAT_IFSPEED:
2613                 *val = 1000000000ull;
2614                 break;
2615 
2616         default:
2617                 mutex_exit(&xnfp->xnf_txlock);
2618                 mutex_exit(&xnfp->xnf_rxlock);
2619 
2620                 return (ENOTSUP);
2621         }
2622 
2623 #undef mac_stat
2624 #undef ether_stat
2625 
2626         mutex_exit(&xnfp->xnf_txlock);
2627         mutex_exit(&xnfp->xnf_rxlock);
2628 
2629         return (0);
2630 }
2631 
2632 static boolean_t
2633 xnf_getcapab(void *arg, mac_capab_t cap, void *cap_data)
2634 {
2635         _NOTE(ARGUNUSED(arg));
2636 
2637         switch (cap) {
2638         case MAC_CAPAB_HCKSUM: {
2639                 uint32_t *capab = cap_data;
2640 
2641                 /*
2642                  * Whilst the flag used to communicate with the IO
2643                  * domain is called "NETTXF_csum_blank", the checksum
2644                  * in the packet must contain the pseudo-header
2645                  * checksum and not zero.
2646                  *
2647                  * To help out the IO domain, we might use
2648                  * HCKSUM_INET_PARTIAL. Unfortunately our stack will
2649                  * then use checksum offload for IPv6 packets, which
2650                  * the IO domain can't handle.
2651                  *
2652                  * As a result, we declare outselves capable of
2653                  * HCKSUM_INET_FULL_V4. This means that we receive
2654                  * IPv4 packets from the stack with a blank checksum
2655                  * field and must insert the pseudo-header checksum
2656                  * before passing the packet to the IO domain.
2657                  */
2658                 *capab = HCKSUM_INET_FULL_V4;
2659                 break;
2660         }
2661         default:
2662                 return (B_FALSE);
2663         }
2664 
2665         return (B_TRUE);
2666 }
2667 
2668 /*
2669  * The state of the peer has changed - react accordingly.
2670  */
2671 static void
2672 oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
2673     void *arg, void *impl_data)
2674 {
2675         _NOTE(ARGUNUSED(id, arg));
2676         xnf_t *xnfp = ddi_get_driver_private(dip);
2677         XenbusState new_state = *(XenbusState *)impl_data;
2678 
2679         ASSERT(xnfp != NULL);
2680 
2681         switch (new_state) {
2682         case XenbusStateUnknown:
2683         case XenbusStateInitialising:
2684         case XenbusStateInitialised:
2685         case XenbusStateClosing:
2686         case XenbusStateClosed:
2687         case XenbusStateReconfiguring:
2688         case XenbusStateReconfigured:
2689                 break;
2690 
2691         case XenbusStateInitWait:
2692                 xnf_read_config(xnfp);
2693 
2694                 if (!xnfp->xnf_be_rx_copy) {
2695                         cmn_err(CE_WARN,
2696                             "The xnf driver requires a dom0 that "
2697                             "supports 'feature-rx-copy'.");
2698                         (void) xvdi_switch_state(xnfp->xnf_devinfo,
2699                             XBT_NULL, XenbusStateClosed);
2700                         break;
2701                 }
2702 
2703                 /*
2704                  * Connect to the backend.
2705                  */
2706                 xnf_be_connect(xnfp);
2707 
2708                 /*
2709                  * Our MAC address as discovered by xnf_read_config().
2710                  */
2711                 mac_unicst_update(xnfp->xnf_mh, xnfp->xnf_mac_addr);
2712 
2713                 break;
2714 
2715         case XenbusStateConnected:
2716                 mutex_enter(&xnfp->xnf_rxlock);
2717                 mutex_enter(&xnfp->xnf_txlock);
2718 
2719                 xnfp->xnf_connected = B_TRUE;
2720                 /*
2721                  * Wake up any threads waiting to send data to
2722                  * backend.
2723                  */
2724                 cv_broadcast(&xnfp->xnf_cv_state);
2725 
2726                 mutex_exit(&xnfp->xnf_txlock);
2727                 mutex_exit(&xnfp->xnf_rxlock);
2728 
2729                 /*
2730                  * Kick the peer in case it missed any transmits
2731                  * request in the TX ring.
2732                  */
2733                 ec_notify_via_evtchn(xnfp->xnf_evtchn);
2734 
2735                 /*
2736                  * There may already be completed receive requests in
2737                  * the ring sent by backend after it gets connected
2738                  * but before we see its state change here, so we call
2739                  * xnf_intr() to handle them, if any.
2740                  */
2741                 (void) xnf_intr((caddr_t)xnfp);
2742 
2743                 /*
2744                  * Mark the link up now that we are connected.
2745                  */
2746                 mac_link_update(xnfp->xnf_mh, LINK_STATE_UP);
2747 
2748                 /*
2749                  * Tell the backend about the multicast addresses in
2750                  * which we are interested.
2751                  */
2752                 mac_multicast_refresh(xnfp->xnf_mh, NULL, xnfp, B_TRUE);
2753 
2754                 break;
2755 
2756         default:
2757                 break;
2758         }
2759 }