1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2008-2013 Solarflare Communications Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 #include <sys/types.h>
  28 #include <sys/sysmacros.h>
  29 #include <sys/ddi.h>
  30 #include <sys/sunddi.h>
  31 #include <sys/atomic.h>
  32 #include <sys/stream.h>
  33 #include <sys/strsun.h>
  34 #include <sys/strsubr.h>
  35 #include <sys/strft.h>
  36 #include <sys/ksynch.h>
  37 #include <sys/ethernet.h>
  38 #include <sys/crc32.h>
  39 #include <sys/pattr.h>
  40 #include <sys/cpu.h>
  41 
  42 #include <sys/ethernet.h>
  43 #include <inet/ip.h>
  44 
  45 #include <netinet/in.h>
  46 #include <netinet/ip.h>
  47 #include <netinet/tcp.h>
  48 
  49 #include "sfxge.h"
  50 
  51 #include "efx.h"
  52 
  53 /* RXQ flush response timeout (in microseconds) */
  54 #define SFXGE_RX_QFLUSH_USEC    (2000000)
  55 
  56 /* RXQ default packet buffer preallocation (number of packet buffers) */
  57 #define SFXGE_RX_QPREALLOC      (0)
  58 
  59 /* Receive packet DMA attributes */
  60 static ddi_device_acc_attr_t sfxge_rx_packet_devacc = {
  61 
  62         DDI_DEVICE_ATTR_V0,     /* devacc_attr_version */
  63         DDI_NEVERSWAP_ACC,      /* devacc_attr_endian_flags */
  64         DDI_STRICTORDER_ACC     /* devacc_attr_dataorder */
  65 };
  66 
  67 static ddi_dma_attr_t sfxge_rx_packet_dma_attr = {
  68         DMA_ATTR_V0,            /* dma_attr_version     */
  69         0,                      /* dma_attr_addr_lo     */
  70         0xffffffffffffffffull,  /* dma_attr_addr_hi     */
  71         0xffffffffffffffffull,  /* dma_attr_count_max   */
  72         SFXGE_CPU_CACHE_SIZE,   /* dma_attr_align       */
  73         0xffffffff,             /* dma_attr_burstsizes  */
  74         1,                      /* dma_attr_minxfer     */
  75         0xffffffffffffffffull,  /* dma_attr_maxxfer     */
  76         0xffffffffffffffffull,  /* dma_attr_seg         */
  77         1,                      /* dma_attr_sgllen      */
  78         1,                      /* dma_attr_granular    */
  79         0                       /* dma_attr_flags       */
  80 };
  81 
  82 /* Receive queue DMA attributes */
  83 static ddi_device_acc_attr_t sfxge_rxq_devacc = {
  84 
  85         DDI_DEVICE_ATTR_V0,     /* devacc_attr_version */
  86         DDI_NEVERSWAP_ACC,      /* devacc_attr_endian_flags */
  87         DDI_STRICTORDER_ACC     /* devacc_attr_dataorder */
  88 };
  89 
  90 static ddi_dma_attr_t sfxge_rxq_dma_attr = {
  91         DMA_ATTR_V0,            /* dma_attr_version     */
  92         0,                      /* dma_attr_addr_lo     */
  93         0xffffffffffffffffull,  /* dma_attr_addr_hi     */
  94         0xffffffffffffffffull,  /* dma_attr_count_max   */
  95         EFX_BUF_SIZE,           /* dma_attr_align       */
  96         0xffffffff,             /* dma_attr_burstsizes  */
  97         1,                      /* dma_attr_minxfer     */
  98         0xffffffffffffffffull,  /* dma_attr_maxxfer     */
  99         0xffffffffffffffffull,  /* dma_attr_seg         */
 100         1,                      /* dma_attr_sgllen      */
 101         1,                      /* dma_attr_granular    */
 102         0                       /* dma_attr_flags       */
 103 };
 104 
 105 /* Forward declaration */
 106 static int
 107 sfxge_rx_qpreallocate(sfxge_rxq_t *srp, int nprealloc);
 108 
 109 static int
 110 sfxge_rx_packet_ctor(void *buf, void *arg, int kmflags)
 111 {
 112         sfxge_rx_packet_t *srpp = buf;
 113         sfxge_t *sp = arg;
 114         dev_info_t *dip = sp->s_dip;
 115         int err;
 116 
 117         ASSERT3U(sizeof (srpp->__srp_u1.__srp_s1), <=,
 118             sizeof (srpp->__srp_u1.__srp_pad));
 119         ASSERT3U(sizeof (srpp->__srp_u2.__srp_s2), <=,
 120             sizeof (srpp->__srp_u2.__srp_pad));
 121 
 122         bzero(buf, sizeof (sfxge_rx_packet_t));
 123 
 124         /* Allocate a DMA handle */
 125         err = ddi_dma_alloc_handle(dip, &sfxge_rx_packet_dma_attr,
 126             (kmflags == KM_SLEEP) ? DDI_DMA_SLEEP : DDI_DMA_DONTWAIT,
 127             NULL, &(srpp->srp_dma_handle));
 128         if (err != DDI_SUCCESS)
 129                 goto fail1;
 130 
 131         return (0);
 132 
 133 fail1:
 134         DTRACE_PROBE1(fail1, int, err);
 135 
 136         SFXGE_OBJ_CHECK(srpp, sfxge_rx_packet_t);
 137 
 138         return (-1);
 139 }
 140 
 141 static void
 142 sfxge_rx_packet_dtor(void *buf, void *arg)
 143 {
 144         sfxge_rx_packet_t *srpp = buf;
 145 
 146         _NOTE(ARGUNUSED(arg))
 147 
 148         /* Free the DMA handle */
 149         ddi_dma_free_handle(&(srpp->srp_dma_handle));
 150         srpp->srp_dma_handle = NULL;
 151 
 152         SFXGE_OBJ_CHECK(srpp, sfxge_rx_packet_t);
 153 }
 154 
 155 static int
 156 sfxge_rx_qctor(void *buf, void *arg, int kmflags)
 157 {
 158         sfxge_rxq_t *srp = buf;
 159         efsys_mem_t *esmp = &(srp->sr_mem);
 160         sfxge_t *sp = arg;
 161         sfxge_dma_buffer_attr_t dma_attr;
 162         sfxge_rx_fpp_t *srfppp;
 163         int nprealloc;
 164         unsigned int id;
 165         int rc;
 166 
 167         /* Compile-time structure layout checks */
 168         EFX_STATIC_ASSERT(sizeof (srp->__sr_u1.__sr_s1) <=
 169             sizeof (srp->__sr_u1.__sr_pad));
 170         EFX_STATIC_ASSERT(sizeof (srp->__sr_u2.__sr_s2) <=
 171             sizeof (srp->__sr_u2.__sr_pad));
 172         EFX_STATIC_ASSERT(sizeof (srp->__sr_u3.__sr_s3) <=
 173             sizeof (srp->__sr_u3.__sr_pad));
 174 
 175         bzero(buf, sizeof (sfxge_rxq_t));
 176 
 177         srp->sr_sp = sp;
 178 
 179         dma_attr.sdba_dip        = sp->s_dip;
 180         dma_attr.sdba_dattrp     = &sfxge_rxq_dma_attr;
 181         dma_attr.sdba_callback   = DDI_DMA_SLEEP;
 182         dma_attr.sdba_length     = EFX_RXQ_SIZE(sp->s_rxq_size);
 183         dma_attr.sdba_memflags   = DDI_DMA_CONSISTENT;
 184         dma_attr.sdba_devaccp    = &sfxge_rxq_devacc;
 185         dma_attr.sdba_bindflags  = DDI_DMA_READ | DDI_DMA_CONSISTENT;
 186         dma_attr.sdba_maxcookies = 1;
 187         dma_attr.sdba_zeroinit   = B_FALSE;
 188 
 189         if ((rc = sfxge_dma_buffer_create(esmp, &dma_attr)) != 0)
 190                 goto fail1;
 191 
 192         /* Allocate some buffer table entries */
 193         if ((rc = sfxge_sram_buf_tbl_alloc(sp, EFX_RXQ_NBUFS(sp->s_rxq_size),
 194             &(srp->sr_id))) != 0)
 195                 goto fail2;
 196 
 197         /* Allocate the context array */
 198         if ((srp->sr_srpp = kmem_zalloc(sizeof (sfxge_rx_packet_t *) *
 199             sp->s_rxq_size, kmflags)) == NULL) {
 200                 rc = ENOMEM;
 201                 goto fail3;
 202         }
 203 
 204         /* Allocate the flow table */
 205         if ((srp->sr_flow = kmem_zalloc(sizeof (sfxge_rx_flow_t) *
 206             SFXGE_MAX_FLOW, kmflags)) == NULL) {
 207                 rc = ENOMEM;
 208                 goto fail4;
 209         }
 210 
 211         srp->sr_srfpp = &(srp->sr_srfp);
 212         srp->sr_rto = drv_usectohz(200000);
 213 
 214         srp->sr_mpp = &(srp->sr_mp);
 215 
 216         /* Initialize the free packet pool */
 217         srfppp = &(srp->sr_fpp);
 218         if ((srfppp->srfpp_putp = kmem_zalloc(SFXGE_CPU_CACHE_SIZE *
 219                 SFXGE_RX_FPP_NSLOTS, kmflags)) == NULL) {
 220                 rc = ENOMEM;
 221                 goto fail5;
 222         }
 223         for (id = 0; id < SFXGE_RX_FPP_NSLOTS; id++) {
 224                 sfxge_rx_fpp_putlist_t *putp;
 225                 size_t off;
 226 
 227                 off = id * SFXGE_CPU_CACHE_SIZE;
 228                 putp = (void *)(srfppp->srfpp_putp + off);
 229 
 230                 putp->srfpl_putp = NULL;
 231                 putp->srfpl_putpp = &(putp->srfpl_putp);
 232                 mutex_init(&(putp->srfpl_lock), NULL, MUTEX_DRIVER,
 233                     DDI_INTR_PRI(sp->s_intr.si_intr_pri));
 234         }
 235 
 236         cv_init(&(srp->sr_flush_kv), NULL, CV_DRIVER, NULL);
 237 
 238         /* Preallocate some packets on the free packet pool */
 239         nprealloc = ddi_prop_get_int(DDI_DEV_T_ANY, sp->s_dip,
 240             DDI_PROP_DONTPASS, "rx_prealloc_pkt_buffers", SFXGE_RX_QPREALLOC);
 241         sfxge_rx_qpreallocate(srp, nprealloc);
 242 
 243 
 244         return (0);
 245 
 246 fail5:
 247         DTRACE_PROBE(fail5);
 248 
 249         srp->sr_mpp = NULL;
 250 
 251         srp->sr_rto = 0;
 252         srp->sr_srfpp = NULL;
 253 
 254         /* Free the flow table */
 255         kmem_free(srp->sr_flow, sizeof (sfxge_rx_flow_t) *
 256             SFXGE_MAX_FLOW);
 257         srp->sr_flow = NULL;
 258 
 259 fail4:
 260         DTRACE_PROBE(fail4);
 261 
 262         /* Free the context array */
 263         kmem_free(srp->sr_srpp, sizeof (sfxge_rx_packet_t *) *
 264             sp->s_rxq_size);
 265         srp->sr_srpp = NULL;
 266 
 267 fail3:
 268         DTRACE_PROBE(fail3);
 269 
 270         /* Free the buffer table entries */
 271         sfxge_sram_buf_tbl_free(sp, srp->sr_id,
 272             EFX_RXQ_NBUFS(sp->s_rxq_size));
 273         srp->sr_id = 0;
 274 
 275 fail2:
 276         DTRACE_PROBE(fail2);
 277         /* Remove dma setup */
 278         sfxge_dma_buffer_destroy(esmp);
 279 
 280 fail1:
 281         DTRACE_PROBE1(fail1, int, rc);
 282 
 283         srp->sr_sp = NULL;
 284 
 285         SFXGE_OBJ_CHECK(srp, sfxge_rxq_t);
 286 
 287         return (-1);
 288 }
 289 
 290 static void
 291 sfxge_rx_qdtor(void *buf, void *arg)
 292 {
 293         sfxge_rxq_t *srp = buf;
 294         efsys_mem_t *esmp = &(srp->sr_mem);
 295         sfxge_t *sp = srp->sr_sp;
 296         sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp);
 297         unsigned int id;
 298 
 299         _NOTE(ARGUNUSED(arg))
 300 
 301         cv_destroy(&(srp->sr_flush_kv));
 302 
 303         /* Tear down the free packet pool */
 304         for (id = 0; id < SFXGE_RX_FPP_NSLOTS; id++) {
 305                 sfxge_rx_fpp_putlist_t *putp;
 306                 size_t off;
 307 
 308                 off = id * SFXGE_CPU_CACHE_SIZE;
 309                 putp = (void *)(srfppp->srfpp_putp + off);
 310 
 311                 putp->srfpl_putpp = NULL;
 312                 mutex_destroy(&(putp->srfpl_lock));
 313 
 314                 SFXGE_OBJ_CHECK(putp, sfxge_rx_fpp_putlist_t);
 315         }
 316         kmem_free(srfppp->srfpp_putp, SFXGE_CPU_CACHE_SIZE *
 317             SFXGE_RX_FPP_NSLOTS);
 318         srfppp->srfpp_putp = NULL;
 319 
 320         srp->sr_mpp = NULL;
 321 
 322         srp->sr_rto = 0;
 323         srp->sr_srfpp = NULL;
 324 
 325         /* Free the flow table */
 326         kmem_free(srp->sr_flow, sizeof (sfxge_rx_flow_t) *
 327             SFXGE_MAX_FLOW);
 328         srp->sr_flow = NULL;
 329 
 330         /* Free the context array */
 331         kmem_free(srp->sr_srpp, sizeof (sfxge_rx_packet_t *) *
 332             sp->s_rxq_size);
 333         srp->sr_srpp = NULL;
 334 
 335         /* Free the buffer table entries */
 336         sfxge_sram_buf_tbl_free(sp, srp->sr_id,
 337             EFX_RXQ_NBUFS(sp->s_rxq_size));
 338         srp->sr_id = 0;
 339 
 340         /* Tear down dma setup */
 341         sfxge_dma_buffer_destroy(esmp);
 342 
 343         SFXGE_OBJ_CHECK(srp, sfxge_rxq_t);
 344 }
 345 
 346 /* Note: This function takes ownership of *srpp. */
 347 static inline void
 348 sfxge_rx_qfpp_put(sfxge_rxq_t *srp, sfxge_rx_packet_t *srpp)
 349 {
 350         sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp);
 351         mblk_t *mp = srpp->srp_mp;
 352         unsigned int id;
 353         size_t off;
 354         sfxge_rx_fpp_putlist_t *putp;
 355 
 356         ASSERT3P(mp->b_next, ==, NULL);
 357         ASSERT3P(mp->b_prev, ==, NULL);
 358 
 359         id = CPU->cpu_seqid & SFXGE_RX_FPP_MASK;
 360         off = id * SFXGE_CPU_CACHE_SIZE;
 361 
 362         ASSERT3P(srpp->srp_putp, ==, srfppp->srfpp_putp);
 363         putp = (void *)(srpp->srp_putp + off);
 364 
 365         mutex_enter(&(putp->srfpl_lock));
 366         putp->srfpl_count++;
 367         *putp->srfpl_putpp = mp;
 368         putp->srfpl_putpp = &(mp->b_next);
 369         mutex_exit(&(putp->srfpl_lock));
 370 }
 371 
 372 static unsigned int
 373 sfxge_rx_qfpp_swizzle(sfxge_rxq_t *srp)
 374 {
 375         sfxge_t *sp = srp->sr_sp;
 376         unsigned int index = srp->sr_index;
 377         sfxge_evq_t *sep = sp->s_sep[index];
 378         sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp);
 379         unsigned int start;
 380         unsigned int id;
 381         mblk_t *p;
 382         mblk_t **pp;
 383         unsigned int count;
 384         unsigned int loaned;
 385 
 386         ASSERT(mutex_owned(&(sep->se_lock)));
 387 
 388         /* We want to access the put list for the current CPU last */
 389         id = start = (CPU->cpu_seqid + 1) & SFXGE_RX_FPP_MASK;
 390 
 391         do {
 392                 sfxge_rx_fpp_putlist_t *putp;
 393                 size_t off;
 394 
 395                 off = id * SFXGE_CPU_CACHE_SIZE;
 396                 id  = (id + 1) & SFXGE_RX_FPP_MASK;
 397 
 398                 putp = (void *)(srfppp->srfpp_putp + off);
 399 
 400                 /* Acquire the put list */
 401                 mutex_enter(&(putp->srfpl_lock));
 402 
 403                 p = putp->srfpl_putp;
 404                 pp = putp->srfpl_putpp;
 405                 count = putp->srfpl_count;
 406 
 407                 putp->srfpl_putp = NULL;
 408                 putp->srfpl_putpp = &(putp->srfpl_putp);
 409                 putp->srfpl_count = 0;
 410 
 411                 mutex_exit(&(putp->srfpl_lock));
 412 
 413                 if (p == NULL)
 414                         continue;
 415 
 416                 /* Add the list to the head of the get list */
 417                 *pp = srfppp->srfpp_get;
 418                 srfppp->srfpp_get = p;
 419 
 420                 /* Adjust the counters */
 421                 ASSERT3U(srfppp->srfpp_loaned, >=, count);
 422                 srfppp->srfpp_loaned -= count;
 423                 srfppp->srfpp_count += count;
 424 
 425 #if 0
 426                 /* NOTE: this probe is disabled because it is expensive!! */
 427                 DTRACE_PROBE2(count,
 428                     unsigned int, (id - 1) & SFXGE_RX_FPP_MASK,
 429                     unsigned int, count);
 430 #endif
 431 
 432         } while (id != start);
 433 
 434         /* Return the number of packets yet to appear in the put list */
 435         loaned = srfppp->srfpp_loaned;
 436 
 437 
 438         return (loaned);
 439 }
 440 
 441 
 442 #define DB_FRTNP(mp)    ((mp)->b_datap->db_frtnp)
 443 
 444 static void
 445 sfxge_rx_qfpp_empty(sfxge_rxq_t *srp)
 446 {
 447         sfxge_t *sp = srp->sr_sp;
 448         unsigned int index = srp->sr_index;
 449         sfxge_evq_t *sep = sp->s_sep[index];
 450         sfxge_rx_fpp_t *srfppp;
 451         mblk_t *mp;
 452 
 453         mutex_enter(&(sep->se_lock));
 454         srfppp = &(srp->sr_fpp);
 455 
 456         /* Swizzle put list to get list */
 457         (void) sfxge_rx_qfpp_swizzle(srp);
 458         ASSERT3U(srfppp->srfpp_loaned, ==, 0);
 459 
 460         mp = srfppp->srfpp_get;
 461         srfppp->srfpp_get = NULL;
 462 
 463         /* Free the remainder */
 464         while (mp != NULL) {
 465                 mblk_t *next;
 466                 frtn_t *freep;
 467                 sfxge_rx_packet_t *srpp;
 468 
 469                 next = mp->b_next;
 470                 mp->b_next = NULL;
 471 
 472                 ASSERT3U(srfppp->srfpp_count, >, 0);
 473                 srfppp->srfpp_count--;
 474 
 475                 freep = DB_FRTNP(mp);
 476                 /*
 477                  * ASSERT3P(freep->free_func, ==, sfxge_rx_qpacket_free);
 478                  *   is implied by srpp test below
 479                  */
 480                 /*LINTED*/
 481                 srpp = (sfxge_rx_packet_t *)(freep->free_arg);
 482                 ASSERT3P(srpp->srp_mp, ==, mp);
 483                 ASSERT3P(mp->b_cont, ==, NULL);
 484                 srpp->srp_recycle = B_FALSE;
 485 
 486                 freeb(mp);
 487 
 488                 mp = next;
 489         }
 490         ASSERT3U(srfppp->srfpp_count, ==, 0);
 491 
 492         srfppp->srfpp_min = 0;
 493 
 494         mutex_exit(&(sep->se_lock));
 495 }
 496 
 497 /*
 498  * This is an estimate of all memory consumed per RX packet
 499  * it can be inaccurate but but sp->s_rx_pkt_mem_alloc mustn't drift
 500  */
 501 static uint64_t
 502 sfxge_rx_pkt_mem_approx(const sfxge_rx_packet_t *srpp)
 503 {
 504         return (srpp->srp_mblksize + sizeof (mblk_t) + sizeof (dblk_t) +
 505             sizeof (sfxge_rx_packet_t));
 506 }
 507 
 508 static void
 509 sfxge_rx_qpacket_destroy(sfxge_rxq_t *srp, sfxge_rx_packet_t *srpp)
 510 {
 511         sfxge_t *sp = srp->sr_sp;
 512         int64_t delta = sfxge_rx_pkt_mem_approx(srpp);
 513 
 514         ASSERT(!(srpp->srp_recycle));
 515         ASSERT3P(srpp->srp_mp, ==, NULL);
 516 
 517         srpp->srp_off = 0;
 518         srpp->srp_thp = NULL;
 519         srpp->srp_iphp = NULL;
 520         srpp->srp_etherhp = NULL;
 521         srpp->srp_size = 0;
 522         srpp->srp_flags = 0;
 523 
 524         bzero(&(srpp->srp_free), sizeof (frtn_t));
 525 
 526         srpp->srp_mblksize = 0;
 527         srpp->srp_base = NULL;
 528 
 529         /* Unbind the DMA memory from the DMA handle */
 530         srpp->srp_addr = 0;
 531         (void) ddi_dma_unbind_handle(srpp->srp_dma_handle);
 532 
 533         /* Free the DMA memory */
 534         srpp->srp_base = NULL;
 535         ddi_dma_mem_free(&(srpp->srp_acc_handle));
 536         srpp->srp_acc_handle = NULL;
 537 
 538         srpp->srp_putp = NULL;
 539         srpp->srp_srp = NULL;
 540 
 541         kmem_cache_free(sp->s_rpc, srpp);
 542         if (sp->s_rx_pkt_mem_max)
 543                 atomic_add_64(&sp->s_rx_pkt_mem_alloc, -delta);
 544 }
 545 
 546 #ifdef _USE_XESBALLOC
 547 static void
 548 sfxge_rx_qpacket_free(void *arg, mblk_t *mp, boolean_t *recyclep)
 549 {
 550         sfxge_rx_packet_t *srpp = arg;
 551         sfxge_rxq_t *srp = srpp->srp_srp;
 552 
 553         /*
 554          * WARNING "man -s 9f esballoc" states:
 555          * => runs async in a background context
 556          * => must not sleep, or access data structures that could be freed
 557          */
 558         ASSERT3P(DB_BASE(mp), ==, srpp->srp_base);
 559         ASSERT3P(MBLKSIZE(mp), ==, srpp->srp_mblksize);
 560 
 561         /* Check whether we want to recycle the receive packets */
 562         if (srpp->srp_recycle) {
 563                 ASSERT3P(DB_FRTNP(mp), ==, &(srpp->srp_free));
 564 
 565                 srpp->srp_mp = mp;
 566 
 567                 /* NORMAL recycled case */
 568                 sfxge_rx_qfpp_put(srp, srpp);
 569                 *recyclep = B_TRUE;
 570                 return;
 571         }
 572 
 573         srpp->srp_mp = NULL;
 574 
 575         sfxge_rx_qpacket_destroy(srp, srpp);
 576         *recyclep = B_FALSE;
 577 }
 578 #endif  /* _USE_XESBALLOC */
 579 
 580 #ifdef _USE_DESBALLOC
 581 static void
 582 sfxge_rx_qpacket_free(void *arg)
 583 {
 584         sfxge_rx_packet_t *srpp = arg;
 585         sfxge_rxq_t *srp = srpp->srp_srp;
 586 
 587         /*
 588          * WARNING "man -s 9f esballoc"  states:
 589          * => runs sync from the thread calling freeb()
 590          * => must not sleep, or access data structures that could be freed
 591          */
 592 
 593         /* Check whether we want to recycle the receive packets */
 594         if (srpp->srp_recycle) {
 595                 frtn_t *freep;
 596                 mblk_t *mp;
 597                 size_t size;
 598 
 599                 freep = &(srpp->srp_free);
 600                 ASSERT3P(freep->free_func, ==, sfxge_rx_qpacket_free);
 601                 ASSERT3P(freep->free_arg, ==, (caddr_t)srpp);
 602 
 603                 /*
 604                  * Allocate a matching mblk_t before the current one is
 605                  * freed.
 606                  */
 607                 size = srpp->srp_mblksize;
 608 
 609                 if ((mp = desballoc(srpp->srp_base, size, BPRI_HI,
 610                     freep)) != NULL) {
 611                         srpp->srp_mp = mp;
 612 
 613                         /* NORMAL recycled case */
 614                         sfxge_rx_qfpp_put(srp, srpp);
 615                         return;
 616                 }
 617         }
 618 
 619         srpp->srp_mp = NULL;
 620 
 621         sfxge_rx_qpacket_destroy(srp, srpp);
 622 }
 623 #endif  /* _USE_DESBALLOC */
 624 
 625 static sfxge_rx_packet_t *
 626 sfxge_rx_qpacket_create(sfxge_rxq_t *srp)
 627 {
 628         sfxge_t *sp = srp->sr_sp;
 629         sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp);
 630         sfxge_rx_packet_t *srpp;
 631         size_t size;
 632         caddr_t base;
 633         size_t unit;
 634         ddi_dma_cookie_t dmac;
 635         unsigned int ncookies;
 636         frtn_t *freep;
 637         mblk_t *mp;
 638         int err;
 639         int rc;
 640 
 641         size = sp->s_rx_buffer_size;
 642 
 643         if (sp->s_rx_pkt_mem_max &&
 644             (sp->s_rx_pkt_mem_alloc + size >= sp->s_rx_pkt_mem_max)) {
 645                 DTRACE_PROBE(rx_pkt_mem_max);
 646                 srp->sr_kstat.srk_rx_pkt_mem_limit++;
 647                 return (NULL);
 648         }
 649 
 650         /* Allocate a new packet */
 651         if ((srpp = kmem_cache_alloc(sp->s_rpc, KM_NOSLEEP)) == NULL) {
 652                 srp->sr_kstat.srk_kcache_alloc_nomem++;
 653                 rc = ENOMEM;
 654                 goto fail1;
 655         }
 656 
 657         srpp->srp_srp = srp;
 658         srpp->srp_putp = srfppp->srfpp_putp;
 659 
 660         /* Allocate some DMA memory */
 661         err = ddi_dma_mem_alloc(srpp->srp_dma_handle, size,
 662             &sfxge_rx_packet_devacc, DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
 663             NULL, &base, &unit, &(srpp->srp_acc_handle));
 664         switch (err) {
 665         case DDI_SUCCESS:
 666                 break;
 667 
 668         case DDI_FAILURE:
 669                 srp->sr_kstat.srk_dma_alloc_nomem++;
 670                 rc = ENOMEM;
 671                 goto fail2;
 672 
 673         default:
 674                 srp->sr_kstat.srk_dma_alloc_fail++;
 675                 rc = EFAULT;
 676                 goto fail2;
 677         }
 678 
 679         /* Adjust the buffer to align the start of the DMA area correctly */
 680         base += sp->s_rx_buffer_align;
 681         size -= sp->s_rx_buffer_align;
 682 
 683         /* Bind the DMA memory to the DMA handle */
 684         err = ddi_dma_addr_bind_handle(srpp->srp_dma_handle, NULL,
 685             base, size, DDI_DMA_READ | DDI_DMA_STREAMING,
 686             DDI_DMA_DONTWAIT, NULL, &dmac, &ncookies);
 687         switch (err) {
 688         case DDI_DMA_MAPPED:
 689                 break;
 690 
 691         case DDI_DMA_INUSE:
 692                 srp->sr_kstat.srk_dma_bind_fail++;
 693                 rc = EEXIST;
 694                 goto fail3;
 695 
 696         case DDI_DMA_NORESOURCES:
 697                 srp->sr_kstat.srk_dma_bind_nomem++;
 698                 rc = ENOMEM;
 699                 goto fail3;
 700 
 701         case DDI_DMA_NOMAPPING:
 702                 srp->sr_kstat.srk_dma_bind_fail++;
 703                 rc = ENOTSUP;
 704                 goto fail3;
 705 
 706         case DDI_DMA_TOOBIG:
 707                 srp->sr_kstat.srk_dma_bind_fail++;
 708                 rc = EFBIG;
 709                 goto fail3;
 710 
 711         default:
 712                 srp->sr_kstat.srk_dma_bind_fail++;
 713                 rc = EFAULT;
 714                 goto fail3;
 715         }
 716         ASSERT3U(ncookies, ==, 1);
 717 
 718         srpp->srp_addr = dmac.dmac_laddress;
 719 
 720         srpp->srp_base = (unsigned char *)base;
 721         srpp->srp_mblksize = size;
 722 
 723         /*
 724          * Allocate a STREAMS block: We use size 1 so that the allocator will
 725          * use the first (and smallest) dblk cache.
 726          */
 727         freep = &(srpp->srp_free);
 728         freep->free_func = sfxge_rx_qpacket_free;
 729         freep->free_arg  = (caddr_t)srpp;
 730 
 731 #ifdef _USE_XESBALLOC
 732         if ((mp = xesballoc(srpp->srp_base, size, BPRI_HI, freep)) == NULL) {
 733                 srp->sr_kstat.srk_xesballoc_fail++;
 734                 rc = ENOMEM;
 735                 goto fail4;
 736         }
 737 #endif  /* _USE_XESBALLOC */
 738 
 739 #ifdef _USE_DESBALLOC
 740         if ((mp = desballoc(srpp->srp_base, size, BPRI_HI, freep)) == NULL) {
 741                 srp->sr_kstat.srk_desballoc_fail++;
 742                 rc = ENOMEM;
 743                 goto fail4;
 744         }
 745 #endif  /* _USE_DESBALLOC */
 746 
 747         srpp->srp_mp = mp;
 748         srpp->srp_recycle = B_TRUE;
 749 
 750         if (sp->s_rx_pkt_mem_max) {
 751                 int64_t delta = sfxge_rx_pkt_mem_approx(srpp);
 752                 atomic_add_64(&sp->s_rx_pkt_mem_alloc, delta);
 753         }
 754 
 755         return (srpp);
 756 
 757 fail4:
 758         DTRACE_PROBE(fail4);
 759 
 760         bzero(&(srpp->srp_free), sizeof (frtn_t));
 761 
 762         srpp->srp_mblksize = 0;
 763         srpp->srp_base = NULL;
 764 
 765         /* Unbind the DMA memory from the DMA handle */
 766         srpp->srp_addr = 0;
 767         (void) ddi_dma_unbind_handle(srpp->srp_dma_handle);
 768 
 769 fail3:
 770         DTRACE_PROBE(fail3);
 771 
 772         /* Free the DMA memory */
 773         ddi_dma_mem_free(&(srpp->srp_acc_handle));
 774         srpp->srp_acc_handle = NULL;
 775 
 776 fail2:
 777         DTRACE_PROBE(fail2);
 778 
 779         srpp->srp_putp = NULL;
 780         srpp->srp_srp = NULL;
 781 
 782         kmem_cache_free(sp->s_rpc, srpp);
 783 
 784 fail1:
 785         DTRACE_PROBE1(fail1, int, rc);
 786 
 787         return (NULL);
 788 }
 789 
 790 #define SFXGE_REFILL_BATCH  64
 791 
 792 /* Try to refill the RX descriptor ring from the associated free pkt pool */
 793 static void
 794 sfxge_rx_qrefill(sfxge_rxq_t *srp, unsigned int target)
 795 {
 796         sfxge_t *sp = srp->sr_sp;
 797         sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp);
 798         unsigned int index = srp->sr_index;
 799         sfxge_evq_t *sep = sp->s_sep[index];
 800         efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
 801         mblk_t *mp;
 802         int ntodo;
 803         unsigned int count;
 804         unsigned int batch;
 805         unsigned int rxfill;
 806         unsigned int mblksize;
 807 
 808         prefetch_read_many(sp->s_enp);
 809         prefetch_read_many(srp->sr_erp);
 810 
 811         ASSERT(mutex_owned(&(sep->se_lock)));
 812 
 813         if (srp->sr_state != SFXGE_RXQ_STARTED)
 814                 return;
 815 
 816         rxfill = srp->sr_added - srp->sr_completed;
 817         ASSERT3U(rxfill, <=, EFX_RXQ_LIMIT(sp->s_rxq_size));
 818         ntodo = min(EFX_RXQ_LIMIT(sp->s_rxq_size) - rxfill, target);
 819         ASSERT3U(ntodo, <=, EFX_RXQ_LIMIT(sp->s_rxq_size));
 820 
 821         if (ntodo == 0)
 822                 goto out;
 823 
 824         (void) sfxge_rx_qfpp_swizzle(srp);
 825 
 826         mp = srfppp->srfpp_get;
 827         count = srfppp->srfpp_count;
 828         mblksize = sp->s_rx_buffer_size - sp->s_rx_buffer_align;
 829 
 830         batch = 0;
 831         while (ntodo-- > 0) {
 832                 mblk_t *next;
 833                 frtn_t *freep;
 834                 sfxge_rx_packet_t *srpp;
 835                 unsigned int id;
 836 
 837                 if (mp == NULL)
 838                         break;
 839 
 840                 next = mp->b_next;
 841                 mp->b_next = NULL;
 842 
 843                 if (next != NULL)
 844                         prefetch_read_many(next);
 845 
 846                 freep = DB_FRTNP(mp);
 847                 /*LINTED*/
 848                 srpp = (sfxge_rx_packet_t *)(freep->free_arg);
 849                 ASSERT3P(srpp->srp_mp, ==, mp);
 850 
 851                 /* The MTU may have changed since the packet was allocated */
 852                 if (MBLKSIZE(mp) != mblksize) {
 853                         srpp->srp_recycle = B_FALSE;
 854 
 855                         freeb(mp);
 856 
 857                         --count;
 858                         mp = next;
 859                         continue;
 860                 }
 861 
 862                 srpp->srp_off = 0;
 863                 srpp->srp_thp = NULL;
 864                 srpp->srp_iphp = NULL;
 865                 srpp->srp_etherhp = NULL;
 866                 srpp->srp_size = 0;
 867                 srpp->srp_flags = EFX_DISCARD;
 868 
 869                 id = (srp->sr_added + batch) & (sp->s_rxq_size - 1);
 870                 ASSERT(srp->sr_srpp[id] == NULL);
 871                 srp->sr_srpp[id] = srpp;
 872 
 873                 addr[batch++] = srpp->srp_addr;
 874                 if (batch == SFXGE_REFILL_BATCH) {
 875                         efx_rx_qpost(srp->sr_erp, addr, mblksize, batch,
 876                             srp->sr_completed, srp->sr_added);
 877                         srp->sr_added += batch;
 878                         batch = 0;
 879                 }
 880 
 881                 --count;
 882                 mp = next;
 883         }
 884 
 885         srfppp->srfpp_get = mp;
 886         srfppp->srfpp_count = count;
 887 
 888         if (batch != 0) {
 889                 efx_rx_qpost(srp->sr_erp, addr, mblksize, batch,
 890                     srp->sr_completed, srp->sr_added);
 891                 srp->sr_added += batch;
 892         }
 893 
 894         /* Make the descriptors visible to the hardware */
 895         (void) ddi_dma_sync(srp->sr_mem.esm_dma_handle,
 896             0,
 897             EFX_RXQ_SIZE(sp->s_rxq_size),
 898             DDI_DMA_SYNC_FORDEV);
 899 
 900         efx_rx_qpush(srp->sr_erp, srp->sr_added);
 901 
 902 out:
 903         if (srfppp->srfpp_count < srfppp->srfpp_min)
 904                 srfppp->srfpp_min = srfppp->srfpp_count;
 905 }
 906 
 907 /* Preallocate packets and put them in the free packet pool */
 908 static int
 909 sfxge_rx_qpreallocate(sfxge_rxq_t *srp, int nprealloc)
 910 {
 911         sfxge_rx_fpp_t *srfppp = &((srp)->sr_fpp);
 912         srfppp->srfpp_lowat = nprealloc;
 913         while (nprealloc-- > 0) {
 914                 sfxge_rx_packet_t *srpp;
 915 
 916                 if ((srpp = sfxge_rx_qpacket_create(srp)) == NULL)
 917                         break;
 918                 sfxge_rx_qfpp_put(srp, srpp);
 919         }
 920         return (0);
 921 }
 922 
 923 /* Try to refill the RX descriptor ring by allocating new packets */
 924 static void
 925 sfxge_rx_qfill(sfxge_rxq_t *srp, unsigned int target)
 926 {
 927         sfxge_t *sp = srp->sr_sp;
 928         unsigned int index = srp->sr_index;
 929         sfxge_evq_t *sep = sp->s_sep[index];
 930         unsigned int batch;
 931         unsigned int rxfill;
 932         unsigned int mblksize;
 933         int ntodo;
 934         efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
 935         mblk_t *mp = NULL;
 936 
 937         prefetch_read_many(sp->s_enp);
 938         prefetch_read_many(srp->sr_erp);
 939 
 940         ASSERT(mutex_owned(&(sep->se_lock)));
 941 
 942         if (srp->sr_state != SFXGE_RXQ_STARTED)
 943                 return;
 944 
 945         rxfill = srp->sr_added - srp->sr_completed;
 946         ASSERT3U(rxfill, <=, EFX_RXQ_LIMIT(sp->s_rxq_size));
 947         ntodo = min(EFX_RXQ_LIMIT(sp->s_rxq_size) - rxfill, target);
 948         ASSERT3U(ntodo, <=, EFX_RXQ_LIMIT(sp->s_rxq_size));
 949 
 950         if (ntodo == 0)
 951                 return;
 952 
 953         mblksize = sp->s_rx_buffer_size - sp->s_rx_buffer_align;
 954 
 955         batch = 0;
 956         while (ntodo-- > 0) {
 957                 sfxge_rx_packet_t *srpp;
 958                 unsigned int id;
 959 
 960                 if ((srpp = sfxge_rx_qpacket_create(srp)) == NULL)
 961                         break;
 962 
 963                 mp = srpp->srp_mp;
 964 
 965                 ASSERT3U(MBLKSIZE(mp), ==, mblksize);
 966 
 967                 ASSERT3U(srpp->srp_off, ==, 0);
 968                 ASSERT3P(srpp->srp_thp, ==, NULL);
 969                 ASSERT3P(srpp->srp_iphp, ==, NULL);
 970                 ASSERT3P(srpp->srp_etherhp, ==, NULL);
 971                 ASSERT3U(srpp->srp_size, ==, 0);
 972 
 973                 srpp->srp_flags = EFX_DISCARD;
 974 
 975                 id = (srp->sr_added + batch) & (sp->s_rxq_size - 1);
 976                 ASSERT(srp->sr_srpp[id] == NULL);
 977                 srp->sr_srpp[id] = srpp;
 978 
 979                 addr[batch++] = srpp->srp_addr;
 980                 if (batch == SFXGE_REFILL_BATCH) {
 981                         efx_rx_qpost(srp->sr_erp, addr, mblksize, batch,
 982                             srp->sr_completed, srp->sr_added);
 983                         srp->sr_added += batch;
 984                         batch = 0;
 985                 }
 986         }
 987 
 988         if (batch != 0) {
 989                 efx_rx_qpost(srp->sr_erp, addr, mblksize, batch,
 990                     srp->sr_completed, srp->sr_added);
 991                 srp->sr_added += batch;
 992         }
 993 
 994         /* Make the descriptors visible to the hardware */
 995         (void) ddi_dma_sync(srp->sr_mem.esm_dma_handle,
 996             0,
 997             EFX_RXQ_SIZE(sp->s_rxq_size),
 998             DDI_DMA_SYNC_FORDEV);
 999 
1000         efx_rx_qpush(srp->sr_erp, srp->sr_added);
1001 }
1002 
1003 void
1004 sfxge_rx_qfpp_trim(sfxge_rxq_t *srp)
1005 {
1006         sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp);
1007         sfxge_t *sp = srp->sr_sp;
1008         unsigned int index = srp->sr_index;
1009         sfxge_evq_t *sep = sp->s_sep[index];
1010         mblk_t *p;
1011         mblk_t **pp;
1012         int count;
1013 
1014         ASSERT(mutex_owned(&(sep->se_lock)));
1015 
1016         if (srp->sr_state != SFXGE_RXQ_STARTED)
1017                 goto done;
1018 
1019         /* Make sure the queue is full */
1020         sfxge_rx_qrefill(srp, EFX_RXQ_LIMIT(sp->s_rxq_size));
1021 
1022         /* The refill may have emptied the pool */
1023         if (srfppp->srfpp_min == 0)
1024                 goto done;
1025 
1026         /* Don't trim below the pool's low water mark */
1027         if (srfppp->srfpp_count <= srfppp->srfpp_lowat)
1028                 goto done;
1029 
1030         ASSERT(srfppp->srfpp_min <= srfppp->srfpp_count);
1031 
1032         /* Trim to the largest of srfppp->srfpp_min and srfpp->srfpp_lowat */
1033         if (srfppp->srfpp_lowat > srfppp->srfpp_min)
1034                 count = srfppp->srfpp_count - srfppp->srfpp_lowat;
1035         else
1036                 count = srfppp->srfpp_count - srfppp->srfpp_min;
1037 
1038         /* Walk the get list */
1039         pp = &(srfppp->srfpp_get);
1040         while (--count >= 0) {
1041                 ASSERT(pp);
1042                 p = *pp;
1043                 ASSERT(p != NULL);
1044 
1045                 pp = &(p->b_next);
1046         }
1047         ASSERT(pp);
1048         p = *pp;
1049 
1050         /* Truncate the get list */
1051         *pp = NULL;
1052 
1053         /* Free the remainder */
1054         while (p != NULL) {
1055                 mblk_t *next;
1056                 frtn_t *freep;
1057                 sfxge_rx_packet_t *srpp;
1058 
1059                 next = p->b_next;
1060                 p->b_next = NULL;
1061 
1062                 ASSERT3U(srfppp->srfpp_min, >, 0);
1063                 srfppp->srfpp_min--;
1064                 srfppp->srfpp_count--;
1065 
1066                 freep = DB_FRTNP(p);
1067                 /*LINTED*/
1068                 srpp = (sfxge_rx_packet_t *)(freep->free_arg);
1069                 ASSERT3P(srpp->srp_mp, ==, p);
1070 
1071                 srpp->srp_recycle = B_FALSE;
1072 
1073                 freeb(p);
1074 
1075                 p = next;
1076         }
1077 
1078 done:
1079         srfppp->srfpp_min = srfppp->srfpp_count;
1080 }
1081 
1082 static void
1083 sfxge_rx_qpoll(void *arg)
1084 {
1085         sfxge_rxq_t *srp = arg;
1086         sfxge_t *sp = srp->sr_sp;
1087         unsigned int index = srp->sr_index;
1088         sfxge_evq_t *sep = sp->s_sep[index];
1089         uint16_t magic;
1090 
1091         /*
1092          * man timeout(9f) states that this code should adhere to the
1093          * same requirements as a softirq handler - DO NOT BLOCK
1094          */
1095 
1096         /*
1097          * Post an event to the event queue to cause the free packet pool to be
1098          * trimmed if it is oversize.
1099          */
1100         magic = SFXGE_MAGIC_RX_QFPP_TRIM | index;
1101 
1102 #if defined(DEBUG)
1103         /* This is guaranteed due to the start/stop order of rx and ev */
1104         ASSERT3U(sep->se_state, ==, SFXGE_EVQ_STARTED);
1105         ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_STARTED);
1106 #else
1107         /*
1108          * Bug22691 WORKAROUND:
1109          * This handler has been observed in the field to be invoked for a
1110          * queue in the INITIALIZED state, which should never happen.
1111          * Until the mechanism for this is properly understood, add defensive
1112          * checks.
1113          */
1114         if ((sep->se_state != SFXGE_EVQ_STARTED) ||
1115             (srp->sr_state != SFXGE_RXQ_STARTED) ||
1116             (!sep->se_eep)) {
1117                 cmn_err(CE_WARN, SFXGE_CMN_ERR
1118                         "[%s%d] RXQ[%d] bad state in sfxge_rx_qpoll %d %d %p",
1119                         ddi_driver_name(sp->s_dip), ddi_get_instance(sp->s_dip),
1120                         index, sep->se_state, srp->sr_state, sep->se_eep);
1121                 return;
1122         }
1123 #endif
1124         efx_ev_qpost(sep->se_eep, magic);
1125 
1126         srp->sr_tid = timeout(sfxge_rx_qpoll, srp,
1127                 drv_usectohz(sp->s_rxq_poll_usec));
1128 }
1129 
1130 static void
1131 sfxge_rx_qpoll_start(sfxge_rxq_t *srp)
1132 {
1133         sfxge_t *sp = srp->sr_sp;
1134         unsigned int index = srp->sr_index;
1135         sfxge_evq_t *sep = sp->s_sep[index];
1136 
1137         ASSERT(mutex_owned(&(sep->se_lock)));
1138         ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_STARTED);
1139 
1140         /* Schedule a poll */
1141         ASSERT3P(srp->sr_tid, ==, 0);
1142         srp->sr_tid = timeout(sfxge_rx_qpoll, srp, 0);
1143 }
1144 
1145 static void
1146 sfxge_rx_qpoll_stop(sfxge_rxq_t *srp)
1147 {
1148         sfxge_t *sp = srp->sr_sp;
1149         unsigned int index = srp->sr_index;
1150         sfxge_evq_t *sep = sp->s_sep[index];
1151         timeout_id_t tid;
1152 
1153         ASSERT(mutex_owned(&(sep->se_lock)));
1154         ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_STARTED);
1155 
1156         /*
1157          * Cancel the qpoll timer. Care is needed as this function
1158          * can race with sfxge_rx_qpoll() for timeout id updates.
1159          *
1160          * Do not hold locks used by any timeout(9f) handlers across
1161          * calls to untimeout(9f) as this will deadlock.
1162          */
1163         tid = 0;
1164         while ((srp->sr_tid != 0) && (srp->sr_tid != tid)) {
1165                 tid = srp->sr_tid;
1166                 (void) untimeout(tid);
1167         }
1168         srp->sr_tid = 0;
1169 }
1170 
1171 static int
1172 sfxge_rx_kstat_update(kstat_t *ksp, int rw)
1173 {
1174         sfxge_rxq_t *srp = ksp->ks_private;
1175         sfxge_t *sp = srp->sr_sp;
1176         unsigned int index = srp->sr_index;
1177         sfxge_evq_t *sep = sp->s_sep[index];
1178         kstat_named_t *knp;
1179         int rc;
1180 
1181         if (rw != KSTAT_READ) {
1182                 rc = EACCES;
1183                 goto fail1;
1184         }
1185 
1186         ASSERT(mutex_owned(&(sep->se_lock)));
1187         if (srp->sr_state != SFXGE_RXQ_STARTED)
1188                 goto done;
1189 
1190         knp = ksp->ks_data;
1191         /* NB pointer post-increment below */
1192         knp++->value.ui32 = srp->sr_kstat.srk_rx_pkt_mem_limit;
1193         knp++->value.ui32 = srp->sr_kstat.srk_kcache_alloc_nomem;
1194         knp++->value.ui32 = srp->sr_kstat.srk_dma_alloc_nomem;
1195         knp++->value.ui32 = srp->sr_kstat.srk_dma_alloc_fail;
1196         knp++->value.ui32 = srp->sr_kstat.srk_dma_bind_nomem;
1197         knp++->value.ui32 = srp->sr_kstat.srk_dma_bind_fail;
1198 #ifdef _USE_XESBALLOC
1199         knp++->value.ui32 = srp->sr_kstat.srk_xesballoc_fail;
1200 #endif
1201 #ifdef _USE_DESBALLOC
1202         knp++->value.ui32 = srp->sr_kstat.srk_desballoc_fail;
1203 #endif
1204         knp++->value.ui32 = srp->sr_kstat.srk_rxq_empty_discard;
1205 
1206 done:
1207         return (0);
1208 
1209 fail1:
1210         DTRACE_PROBE1(fail1, int, rc);
1211 
1212         return (rc);
1213 }
1214 
1215 static int
1216 sfxge_rx_kstat_init(sfxge_rxq_t *srp)
1217 {
1218         sfxge_t *sp = srp->sr_sp;
1219         unsigned int index = srp->sr_index;
1220         sfxge_evq_t *sep = sp->s_sep[index];
1221         dev_info_t *dip = sp->s_dip;
1222         char name[MAXNAMELEN];
1223         kstat_t *ksp;
1224         kstat_named_t *knp;
1225         int rc;
1226 
1227         /* Create the set */
1228         (void) snprintf(name, MAXNAMELEN - 1, "%s_rxq%04d",
1229             ddi_driver_name(dip), index);
1230 
1231         if ((ksp = kstat_create((char *)ddi_driver_name(dip),
1232             ddi_get_instance(dip), name, "rxq", KSTAT_TYPE_NAMED,
1233             SFXGE_RX_NSTATS, 0)) == NULL) {
1234                 rc = ENOMEM;
1235                 goto fail1;
1236         }
1237 
1238         srp->sr_ksp = ksp;
1239 
1240         ksp->ks_update = sfxge_rx_kstat_update;
1241         ksp->ks_private = srp;
1242         ksp->ks_lock = &(sep->se_lock);
1243 
1244         /* Initialise the named stats */
1245         knp = ksp->ks_data;
1246         kstat_named_init(knp, "rx_pkt_mem_limit", KSTAT_DATA_UINT32);
1247         knp++;
1248         kstat_named_init(knp, "kcache_alloc_nomem", KSTAT_DATA_UINT32);
1249         knp++;
1250         kstat_named_init(knp, "dma_alloc_nomem", KSTAT_DATA_UINT32);
1251         knp++;
1252         kstat_named_init(knp, "dma_alloc_fail", KSTAT_DATA_UINT32);
1253         knp++;
1254         kstat_named_init(knp, "dma_bind_nomem", KSTAT_DATA_UINT32);
1255         knp++;
1256         kstat_named_init(knp, "dma_bind_fail", KSTAT_DATA_UINT32);
1257         knp++;
1258 #ifdef _USE_XESBALLOC
1259         kstat_named_init(knp, "xesballoc_fail", KSTAT_DATA_UINT32);
1260 #endif
1261 #ifdef _USE_DESBALLOC
1262         kstat_named_init(knp, "desballoc_fail", KSTAT_DATA_UINT32);
1263 #endif
1264         kstat_named_init(knp, "rxq_empty_discard", KSTAT_DATA_UINT32);
1265 
1266         kstat_install(ksp);
1267         return (0);
1268 
1269 fail1:
1270         DTRACE_PROBE1(fail1, int, rc);
1271 
1272         return (rc);
1273 }
1274 
1275 static int
1276 sfxge_rx_qinit(sfxge_t *sp, unsigned int index)
1277 {
1278         sfxge_rxq_t *srp;
1279         int rc;
1280 
1281         ASSERT3U(index, <, SFXGE_RX_SCALE_MAX);
1282 
1283         srp = kmem_cache_alloc(sp->s_rqc, KM_SLEEP);
1284 
1285         ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_UNINITIALIZED);
1286 
1287         srp->sr_index = index;
1288         sp->s_srp[index] = srp;
1289 
1290         if ((rc = sfxge_rx_kstat_init(srp)) != 0)
1291                 goto fail1;
1292 
1293         srp->sr_state = SFXGE_RXQ_INITIALIZED;
1294 
1295         return (0);
1296 fail1:
1297         DTRACE_PROBE1(fail1, int, rc);
1298         kmem_cache_free(sp->s_rqc, srp);
1299 
1300         return (rc);
1301 }
1302 
1303 static int
1304 sfxge_rx_qstart(sfxge_t *sp, unsigned int index)
1305 {
1306         sfxge_evq_t *sep = sp->s_sep[index];
1307         sfxge_rxq_t *srp;
1308         efsys_mem_t *esmp;
1309         efx_nic_t *enp;
1310         unsigned int level;
1311         int rc;
1312 
1313         mutex_enter(&(sep->se_lock));
1314         srp = sp->s_srp[index];
1315         enp = sp->s_enp;
1316         esmp = &(srp->sr_mem);
1317 
1318         ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_INITIALIZED);
1319         ASSERT3U(sep->se_state, ==, SFXGE_EVQ_STARTED);
1320 
1321         /* Zero the memory */
1322         (void) memset(esmp->esm_base, 0, EFX_RXQ_SIZE(sp->s_rxq_size));
1323 
1324         /* Program the buffer table */
1325         if ((rc = sfxge_sram_buf_tbl_set(sp, srp->sr_id, esmp,
1326             EFX_RXQ_NBUFS(sp->s_rxq_size))) != 0)
1327                 goto fail1;
1328 
1329         /* Create the receive queue */
1330         if ((rc = efx_rx_qcreate(enp, index, index, EFX_RXQ_TYPE_DEFAULT,
1331             esmp, sp->s_rxq_size, srp->sr_id, sep->se_eep, &(srp->sr_erp)))
1332             != 0)
1333                 goto fail2;
1334 
1335         /* Enable the receive queue */
1336         efx_rx_qenable(srp->sr_erp);
1337 
1338         /* Set the water marks */
1339         srp->sr_hiwat = EFX_RXQ_LIMIT(sp->s_rxq_size) * 9 / 10;
1340         srp->sr_lowat = srp->sr_hiwat / 2;
1341 
1342         srp->sr_state = SFXGE_RXQ_STARTED;
1343 
1344         sfxge_rx_qpoll_start(srp);
1345 
1346         /* Try to fill the queue from the pool */
1347         sfxge_rx_qrefill(srp, EFX_RXQ_LIMIT(sp->s_rxq_size));
1348 
1349         /*
1350          * If there were insufficient buffers in the pool to reach the at
1351          * least a batch then allocate some.
1352          */
1353         level = srp->sr_added - srp->sr_completed;
1354         if (level < SFXGE_RX_BATCH)
1355                 sfxge_rx_qfill(srp, SFXGE_RX_BATCH);
1356 
1357         mutex_exit(&(sep->se_lock));
1358 
1359         return (0);
1360 
1361 fail2:
1362         DTRACE_PROBE(fail2);
1363 
1364         /* Clear entries from the buffer table */
1365         sfxge_sram_buf_tbl_clear(sp, srp->sr_id,
1366             EFX_RXQ_NBUFS(sp->s_rxq_size));
1367 
1368 fail1:
1369         DTRACE_PROBE1(fail1, int, rc);
1370 
1371         mutex_exit(&(sep->se_lock));
1372 
1373         return (rc);
1374 }
1375 
1376 static void
1377 sfxge_rx_qflow_complete(sfxge_rxq_t *srp, sfxge_rx_flow_t *srfp)
1378 {
1379         mblk_t *mp;
1380         struct ether_header *etherhp;
1381         struct ip *iphp;
1382         struct tcphdr *thp;
1383 
1384         if (srfp->srf_mp == NULL)
1385                 return;
1386 
1387         mp = srfp->srf_mp;
1388         etherhp = srfp->srf_etherhp;
1389         iphp = srfp->srf_iphp;
1390         thp = srfp->srf_last_thp;
1391 
1392         ASSERT3U(((etherhp->ether_type == htons(ETHERTYPE_VLAN)) ?
1393             sizeof (struct ether_vlan_header) :
1394             sizeof (struct ether_header)) +
1395             srfp->srf_len, ==, msgdsize(mp));
1396 
1397         ASSERT3U(srfp->srf_len & 0xffff, ==, srfp->srf_len);
1398         iphp->ip_len = htons(srfp->srf_len);
1399 
1400         srfp->srf_first_thp->th_ack = thp->th_ack;
1401         srfp->srf_first_thp->th_win = thp->th_win;
1402         srfp->srf_first_thp->th_flags = thp->th_flags;
1403 
1404         DTRACE_PROBE2(flow_complete, uint32_t, srfp->srf_tag,
1405             size_t, srfp->srf_len);
1406 
1407         srfp->srf_mp = NULL;
1408         srfp->srf_len = 0;
1409 
1410         ASSERT(mp->b_next == NULL);
1411         *(srp->sr_mpp) = mp;
1412         srp->sr_mpp = &(mp->b_next);
1413 }
1414 
1415 static boolean_t
1416 sfxge_rx_qflow_add(sfxge_rxq_t *srp, sfxge_rx_flow_t *srfp,
1417     sfxge_rx_packet_t *srpp, clock_t now)
1418 {
1419         sfxge_t *sp = srp->sr_sp;
1420         struct ether_header *etherhp = srpp->srp_etherhp;
1421         struct ip *iphp = srpp->srp_iphp;
1422         struct tcphdr *thp = srpp->srp_thp;
1423         size_t off = srpp->srp_off;
1424         size_t size = (size_t)(srpp->srp_size);
1425         mblk_t *mp = srpp->srp_mp;
1426         uint32_t seq;
1427         unsigned int shift;
1428 
1429         ASSERT3U(MBLKL(mp), ==, off + size);
1430         ASSERT3U(DB_CKSUMFLAGS(mp), ==,
1431             HCK_FULLCKSUM | HCK_FULLCKSUM_OK | HCK_IPV4_HDRCKSUM);
1432 
1433         seq = htonl(thp->th_seq);
1434 
1435         /*
1436          * If the time between this segment and the last is greater than RTO
1437          * then consider this a new flow.
1438          */
1439         if (now - srfp->srf_lbolt > srp->sr_rto) {
1440                 srfp->srf_count = 1;
1441                 srfp->srf_seq = seq + size;
1442 
1443                 goto fail1;
1444         }
1445 
1446         if (seq != srfp->srf_seq) {
1447                 if (srfp->srf_count > SFXGE_SLOW_START)
1448                         srfp->srf_count = SFXGE_SLOW_START;
1449 
1450                 srfp->srf_count >>= 1;
1451 
1452                 srfp->srf_count++;
1453                 srfp->srf_seq = seq + size;
1454 
1455                 goto fail2;
1456         }
1457 
1458         /* Update the in-order segment count and sequence number */
1459         srfp->srf_count++;
1460         srfp->srf_seq = seq + size;
1461 
1462         /* Don't merge across pure ACK, URG, SYN or RST segments */
1463         if (size == 0 || thp->th_flags & (TH_URG | TH_SYN | TH_RST) ||
1464             thp->th_urp != 0)
1465                 goto fail3;
1466 
1467         /*
1468          * If the in-order segment count has not yet reached the slow-start
1469          * threshold then we cannot coalesce.
1470          */
1471         if (srfp->srf_count < SFXGE_SLOW_START)
1472                 goto fail4;
1473 
1474         /* Scale up the packet size from 4k (the maximum being 64k) */
1475         ASSERT3U(srfp->srf_count, >=, SFXGE_SLOW_START);
1476         shift = MIN(srfp->srf_count - SFXGE_SLOW_START + 12, 16);
1477         if (srfp->srf_len + size >= (1 << shift))
1478                 sfxge_rx_qflow_complete(srp, srfp);
1479 
1480         ASSERT(mp->b_cont == NULL);
1481 
1482 #ifdef _USE_GLD_V3_SOL10
1483         /*
1484          * The IP and UDP layers in Solaris 10 have slow paths for
1485          * handling mblks with more than 2 fragments.
1486          * UDP: see OpenSolaris CR 6305037
1487          * IP: see <http://www.mail-archive.com/networking-discuss@
1488          *   opensolaris.org/msg07366.html>
1489          */
1490         if (srfp->srf_mp && srfp->srf_mp->b_cont) {
1491                 sfxge_rx_qflow_complete(srp, srfp);
1492         }
1493 #endif
1494 
1495         if (srfp->srf_mp == NULL) {
1496                 /* First packet in this flow */
1497                 srfp->srf_etherhp = etherhp;
1498                 srfp->srf_iphp = iphp;
1499                 srfp->srf_first_thp = srfp->srf_last_thp = thp;
1500 
1501                 ASSERT3P(mp->b_cont, ==, NULL);
1502                 srfp->srf_mp = mp;
1503                 srfp->srf_mpp = &(mp->b_cont);
1504 
1505                 srfp->srf_len = ntohs(iphp->ip_len);
1506 
1507                 /*
1508                  * If the flow is not already in the list of occupied flows then
1509                  * add it.
1510                  */
1511                 if (srfp->srf_next == NULL &&
1512                     srp->sr_srfpp != &(srfp->srf_next)) {
1513                         *(srp->sr_srfpp) = srfp;
1514                         srp->sr_srfpp = &(srfp->srf_next);
1515                 }
1516         } else {
1517                 /* Later packet in this flow - skip TCP header */
1518                 srfp->srf_last_thp = thp;
1519 
1520                 mp->b_rptr += off;
1521                 ASSERT3U(MBLKL(mp), ==, size);
1522 
1523                 ASSERT3P(mp->b_cont, ==, NULL);
1524                 *(srfp->srf_mpp) = mp;
1525                 srfp->srf_mpp = &(mp->b_cont);
1526 
1527                 srfp->srf_len += size;
1528 
1529                 ASSERT(srfp->srf_next != NULL ||
1530                     srp->sr_srfpp == &(srfp->srf_next));
1531         }
1532 
1533         DTRACE_PROBE2(flow_add, uint32_t, srfp->srf_tag, size_t, size);
1534 
1535         /*
1536          * Try to align coalesced segments on push boundaries, unless they
1537          * are too frequent.
1538          */
1539         if (sp->s_rx_coalesce_mode == SFXGE_RX_COALESCE_ALLOW_PUSH &&
1540             thp->th_flags & TH_PUSH)
1541                 sfxge_rx_qflow_complete(srp, srfp);
1542 
1543         srfp->srf_lbolt = now;
1544         return (B_TRUE);
1545 
1546 fail4:
1547 fail3:
1548 fail2:
1549 fail1:
1550         sfxge_rx_qflow_complete(srp, srfp);
1551 
1552         srfp->srf_lbolt = now;
1553         return (B_FALSE);
1554 }
1555 
1556 void
1557 sfxge_rx_qpacket_coalesce(sfxge_rxq_t *srp)
1558 {
1559         sfxge_t *sp = srp->sr_sp;
1560         clock_t now;
1561         mblk_t *mp;
1562         sfxge_rx_flow_t *srfp;
1563 
1564         ASSERT(sp->s_rx_coalesce_mode != SFXGE_RX_COALESCE_OFF);
1565 
1566         now = ddi_get_lbolt();
1567 
1568         mp = srp->sr_mp;
1569 
1570         srp->sr_mp = NULL;
1571         srp->sr_mpp = &(srp->sr_mp);
1572 
1573         /* Start with the last flow to be appended to */
1574         srfp = *(srp->sr_srfpp);
1575 
1576         while (mp != NULL) {
1577                 frtn_t *freep;
1578                 sfxge_rx_packet_t *srpp;
1579                 struct ether_header *etherhp;
1580                 struct ip *iphp;
1581                 struct tcphdr *thp;
1582                 size_t off;
1583                 size_t size;
1584                 uint16_t ether_tci;
1585                 uint16_t hash;
1586                 uint32_t tag;
1587                 mblk_t *next;
1588 
1589                 next = mp->b_next;
1590                 mp->b_next = NULL;
1591 
1592                 if (next != NULL)
1593                         prefetch_read_many(next);
1594 
1595                 freep = DB_FRTNP(mp);
1596                 /*LINTED*/
1597                 srpp = (sfxge_rx_packet_t *)(freep->free_arg);
1598                 ASSERT3P(srpp->srp_mp, ==, mp);
1599 
1600                 /* If the packet is not TCP then we cannot coalesce it */
1601                 if (~(srpp->srp_flags) & EFX_PKT_TCP)
1602                         goto reject;
1603 
1604                 /*
1605                  * If the packet is not fully checksummed then we cannot
1606                  * coalesce it.
1607                  */
1608                 if (~(srpp->srp_flags) & (EFX_CKSUM_TCPUDP | EFX_CKSUM_IPV4))
1609                         goto reject;
1610 
1611                 /* Parse the TCP header */
1612                 sfxge_tcp_parse(mp, &etherhp, &iphp, &thp, &off,
1613                     &size);
1614                 ASSERT(etherhp != NULL);
1615                 ASSERT(iphp != NULL);
1616                 ASSERT(thp != NULL);
1617                 ASSERT(off != 0);
1618 
1619                 if ((iphp->ip_off & ~htons(IP_DF)) != 0)
1620                         goto reject;
1621 
1622                 if (etherhp->ether_type == htons(ETHERTYPE_VLAN)) {
1623                         struct ether_vlan_header *ethervhp;
1624 
1625                         ethervhp = (struct ether_vlan_header *)etherhp;
1626                         ether_tci = ethervhp->ether_tci;
1627                 } else {
1628                         ether_tci = 0;
1629                 }
1630 
1631                 /*
1632                  * Make sure any minimum length padding is stripped
1633                  * before we try to add the packet to a flow.
1634                  */
1635                 ASSERT3U(sp->s_rx_prefix_size + MBLKL(mp), ==,
1636                     (size_t)(srpp->srp_size));
1637                 ASSERT3U(sp->s_rx_prefix_size + off + size, <=,
1638                     (size_t)(srpp->srp_size));
1639 
1640                 if (sp->s_rx_prefix_size + off + size <
1641                     (size_t)(srpp->srp_size))
1642                         mp->b_wptr = mp->b_rptr + off + size;
1643 
1644                 /*
1645                  * If there is no current flow, or the segment does not match
1646                  * the current flow then we must attempt to look up the
1647                  * correct flow in the table.
1648                  */
1649                 if (srfp == NULL)
1650                         goto lookup;
1651 
1652                 if (srfp->srf_saddr != iphp->ip_src.s_addr ||
1653                     srfp->srf_daddr != iphp->ip_dst.s_addr)
1654                         goto lookup;
1655 
1656                 if (srfp->srf_sport != thp->th_sport ||
1657                     srfp->srf_dport != thp->th_dport)
1658                         goto lookup;
1659 
1660                 if (srfp->srf_tci != ether_tci)
1661                         goto lookup;
1662 
1663 add:
1664                 ASSERT(srfp != NULL);
1665 
1666                 srpp->srp_etherhp = etherhp;
1667                 srpp->srp_iphp = iphp;
1668                 srpp->srp_thp = thp;
1669                 srpp->srp_off = off;
1670 
1671                 ASSERT3U(size, <, (1 << 16));
1672                 srpp->srp_size = (uint16_t)size;
1673 
1674                 /* Try to append the packet to the flow */
1675                 if (!sfxge_rx_qflow_add(srp, srfp, srpp, now))
1676                         goto reject;
1677 
1678                 mp = next;
1679                 continue;
1680 
1681 lookup:
1682                 /*
1683                  * If there is a prefix area then read the hash from that,
1684                  * otherwise calculate it.
1685                  */
1686                 if (sp->s_rx_prefix_size != 0) {
1687                         hash = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_LFSR,
1688                             DB_BASE(mp));
1689                 } else {
1690                         SFXGE_TCP_HASH(
1691                             ntohl(iphp->ip_src.s_addr),
1692                             ntohs(thp->th_sport),
1693                             ntohl(iphp->ip_dst.s_addr),
1694                             ntohs(thp->th_dport),
1695                             hash);
1696                 }
1697 
1698                 srfp = &(srp->sr_flow[(hash >> 6) % SFXGE_MAX_FLOW]);
1699                 tag = (uint32_t)hash + 1; /* Make sure it's not zero */
1700 
1701                 /*
1702                  * If the flow we have found does not match the hash then
1703                  * it may be an unused flow, or it may be stale.
1704                  */
1705                 if (tag != srfp->srf_tag) {
1706                         if (srfp->srf_count != 0) {
1707                                 if (now - srfp->srf_lbolt <= srp->sr_rto)
1708                                         goto reject;
1709                         }
1710 
1711                         if (srfp->srf_mp != NULL)
1712                                 goto reject;
1713 
1714                         /* Start a new flow */
1715                         ASSERT(srfp->srf_next == NULL);
1716 
1717                         srfp->srf_tag = tag;
1718 
1719                         srfp->srf_saddr = iphp->ip_src.s_addr;
1720                         srfp->srf_daddr = iphp->ip_dst.s_addr;
1721                         srfp->srf_sport = thp->th_sport;
1722                         srfp->srf_dport = thp->th_dport;
1723                         srfp->srf_tci = ether_tci;
1724 
1725                         srfp->srf_count = 0;
1726                         srfp->srf_seq = ntohl(thp->th_seq);
1727 
1728                         srfp->srf_lbolt = now;
1729                         goto add;
1730                 }
1731 
1732                 /*
1733                  * If the flow we have found does match the hash then it could
1734                  * still be an alias.
1735                  */
1736                 if (srfp->srf_saddr != iphp->ip_src.s_addr ||
1737                     srfp->srf_daddr != iphp->ip_dst.s_addr)
1738                         goto reject;
1739 
1740                 if (srfp->srf_sport != thp->th_sport ||
1741                     srfp->srf_dport != thp->th_dport)
1742                         goto reject;
1743 
1744                 if (srfp->srf_tci != ether_tci)
1745                         goto reject;
1746 
1747                 goto add;
1748 
1749 reject:
1750                 *(srp->sr_mpp) = mp;
1751                 srp->sr_mpp = &(mp->b_next);
1752 
1753                 mp = next;
1754         }
1755 }
1756 
1757 void
1758 sfxge_rx_qcomplete(sfxge_rxq_t *srp, boolean_t eop)
1759 {
1760         sfxge_t *sp = srp->sr_sp;
1761         unsigned int index = srp->sr_index;
1762         sfxge_evq_t *sep = sp->s_sep[index];
1763         unsigned int completed;
1764         sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp);
1765         unsigned int level;
1766 
1767         ASSERT(mutex_owned(&(sep->se_lock)));
1768 
1769         ASSERT(srp->sr_mp == NULL);
1770         ASSERT(srp->sr_mpp == &(srp->sr_mp));
1771 
1772         completed = srp->sr_completed;
1773         while (completed != srp->sr_pending) {
1774                 unsigned int id;
1775                 sfxge_rx_packet_t *srpp;
1776                 mblk_t *mp;
1777                 size_t size;
1778                 uint16_t flags;
1779 
1780                 id = completed++ & (sp->s_rxq_size - 1);
1781 
1782                 if (srp->sr_pending - completed >= 4) {
1783                         unsigned int prefetch;
1784 
1785                         prefetch = (id + 4) & (sp->s_rxq_size - 1);
1786 
1787                         srpp = srp->sr_srpp[prefetch];
1788                         ASSERT(srpp != NULL);
1789 
1790                         mp = srpp->srp_mp;
1791                         prefetch_read_many(mp->b_datap);
1792                 } else if (completed == srp->sr_pending) {
1793                         prefetch_read_many(srp->sr_mp);
1794                 }
1795 
1796                 srpp = srp->sr_srpp[id];
1797                 ASSERT(srpp != NULL);
1798 
1799                 srp->sr_srpp[id] = NULL;
1800 
1801                 mp = srpp->srp_mp;
1802                 ASSERT(mp->b_cont == NULL);
1803 
1804                 /* when called from sfxge_rx_qstop() */
1805                 if (srp->sr_state != SFXGE_RXQ_STARTED)
1806                         goto discard;
1807 
1808                 if (srpp->srp_flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
1809                         goto discard;
1810 
1811                 /* Set up the packet length */
1812                 ASSERT3P(mp->b_rptr, ==, DB_BASE(mp));
1813                 mp->b_rptr += sp->s_rx_prefix_size;
1814 
1815                 prefetch_read_many(mp->b_rptr);
1816 
1817                 ASSERT3P(mp->b_wptr, ==, DB_BASE(mp));
1818                 mp->b_wptr += (size_t)(srpp->srp_size);
1819                 ASSERT3P(mp->b_wptr, <=, DB_LIM(mp));
1820 
1821                 /* Calculate the maximum packet size */
1822                 size = sp->s_mtu;
1823                 size += (srpp->srp_flags & EFX_PKT_VLAN_TAGGED) ?
1824                     sizeof (struct ether_vlan_header) :
1825                     sizeof (struct ether_header);
1826 
1827                 if (MBLKL(mp) > size)
1828                         goto discard;
1829 
1830                 /* Make the data visible to the kernel */
1831                 (void) ddi_dma_sync(srpp->srp_dma_handle, 0,
1832                     (size_t)(srpp->srp_size), DDI_DMA_SYNC_FORKERNEL);
1833 
1834                 /* Check for loopback packets */
1835                 if (!(srpp->srp_flags & EFX_PKT_IPV4) &&
1836                     !(srpp->srp_flags & EFX_PKT_IPV6)) {
1837                         struct ether_header *etherhp;
1838 
1839                         /*LINTED*/
1840                         etherhp = (struct ether_header *)(mp->b_rptr);
1841 
1842                         if (etherhp->ether_type ==
1843                             htons(SFXGE_ETHERTYPE_LOOPBACK)) {
1844                                 DTRACE_PROBE(loopback);
1845 
1846                                 srp->sr_loopback++;
1847                                 goto discard;
1848                         }
1849                 }
1850 
1851                 /* Set up the checksum information */
1852                 flags = 0;
1853 
1854                 if (srpp->srp_flags & EFX_CKSUM_IPV4) {
1855                         ASSERT(srpp->srp_flags & EFX_PKT_IPV4);
1856                         flags |= HCK_IPV4_HDRCKSUM;
1857                 }
1858 
1859                 if (srpp->srp_flags & EFX_CKSUM_TCPUDP) {
1860                         ASSERT(srpp->srp_flags & EFX_PKT_TCP ||
1861                             srpp->srp_flags & EFX_PKT_UDP);
1862                         flags |= HCK_FULLCKSUM | HCK_FULLCKSUM_OK;
1863                 }
1864 
1865                 DB_CKSUMSTART(mp) = 0;
1866                 DB_CKSUMSTUFF(mp) = 0;
1867                 DB_CKSUMEND(mp) = 0;
1868                 DB_CKSUMFLAGS(mp) = flags;
1869                 DB_CKSUM16(mp) = 0;
1870 
1871                 /* Add the packet to the tail of the chain */
1872                 srfppp->srfpp_loaned++;
1873 
1874                 ASSERT(mp->b_next == NULL);
1875                 *(srp->sr_mpp) = mp;
1876                 srp->sr_mpp = &(mp->b_next);
1877 
1878                 continue;
1879 
1880 discard:
1881                 /* Return the packet to the pool */
1882                 srfppp->srfpp_loaned++;
1883                 freeb(mp); /* Equivalent to freemsg() as b_cont==0 */
1884         }
1885         srp->sr_completed = completed;
1886 
1887         /* Attempt to coalesce any TCP packets */
1888         if (sp->s_rx_coalesce_mode != SFXGE_RX_COALESCE_OFF)
1889                 sfxge_rx_qpacket_coalesce(srp);
1890 
1891         /*
1892          * If there are any pending flows and this is the end of the
1893          * poll then they must be completed.
1894          */
1895         if (srp->sr_srfp != NULL && eop) {
1896                 sfxge_rx_flow_t *srfp;
1897 
1898                 srfp = srp->sr_srfp;
1899 
1900                 srp->sr_srfp = NULL;
1901                 srp->sr_srfpp = &(srp->sr_srfp);
1902 
1903                 do {
1904                         sfxge_rx_flow_t *next;
1905 
1906                         next = srfp->srf_next;
1907                         srfp->srf_next = NULL;
1908 
1909                         sfxge_rx_qflow_complete(srp, srfp);
1910 
1911                         srfp = next;
1912                 } while (srfp != NULL);
1913         }
1914 
1915         level = srp->sr_added - srp->sr_completed;
1916 
1917         /* If there are any packets then pass them up the stack */
1918         if (srp->sr_mp != NULL) {
1919                 mblk_t *mp;
1920 
1921                 mp = srp->sr_mp;
1922 
1923                 srp->sr_mp = NULL;
1924                 srp->sr_mpp = &(srp->sr_mp);
1925 
1926                 if (level == 0) {
1927                         /* Try to refill ASAP */
1928                         sfxge_rx_qrefill(srp, EFX_RXQ_LIMIT(sp->s_rxq_size));
1929                         level = srp->sr_added - srp->sr_completed;
1930                 }
1931 
1932                 /*
1933                  * If the RXQ is still empty, discard and recycle the
1934                  * current entry to ensure that the ring always
1935                  * contains at least one descriptor. This ensures that
1936                  * the next hardware RX will trigger an event
1937                  * (possibly delayed by interrupt moderation) and
1938                  * trigger another refill/fill attempt.
1939                  *
1940                  * Note this drops a complete LRO fragment from the
1941                  * start of the batch.
1942                  *
1943                  * Note also that copymsgchain() does not help with
1944                  * resource starvation here, unless we are short of DMA
1945                  * mappings.
1946                  */
1947                 if (level == 0) {
1948                         mblk_t *nmp;
1949 
1950                         srp->sr_kstat.srk_rxq_empty_discard++;
1951                         DTRACE_PROBE1(rxq_empty_discard, int, index);
1952                         nmp = mp->b_next;
1953                         if (nmp)
1954                                 sfxge_gld_rx_post(sp, index, nmp);
1955                         /* as level==0 will swizzle,rxpost below */
1956                         freemsg(mp);
1957                 } else {
1958                         sfxge_gld_rx_post(sp, index, mp);
1959                 }
1960         }
1961 
1962         /* Top up the queue if necessary */
1963         if (level < srp->sr_hiwat) {
1964                 sfxge_rx_qrefill(srp, EFX_RXQ_LIMIT(sp->s_rxq_size));
1965 
1966                 level = srp->sr_added - srp->sr_completed;
1967                 if (level < srp->sr_lowat)
1968                         sfxge_rx_qfill(srp, EFX_RXQ_LIMIT(sp->s_rxq_size));
1969         }
1970 }
1971 
1972 static unsigned int
1973 sfxge_rx_qloopback(sfxge_t *sp, unsigned int index)
1974 {
1975         sfxge_evq_t *sep = sp->s_sep[index];
1976         sfxge_rxq_t *srp;
1977         unsigned int count;
1978 
1979         mutex_enter(&(sep->se_lock));
1980         srp = sp->s_srp[index];
1981         count = srp->sr_loopback;
1982         srp->sr_loopback = 0;
1983         mutex_exit(&(sep->se_lock));
1984 
1985         return (count);
1986 }
1987 
1988 void
1989 sfxge_rx_qflush_done(sfxge_rxq_t *srp)
1990 {
1991         sfxge_t *sp = srp->sr_sp;
1992         unsigned int index = srp->sr_index;
1993         sfxge_evq_t *sep = sp->s_sep[index];
1994 
1995         ASSERT(mutex_owned(&(sep->se_lock)));
1996 
1997         /* SFCbug22989: events may be delayed. EVQs are stopped after RXQs */
1998         if ((srp->sr_state != SFXGE_RXQ_INITIALIZED) ||
1999             (srp->sr_flush == SFXGE_FLUSH_DONE))
2000                 return;
2001 
2002         /* Flush successful: wakeup sfxge_rx_qstop() */
2003         srp->sr_flush = SFXGE_FLUSH_DONE;
2004         cv_broadcast(&(srp->sr_flush_kv));
2005 }
2006 
2007 void
2008 sfxge_rx_qflush_failed(sfxge_rxq_t *srp)
2009 {
2010         sfxge_t *sp = srp->sr_sp;
2011         unsigned int index = srp->sr_index;
2012         sfxge_evq_t *sep = sp->s_sep[index];
2013 
2014         ASSERT(mutex_owned(&(sep->se_lock)));
2015 
2016         /* SFCbug22989: events may be delayed. EVQs are stopped after RXQs */
2017         if ((srp->sr_state != SFXGE_RXQ_INITIALIZED) ||
2018             (srp->sr_flush == SFXGE_FLUSH_DONE))
2019                 return;
2020 
2021         /* SFCbug22989: events may be delayed. EVQs are stopped after RXQs */
2022         if (srp->sr_state != SFXGE_RXQ_STARTED)
2023                 return;
2024 
2025         /* Flush failed, so retry until timeout in sfxge_rx_qstop() */
2026         srp->sr_flush = SFXGE_FLUSH_FAILED;
2027         efx_rx_qflush(srp->sr_erp);
2028 }
2029 
2030 static void
2031 sfxge_rx_qstop(sfxge_t *sp, unsigned int index)
2032 {
2033         sfxge_evq_t *sep = sp->s_sep[index];
2034         sfxge_rxq_t *srp;
2035         clock_t timeout;
2036 
2037         mutex_enter(&(sep->se_lock));
2038 
2039         srp = sp->s_srp[index];
2040         ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_STARTED);
2041 
2042         sfxge_rx_qpoll_stop(srp);
2043 
2044         srp->sr_state = SFXGE_RXQ_INITIALIZED;
2045 
2046         if (sp->s_hw_err == SFXGE_HW_OK) {
2047                 /* Wait upto 2sec for queue flushing to complete */
2048                 srp->sr_flush = SFXGE_FLUSH_PENDING;
2049                 efx_rx_qflush(srp->sr_erp);
2050         } else {
2051                 /* Do not attempt flush if indication of H/W failure */
2052                 srp->sr_flush = SFXGE_FLUSH_DONE;
2053         }
2054 
2055         timeout = ddi_get_lbolt() + drv_usectohz(SFXGE_RX_QFLUSH_USEC);
2056 
2057         while (srp->sr_flush != SFXGE_FLUSH_DONE) {
2058                 if (cv_timedwait(&(srp->sr_flush_kv), &(sep->se_lock),
2059                         timeout) < 0) {
2060                         /* Timeout waiting for successful flush */
2061                         dev_info_t *dip = sp->s_dip;
2062 
2063                                 ddi_driver_name(sp->s_dip),
2064                         cmn_err(CE_NOTE,
2065                             SFXGE_CMN_ERR "[%s%d] rxq[%d] flush timeout",
2066                             ddi_driver_name(dip), ddi_get_instance(dip), index);
2067                         break;
2068                 }
2069         }
2070 
2071         DTRACE_PROBE1(flush, sfxge_flush_state_t, srp->sr_flush);
2072         srp->sr_flush = SFXGE_FLUSH_DONE;
2073 
2074         /* Destroy the receive queue */
2075         efx_rx_qdestroy(srp->sr_erp);
2076         srp->sr_erp = NULL;
2077 
2078         /* Clear entries from the buffer table */
2079         sfxge_sram_buf_tbl_clear(sp, srp->sr_id,
2080             EFX_RXQ_NBUFS(sp->s_rxq_size));
2081 
2082         /*
2083          * Free any unused RX packets which had descriptors on the RXQ
2084          * Packets will be discard as state != STARTED
2085          */
2086         srp->sr_pending = srp->sr_added;
2087         sfxge_rx_qcomplete(srp, B_TRUE);
2088 
2089         ASSERT3U(srp->sr_completed, ==, srp->sr_pending);
2090 
2091         srp->sr_added = 0;
2092         srp->sr_pending = 0;
2093         srp->sr_completed = 0;
2094         srp->sr_loopback = 0;
2095 
2096         srp->sr_lowat = 0;
2097         srp->sr_hiwat = 0;
2098 
2099         mutex_exit(&(sep->se_lock));
2100 }
2101 
2102 static void
2103 sfxge_rx_kstat_fini(sfxge_rxq_t *srp)
2104 {
2105         kstat_delete(srp->sr_ksp);
2106         srp->sr_ksp = NULL;
2107 }
2108 
2109 static void
2110 sfxge_rx_qfini(sfxge_t *sp, unsigned int index)
2111 {
2112         sfxge_rxq_t *srp = sp->s_srp[index];
2113 
2114         ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_INITIALIZED);
2115 
2116         sp->s_srp[index] = NULL;
2117         srp->sr_state = SFXGE_RXQ_UNINITIALIZED;
2118 
2119         sfxge_rx_kstat_fini(srp);
2120 
2121         /* Empty the pool */
2122         sfxge_rx_qfpp_empty(srp);
2123 
2124         srp->sr_index = 0;
2125 
2126         kmem_cache_free(sp->s_rqc, srp);
2127 }
2128 
2129 static int
2130 sfxge_rx_scale_kstat_update(kstat_t *ksp, int rw)
2131 {
2132         sfxge_t *sp = ksp->ks_private;
2133         sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2134         sfxge_intr_t *sip = &(sp->s_intr);
2135         kstat_named_t *knp;
2136         unsigned int index;
2137         unsigned int entry;
2138         unsigned int *freq;
2139         int rc;
2140 
2141         ASSERT(mutex_owned(&(srsp->srs_lock)));
2142 
2143         if (rw != KSTAT_READ) {
2144                 rc = EACCES;
2145                 goto fail1;
2146         }
2147 
2148         if ((freq = kmem_zalloc(sizeof (unsigned int) * sip->si_nalloc,
2149                                 KM_NOSLEEP)) == NULL) {
2150                 rc = ENOMEM;
2151                 goto fail2;
2152         }
2153 
2154         for (index = 0; index < sip->si_nalloc; index++)
2155                 freq[index] = 0;
2156 
2157         for (entry = 0; entry < SFXGE_RX_SCALE_MAX; entry++) {
2158                 index = srsp->srs_tbl[entry];
2159 
2160                 freq[index]++;
2161         }
2162 
2163         knp = ksp->ks_data;
2164         for (index = 0; index < sip->si_nalloc; index++) {
2165                 knp->value.ui64 = freq[index];
2166                 knp++;
2167         }
2168 
2169         knp->value.ui64 = srsp->srs_count;
2170 
2171         kmem_free(freq, sizeof (unsigned int) * sip->si_nalloc);
2172 
2173         return (0);
2174 
2175 fail2:
2176         DTRACE_PROBE(fail2);
2177 fail1:
2178         DTRACE_PROBE1(fail1, int, rc);
2179         return (rc);
2180 }
2181 
2182 static int
2183 sfxge_rx_scale_kstat_init(sfxge_t *sp)
2184 {
2185         dev_info_t *dip = sp->s_dip;
2186         sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2187         sfxge_intr_t *sip = &(sp->s_intr);
2188         char name[MAXNAMELEN];
2189         kstat_t *ksp;
2190         kstat_named_t *knp;
2191         unsigned int index;
2192         int rc;
2193 
2194         /* Create the set */
2195         (void) snprintf(name, MAXNAMELEN - 1, "%s_rss", ddi_driver_name(dip));
2196 
2197         if ((ksp = kstat_create((char *)ddi_driver_name(dip),
2198             ddi_get_instance(dip), name, "rss", KSTAT_TYPE_NAMED,
2199             sip->si_nalloc + 1, 0)) == NULL) {
2200                 rc = ENOMEM;
2201                 goto fail1;
2202         }
2203 
2204         srsp->srs_ksp = ksp;
2205 
2206         ksp->ks_update = sfxge_rx_scale_kstat_update;
2207         ksp->ks_private = sp;
2208         ksp->ks_lock = &(srsp->srs_lock);
2209 
2210         /* Initialise the named stats */
2211         knp = ksp->ks_data;
2212         for (index = 0; index < sip->si_nalloc; index++) {
2213                 char name[MAXNAMELEN];
2214 
2215                 (void) snprintf(name, MAXNAMELEN - 1, "evq%04d_count", index);
2216                 kstat_named_init(knp, name, KSTAT_DATA_UINT64);
2217                 knp++;
2218         }
2219 
2220         kstat_named_init(knp, "scale", KSTAT_DATA_UINT64);
2221 
2222         kstat_install(ksp);
2223         return (0);
2224 
2225 fail1:
2226         DTRACE_PROBE1(fail1, int, rc);
2227 
2228         return (rc);
2229 }
2230 
2231 static void
2232 sfxge_rx_scale_kstat_fini(sfxge_t *sp)
2233 {
2234         sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2235 
2236         /* Destroy the set */
2237         kstat_delete(srsp->srs_ksp);
2238         srsp->srs_ksp = NULL;
2239 }
2240 
2241 
2242 unsigned int
2243 sfxge_rx_scale_prop_get(sfxge_t *sp)
2244 {
2245         int rx_scale;
2246 
2247         rx_scale = ddi_prop_get_int(DDI_DEV_T_ANY, sp->s_dip,
2248                                     DDI_PROP_DONTPASS, "rx_scale_count",
2249                                     SFXGE_RX_SCALE_MAX);
2250         /* 0 and all -ve numbers sets to number of logical CPUs */
2251         if (rx_scale <= 0)
2252                 rx_scale = ncpus;
2253 
2254         return (rx_scale);
2255 }
2256 
2257 
2258 static int
2259 sfxge_rx_scale_init(sfxge_t *sp)
2260 {
2261         sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2262         sfxge_intr_t *sip = &(sp->s_intr);
2263         int rc;
2264 
2265         ASSERT3U(srsp->srs_state, ==, SFXGE_RX_SCALE_UNINITIALIZED);
2266 
2267         /* Create tables for CPU, core, cache and chip counts */
2268         srsp->srs_cpu = kmem_zalloc(sizeof (unsigned int) * NCPU, KM_SLEEP);
2269 #ifdef  _USE_CPU_PHYSID
2270         srsp->srs_core = kmem_zalloc(sizeof (unsigned int) * NCPU, KM_SLEEP);
2271         srsp->srs_cache = kmem_zalloc(sizeof (unsigned int) * NCPU, KM_SLEEP);
2272         srsp->srs_chip = kmem_zalloc(sizeof (unsigned int) * NCPU, KM_SLEEP);
2273 #endif
2274 
2275         mutex_init(&(srsp->srs_lock), NULL, MUTEX_DRIVER, NULL);
2276 
2277         /* We need at least one event queue */
2278         srsp->srs_count = sfxge_rx_scale_prop_get(sp);
2279         if (srsp->srs_count > sip->si_nalloc)
2280                 srsp->srs_count = sip->si_nalloc;
2281         if (srsp->srs_count < 1)
2282                 srsp->srs_count = 1;
2283 
2284         /* Set up the kstats */
2285         if ((rc = sfxge_rx_scale_kstat_init(sp)) != 0)
2286                 goto fail1;
2287 
2288         srsp->srs_state = SFXGE_RX_SCALE_INITIALIZED;
2289 
2290         return (0);
2291 
2292 fail1:
2293         DTRACE_PROBE1(fail1, int, rc);
2294         mutex_destroy(&(srsp->srs_lock));
2295 
2296         return (rc);
2297 }
2298 
2299 void
2300 sfxge_rx_scale_update(void *arg)
2301 {
2302         sfxge_t *sp = arg;
2303         sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2304         sfxge_intr_t *sip;
2305         processorid_t id;
2306         unsigned int count;
2307         unsigned int *tbl;
2308         unsigned int *rating;
2309         unsigned int entry;
2310         int rc;
2311 
2312         mutex_enter(&(srsp->srs_lock));
2313 
2314         if (srsp->srs_state != SFXGE_RX_SCALE_STARTED) {
2315                 rc = EFAULT;
2316                 goto fail1;
2317         }
2318 
2319         if ((tbl =  kmem_zalloc(sizeof (unsigned int) * SFXGE_RX_SCALE_MAX,
2320                             KM_NOSLEEP)) == NULL) {
2321                 rc = ENOMEM;
2322                 goto fail2;
2323         }
2324 
2325         sip = &(sp->s_intr);
2326         if ((rating = kmem_zalloc(sizeof (unsigned int) * sip->si_nalloc,
2327                             KM_NOSLEEP)) == NULL) {
2328                 rc = ENOMEM;
2329                 goto fail3;
2330         }
2331 
2332         mutex_enter(&cpu_lock);
2333 
2334         /*
2335          * Substract any current CPU, core, cache and chip usage from the
2336          * global contention tables.
2337          */
2338         for (id = 0; id < NCPU; id++) {
2339                 ASSERT3U(sfxge_cpu[id], >=, srsp->srs_cpu[id]);
2340                 sfxge_cpu[id] -= srsp->srs_cpu[id];
2341                 srsp->srs_cpu[id] = 0;
2342 
2343 #ifdef  _USE_CPU_PHYSID
2344                 ASSERT3U(sfxge_core[id], >=, srsp->srs_core[id]);
2345                 sfxge_core[id] -= srsp->srs_core[id];
2346                 srsp->srs_core[id] = 0;
2347 
2348                 ASSERT3U(sfxge_cache[id], >=, srsp->srs_cache[id]);
2349                 sfxge_cache[id] -= srsp->srs_cache[id];
2350                 srsp->srs_cache[id] = 0;
2351 
2352                 ASSERT3U(sfxge_chip[id], >=, srsp->srs_chip[id]);
2353                 sfxge_chip[id] -= srsp->srs_chip[id];
2354                 srsp->srs_chip[id] = 0;
2355 #endif
2356         }
2357 
2358         ASSERT(srsp->srs_count != 0);
2359 
2360         /* Choose as many event queues as we need */
2361         for (count = 0; count < srsp->srs_count; count++) {
2362                 unsigned int index;
2363                 sfxge_evq_t *sep;
2364                 unsigned int choice;
2365                 unsigned int choice_rating;
2366 
2367                 bzero(rating, sizeof (unsigned int) * sip->si_nalloc);
2368 
2369                 /*
2370                  * Rate each event queue on its global level of CPU
2371                  * contention.
2372                  */
2373                 for (index = 0; index < sip->si_nalloc; index++) {
2374                         sep = sp->s_sep[index];
2375 
2376                         id = sep->se_cpu_id;
2377                         rating[index] += sfxge_cpu[id];
2378 
2379 #ifdef  _USE_CPU_PHYSID
2380                         id = sep->se_core_id;
2381                         rating[index] += sfxge_core[id];
2382 
2383                         id = sep->se_cache_id;
2384                         rating[index] += sfxge_cache[id];
2385 
2386                         id = sep->se_chip_id;
2387                         rating[index] += sfxge_chip[id];
2388 #endif
2389                 }
2390 
2391                 /* Choose the queue with the lowest CPU contention */
2392                 choice = 0;
2393                 choice_rating = rating[0];
2394 
2395                 for (index = 1; index < sip->si_nalloc; index++) {
2396                         if (rating[index] < choice_rating) {
2397                                 choice = index;
2398                                 choice_rating = rating[index];
2399                         }
2400                 }
2401 
2402                 /* Add our choice to the condensed RSS table */
2403                 tbl[count] = choice;
2404 
2405                 /* Add information to the global contention tables */
2406                 sep = sp->s_sep[choice];
2407 
2408                 id = sep->se_cpu_id;
2409                 srsp->srs_cpu[id]++;
2410                 sfxge_cpu[id]++;
2411 
2412 #ifdef  _USE_CPU_PHYSID
2413                 id = sep->se_core_id;
2414                 srsp->srs_core[id]++;
2415                 sfxge_core[id]++;
2416 
2417                 id = sep->se_cache_id;
2418                 srsp->srs_cache[id]++;
2419                 sfxge_cache[id]++;
2420 
2421                 id = sep->se_chip_id;
2422                 srsp->srs_chip[id]++;
2423                 sfxge_chip[id]++;
2424 #endif
2425         }
2426 
2427         mutex_exit(&cpu_lock);
2428 
2429         /* Build the expanded RSS table */
2430         count = 0;
2431         for (entry = 0; entry < SFXGE_RX_SCALE_MAX; entry++) {
2432                 unsigned int index;
2433 
2434                 index = tbl[count];
2435                 count = (count + 1) % srsp->srs_count;
2436 
2437                 srsp->srs_tbl[entry] = index;
2438         }
2439 
2440         /* Program the expanded RSS table into the hardware */
2441         (void) efx_rx_scale_tbl_set(sp->s_enp, srsp->srs_tbl,
2442             SFXGE_RX_SCALE_MAX);
2443 
2444         mutex_exit(&(srsp->srs_lock));
2445         kmem_free(rating, sizeof (unsigned int) * sip->si_nalloc);
2446         kmem_free(tbl, sizeof (unsigned int) * SFXGE_RX_SCALE_MAX);
2447         return;
2448 
2449 fail3:
2450         DTRACE_PROBE(fail3);
2451         kmem_free(tbl, sizeof (unsigned int) * SFXGE_RX_SCALE_MAX);
2452 fail2:
2453         DTRACE_PROBE(fail2);
2454 fail1:
2455         DTRACE_PROBE1(fail1, int, rc);
2456 
2457         mutex_exit(&(srsp->srs_lock));
2458 }
2459 
2460 static int
2461 sfxge_rx_scale_start(sfxge_t *sp)
2462 {
2463         sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2464         const efx_nic_cfg_t *encp;
2465         int rc;
2466 
2467         mutex_enter(&(srsp->srs_lock));
2468 
2469         ASSERT3U(srsp->srs_state, ==, SFXGE_RX_SCALE_INITIALIZED);
2470 
2471         /* Clear down the RSS table */
2472         bzero(srsp->srs_tbl, sizeof (unsigned int) * SFXGE_RX_SCALE_MAX);
2473 
2474         (void) efx_rx_scale_tbl_set(sp->s_enp, srsp->srs_tbl,
2475             SFXGE_RX_SCALE_MAX);
2476 
2477         /* Make sure the LFSR hash is selected */
2478         encp = efx_nic_cfg_get(sp->s_enp);
2479         if ((rc = efx_rx_scale_mode_set(sp->s_enp, EFX_RX_HASHALG_LFSR, 0,
2480             (encp->enc_features & EFX_FEATURE_LFSR_HASH_INSERT))) != 0)
2481                 goto fail1;
2482 
2483         srsp->srs_state = SFXGE_RX_SCALE_STARTED;
2484 
2485         mutex_exit(&(srsp->srs_lock));
2486 
2487         /* sfxge_t->s_state_lock held */
2488         (void) ddi_taskq_dispatch(sp->s_tqp, sfxge_rx_scale_update, sp,
2489             DDI_SLEEP);
2490 
2491         return (0);
2492 
2493 fail1:
2494         DTRACE_PROBE1(fail1, int, rc);
2495 
2496         mutex_exit(&(srsp->srs_lock));
2497 
2498         return (rc);
2499 }
2500 
2501 int
2502 sfxge_rx_scale_count_get(sfxge_t *sp, unsigned int *countp)
2503 {
2504         sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2505         int rc;
2506 
2507         mutex_enter(&(srsp->srs_lock));
2508 
2509         if (srsp->srs_state != SFXGE_RX_SCALE_INITIALIZED &&
2510             srsp->srs_state != SFXGE_RX_SCALE_STARTED) {
2511                 rc = ENOTSUP;
2512                 goto fail1;
2513         }
2514 
2515         *countp = srsp->srs_count;
2516 
2517         mutex_exit(&(srsp->srs_lock));
2518 
2519         return (0);
2520 
2521 fail1:
2522         DTRACE_PROBE1(fail1, int, rc);
2523 
2524         mutex_exit(&(srsp->srs_lock));
2525 
2526         return (rc);
2527 }
2528 
2529 int
2530 sfxge_rx_scale_count_set(sfxge_t *sp, unsigned int count)
2531 {
2532         sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2533         sfxge_intr_t *sip = &(sp->s_intr);
2534         int dispatch = 1;
2535         int rc;
2536 
2537         if (count < 1 || count > sip->si_nalloc) {
2538                 rc = EINVAL;
2539                 goto fail1;
2540         }
2541 
2542         mutex_enter(&(srsp->srs_lock));
2543 
2544         if (srsp->srs_state != SFXGE_RX_SCALE_INITIALIZED &&
2545             srsp->srs_state != SFXGE_RX_SCALE_STARTED) {
2546                 rc = ENOTSUP;
2547                 goto fail2;
2548         }
2549 
2550         srsp->srs_count = count;
2551 
2552         if (srsp->srs_state != SFXGE_RX_SCALE_STARTED)
2553                 dispatch = 0;
2554 
2555         mutex_exit(&(srsp->srs_lock));
2556 
2557         if (dispatch)
2558                 /* no locks held */
2559                 (void) ddi_taskq_dispatch(sp->s_tqp, sfxge_rx_scale_update, sp,
2560                     DDI_SLEEP);
2561 
2562         return (0);
2563 
2564 fail2:
2565         DTRACE_PROBE(fail2);
2566 
2567         mutex_exit(&(srsp->srs_lock));
2568 
2569 fail1:
2570         DTRACE_PROBE1(fail1, int, rc);
2571 
2572         return (rc);
2573 }
2574 
2575 static void
2576 sfxge_rx_scale_stop(sfxge_t *sp)
2577 {
2578         sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2579         processorid_t id;
2580 
2581         mutex_enter(&(srsp->srs_lock));
2582 
2583         ASSERT3U(srsp->srs_state, ==, SFXGE_RX_SCALE_STARTED);
2584 
2585         srsp->srs_state = SFXGE_RX_SCALE_INITIALIZED;
2586 
2587         mutex_enter(&cpu_lock);
2588 
2589         /*
2590          * Substract any current CPU, core, cache and chip usage from the
2591          * global contention tables.
2592          */
2593         for (id = 0; id < NCPU; id++) {
2594                 ASSERT3U(sfxge_cpu[id], >=, srsp->srs_cpu[id]);
2595                 sfxge_cpu[id] -= srsp->srs_cpu[id];
2596                 srsp->srs_cpu[id] = 0;
2597 
2598 #ifdef  _USE_CPU_PHYSID
2599                 ASSERT3U(sfxge_core[id], >=, srsp->srs_core[id]);
2600                 sfxge_core[id] -= srsp->srs_core[id];
2601                 srsp->srs_core[id] = 0;
2602 
2603                 ASSERT3U(sfxge_cache[id], >=, srsp->srs_cache[id]);
2604                 sfxge_cache[id] -= srsp->srs_cache[id];
2605                 srsp->srs_cache[id] = 0;
2606 
2607                 ASSERT3U(sfxge_chip[id], >=, srsp->srs_chip[id]);
2608                 sfxge_chip[id] -= srsp->srs_chip[id];
2609                 srsp->srs_chip[id] = 0;
2610 #endif
2611         }
2612 
2613         mutex_exit(&cpu_lock);
2614 
2615         /* Clear down the RSS table */
2616         bzero(srsp->srs_tbl, sizeof (unsigned int) * SFXGE_RX_SCALE_MAX);
2617 
2618         (void) efx_rx_scale_tbl_set(sp->s_enp, srsp->srs_tbl,
2619             SFXGE_RX_SCALE_MAX);
2620 
2621         mutex_exit(&(srsp->srs_lock));
2622 }
2623 
2624 static void
2625 sfxge_rx_scale_fini(sfxge_t *sp)
2626 {
2627         sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2628 
2629         ASSERT3U(srsp->srs_state, ==, SFXGE_RX_SCALE_INITIALIZED);
2630 
2631         srsp->srs_state = SFXGE_RX_SCALE_UNINITIALIZED;
2632 
2633         /* Tear down the kstats */
2634         sfxge_rx_scale_kstat_fini(sp);
2635 
2636         srsp->srs_count = 0;
2637 
2638         mutex_destroy(&(srsp->srs_lock));
2639 
2640         /* Destroy tables */
2641 #ifdef  _USE_CPU_PHYSID
2642         kmem_free(srsp->srs_chip, sizeof (unsigned int) * NCPU);
2643         srsp->srs_chip = NULL;
2644 
2645         kmem_free(srsp->srs_cache, sizeof (unsigned int) * NCPU);
2646         srsp->srs_cache = NULL;
2647 
2648         kmem_free(srsp->srs_core, sizeof (unsigned int) * NCPU);
2649         srsp->srs_core = NULL;
2650 #endif
2651         kmem_free(srsp->srs_cpu, sizeof (unsigned int) * NCPU);
2652         srsp->srs_cpu = NULL;
2653 }
2654 
2655 int
2656 sfxge_rx_init(sfxge_t *sp)
2657 {
2658         sfxge_intr_t *sip = &(sp->s_intr);
2659         const efx_nic_cfg_t *encp;
2660         char name[MAXNAMELEN];
2661         int index;
2662         int rc;
2663 
2664         if (sip->si_state == SFXGE_INTR_UNINITIALIZED) {
2665                 rc = EINVAL;
2666                 goto fail1;
2667         }
2668 
2669         encp = efx_nic_cfg_get(sp->s_enp);
2670         if ((rc = sfxge_rx_scale_init(sp)) != 0)
2671                 goto fail2;
2672 
2673         (void) snprintf(name, MAXNAMELEN - 1, "%s%d_rx_packet_cache",
2674             ddi_driver_name(sp->s_dip), ddi_get_instance(sp->s_dip));
2675 
2676         sp->s_rpc = kmem_cache_create(name, sizeof (sfxge_rx_packet_t),
2677             SFXGE_CPU_CACHE_SIZE, sfxge_rx_packet_ctor, sfxge_rx_packet_dtor,
2678             NULL, sp, NULL, 0);
2679         ASSERT(sp->s_rpc != NULL);
2680 
2681         (void) snprintf(name, MAXNAMELEN - 1, "%s%d_rxq_cache",
2682             ddi_driver_name(sp->s_dip), ddi_get_instance(sp->s_dip));
2683 
2684         sp->s_rqc = kmem_cache_create(name, sizeof (sfxge_rxq_t),
2685             SFXGE_CPU_CACHE_SIZE, sfxge_rx_qctor, sfxge_rx_qdtor, NULL, sp,
2686             NULL, 0);
2687         ASSERT(sp->s_rqc != NULL);
2688 
2689         sp->s_rx_pkt_mem_max = ddi_prop_get_int64(DDI_DEV_T_ANY, sp->s_dip,
2690             DDI_PROP_DONTPASS, "rx_pkt_mem_max", 0); /* disabled */
2691 
2692         /* Initialize the receive queue(s) */
2693         for (index = 0; index < sip->si_nalloc; index++) {
2694                 if ((rc = sfxge_rx_qinit(sp, index)) != 0)
2695                         goto fail3;
2696         }
2697 
2698         sp->s_rx_coalesce_mode = ddi_prop_get_int(DDI_DEV_T_ANY, sp->s_dip,
2699             DDI_PROP_DONTPASS, "rx_coalesce_mode", SFXGE_RX_COALESCE_OFF);
2700 
2701         return (0);
2702 
2703 fail3:
2704         DTRACE_PROBE(fail3);
2705 
2706         /* Tear down the receive queue(s) */
2707         while (--index >= 0)
2708                 sfxge_rx_qfini(sp, index);
2709 
2710         kmem_cache_destroy(sp->s_rqc);
2711         sp->s_rqc = NULL;
2712 
2713         kmem_cache_destroy(sp->s_rpc);
2714         sp->s_rpc = NULL;
2715 
2716         sfxge_rx_scale_fini(sp);
2717 
2718 fail2:
2719         DTRACE_PROBE(fail2);
2720 fail1:
2721         DTRACE_PROBE1(fail1, int, rc);
2722 
2723         return (rc);
2724 }
2725 
2726 int
2727 sfxge_rx_start(sfxge_t *sp)
2728 {
2729         sfxge_mac_t *smp = &(sp->s_mac);
2730         sfxge_intr_t *sip;
2731         const efx_nic_cfg_t *encp;
2732         int index;
2733         int rc;
2734 
2735         mutex_enter(&(smp->sm_lock));
2736 
2737         /* Calculate the receive packet buffer size and alignment */
2738         sp->s_rx_buffer_size = EFX_MAC_PDU(sp->s_mtu);
2739 
2740         encp = efx_nic_cfg_get(sp->s_enp);
2741         if (encp->enc_features & EFX_FEATURE_LFSR_HASH_INSERT) {
2742                 size_t align;
2743 
2744                 sp->s_rx_prefix_size = EFX_RX_PREFIX_SIZE;
2745 
2746                 /*
2747                  * Place the start of the buffer a prefix length minus 2
2748                  * before the start of a cache line. This ensures that the
2749                  * last two bytes of the prefix (which is where the LFSR hash
2750                  * is located) are in the same cache line as the headers, and
2751                  * the IP header is 32-bit aligned.
2752                  */
2753                 align = SFXGE_CPU_CACHE_SIZE + SFXGE_IP_ALIGN -
2754                     EFX_RX_PREFIX_SIZE;
2755 
2756                 sp->s_rx_buffer_align = align;
2757                 sp->s_rx_buffer_size += align;
2758         } else {
2759                 sp->s_rx_prefix_size = 0;
2760 
2761                 /*
2762                  * Place the start of the buffer 2 bytes after a cache line
2763                  * boundary so that the headers fit into the cache line and
2764                  * the IP header is 32-bit aligned.
2765                  */
2766 
2767                 sp->s_rx_buffer_align = SFXGE_IP_ALIGN;
2768                 sp->s_rx_buffer_size += SFXGE_IP_ALIGN;
2769         }
2770 
2771         /* Initialize the receive module */
2772         if ((rc = efx_rx_init(sp->s_enp)) != 0)
2773                 goto fail1;
2774 
2775         mutex_exit(&(smp->sm_lock));
2776 
2777         if ((rc = sfxge_rx_scale_start(sp)) != 0)
2778                 goto fail2;
2779 
2780         /* Start the receive queue(s) */
2781         sip = &(sp->s_intr);
2782         for (index = 0; index < sip->si_nalloc; index++) {
2783                 if ((rc = sfxge_rx_qstart(sp, index)) != 0)
2784                         goto fail3;
2785         }
2786 
2787         return (0);
2788 
2789 fail3:
2790         DTRACE_PROBE(fail3);
2791 
2792         /* Stop the receive queue(s) */
2793         while (--index >= 0)
2794                 sfxge_rx_qstop(sp, index);
2795 
2796         sfxge_rx_scale_stop(sp);
2797 
2798 fail2:
2799         DTRACE_PROBE(fail2);
2800 
2801         mutex_enter(&(smp->sm_lock));
2802 
2803         /* Tear down the receive module */
2804         efx_rx_fini(sp->s_enp);
2805 
2806 fail1:
2807         DTRACE_PROBE1(fail1, int, rc);
2808 
2809         mutex_exit(&(smp->sm_lock));
2810 
2811         return (rc);
2812 }
2813 
2814 void
2815 sfxge_rx_coalesce_mode_get(sfxge_t *sp, sfxge_rx_coalesce_mode_t *modep)
2816 {
2817         *modep = sp->s_rx_coalesce_mode;
2818 }
2819 
2820 int
2821 sfxge_rx_coalesce_mode_set(sfxge_t *sp, sfxge_rx_coalesce_mode_t mode)
2822 {
2823         int rc;
2824 
2825         switch (mode) {
2826         case SFXGE_RX_COALESCE_OFF:
2827         case SFXGE_RX_COALESCE_DISALLOW_PUSH:
2828         case SFXGE_RX_COALESCE_ALLOW_PUSH:
2829                 break;
2830 
2831         default:
2832                 rc = EINVAL;
2833                 goto fail1;
2834         }
2835 
2836         sp->s_rx_coalesce_mode = mode;
2837 
2838         return (0);
2839 
2840 fail1:
2841         DTRACE_PROBE1(fail1, int, rc);
2842 
2843         return (rc);
2844 }
2845 
2846 void
2847 sfxge_rx_loopback(sfxge_t *sp, unsigned int *countp)
2848 {
2849         sfxge_intr_t *sip = &(sp->s_intr);
2850         int index;
2851 
2852         *countp = 0;
2853         for (index = 0; index < sip->si_nalloc; index++)
2854                 *countp += sfxge_rx_qloopback(sp, index);
2855 }
2856 
2857 int
2858 sfxge_rx_ioctl(sfxge_t *sp, sfxge_rx_ioc_t *srip)
2859 {
2860         int rc;
2861 
2862         switch (srip->sri_op) {
2863         case SFXGE_RX_OP_LOOPBACK: {
2864                 unsigned int count;
2865 
2866                 sfxge_rx_loopback(sp, &count);
2867 
2868                 srip->sri_data = count;
2869 
2870                 break;
2871         }
2872         default:
2873                 rc = ENOTSUP;
2874                 goto fail1;
2875         }
2876 
2877         return (0);
2878 
2879 fail1:
2880         DTRACE_PROBE1(fail1, int, rc);
2881 
2882         return (rc);
2883 }
2884 
2885 void
2886 sfxge_rx_stop(sfxge_t *sp)
2887 {
2888         sfxge_mac_t *smp = &(sp->s_mac);
2889         sfxge_intr_t *sip = &(sp->s_intr);
2890         efx_nic_t *enp = sp->s_enp;
2891         const efx_nic_cfg_t *encp;
2892         int index;
2893 
2894         /* Stop the receive queue(s) */
2895         index = sip->si_nalloc;
2896         while (--index >= 0)
2897                 /* TBD: Flush RXQs in parallel; HW has limit + may need retry */
2898                 sfxge_rx_qstop(sp, index);
2899 
2900         encp = efx_nic_cfg_get(sp->s_enp);
2901         sfxge_rx_scale_stop(sp);
2902 
2903         mutex_enter(&(smp->sm_lock));
2904 
2905         /* Tear down the receive module */
2906         efx_rx_fini(enp);
2907 
2908         sp->s_rx_buffer_align = 0;
2909         sp->s_rx_prefix_size = 0;
2910         sp->s_rx_buffer_size = 0;
2911 
2912         mutex_exit(&(smp->sm_lock));
2913 }
2914 
2915 unsigned int
2916 sfxge_rx_loaned(sfxge_t *sp)
2917 {
2918         sfxge_intr_t *sip = &(sp->s_intr);
2919         int index;
2920         unsigned int loaned;
2921 
2922         ASSERT3U(sip->si_state, ==, SFXGE_INTR_INITIALIZED);
2923 
2924         loaned = 0;
2925         for (index = 0; index < sip->si_nalloc; index++) {
2926                 sfxge_rxq_t *srp = sp->s_srp[index];
2927                 sfxge_evq_t *sep = sp->s_sep[srp->sr_index];
2928 
2929                 mutex_enter(&(sep->se_lock));
2930 
2931                 loaned += sfxge_rx_qfpp_swizzle(srp);
2932 
2933                 mutex_exit(&(sep->se_lock));
2934         }
2935 
2936         return (loaned);
2937 }
2938 
2939 void
2940 sfxge_rx_fini(sfxge_t *sp)
2941 {
2942         sfxge_intr_t *sip = &(sp->s_intr);
2943         const efx_nic_cfg_t *encp;
2944         int index;
2945 
2946         ASSERT3U(sip->si_state, ==, SFXGE_INTR_INITIALIZED);
2947 
2948         sp->s_rx_coalesce_mode = SFXGE_RX_COALESCE_OFF;
2949 
2950         /* Tear down the receive queue(s) */
2951         index = sip->si_nalloc;
2952         while (--index >= 0)
2953                 sfxge_rx_qfini(sp, index);
2954 
2955         ASSERT3U(sp->s_rx_pkt_mem_alloc, ==, 0);
2956 
2957         kmem_cache_destroy(sp->s_rqc);
2958         sp->s_rqc = NULL;
2959 
2960         kmem_cache_destroy(sp->s_rpc);
2961         sp->s_rpc = NULL;
2962 
2963         encp = efx_nic_cfg_get(sp->s_enp);
2964         sfxge_rx_scale_fini(sp);
2965 }