1 /*
   2  * Copyright (c) 2008-2015 Solarflare Communications Inc.
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions are met:
   7  *
   8  * 1. Redistributions of source code must retain the above copyright notice,
   9  *    this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright notice,
  11  *    this list of conditions and the following disclaimer in the documentation
  12  *    and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  15  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  16  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  17  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  21  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  22  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  23  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  24  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25  *
  26  * The views and conclusions contained in the software and documentation are
  27  * those of the authors and should not be interpreted as representing official
  28  * policies, either expressed or implied, of the FreeBSD Project.
  29  */
  30 
  31 #include <sys/types.h>
  32 #include <sys/sysmacros.h>
  33 #include <sys/ddi.h>
  34 #include <sys/sunddi.h>
  35 #include <sys/atomic.h>
  36 #include <sys/stream.h>
  37 #include <sys/strsun.h>
  38 #include <sys/strsubr.h>
  39 #include <sys/strft.h>
  40 #include <sys/ksynch.h>
  41 #include <sys/ethernet.h>
  42 #include <sys/crc32.h>
  43 #include <sys/pattr.h>
  44 #include <sys/cpu.h>
  45 
  46 #include <sys/ethernet.h>
  47 #include <inet/ip.h>
  48 
  49 #include <netinet/in.h>
  50 #include <netinet/ip.h>
  51 #include <netinet/tcp.h>
  52 
  53 #include "sfxge.h"
  54 
  55 #include "efx.h"
  56 
  57 /* RXQ flush response timeout (in microseconds) */
  58 #define SFXGE_RX_QFLUSH_USEC    (2000000)
  59 
  60 /* RXQ flush tries in the case of failure */
  61 #define SFXGE_RX_QFLUSH_TRIES   (5)
  62 
  63 /* RXQ default packet buffer preallocation (number of packet buffers) */
  64 #define SFXGE_RX_QPREALLOC      (0)
  65 
  66 /* Receive packet DMA attributes */
  67 static ddi_device_acc_attr_t sfxge_rx_packet_devacc = {
  68 
  69         DDI_DEVICE_ATTR_V0,     /* devacc_attr_version */
  70         DDI_NEVERSWAP_ACC,      /* devacc_attr_endian_flags */
  71         DDI_STRICTORDER_ACC     /* devacc_attr_dataorder */
  72 };
  73 
  74 static ddi_dma_attr_t sfxge_rx_packet_dma_attr = {
  75         DMA_ATTR_V0,            /* dma_attr_version     */
  76         0,                      /* dma_attr_addr_lo     */
  77         0xffffffffffffffffull,  /* dma_attr_addr_hi     */
  78         0xffffffffffffffffull,  /* dma_attr_count_max   */
  79         SFXGE_CPU_CACHE_SIZE,   /* dma_attr_align       */
  80         0xffffffff,             /* dma_attr_burstsizes  */
  81         1,                      /* dma_attr_minxfer     */
  82         0xffffffffffffffffull,  /* dma_attr_maxxfer     */
  83         0xffffffffffffffffull,  /* dma_attr_seg         */
  84         1,                      /* dma_attr_sgllen      */
  85         1,                      /* dma_attr_granular    */
  86         0                       /* dma_attr_flags       */
  87 };
  88 
  89 /* Receive queue DMA attributes */
  90 static ddi_device_acc_attr_t sfxge_rxq_devacc = {
  91 
  92         DDI_DEVICE_ATTR_V0,     /* devacc_attr_version */
  93         DDI_NEVERSWAP_ACC,      /* devacc_attr_endian_flags */
  94         DDI_STRICTORDER_ACC     /* devacc_attr_dataorder */
  95 };
  96 
  97 static ddi_dma_attr_t sfxge_rxq_dma_attr = {
  98         DMA_ATTR_V0,            /* dma_attr_version     */
  99         0,                      /* dma_attr_addr_lo     */
 100         0xffffffffffffffffull,  /* dma_attr_addr_hi     */
 101         0xffffffffffffffffull,  /* dma_attr_count_max   */
 102         EFX_BUF_SIZE,           /* dma_attr_align       */
 103         0xffffffff,             /* dma_attr_burstsizes  */
 104         1,                      /* dma_attr_minxfer     */
 105         0xffffffffffffffffull,  /* dma_attr_maxxfer     */
 106         0xffffffffffffffffull,  /* dma_attr_seg         */
 107         1,                      /* dma_attr_sgllen      */
 108         1,                      /* dma_attr_granular    */
 109         0                       /* dma_attr_flags       */
 110 };
 111 
 112 /* Forward declaration */
 113 static void sfxge_rx_qpreallocate(sfxge_rxq_t *srp, int nprealloc);
 114 
 115 static int
 116 sfxge_rx_packet_ctor(void *buf, void *arg, int kmflags)
 117 {
 118         sfxge_rx_packet_t *srpp = buf;
 119         sfxge_t *sp = arg;
 120         dev_info_t *dip = sp->s_dip;
 121         int err;
 122 
 123         ASSERT3U(sizeof (srpp->__srp_u1.__srp_s1), <=,
 124             sizeof (srpp->__srp_u1.__srp_pad));
 125         ASSERT3U(sizeof (srpp->__srp_u2.__srp_s2), <=,
 126             sizeof (srpp->__srp_u2.__srp_pad));
 127 
 128         bzero(buf, sizeof (sfxge_rx_packet_t));
 129 
 130         /* Allocate a DMA handle */
 131         err = ddi_dma_alloc_handle(dip, &sfxge_rx_packet_dma_attr,
 132             (kmflags == KM_SLEEP) ? DDI_DMA_SLEEP : DDI_DMA_DONTWAIT,
 133             NULL, &(srpp->srp_dma_handle));
 134         if (err != DDI_SUCCESS)
 135                 goto fail1;
 136 
 137         return (0);
 138 
 139 fail1:
 140         DTRACE_PROBE1(fail1, int, err);
 141 
 142         SFXGE_OBJ_CHECK(srpp, sfxge_rx_packet_t);
 143 
 144         return (-1);
 145 }
 146 
 147 static void
 148 sfxge_rx_packet_dtor(void *buf, void *arg)
 149 {
 150         sfxge_rx_packet_t *srpp = buf;
 151 
 152         _NOTE(ARGUNUSED(arg))
 153 
 154         /* Free the DMA handle */
 155         ddi_dma_free_handle(&(srpp->srp_dma_handle));
 156         srpp->srp_dma_handle = NULL;
 157 
 158         SFXGE_OBJ_CHECK(srpp, sfxge_rx_packet_t);
 159 }
 160 
 161 static int
 162 sfxge_rx_qctor(void *buf, void *arg, int kmflags)
 163 {
 164         sfxge_rxq_t *srp = buf;
 165         efsys_mem_t *esmp = &(srp->sr_mem);
 166         sfxge_t *sp = arg;
 167         sfxge_dma_buffer_attr_t dma_attr;
 168         sfxge_rx_fpp_t *srfppp;
 169         int nprealloc;
 170         unsigned int id;
 171         int rc;
 172 
 173         /* Compile-time structure layout checks */
 174         EFX_STATIC_ASSERT(sizeof (srp->__sr_u1.__sr_s1) <=
 175             sizeof (srp->__sr_u1.__sr_pad));
 176         EFX_STATIC_ASSERT(sizeof (srp->__sr_u2.__sr_s2) <=
 177             sizeof (srp->__sr_u2.__sr_pad));
 178         EFX_STATIC_ASSERT(sizeof (srp->__sr_u3.__sr_s3) <=
 179             sizeof (srp->__sr_u3.__sr_pad));
 180 
 181         bzero(buf, sizeof (sfxge_rxq_t));
 182 
 183         srp->sr_sp = sp;
 184 
 185         dma_attr.sdba_dip        = sp->s_dip;
 186         dma_attr.sdba_dattrp     = &sfxge_rxq_dma_attr;
 187         dma_attr.sdba_callback   = DDI_DMA_SLEEP;
 188         dma_attr.sdba_length     = EFX_RXQ_SIZE(sp->s_rxq_size);
 189         dma_attr.sdba_memflags   = DDI_DMA_CONSISTENT;
 190         dma_attr.sdba_devaccp    = &sfxge_rxq_devacc;
 191         dma_attr.sdba_bindflags  = DDI_DMA_READ | DDI_DMA_CONSISTENT;
 192         dma_attr.sdba_maxcookies = 1;
 193         dma_attr.sdba_zeroinit   = B_FALSE;
 194 
 195         if ((rc = sfxge_dma_buffer_create(esmp, &dma_attr)) != 0)
 196                 goto fail1;
 197 
 198         /* Allocate some buffer table entries */
 199         if ((rc = sfxge_sram_buf_tbl_alloc(sp, EFX_RXQ_NBUFS(sp->s_rxq_size),
 200             &(srp->sr_id))) != 0)
 201                 goto fail2;
 202 
 203         /* Allocate the context array */
 204         if ((srp->sr_srpp = kmem_zalloc(sizeof (sfxge_rx_packet_t *) *
 205             sp->s_rxq_size, kmflags)) == NULL) {
 206                 rc = ENOMEM;
 207                 goto fail3;
 208         }
 209 
 210         /* Allocate the flow table */
 211         if ((srp->sr_flow = kmem_zalloc(sizeof (sfxge_rx_flow_t) *
 212             SFXGE_MAX_FLOW, kmflags)) == NULL) {
 213                 rc = ENOMEM;
 214                 goto fail4;
 215         }
 216 
 217         srp->sr_srfpp = &(srp->sr_srfp);
 218         srp->sr_rto = drv_usectohz(200000);
 219 
 220         srp->sr_mpp = &(srp->sr_mp);
 221 
 222         /* Initialize the free packet pool */
 223         srfppp = &(srp->sr_fpp);
 224         if ((srfppp->srfpp_putp = kmem_zalloc(SFXGE_CPU_CACHE_SIZE *
 225             SFXGE_RX_FPP_NSLOTS, kmflags)) == NULL) {
 226                 rc = ENOMEM;
 227                 goto fail5;
 228         }
 229         for (id = 0; id < SFXGE_RX_FPP_NSLOTS; id++) {
 230                 sfxge_rx_fpp_putlist_t *putp;
 231                 size_t off;
 232 
 233                 off = id * SFXGE_CPU_CACHE_SIZE;
 234                 putp = (void *)(srfppp->srfpp_putp + off);
 235 
 236                 putp->srfpl_putp = NULL;
 237                 putp->srfpl_putpp = &(putp->srfpl_putp);
 238                 mutex_init(&(putp->srfpl_lock), NULL, MUTEX_DRIVER,
 239                     DDI_INTR_PRI(sp->s_intr.si_intr_pri));
 240         }
 241 
 242         cv_init(&(srp->sr_flush_kv), NULL, CV_DRIVER, NULL);
 243 
 244         /* Preallocate some packets on the free packet pool */
 245         nprealloc = ddi_prop_get_int(DDI_DEV_T_ANY, sp->s_dip,
 246             DDI_PROP_DONTPASS, "rx_prealloc_pkt_buffers", SFXGE_RX_QPREALLOC);
 247         sfxge_rx_qpreallocate(srp, nprealloc);
 248 
 249 
 250         return (0);
 251 
 252 fail5:
 253         DTRACE_PROBE(fail5);
 254 
 255         srp->sr_mpp = NULL;
 256 
 257         srp->sr_rto = 0;
 258         srp->sr_srfpp = NULL;
 259 
 260         /* Free the flow table */
 261         kmem_free(srp->sr_flow, sizeof (sfxge_rx_flow_t) *
 262             SFXGE_MAX_FLOW);
 263         srp->sr_flow = NULL;
 264 
 265 fail4:
 266         DTRACE_PROBE(fail4);
 267 
 268         /* Free the context array */
 269         kmem_free(srp->sr_srpp, sizeof (sfxge_rx_packet_t *) *
 270             sp->s_rxq_size);
 271         srp->sr_srpp = NULL;
 272 
 273 fail3:
 274         DTRACE_PROBE(fail3);
 275 
 276         /* Free the buffer table entries */
 277         sfxge_sram_buf_tbl_free(sp, srp->sr_id,
 278             EFX_RXQ_NBUFS(sp->s_rxq_size));
 279         srp->sr_id = 0;
 280 
 281 fail2:
 282         DTRACE_PROBE(fail2);
 283         /* Remove dma setup */
 284         sfxge_dma_buffer_destroy(esmp);
 285 
 286 fail1:
 287         DTRACE_PROBE1(fail1, int, rc);
 288 
 289         srp->sr_sp = NULL;
 290 
 291         SFXGE_OBJ_CHECK(srp, sfxge_rxq_t);
 292 
 293         return (-1);
 294 }
 295 
 296 static void
 297 sfxge_rx_qdtor(void *buf, void *arg)
 298 {
 299         sfxge_rxq_t *srp = buf;
 300         efsys_mem_t *esmp = &(srp->sr_mem);
 301         sfxge_t *sp = srp->sr_sp;
 302         sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp);
 303         unsigned int id;
 304 
 305         _NOTE(ARGUNUSED(arg))
 306 
 307         cv_destroy(&(srp->sr_flush_kv));
 308 
 309         /* Tear down the free packet pool */
 310         for (id = 0; id < SFXGE_RX_FPP_NSLOTS; id++) {
 311                 sfxge_rx_fpp_putlist_t *putp;
 312                 size_t off;
 313 
 314                 off = id * SFXGE_CPU_CACHE_SIZE;
 315                 putp = (void *)(srfppp->srfpp_putp + off);
 316 
 317                 putp->srfpl_putpp = NULL;
 318                 mutex_destroy(&(putp->srfpl_lock));
 319 
 320                 SFXGE_OBJ_CHECK(putp, sfxge_rx_fpp_putlist_t);
 321         }
 322         kmem_free(srfppp->srfpp_putp, SFXGE_CPU_CACHE_SIZE *
 323             SFXGE_RX_FPP_NSLOTS);
 324         srfppp->srfpp_putp = NULL;
 325 
 326         srp->sr_mpp = NULL;
 327 
 328         srp->sr_rto = 0;
 329         srp->sr_srfpp = NULL;
 330 
 331         /* Free the flow table */
 332         kmem_free(srp->sr_flow, sizeof (sfxge_rx_flow_t) *
 333             SFXGE_MAX_FLOW);
 334         srp->sr_flow = NULL;
 335 
 336         /* Free the context array */
 337         kmem_free(srp->sr_srpp, sizeof (sfxge_rx_packet_t *) *
 338             sp->s_rxq_size);
 339         srp->sr_srpp = NULL;
 340 
 341         /* Free the buffer table entries */
 342         sfxge_sram_buf_tbl_free(sp, srp->sr_id,
 343             EFX_RXQ_NBUFS(sp->s_rxq_size));
 344         srp->sr_id = 0;
 345 
 346         /* Tear down dma setup */
 347         sfxge_dma_buffer_destroy(esmp);
 348 
 349         SFXGE_OBJ_CHECK(srp, sfxge_rxq_t);
 350 }
 351 
 352 /* Note: This function takes ownership of *srpp. */
 353 static inline void
 354 sfxge_rx_qfpp_put(sfxge_rxq_t *srp, sfxge_rx_packet_t *srpp)
 355 {
 356         sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp);
 357         mblk_t *mp = srpp->srp_mp;
 358         unsigned int id;
 359         size_t off;
 360         sfxge_rx_fpp_putlist_t *putp;
 361 
 362         ASSERT3P(mp->b_next, ==, NULL);
 363         ASSERT3P(mp->b_prev, ==, NULL);
 364 
 365         id = CPU->cpu_seqid & SFXGE_RX_FPP_MASK;
 366         off = id * SFXGE_CPU_CACHE_SIZE;
 367 
 368         ASSERT3P(srpp->srp_putp, ==, srfppp->srfpp_putp);
 369         putp = (void *)(srpp->srp_putp + off);
 370 
 371         mutex_enter(&(putp->srfpl_lock));
 372         putp->srfpl_count++;
 373         *putp->srfpl_putpp = mp;
 374         putp->srfpl_putpp = &(mp->b_next);
 375         mutex_exit(&(putp->srfpl_lock));
 376 }
 377 
 378 static unsigned int
 379 sfxge_rx_qfpp_swizzle(sfxge_rxq_t *srp)
 380 {
 381         sfxge_t *sp = srp->sr_sp;
 382         unsigned int index = srp->sr_index;
 383         sfxge_evq_t *sep = sp->s_sep[index];
 384         sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp);
 385         unsigned int start;
 386         unsigned int id;
 387         mblk_t *p;
 388         mblk_t **pp;
 389         unsigned int count;
 390         unsigned int loaned;
 391 
 392         ASSERT(mutex_owned(&(sep->se_lock)));
 393 
 394         /* We want to access the put list for the current CPU last */
 395         id = start = (CPU->cpu_seqid + 1) & SFXGE_RX_FPP_MASK;
 396 
 397         do {
 398                 sfxge_rx_fpp_putlist_t *putp;
 399                 size_t off;
 400 
 401                 off = id * SFXGE_CPU_CACHE_SIZE;
 402                 id  = (id + 1) & SFXGE_RX_FPP_MASK;
 403 
 404                 putp = (void *)(srfppp->srfpp_putp + off);
 405 
 406                 /* Acquire the put list */
 407                 mutex_enter(&(putp->srfpl_lock));
 408 
 409                 p = putp->srfpl_putp;
 410                 pp = putp->srfpl_putpp;
 411                 count = putp->srfpl_count;
 412 
 413                 putp->srfpl_putp = NULL;
 414                 putp->srfpl_putpp = &(putp->srfpl_putp);
 415                 putp->srfpl_count = 0;
 416 
 417                 mutex_exit(&(putp->srfpl_lock));
 418 
 419                 if (p == NULL)
 420                         continue;
 421 
 422                 /* Add the list to the head of the get list */
 423                 *pp = srfppp->srfpp_get;
 424                 srfppp->srfpp_get = p;
 425 
 426                 /* Adjust the counters */
 427                 ASSERT3U(srfppp->srfpp_loaned, >=, count);
 428                 srfppp->srfpp_loaned -= count;
 429                 srfppp->srfpp_count += count;
 430 
 431 #if 0
 432                 /* NOTE: this probe is disabled because it is expensive!! */
 433                 DTRACE_PROBE2(count,
 434                     unsigned int, (id - 1) & SFXGE_RX_FPP_MASK,
 435                     unsigned int, count);
 436 #endif
 437 
 438         } while (id != start);
 439 
 440         /* Return the number of packets yet to appear in the put list */
 441         loaned = srfppp->srfpp_loaned;
 442 
 443 
 444         return (loaned);
 445 }
 446 
 447 
 448 #define DB_FRTNP(mp)    ((mp)->b_datap->db_frtnp)
 449 
 450 static void
 451 sfxge_rx_qfpp_empty(sfxge_rxq_t *srp)
 452 {
 453         sfxge_t *sp = srp->sr_sp;
 454         unsigned int index = srp->sr_index;
 455         sfxge_evq_t *sep = sp->s_sep[index];
 456         sfxge_rx_fpp_t *srfppp;
 457         mblk_t *mp;
 458 
 459         mutex_enter(&(sep->se_lock));
 460         srfppp = &(srp->sr_fpp);
 461 
 462         /* Swizzle put list to get list */
 463         (void) sfxge_rx_qfpp_swizzle(srp);
 464         ASSERT3U(srfppp->srfpp_loaned, ==, 0);
 465 
 466         mp = srfppp->srfpp_get;
 467         srfppp->srfpp_get = NULL;
 468 
 469         /* Free the remainder */
 470         while (mp != NULL) {
 471                 mblk_t *next;
 472                 frtn_t *freep;
 473                 sfxge_rx_packet_t *srpp;
 474 
 475                 next = mp->b_next;
 476                 mp->b_next = NULL;
 477 
 478                 ASSERT3U(srfppp->srfpp_count, >, 0);
 479                 srfppp->srfpp_count--;
 480 
 481                 freep = DB_FRTNP(mp);
 482                 /*
 483                  * ASSERT3P(freep->free_func, ==, sfxge_rx_qpacket_free);
 484                  *   is implied by srpp test below
 485                  */
 486                 /*LINTED*/
 487                 srpp = (sfxge_rx_packet_t *)(freep->free_arg);
 488                 ASSERT3P(srpp->srp_mp, ==, mp);
 489                 ASSERT3P(mp->b_cont, ==, NULL);
 490                 srpp->srp_recycle = B_FALSE;
 491 
 492                 freeb(mp);
 493 
 494                 mp = next;
 495         }
 496         ASSERT3U(srfppp->srfpp_count, ==, 0);
 497 
 498         srfppp->srfpp_min = 0;
 499 
 500         mutex_exit(&(sep->se_lock));
 501 }
 502 
 503 /*
 504  * This is an estimate of all memory consumed per RX packet
 505  * it can be inaccurate but but sp->s_rx_pkt_mem_alloc mustn't drift
 506  */
 507 static uint64_t
 508 sfxge_rx_pkt_mem_approx(const sfxge_rx_packet_t *srpp)
 509 {
 510         return (srpp->srp_mblksize + sizeof (mblk_t) + sizeof (dblk_t) +
 511             sizeof (sfxge_rx_packet_t));
 512 }
 513 
 514 static void
 515 sfxge_rx_qpacket_destroy(sfxge_rxq_t *srp, sfxge_rx_packet_t *srpp)
 516 {
 517         sfxge_t *sp = srp->sr_sp;
 518         int64_t delta = sfxge_rx_pkt_mem_approx(srpp);
 519 
 520         ASSERT(!(srpp->srp_recycle));
 521         ASSERT3P(srpp->srp_mp, ==, NULL);
 522 
 523         srpp->srp_off = 0;
 524         srpp->srp_thp = NULL;
 525         srpp->srp_iphp = NULL;
 526         srpp->srp_etherhp = NULL;
 527         srpp->srp_size = 0;
 528         srpp->srp_flags = 0;
 529 
 530         bzero(&(srpp->srp_free), sizeof (frtn_t));
 531 
 532         srpp->srp_mblksize = 0;
 533         srpp->srp_base = NULL;
 534 
 535         /* Unbind the DMA memory from the DMA handle */
 536         srpp->srp_addr = 0;
 537         (void) ddi_dma_unbind_handle(srpp->srp_dma_handle);
 538 
 539         /* Free the DMA memory */
 540         srpp->srp_base = NULL;
 541         ddi_dma_mem_free(&(srpp->srp_acc_handle));
 542         srpp->srp_acc_handle = NULL;
 543 
 544         srpp->srp_putp = NULL;
 545         srpp->srp_srp = NULL;
 546 
 547         kmem_cache_free(sp->s_rpc, srpp);
 548         if (sp->s_rx_pkt_mem_max)
 549                 atomic_add_64(&sp->s_rx_pkt_mem_alloc, -delta);
 550 }
 551 
 552 static void
 553 sfxge_rx_qpacket_free(void *arg)
 554 {
 555         sfxge_rx_packet_t *srpp = arg;
 556         sfxge_rxq_t *srp = srpp->srp_srp;
 557 
 558         /*
 559          * WARNING "man -s 9f esballoc"  states:
 560          * => runs sync from the thread calling freeb()
 561          * => must not sleep, or access data structures that could be freed
 562          */
 563 
 564         /* Check whether we want to recycle the receive packets */
 565         if (srpp->srp_recycle) {
 566                 frtn_t *freep;
 567                 mblk_t *mp;
 568                 size_t size;
 569 
 570                 freep = &(srpp->srp_free);
 571                 ASSERT3P(freep->free_func, ==, sfxge_rx_qpacket_free);
 572                 ASSERT3P(freep->free_arg, ==, (caddr_t)srpp);
 573 
 574                 /*
 575                  * Allocate a matching mblk_t before the current one is
 576                  * freed.
 577                  */
 578                 size = srpp->srp_mblksize;
 579 
 580                 if ((mp = desballoc(srpp->srp_base, size, BPRI_HI,
 581                     freep)) != NULL) {
 582                         srpp->srp_mp = mp;
 583 
 584                         /* NORMAL recycled case */
 585                         sfxge_rx_qfpp_put(srp, srpp);
 586                         return;
 587                 }
 588         }
 589 
 590         srpp->srp_mp = NULL;
 591 
 592         sfxge_rx_qpacket_destroy(srp, srpp);
 593 }
 594 
 595 static sfxge_rx_packet_t *
 596 sfxge_rx_qpacket_create(sfxge_rxq_t *srp)
 597 {
 598         sfxge_t *sp = srp->sr_sp;
 599         sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp);
 600         sfxge_rx_packet_t *srpp;
 601         size_t size;
 602         caddr_t base;
 603         size_t unit;
 604         ddi_dma_cookie_t dmac;
 605         unsigned int ncookies;
 606         frtn_t *freep;
 607         mblk_t *mp;
 608         int err;
 609         int rc;
 610 
 611         size = sp->s_rx_buffer_size;
 612 
 613         if (sp->s_rx_pkt_mem_max &&
 614             (sp->s_rx_pkt_mem_alloc + size >= sp->s_rx_pkt_mem_max)) {
 615                 DTRACE_PROBE(rx_pkt_mem_max);
 616                 srp->sr_kstat.srk_rx_pkt_mem_limit++;
 617                 return (NULL);
 618         }
 619 
 620         /* Allocate a new packet */
 621         if ((srpp = kmem_cache_alloc(sp->s_rpc, KM_NOSLEEP)) == NULL) {
 622                 srp->sr_kstat.srk_kcache_alloc_nomem++;
 623                 rc = ENOMEM;
 624                 goto fail1;
 625         }
 626 
 627         srpp->srp_srp = srp;
 628         srpp->srp_putp = srfppp->srfpp_putp;
 629 
 630         /* Allocate some DMA memory */
 631         err = ddi_dma_mem_alloc(srpp->srp_dma_handle, size,
 632             &sfxge_rx_packet_devacc, DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
 633             NULL, &base, &unit, &(srpp->srp_acc_handle));
 634         switch (err) {
 635         case DDI_SUCCESS:
 636                 break;
 637 
 638         case DDI_FAILURE:
 639                 srp->sr_kstat.srk_dma_alloc_nomem++;
 640                 rc = ENOMEM;
 641                 goto fail2;
 642 
 643         default:
 644                 srp->sr_kstat.srk_dma_alloc_fail++;
 645                 rc = EFAULT;
 646                 goto fail2;
 647         }
 648 
 649         /* Adjust the buffer to align the start of the DMA area correctly */
 650         base += sp->s_rx_buffer_align;
 651         size -= sp->s_rx_buffer_align;
 652 
 653         /* Bind the DMA memory to the DMA handle */
 654         err = ddi_dma_addr_bind_handle(srpp->srp_dma_handle, NULL,
 655             base, size, DDI_DMA_READ | DDI_DMA_STREAMING,
 656             DDI_DMA_DONTWAIT, NULL, &dmac, &ncookies);
 657         switch (err) {
 658         case DDI_DMA_MAPPED:
 659                 break;
 660 
 661         case DDI_DMA_INUSE:
 662                 srp->sr_kstat.srk_dma_bind_fail++;
 663                 rc = EEXIST;
 664                 goto fail3;
 665 
 666         case DDI_DMA_NORESOURCES:
 667                 srp->sr_kstat.srk_dma_bind_nomem++;
 668                 rc = ENOMEM;
 669                 goto fail3;
 670 
 671         case DDI_DMA_NOMAPPING:
 672                 srp->sr_kstat.srk_dma_bind_fail++;
 673                 rc = ENOTSUP;
 674                 goto fail3;
 675 
 676         case DDI_DMA_TOOBIG:
 677                 srp->sr_kstat.srk_dma_bind_fail++;
 678                 rc = EFBIG;
 679                 goto fail3;
 680 
 681         default:
 682                 srp->sr_kstat.srk_dma_bind_fail++;
 683                 rc = EFAULT;
 684                 goto fail3;
 685         }
 686         ASSERT3U(ncookies, ==, 1);
 687 
 688         srpp->srp_addr = dmac.dmac_laddress;
 689 
 690         srpp->srp_base = (unsigned char *)base;
 691         srpp->srp_mblksize = size;
 692 
 693         /*
 694          * Allocate a STREAMS block: We use size 1 so that the allocator will
 695          * use the first (and smallest) dblk cache.
 696          */
 697         freep = &(srpp->srp_free);
 698         freep->free_func = sfxge_rx_qpacket_free;
 699         freep->free_arg  = (caddr_t)srpp;
 700 
 701         if ((mp = desballoc(srpp->srp_base, size, BPRI_HI, freep)) == NULL) {
 702                 srp->sr_kstat.srk_desballoc_fail++;
 703                 rc = ENOMEM;
 704                 goto fail4;
 705         }
 706 
 707         srpp->srp_mp = mp;
 708         srpp->srp_recycle = B_TRUE;
 709 
 710         if (sp->s_rx_pkt_mem_max) {
 711                 int64_t delta = sfxge_rx_pkt_mem_approx(srpp);
 712                 atomic_add_64(&sp->s_rx_pkt_mem_alloc, delta);
 713         }
 714 
 715         return (srpp);
 716 
 717 fail4:
 718         DTRACE_PROBE(fail4);
 719 
 720         bzero(&(srpp->srp_free), sizeof (frtn_t));
 721 
 722         srpp->srp_mblksize = 0;
 723         srpp->srp_base = NULL;
 724 
 725         /* Unbind the DMA memory from the DMA handle */
 726         srpp->srp_addr = 0;
 727         (void) ddi_dma_unbind_handle(srpp->srp_dma_handle);
 728 
 729 fail3:
 730         DTRACE_PROBE(fail3);
 731 
 732         /* Free the DMA memory */
 733         ddi_dma_mem_free(&(srpp->srp_acc_handle));
 734         srpp->srp_acc_handle = NULL;
 735 
 736 fail2:
 737         DTRACE_PROBE(fail2);
 738 
 739         srpp->srp_putp = NULL;
 740         srpp->srp_srp = NULL;
 741 
 742         kmem_cache_free(sp->s_rpc, srpp);
 743 
 744 fail1:
 745         DTRACE_PROBE1(fail1, int, rc);
 746 
 747         return (NULL);
 748 }
 749 
 750 #define SFXGE_REFILL_BATCH  64
 751 
 752 /* Try to refill the RX descriptor ring from the associated free pkt pool */
 753 static void
 754 sfxge_rx_qrefill(sfxge_rxq_t *srp, unsigned int target)
 755 {
 756         sfxge_t *sp = srp->sr_sp;
 757         sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp);
 758         unsigned int index = srp->sr_index;
 759         sfxge_evq_t *sep = sp->s_sep[index];
 760         efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
 761         mblk_t *mp;
 762         int ntodo;
 763         unsigned int count;
 764         unsigned int batch;
 765         unsigned int rxfill;
 766         unsigned int mblksize;
 767 
 768         prefetch_read_many(sp->s_enp);
 769         prefetch_read_many(srp->sr_erp);
 770 
 771         ASSERT(mutex_owned(&(sep->se_lock)));
 772 
 773         if (srp->sr_state != SFXGE_RXQ_STARTED)
 774                 return;
 775 
 776         rxfill = srp->sr_added - srp->sr_completed;
 777         ASSERT3U(rxfill, <=, EFX_RXQ_LIMIT(sp->s_rxq_size));
 778         ntodo = min(EFX_RXQ_LIMIT(sp->s_rxq_size) - rxfill, target);
 779         ASSERT3U(ntodo, <=, EFX_RXQ_LIMIT(sp->s_rxq_size));
 780 
 781         if (ntodo == 0)
 782                 goto out;
 783 
 784         (void) sfxge_rx_qfpp_swizzle(srp);
 785 
 786         mp = srfppp->srfpp_get;
 787         count = srfppp->srfpp_count;
 788         mblksize = sp->s_rx_buffer_size - sp->s_rx_buffer_align;
 789 
 790         batch = 0;
 791         while (ntodo-- > 0) {
 792                 mblk_t *next;
 793                 frtn_t *freep;
 794                 sfxge_rx_packet_t *srpp;
 795                 unsigned int id;
 796 
 797                 if (mp == NULL)
 798                         break;
 799 
 800                 next = mp->b_next;
 801                 mp->b_next = NULL;
 802 
 803                 if (next != NULL)
 804                         prefetch_read_many(next);
 805 
 806                 freep = DB_FRTNP(mp);
 807                 /*LINTED*/
 808                 srpp = (sfxge_rx_packet_t *)(freep->free_arg);
 809                 ASSERT3P(srpp->srp_mp, ==, mp);
 810 
 811                 /* The MTU may have changed since the packet was allocated */
 812                 if (MBLKSIZE(mp) != mblksize) {
 813                         srpp->srp_recycle = B_FALSE;
 814 
 815                         freeb(mp);
 816 
 817                         --count;
 818                         mp = next;
 819                         continue;
 820                 }
 821 
 822                 srpp->srp_off = 0;
 823                 srpp->srp_thp = NULL;
 824                 srpp->srp_iphp = NULL;
 825                 srpp->srp_etherhp = NULL;
 826                 srpp->srp_size = 0;
 827                 srpp->srp_flags = EFX_DISCARD;
 828 
 829                 id = (srp->sr_added + batch) & (sp->s_rxq_size - 1);
 830                 ASSERT(srp->sr_srpp[id] == NULL);
 831                 srp->sr_srpp[id] = srpp;
 832 
 833                 addr[batch++] = srpp->srp_addr;
 834                 if (batch == SFXGE_REFILL_BATCH) {
 835                         efx_rx_qpost(srp->sr_erp, addr, mblksize, batch,
 836                             srp->sr_completed, srp->sr_added);
 837                         srp->sr_added += batch;
 838                         batch = 0;
 839                 }
 840 
 841                 --count;
 842                 mp = next;
 843         }
 844 
 845         srfppp->srfpp_get = mp;
 846         srfppp->srfpp_count = count;
 847 
 848         if (batch != 0) {
 849                 efx_rx_qpost(srp->sr_erp, addr, mblksize, batch,
 850                     srp->sr_completed, srp->sr_added);
 851                 srp->sr_added += batch;
 852         }
 853 
 854         efx_rx_qpush(srp->sr_erp, srp->sr_added, &srp->sr_pushed);
 855 
 856 out:
 857         if (srfppp->srfpp_count < srfppp->srfpp_min)
 858                 srfppp->srfpp_min = srfppp->srfpp_count;
 859 }
 860 
 861 /* Preallocate packets and put them in the free packet pool */
 862 static void
 863 sfxge_rx_qpreallocate(sfxge_rxq_t *srp, int nprealloc)
 864 {
 865         sfxge_rx_fpp_t *srfppp = &((srp)->sr_fpp);
 866         srfppp->srfpp_lowat = nprealloc;
 867         while (nprealloc-- > 0) {
 868                 sfxge_rx_packet_t *srpp;
 869 
 870                 if ((srpp = sfxge_rx_qpacket_create(srp)) == NULL)
 871                         break;
 872                 sfxge_rx_qfpp_put(srp, srpp);
 873         }
 874 }
 875 
 876 /* Try to refill the RX descriptor ring by allocating new packets */
 877 static void
 878 sfxge_rx_qfill(sfxge_rxq_t *srp, unsigned int target)
 879 {
 880         sfxge_t *sp = srp->sr_sp;
 881         unsigned int index = srp->sr_index;
 882         sfxge_evq_t *sep = sp->s_sep[index];
 883         unsigned int batch;
 884         unsigned int rxfill;
 885         unsigned int mblksize;
 886         int ntodo;
 887         efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
 888         mblk_t *mp = NULL;
 889 
 890         prefetch_read_many(sp->s_enp);
 891         prefetch_read_many(srp->sr_erp);
 892 
 893         ASSERT(mutex_owned(&(sep->se_lock)));
 894 
 895         if (srp->sr_state != SFXGE_RXQ_STARTED)
 896                 return;
 897 
 898         rxfill = srp->sr_added - srp->sr_completed;
 899         ASSERT3U(rxfill, <=, EFX_RXQ_LIMIT(sp->s_rxq_size));
 900         ntodo = min(EFX_RXQ_LIMIT(sp->s_rxq_size) - rxfill, target);
 901         ASSERT3U(ntodo, <=, EFX_RXQ_LIMIT(sp->s_rxq_size));
 902 
 903         if (ntodo == 0)
 904                 return;
 905 
 906         mblksize = sp->s_rx_buffer_size - sp->s_rx_buffer_align;
 907 
 908         batch = 0;
 909         while (ntodo-- > 0) {
 910                 sfxge_rx_packet_t *srpp;
 911                 unsigned int id;
 912 
 913                 if ((srpp = sfxge_rx_qpacket_create(srp)) == NULL)
 914                         break;
 915 
 916                 mp = srpp->srp_mp;
 917 
 918                 ASSERT3U(MBLKSIZE(mp), ==, mblksize);
 919 
 920                 ASSERT3U(srpp->srp_off, ==, 0);
 921                 ASSERT3P(srpp->srp_thp, ==, NULL);
 922                 ASSERT3P(srpp->srp_iphp, ==, NULL);
 923                 ASSERT3P(srpp->srp_etherhp, ==, NULL);
 924                 ASSERT3U(srpp->srp_size, ==, 0);
 925 
 926                 srpp->srp_flags = EFX_DISCARD;
 927 
 928                 id = (srp->sr_added + batch) & (sp->s_rxq_size - 1);
 929                 ASSERT(srp->sr_srpp[id] == NULL);
 930                 srp->sr_srpp[id] = srpp;
 931 
 932                 addr[batch++] = srpp->srp_addr;
 933                 if (batch == SFXGE_REFILL_BATCH) {
 934                         efx_rx_qpost(srp->sr_erp, addr, mblksize, batch,
 935                             srp->sr_completed, srp->sr_added);
 936                         srp->sr_added += batch;
 937                         batch = 0;
 938                 }
 939         }
 940 
 941         if (batch != 0) {
 942                 efx_rx_qpost(srp->sr_erp, addr, mblksize, batch,
 943                     srp->sr_completed, srp->sr_added);
 944                 srp->sr_added += batch;
 945         }
 946 
 947         efx_rx_qpush(srp->sr_erp, srp->sr_added, &srp->sr_pushed);
 948 }
 949 
 950 void
 951 sfxge_rx_qfpp_trim(sfxge_rxq_t *srp)
 952 {
 953         sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp);
 954         sfxge_t *sp = srp->sr_sp;
 955         unsigned int index = srp->sr_index;
 956         sfxge_evq_t *sep = sp->s_sep[index];
 957         mblk_t *p;
 958         mblk_t **pp;
 959         int count;
 960 
 961         ASSERT(mutex_owned(&(sep->se_lock)));
 962 
 963         if (srp->sr_state != SFXGE_RXQ_STARTED)
 964                 goto done;
 965 
 966         /* Make sure the queue is full */
 967         sfxge_rx_qrefill(srp, EFX_RXQ_LIMIT(sp->s_rxq_size));
 968 
 969         /* The refill may have emptied the pool */
 970         if (srfppp->srfpp_min == 0)
 971                 goto done;
 972 
 973         /* Don't trim below the pool's low water mark */
 974         if (srfppp->srfpp_count <= srfppp->srfpp_lowat)
 975                 goto done;
 976 
 977         ASSERT(srfppp->srfpp_min <= srfppp->srfpp_count);
 978 
 979         /* Trim to the largest of srfppp->srfpp_min and srfpp->srfpp_lowat */
 980         if (srfppp->srfpp_lowat > srfppp->srfpp_min)
 981                 count = srfppp->srfpp_count - srfppp->srfpp_lowat;
 982         else
 983                 count = srfppp->srfpp_count - srfppp->srfpp_min;
 984 
 985         /* Walk the get list */
 986         pp = &(srfppp->srfpp_get);
 987         while (--count >= 0) {
 988                 ASSERT(pp);
 989                 p = *pp;
 990                 ASSERT(p != NULL);
 991 
 992                 pp = &(p->b_next);
 993         }
 994         ASSERT(pp);
 995         p = *pp;
 996 
 997         /* Truncate the get list */
 998         *pp = NULL;
 999 
1000         /* Free the remainder */
1001         while (p != NULL) {
1002                 mblk_t *next;
1003                 frtn_t *freep;
1004                 sfxge_rx_packet_t *srpp;
1005 
1006                 next = p->b_next;
1007                 p->b_next = NULL;
1008 
1009                 ASSERT3U(srfppp->srfpp_min, >, 0);
1010                 srfppp->srfpp_min--;
1011                 srfppp->srfpp_count--;
1012 
1013                 freep = DB_FRTNP(p);
1014                 /*LINTED*/
1015                 srpp = (sfxge_rx_packet_t *)(freep->free_arg);
1016                 ASSERT3P(srpp->srp_mp, ==, p);
1017 
1018                 srpp->srp_recycle = B_FALSE;
1019 
1020                 freeb(p);
1021 
1022                 p = next;
1023         }
1024 
1025 done:
1026         srfppp->srfpp_min = srfppp->srfpp_count;
1027 }
1028 
1029 static void
1030 sfxge_rx_qpoll(void *arg)
1031 {
1032         sfxge_rxq_t *srp = arg;
1033         sfxge_t *sp = srp->sr_sp;
1034         unsigned int index = srp->sr_index;
1035         sfxge_evq_t *sep = sp->s_sep[index];
1036         uint16_t magic;
1037 
1038         /*
1039          * man timeout(9f) states that this code should adhere to the
1040          * same requirements as a softirq handler - DO NOT BLOCK
1041          */
1042 
1043         /*
1044          * Post an event to the event queue to cause the free packet pool to be
1045          * trimmed if it is oversize.
1046          */
1047         magic = SFXGE_MAGIC_RX_QFPP_TRIM | index;
1048 
1049 #if defined(DEBUG)
1050         /* This is guaranteed due to the start/stop order of rx and ev */
1051         ASSERT3U(sep->se_state, ==, SFXGE_EVQ_STARTED);
1052         ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_STARTED);
1053 #else
1054         /*
1055          * Bug22691 WORKAROUND:
1056          * This handler has been observed in the field to be invoked for a
1057          * queue in the INITIALIZED state, which should never happen.
1058          * Until the mechanism for this is properly understood, add defensive
1059          * checks.
1060          */
1061         if ((sep->se_state != SFXGE_EVQ_STARTED) ||
1062             (srp->sr_state != SFXGE_RXQ_STARTED) ||
1063             (!sep->se_eep)) {
1064                 dev_err(sp->s_dip, CE_WARN, SFXGE_CMN_ERR
1065                     "RXQ[%d] bad state in sfxge_rx_qpoll %d %d %p",
1066                     index, sep->se_state, srp->sr_state, sep->se_eep);
1067                 return;
1068         }
1069 #endif
1070         efx_ev_qpost(sep->se_eep, magic);
1071 
1072         srp->sr_tid = timeout(sfxge_rx_qpoll, srp,
1073             drv_usectohz(sp->s_rxq_poll_usec));
1074 }
1075 
1076 static void
1077 sfxge_rx_qpoll_start(sfxge_rxq_t *srp)
1078 {
1079         sfxge_t *sp = srp->sr_sp;
1080         unsigned int index = srp->sr_index;
1081         sfxge_evq_t *sep = sp->s_sep[index];
1082 
1083         ASSERT(mutex_owned(&(sep->se_lock)));
1084         ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_STARTED);
1085 
1086         /* Schedule a poll */
1087         ASSERT3P(srp->sr_tid, ==, 0);
1088         srp->sr_tid = timeout(sfxge_rx_qpoll, srp, 0);
1089 }
1090 
1091 static void
1092 sfxge_rx_qpoll_stop(sfxge_rxq_t *srp)
1093 {
1094         sfxge_t *sp = srp->sr_sp;
1095         unsigned int index = srp->sr_index;
1096         sfxge_evq_t *sep = sp->s_sep[index];
1097         timeout_id_t tid;
1098 
1099         ASSERT(mutex_owned(&(sep->se_lock)));
1100         ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_STARTED);
1101 
1102         /*
1103          * Cancel the qpoll timer. Care is needed as this function
1104          * can race with sfxge_rx_qpoll() for timeout id updates.
1105          *
1106          * Do not hold locks used by any timeout(9f) handlers across
1107          * calls to untimeout(9f) as this will deadlock.
1108          */
1109         tid = 0;
1110         while ((srp->sr_tid != 0) && (srp->sr_tid != tid)) {
1111                 tid = srp->sr_tid;
1112                 (void) untimeout(tid);
1113         }
1114         srp->sr_tid = 0;
1115 }
1116 
1117 static int
1118 sfxge_rx_kstat_update(kstat_t *ksp, int rw)
1119 {
1120         sfxge_rxq_t *srp = ksp->ks_private;
1121         sfxge_t *sp = srp->sr_sp;
1122         unsigned int index = srp->sr_index;
1123         sfxge_evq_t *sep = sp->s_sep[index];
1124         kstat_named_t *knp;
1125         int rc;
1126 
1127         if (rw != KSTAT_READ) {
1128                 rc = EACCES;
1129                 goto fail1;
1130         }
1131 
1132         ASSERT(mutex_owned(&(sep->se_lock)));
1133         if (srp->sr_state != SFXGE_RXQ_STARTED)
1134                 goto done;
1135 
1136         knp = ksp->ks_data;
1137         /* NB pointer post-increment below */
1138         knp++->value.ui32 = srp->sr_kstat.srk_rx_pkt_mem_limit;
1139         knp++->value.ui32 = srp->sr_kstat.srk_kcache_alloc_nomem;
1140         knp++->value.ui32 = srp->sr_kstat.srk_dma_alloc_nomem;
1141         knp++->value.ui32 = srp->sr_kstat.srk_dma_alloc_fail;
1142         knp++->value.ui32 = srp->sr_kstat.srk_dma_bind_nomem;
1143         knp++->value.ui32 = srp->sr_kstat.srk_dma_bind_fail;
1144         knp++->value.ui32 = srp->sr_kstat.srk_desballoc_fail;
1145         knp++->value.ui32 = srp->sr_kstat.srk_rxq_empty_discard;
1146 
1147 done:
1148         return (0);
1149 
1150 fail1:
1151         DTRACE_PROBE1(fail1, int, rc);
1152 
1153         return (rc);
1154 }
1155 
1156 static int
1157 sfxge_rx_kstat_init(sfxge_rxq_t *srp)
1158 {
1159         sfxge_t *sp = srp->sr_sp;
1160         unsigned int index = srp->sr_index;
1161         sfxge_evq_t *sep = sp->s_sep[index];
1162         dev_info_t *dip = sp->s_dip;
1163         char name[MAXNAMELEN];
1164         kstat_t *ksp;
1165         kstat_named_t *knp;
1166         int rc;
1167 
1168         /* Create the set */
1169         (void) snprintf(name, MAXNAMELEN - 1, "%s_rxq%04d",
1170             ddi_driver_name(dip), index);
1171 
1172         if ((ksp = kstat_create((char *)ddi_driver_name(dip),
1173             ddi_get_instance(dip), name, "rxq", KSTAT_TYPE_NAMED,
1174             SFXGE_RX_NSTATS, 0)) == NULL) {
1175                 rc = ENOMEM;
1176                 goto fail1;
1177         }
1178 
1179         srp->sr_ksp = ksp;
1180 
1181         ksp->ks_update = sfxge_rx_kstat_update;
1182         ksp->ks_private = srp;
1183         ksp->ks_lock = &(sep->se_lock);
1184 
1185         /* Initialise the named stats */
1186         knp = ksp->ks_data;
1187         kstat_named_init(knp, "rx_pkt_mem_limit", KSTAT_DATA_UINT32);
1188         knp++;
1189         kstat_named_init(knp, "kcache_alloc_nomem", KSTAT_DATA_UINT32);
1190         knp++;
1191         kstat_named_init(knp, "dma_alloc_nomem", KSTAT_DATA_UINT32);
1192         knp++;
1193         kstat_named_init(knp, "dma_alloc_fail", KSTAT_DATA_UINT32);
1194         knp++;
1195         kstat_named_init(knp, "dma_bind_nomem", KSTAT_DATA_UINT32);
1196         knp++;
1197         kstat_named_init(knp, "dma_bind_fail", KSTAT_DATA_UINT32);
1198         knp++;
1199         kstat_named_init(knp, "desballoc_fail", KSTAT_DATA_UINT32);
1200         knp++;
1201         kstat_named_init(knp, "rxq_empty_discard", KSTAT_DATA_UINT32);
1202 
1203         kstat_install(ksp);
1204         return (0);
1205 
1206 fail1:
1207         DTRACE_PROBE1(fail1, int, rc);
1208 
1209         return (rc);
1210 }
1211 
1212 static int
1213 sfxge_rx_qinit(sfxge_t *sp, unsigned int index)
1214 {
1215         sfxge_rxq_t *srp;
1216         int rc;
1217 
1218         ASSERT3U(index, <, SFXGE_RX_SCALE_MAX);
1219 
1220         if ((srp = kmem_cache_alloc(sp->s_rqc, KM_SLEEP)) == NULL) {
1221                 rc = ENOMEM;
1222                 goto fail1;
1223         }
1224         ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_UNINITIALIZED);
1225 
1226         srp->sr_index = index;
1227         sp->s_srp[index] = srp;
1228 
1229         if ((rc = sfxge_rx_kstat_init(srp)) != 0)
1230                 goto fail2;
1231 
1232         srp->sr_state = SFXGE_RXQ_INITIALIZED;
1233 
1234         return (0);
1235 
1236 fail2:
1237         DTRACE_PROBE(fail2);
1238         kmem_cache_free(sp->s_rqc, srp);
1239 
1240 fail1:
1241         DTRACE_PROBE1(fail1, int, rc);
1242 
1243         return (rc);
1244 }
1245 
1246 static int
1247 sfxge_rx_qstart(sfxge_t *sp, unsigned int index)
1248 {
1249         sfxge_evq_t *sep = sp->s_sep[index];
1250         sfxge_rxq_t *srp;
1251         efsys_mem_t *esmp;
1252         efx_nic_t *enp;
1253         unsigned int level;
1254         int rc;
1255 
1256         mutex_enter(&(sep->se_lock));
1257         srp = sp->s_srp[index];
1258         enp = sp->s_enp;
1259         esmp = &(srp->sr_mem);
1260 
1261         ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_INITIALIZED);
1262         ASSERT3U(sep->se_state, ==, SFXGE_EVQ_STARTED);
1263 
1264         /* Zero the memory */
1265         bzero(esmp->esm_base, EFX_RXQ_SIZE(sp->s_rxq_size));
1266 
1267         /* Program the buffer table */
1268         if ((rc = sfxge_sram_buf_tbl_set(sp, srp->sr_id, esmp,
1269             EFX_RXQ_NBUFS(sp->s_rxq_size))) != 0)
1270                 goto fail1;
1271 
1272         /* Create the receive queue */
1273         if ((rc = efx_rx_qcreate(enp, index, index, EFX_RXQ_TYPE_DEFAULT,
1274             esmp, sp->s_rxq_size, srp->sr_id, sep->se_eep, &(srp->sr_erp)))
1275             != 0)
1276                 goto fail2;
1277 
1278         /* Enable the receive queue */
1279         efx_rx_qenable(srp->sr_erp);
1280 
1281         /* Set the water marks */
1282         srp->sr_hiwat = EFX_RXQ_LIMIT(sp->s_rxq_size) * 9 / 10;
1283         srp->sr_lowat = srp->sr_hiwat / 2;
1284 
1285         srp->sr_state = SFXGE_RXQ_STARTED;
1286         srp->sr_flush = SFXGE_FLUSH_INACTIVE;
1287 
1288         sfxge_rx_qpoll_start(srp);
1289 
1290         /* Try to fill the queue from the pool */
1291         sfxge_rx_qrefill(srp, EFX_RXQ_LIMIT(sp->s_rxq_size));
1292 
1293         /*
1294          * If there were insufficient buffers in the pool to reach the at
1295          * least a batch then allocate some.
1296          */
1297         level = srp->sr_added - srp->sr_completed;
1298         if (level < SFXGE_RX_BATCH)
1299                 sfxge_rx_qfill(srp, SFXGE_RX_BATCH);
1300 
1301         mutex_exit(&(sep->se_lock));
1302 
1303         return (0);
1304 
1305 fail2:
1306         DTRACE_PROBE(fail2);
1307 
1308         /* Clear entries from the buffer table */
1309         sfxge_sram_buf_tbl_clear(sp, srp->sr_id,
1310             EFX_RXQ_NBUFS(sp->s_rxq_size));
1311 
1312 fail1:
1313         DTRACE_PROBE1(fail1, int, rc);
1314 
1315         mutex_exit(&(sep->se_lock));
1316 
1317         return (rc);
1318 }
1319 
1320 static void
1321 sfxge_rx_qflow_complete(sfxge_rxq_t *srp, sfxge_rx_flow_t *srfp)
1322 {
1323         mblk_t *mp;
1324         struct ether_header *etherhp;
1325         struct ip *iphp;
1326         struct tcphdr *thp;
1327 
1328         if (srfp->srf_mp == NULL)
1329                 return;
1330 
1331         mp = srfp->srf_mp;
1332         etherhp = srfp->srf_etherhp;
1333         iphp = srfp->srf_iphp;
1334         thp = srfp->srf_last_thp;
1335 
1336         ASSERT3U(((etherhp->ether_type == htons(ETHERTYPE_VLAN)) ?
1337             sizeof (struct ether_vlan_header) :
1338             sizeof (struct ether_header)) +
1339             srfp->srf_len, ==, msgdsize(mp));
1340 
1341         ASSERT3U(srfp->srf_len & 0xffff, ==, srfp->srf_len);
1342         iphp->ip_len = htons(srfp->srf_len);
1343 
1344         srfp->srf_first_thp->th_ack = thp->th_ack;
1345         srfp->srf_first_thp->th_win = thp->th_win;
1346         srfp->srf_first_thp->th_flags = thp->th_flags;
1347 
1348         DTRACE_PROBE2(flow_complete, uint32_t, srfp->srf_tag,
1349             size_t, srfp->srf_len);
1350 
1351         srfp->srf_mp = NULL;
1352         srfp->srf_len = 0;
1353 
1354         ASSERT(mp->b_next == NULL);
1355         *(srp->sr_mpp) = mp;
1356         srp->sr_mpp = &(mp->b_next);
1357 }
1358 
1359 static boolean_t
1360 sfxge_rx_qflow_add(sfxge_rxq_t *srp, sfxge_rx_flow_t *srfp,
1361     sfxge_rx_packet_t *srpp, clock_t now)
1362 {
1363         sfxge_t *sp = srp->sr_sp;
1364         struct ether_header *etherhp = srpp->srp_etherhp;
1365         struct ip *iphp = srpp->srp_iphp;
1366         struct tcphdr *thp = srpp->srp_thp;
1367         size_t off = srpp->srp_off;
1368         size_t size = (size_t)(srpp->srp_size);
1369         mblk_t *mp = srpp->srp_mp;
1370         uint32_t seq;
1371         unsigned int shift;
1372 
1373         ASSERT3U(MBLKL(mp), ==, off + size);
1374         ASSERT3U(DB_CKSUMFLAGS(mp), ==,
1375             HCK_FULLCKSUM | HCK_FULLCKSUM_OK | HCK_IPV4_HDRCKSUM);
1376 
1377         seq = htonl(thp->th_seq);
1378 
1379         /*
1380          * If the time between this segment and the last is greater than RTO
1381          * then consider this a new flow.
1382          */
1383         if (now - srfp->srf_lbolt > srp->sr_rto) {
1384                 srfp->srf_count = 1;
1385                 srfp->srf_seq = seq + size;
1386 
1387                 goto fail1;
1388         }
1389 
1390         if (seq != srfp->srf_seq) {
1391                 if (srfp->srf_count > SFXGE_SLOW_START)
1392                         srfp->srf_count = SFXGE_SLOW_START;
1393 
1394                 srfp->srf_count >>= 1;
1395 
1396                 srfp->srf_count++;
1397                 srfp->srf_seq = seq + size;
1398 
1399                 goto fail2;
1400         }
1401 
1402         /* Update the in-order segment count and sequence number */
1403         srfp->srf_count++;
1404         srfp->srf_seq = seq + size;
1405 
1406         /* Don't merge across pure ACK, URG, SYN or RST segments */
1407         if (size == 0 || thp->th_flags & (TH_URG | TH_SYN | TH_RST) ||
1408             thp->th_urp != 0)
1409                 goto fail3;
1410 
1411         /*
1412          * If the in-order segment count has not yet reached the slow-start
1413          * threshold then we cannot coalesce.
1414          */
1415         if (srfp->srf_count < SFXGE_SLOW_START)
1416                 goto fail4;
1417 
1418         /* Scale up the packet size from 4k (the maximum being 64k) */
1419         ASSERT3U(srfp->srf_count, >=, SFXGE_SLOW_START);
1420         shift = MIN(srfp->srf_count - SFXGE_SLOW_START + 12, 16);
1421         if (srfp->srf_len + size >= (1 << shift))
1422                 sfxge_rx_qflow_complete(srp, srfp);
1423 
1424         ASSERT(mp->b_cont == NULL);
1425 
1426         if (srfp->srf_mp == NULL) {
1427                 /* First packet in this flow */
1428                 srfp->srf_etherhp = etherhp;
1429                 srfp->srf_iphp = iphp;
1430                 srfp->srf_first_thp = srfp->srf_last_thp = thp;
1431 
1432                 ASSERT3P(mp->b_cont, ==, NULL);
1433                 srfp->srf_mp = mp;
1434                 srfp->srf_mpp = &(mp->b_cont);
1435 
1436                 srfp->srf_len = ntohs(iphp->ip_len);
1437 
1438                 /*
1439                  * If the flow is not already in the list of occupied flows then
1440                  * add it.
1441                  */
1442                 if (srfp->srf_next == NULL &&
1443                     srp->sr_srfpp != &(srfp->srf_next)) {
1444                         *(srp->sr_srfpp) = srfp;
1445                         srp->sr_srfpp = &(srfp->srf_next);
1446                 }
1447         } else {
1448                 /* Later packet in this flow - skip TCP header */
1449                 srfp->srf_last_thp = thp;
1450 
1451                 mp->b_rptr += off;
1452                 ASSERT3U(MBLKL(mp), ==, size);
1453 
1454                 ASSERT3P(mp->b_cont, ==, NULL);
1455                 *(srfp->srf_mpp) = mp;
1456                 srfp->srf_mpp = &(mp->b_cont);
1457 
1458                 srfp->srf_len += size;
1459 
1460                 ASSERT(srfp->srf_next != NULL ||
1461                     srp->sr_srfpp == &(srfp->srf_next));
1462         }
1463 
1464         DTRACE_PROBE2(flow_add, uint32_t, srfp->srf_tag, size_t, size);
1465 
1466         /*
1467          * Try to align coalesced segments on push boundaries, unless they
1468          * are too frequent.
1469          */
1470         if (sp->s_rx_coalesce_mode == SFXGE_RX_COALESCE_ALLOW_PUSH &&
1471             thp->th_flags & TH_PUSH)
1472                 sfxge_rx_qflow_complete(srp, srfp);
1473 
1474         srfp->srf_lbolt = now;
1475         return (B_TRUE);
1476 
1477 fail4:
1478 fail3:
1479 fail2:
1480 fail1:
1481         sfxge_rx_qflow_complete(srp, srfp);
1482 
1483         srfp->srf_lbolt = now;
1484         return (B_FALSE);
1485 }
1486 
1487 void
1488 sfxge_rx_qpacket_coalesce(sfxge_rxq_t *srp)
1489 {
1490         sfxge_t *sp = srp->sr_sp;
1491         clock_t now;
1492         mblk_t *mp;
1493         sfxge_rx_flow_t *srfp;
1494 
1495         ASSERT(sp->s_rx_coalesce_mode != SFXGE_RX_COALESCE_OFF);
1496 
1497         now = ddi_get_lbolt();
1498 
1499         mp = srp->sr_mp;
1500 
1501         srp->sr_mp = NULL;
1502         srp->sr_mpp = &(srp->sr_mp);
1503 
1504         /* Start with the last flow to be appended to */
1505         srfp = *(srp->sr_srfpp);
1506 
1507         while (mp != NULL) {
1508                 frtn_t *freep;
1509                 sfxge_rx_packet_t *srpp;
1510                 struct ether_header *etherhp;
1511                 struct ip *iphp;
1512                 struct tcphdr *thp;
1513                 size_t off;
1514                 size_t size;
1515                 uint16_t ether_tci;
1516                 uint32_t hash;
1517                 uint32_t tag;
1518                 mblk_t *next;
1519                 sfxge_packet_type_t pkt_type;
1520                 uint16_t sport, dport;
1521 
1522                 next = mp->b_next;
1523                 mp->b_next = NULL;
1524 
1525                 if (next != NULL)
1526                         prefetch_read_many(next);
1527 
1528                 freep = DB_FRTNP(mp);
1529                 /*LINTED*/
1530                 srpp = (sfxge_rx_packet_t *)(freep->free_arg);
1531                 ASSERT3P(srpp->srp_mp, ==, mp);
1532 
1533                 /* If the packet is not TCP then we cannot coalesce it */
1534                 if (~(srpp->srp_flags) & EFX_PKT_TCP)
1535                         goto reject;
1536 
1537                 /*
1538                  * If the packet is not fully checksummed then we cannot
1539                  * coalesce it.
1540                  */
1541                 if (~(srpp->srp_flags) & (EFX_CKSUM_TCPUDP | EFX_CKSUM_IPV4))
1542                         goto reject;
1543 
1544                 /* Parse the TCP header */
1545                 pkt_type = sfxge_pkthdr_parse(mp, &etherhp, &iphp, &thp, &off,
1546                     &size, &sport, &dport);
1547                 ASSERT(pkt_type == SFXGE_PACKET_TYPE_IPV4_TCP);
1548                 ASSERT(etherhp != NULL);
1549                 ASSERT(iphp != NULL);
1550                 ASSERT(thp != NULL);
1551                 ASSERT(off != 0);
1552 
1553                 if ((iphp->ip_off & ~htons(IP_DF)) != 0)
1554                         goto reject;
1555 
1556                 if (etherhp->ether_type == htons(ETHERTYPE_VLAN)) {
1557                         struct ether_vlan_header *ethervhp;
1558 
1559                         ethervhp = (struct ether_vlan_header *)etherhp;
1560                         ether_tci = ethervhp->ether_tci;
1561                 } else {
1562                         ether_tci = 0;
1563                 }
1564 
1565                 /*
1566                  * Make sure any minimum length padding is stripped
1567                  * before we try to add the packet to a flow.
1568                  */
1569                 ASSERT3U(sp->s_rx_prefix_size + MBLKL(mp), ==,
1570                     (size_t)(srpp->srp_size));
1571                 ASSERT3U(sp->s_rx_prefix_size + off + size, <=,
1572                     (size_t)(srpp->srp_size));
1573 
1574                 if (sp->s_rx_prefix_size + off + size <
1575                     (size_t)(srpp->srp_size))
1576                         mp->b_wptr = mp->b_rptr + off + size;
1577 
1578                 /*
1579                  * If there is no current flow, or the segment does not match
1580                  * the current flow then we must attempt to look up the
1581                  * correct flow in the table.
1582                  */
1583                 if (srfp == NULL)
1584                         goto lookup;
1585 
1586                 if (srfp->srf_saddr != iphp->ip_src.s_addr ||
1587                     srfp->srf_daddr != iphp->ip_dst.s_addr)
1588                         goto lookup;
1589 
1590                 if (srfp->srf_sport != thp->th_sport ||
1591                     srfp->srf_dport != thp->th_dport)
1592                         goto lookup;
1593 
1594                 if (srfp->srf_tci != ether_tci)
1595                         goto lookup;
1596 
1597 add:
1598                 ASSERT(srfp != NULL);
1599 
1600                 srpp->srp_etherhp = etherhp;
1601                 srpp->srp_iphp = iphp;
1602                 srpp->srp_thp = thp;
1603                 srpp->srp_off = off;
1604 
1605                 ASSERT3U(size, <, (1 << 16));
1606                 srpp->srp_size = (uint16_t)size;
1607 
1608                 /* Try to append the packet to the flow */
1609                 if (!sfxge_rx_qflow_add(srp, srfp, srpp, now))
1610                         goto reject;
1611 
1612                 mp = next;
1613                 continue;
1614 
1615 lookup:
1616                 /*
1617                  * If there is a prefix area then read the hash from that,
1618                  * otherwise calculate it.
1619                  */
1620                 if (sp->s_rx_prefix_size != 0) {
1621                         hash = efx_psuedo_hdr_hash_get(sp->s_enp,
1622                             EFX_RX_HASHALG_TOEPLITZ,
1623                             DB_BASE(mp));
1624                 } else {
1625                         SFXGE_TCP_HASH(sp,
1626                             &iphp->ip_src.s_addr,
1627                             thp->th_sport,
1628                             &iphp->ip_dst.s_addr,
1629                             thp->th_dport,
1630                             hash);
1631                 }
1632 
1633                 srfp = &(srp->sr_flow[(hash >> 6) % SFXGE_MAX_FLOW]);
1634                 tag = hash + 1; /* Make sure it's not zero */
1635 
1636                 /*
1637                  * If the flow we have found does not match the hash then
1638                  * it may be an unused flow, or it may be stale.
1639                  */
1640                 if (tag != srfp->srf_tag) {
1641                         if (srfp->srf_count != 0) {
1642                                 if (now - srfp->srf_lbolt <= srp->sr_rto)
1643                                         goto reject;
1644                         }
1645 
1646                         if (srfp->srf_mp != NULL)
1647                                 goto reject;
1648 
1649                         /* Start a new flow */
1650                         ASSERT(srfp->srf_next == NULL);
1651 
1652                         srfp->srf_tag = tag;
1653 
1654                         srfp->srf_saddr = iphp->ip_src.s_addr;
1655                         srfp->srf_daddr = iphp->ip_dst.s_addr;
1656                         srfp->srf_sport = thp->th_sport;
1657                         srfp->srf_dport = thp->th_dport;
1658                         srfp->srf_tci = ether_tci;
1659 
1660                         srfp->srf_count = 0;
1661                         srfp->srf_seq = ntohl(thp->th_seq);
1662 
1663                         srfp->srf_lbolt = now;
1664                         goto add;
1665                 }
1666 
1667                 /*
1668                  * If the flow we have found does match the hash then it could
1669                  * still be an alias.
1670                  */
1671                 if (srfp->srf_saddr != iphp->ip_src.s_addr ||
1672                     srfp->srf_daddr != iphp->ip_dst.s_addr)
1673                         goto reject;
1674 
1675                 if (srfp->srf_sport != thp->th_sport ||
1676                     srfp->srf_dport != thp->th_dport)
1677                         goto reject;
1678 
1679                 if (srfp->srf_tci != ether_tci)
1680                         goto reject;
1681 
1682                 goto add;
1683 
1684 reject:
1685                 *(srp->sr_mpp) = mp;
1686                 srp->sr_mpp = &(mp->b_next);
1687 
1688                 mp = next;
1689         }
1690 }
1691 
1692 void
1693 sfxge_rx_qcomplete(sfxge_rxq_t *srp, boolean_t eop)
1694 {
1695         sfxge_t *sp = srp->sr_sp;
1696         unsigned int index = srp->sr_index;
1697         sfxge_evq_t *sep = sp->s_sep[index];
1698         unsigned int completed;
1699         sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp);
1700         unsigned int level;
1701 
1702         ASSERT(mutex_owned(&(sep->se_lock)));
1703 
1704         ASSERT(srp->sr_mp == NULL);
1705         ASSERT(srp->sr_mpp == &(srp->sr_mp));
1706 
1707         completed = srp->sr_completed;
1708         while (completed != srp->sr_pending) {
1709                 unsigned int id;
1710                 sfxge_rx_packet_t *srpp;
1711                 mblk_t *mp;
1712                 size_t size;
1713                 uint16_t flags;
1714                 int rc;
1715 
1716                 id = completed++ & (sp->s_rxq_size - 1);
1717 
1718                 if (srp->sr_pending - completed >= 4) {
1719                         unsigned int prefetch;
1720 
1721                         prefetch = (id + 4) & (sp->s_rxq_size - 1);
1722 
1723                         srpp = srp->sr_srpp[prefetch];
1724                         ASSERT(srpp != NULL);
1725 
1726                         mp = srpp->srp_mp;
1727                         prefetch_read_many(mp->b_datap);
1728                 } else if (completed == srp->sr_pending) {
1729                         prefetch_read_many(srp->sr_mp);
1730                 }
1731 
1732                 srpp = srp->sr_srpp[id];
1733                 ASSERT(srpp != NULL);
1734 
1735                 srp->sr_srpp[id] = NULL;
1736 
1737                 mp = srpp->srp_mp;
1738                 ASSERT(mp->b_cont == NULL);
1739 
1740                 /* when called from sfxge_rx_qstop() */
1741                 if (srp->sr_state != SFXGE_RXQ_STARTED)
1742                         goto discard;
1743 
1744                 if (srpp->srp_flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
1745                         goto discard;
1746 
1747                 /* Make the data visible to the kernel */
1748                 rc = ddi_dma_sync(srpp->srp_dma_handle, 0,
1749                     sp->s_rx_buffer_size, DDI_DMA_SYNC_FORKERNEL);
1750                 ASSERT3P(rc, ==, DDI_SUCCESS);
1751 
1752                 /* Read the length from the psuedo header if required */
1753                 if (srpp->srp_flags & EFX_PKT_PREFIX_LEN) {
1754                         rc = efx_psuedo_hdr_pkt_length_get(sp->s_enp,
1755                             mp->b_rptr,
1756                             &srpp->srp_size);
1757                         ASSERT3P(rc, ==, 0);
1758                         srpp->srp_size += sp->s_rx_prefix_size;
1759                 }
1760 
1761                 /* Set up the packet length */
1762                 ASSERT3P(mp->b_rptr, ==, DB_BASE(mp));
1763                 mp->b_rptr += sp->s_rx_prefix_size;
1764 
1765                 prefetch_read_many(mp->b_rptr);
1766 
1767                 ASSERT3P(mp->b_wptr, ==, DB_BASE(mp));
1768                 mp->b_wptr += (size_t)(srpp->srp_size);
1769                 ASSERT3P(mp->b_wptr, <=, DB_LIM(mp));
1770 
1771                 /* Calculate the maximum packet size */
1772                 size = sp->s_mtu;
1773                 size += (srpp->srp_flags & EFX_PKT_VLAN_TAGGED) ?
1774                     sizeof (struct ether_vlan_header) :
1775                     sizeof (struct ether_header);
1776 
1777                 if (MBLKL(mp) > size)
1778                         goto discard;
1779 
1780                 /* Check for loopback packets */
1781                 if (!(srpp->srp_flags & EFX_PKT_IPV4) &&
1782                     !(srpp->srp_flags & EFX_PKT_IPV6)) {
1783                         struct ether_header *etherhp;
1784 
1785                         /*LINTED*/
1786                         etherhp = (struct ether_header *)(mp->b_rptr);
1787 
1788                         if (etherhp->ether_type ==
1789                             htons(SFXGE_ETHERTYPE_LOOPBACK)) {
1790                                 DTRACE_PROBE(loopback);
1791 
1792                                 srp->sr_loopback++;
1793                                 goto discard;
1794                         }
1795                 }
1796 
1797                 /* Set up the checksum information */
1798                 flags = 0;
1799 
1800                 if (srpp->srp_flags & EFX_CKSUM_IPV4) {
1801                         ASSERT(srpp->srp_flags & EFX_PKT_IPV4);
1802                         flags |= HCK_IPV4_HDRCKSUM;
1803                 }
1804 
1805                 if (srpp->srp_flags & EFX_CKSUM_TCPUDP) {
1806                         ASSERT(srpp->srp_flags & EFX_PKT_TCP ||
1807                             srpp->srp_flags & EFX_PKT_UDP);
1808                         flags |= HCK_FULLCKSUM | HCK_FULLCKSUM_OK;
1809                 }
1810 
1811                 DB_CKSUMSTART(mp) = 0;
1812                 DB_CKSUMSTUFF(mp) = 0;
1813                 DB_CKSUMEND(mp) = 0;
1814                 DB_CKSUMFLAGS(mp) = flags;
1815                 DB_CKSUM16(mp) = 0;
1816 
1817                 /* Add the packet to the tail of the chain */
1818                 srfppp->srfpp_loaned++;
1819 
1820                 ASSERT(mp->b_next == NULL);
1821                 *(srp->sr_mpp) = mp;
1822                 srp->sr_mpp = &(mp->b_next);
1823 
1824                 continue;
1825 
1826 discard:
1827                 /* Return the packet to the pool */
1828                 srfppp->srfpp_loaned++;
1829                 freeb(mp); /* Equivalent to freemsg() as b_cont==0 */
1830         }
1831         srp->sr_completed = completed;
1832 
1833         /* Attempt to coalesce any TCP packets */
1834         if (sp->s_rx_coalesce_mode != SFXGE_RX_COALESCE_OFF)
1835                 sfxge_rx_qpacket_coalesce(srp);
1836 
1837         /*
1838          * If there are any pending flows and this is the end of the
1839          * poll then they must be completed.
1840          */
1841         if (srp->sr_srfp != NULL && eop) {
1842                 sfxge_rx_flow_t *srfp;
1843 
1844                 srfp = srp->sr_srfp;
1845 
1846                 srp->sr_srfp = NULL;
1847                 srp->sr_srfpp = &(srp->sr_srfp);
1848 
1849                 do {
1850                         sfxge_rx_flow_t *next;
1851 
1852                         next = srfp->srf_next;
1853                         srfp->srf_next = NULL;
1854 
1855                         sfxge_rx_qflow_complete(srp, srfp);
1856 
1857                         srfp = next;
1858                 } while (srfp != NULL);
1859         }
1860 
1861         level = srp->sr_pushed - srp->sr_completed;
1862 
1863         /* If there are any packets then pass them up the stack */
1864         if (srp->sr_mp != NULL) {
1865                 mblk_t *mp;
1866 
1867                 mp = srp->sr_mp;
1868 
1869                 srp->sr_mp = NULL;
1870                 srp->sr_mpp = &(srp->sr_mp);
1871 
1872                 if (level == 0) {
1873                         /* Try to refill ASAP */
1874                         sfxge_rx_qrefill(srp, EFX_RXQ_LIMIT(sp->s_rxq_size));
1875                         level = srp->sr_pushed - srp->sr_completed;
1876                 }
1877 
1878                 /*
1879                  * If the RXQ is still empty, discard and recycle the
1880                  * current entry to ensure that the ring always
1881                  * contains at least one descriptor. This ensures that
1882                  * the next hardware RX will trigger an event
1883                  * (possibly delayed by interrupt moderation) and
1884                  * trigger another refill/fill attempt.
1885                  *
1886                  * Note this drops a complete LRO fragment from the
1887                  * start of the batch.
1888                  *
1889                  * Note also that copymsgchain() does not help with
1890                  * resource starvation here, unless we are short of DMA
1891                  * mappings.
1892                  */
1893                 if (level == 0) {
1894                         mblk_t *nmp;
1895 
1896                         srp->sr_kstat.srk_rxq_empty_discard++;
1897                         DTRACE_PROBE1(rxq_empty_discard, int, index);
1898                         nmp = mp->b_next;
1899                         if (nmp)
1900                                 sfxge_gld_rx_post(sp, index, nmp);
1901                         /* as level==0 will swizzle,rxpost below */
1902                         freemsg(mp);
1903                 } else {
1904                         sfxge_gld_rx_post(sp, index, mp);
1905                 }
1906         }
1907 
1908         /* Top up the queue if necessary */
1909         if (level < srp->sr_hiwat) {
1910                 sfxge_rx_qrefill(srp, EFX_RXQ_LIMIT(sp->s_rxq_size));
1911 
1912                 level = srp->sr_added - srp->sr_completed;
1913                 if (level < srp->sr_lowat)
1914                         sfxge_rx_qfill(srp, EFX_RXQ_LIMIT(sp->s_rxq_size));
1915         }
1916 }
1917 
1918 void
1919 sfxge_rx_qflush_done(sfxge_rxq_t *srp)
1920 {
1921         sfxge_t *sp = srp->sr_sp;
1922         unsigned int index = srp->sr_index;
1923         sfxge_evq_t *sep = sp->s_sep[index];
1924         boolean_t flush_pending;
1925 
1926         ASSERT(mutex_owned(&(sep->se_lock)));
1927 
1928         /*
1929          * Flush successful: wakeup sfxge_rx_qstop() if flush is pending.
1930          *
1931          * A delayed flush event received after RxQ stop has timed out
1932          * will be ignored, as then the flush state will not be PENDING
1933          * (see SFCbug22989).
1934          */
1935         flush_pending = (srp->sr_flush == SFXGE_FLUSH_PENDING);
1936         srp->sr_flush = SFXGE_FLUSH_DONE;
1937         if (flush_pending)
1938                 cv_broadcast(&(srp->sr_flush_kv));
1939 }
1940 
1941 void
1942 sfxge_rx_qflush_failed(sfxge_rxq_t *srp)
1943 {
1944         sfxge_t *sp = srp->sr_sp;
1945         unsigned int index = srp->sr_index;
1946         sfxge_evq_t *sep = sp->s_sep[index];
1947         boolean_t flush_pending;
1948 
1949         ASSERT(mutex_owned(&(sep->se_lock)));
1950 
1951         /*
1952          * Flush failed: wakeup sfxge_rx_qstop() if flush is pending.
1953          *
1954          * A delayed flush event received after RxQ stop has timed out
1955          * will be ignored, as then the flush state will not be PENDING
1956          * (see SFCbug22989).
1957          */
1958         flush_pending = (srp->sr_flush == SFXGE_FLUSH_PENDING);
1959         srp->sr_flush = SFXGE_FLUSH_FAILED;
1960         if (flush_pending)
1961                 cv_broadcast(&(srp->sr_flush_kv));
1962 }
1963 
1964 static void
1965 sfxge_rx_qstop(sfxge_t *sp, unsigned int index)
1966 {
1967         dev_info_t *dip = sp->s_dip;
1968         sfxge_evq_t *sep = sp->s_sep[index];
1969         sfxge_rxq_t *srp;
1970         clock_t timeout;
1971         unsigned int flush_tries = SFXGE_RX_QFLUSH_TRIES;
1972         int rc;
1973 
1974         ASSERT(mutex_owned(&(sp->s_state_lock)));
1975 
1976         mutex_enter(&(sep->se_lock));
1977 
1978         srp = sp->s_srp[index];
1979         ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_STARTED);
1980 
1981         sfxge_rx_qpoll_stop(srp);
1982 
1983         /* Further packets are discarded by sfxge_rx_qcomplete() */
1984         srp->sr_state = SFXGE_RXQ_INITIALIZED;
1985 
1986         if (sp->s_hw_err != SFXGE_HW_OK) {
1987                 /*
1988                  * Flag indicates possible hardware failure.
1989                  * Attempt flush but do not wait for it to complete.
1990                  */
1991                 srp->sr_flush = SFXGE_FLUSH_DONE;
1992                 (void) efx_rx_qflush(srp->sr_erp);
1993         }
1994 
1995         /* Wait upto 2sec for queue flushing to complete */
1996         timeout = ddi_get_lbolt() + drv_usectohz(SFXGE_RX_QFLUSH_USEC);
1997 
1998         while (srp->sr_flush != SFXGE_FLUSH_DONE && flush_tries-- > 0) {
1999                 if ((rc = efx_rx_qflush(srp->sr_erp)) != 0) {
2000                         if (rc == EALREADY)
2001                                 srp->sr_flush = SFXGE_FLUSH_DONE;
2002                         else
2003                                 srp->sr_flush = SFXGE_FLUSH_FAILED;
2004                         break;
2005                 }
2006                 srp->sr_flush = SFXGE_FLUSH_PENDING;
2007                 if (cv_timedwait(&(srp->sr_flush_kv), &(sep->se_lock),
2008                     timeout) < 0) {
2009                         /* Timeout waiting for successful or failed flush */
2010                         dev_err(dip, CE_NOTE,
2011                             SFXGE_CMN_ERR "rxq[%d] flush timeout", index);
2012                         break;
2013                 }
2014         }
2015 
2016         if (srp->sr_flush == SFXGE_FLUSH_FAILED)
2017                 dev_err(dip, CE_NOTE,
2018                     SFXGE_CMN_ERR "rxq[%d] flush failed", index);
2019 
2020         DTRACE_PROBE1(flush, sfxge_flush_state_t, srp->sr_flush);
2021         srp->sr_flush = SFXGE_FLUSH_DONE;
2022 
2023         /* Destroy the receive queue */
2024         efx_rx_qdestroy(srp->sr_erp);
2025         srp->sr_erp = NULL;
2026 
2027         /* Clear entries from the buffer table */
2028         sfxge_sram_buf_tbl_clear(sp, srp->sr_id,
2029             EFX_RXQ_NBUFS(sp->s_rxq_size));
2030 
2031         /*
2032          * Free any unused RX packets which had descriptors on the RXQ
2033          * Packets will be discard as state != STARTED
2034          */
2035         srp->sr_pending = srp->sr_added;
2036         sfxge_rx_qcomplete(srp, B_TRUE);
2037 
2038         ASSERT3U(srp->sr_completed, ==, srp->sr_pending);
2039 
2040         srp->sr_added = 0;
2041         srp->sr_pushed = 0;
2042         srp->sr_pending = 0;
2043         srp->sr_completed = 0;
2044         srp->sr_loopback = 0;
2045 
2046         srp->sr_lowat = 0;
2047         srp->sr_hiwat = 0;
2048 
2049         mutex_exit(&(sep->se_lock));
2050 }
2051 
2052 static void
2053 sfxge_rx_kstat_fini(sfxge_rxq_t *srp)
2054 {
2055         kstat_delete(srp->sr_ksp);
2056         srp->sr_ksp = NULL;
2057 }
2058 
2059 static void
2060 sfxge_rx_qfini(sfxge_t *sp, unsigned int index)
2061 {
2062         sfxge_rxq_t *srp = sp->s_srp[index];
2063 
2064         ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_INITIALIZED);
2065 
2066         sp->s_srp[index] = NULL;
2067         srp->sr_state = SFXGE_RXQ_UNINITIALIZED;
2068 
2069         sfxge_rx_kstat_fini(srp);
2070 
2071         /* Empty the pool */
2072         sfxge_rx_qfpp_empty(srp);
2073 
2074         srp->sr_index = 0;
2075 
2076         kmem_cache_free(sp->s_rqc, srp);
2077 }
2078 
2079 static int
2080 sfxge_rx_scale_kstat_update(kstat_t *ksp, int rw)
2081 {
2082         sfxge_t *sp = ksp->ks_private;
2083         sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2084         sfxge_intr_t *sip = &(sp->s_intr);
2085         kstat_named_t *knp;
2086         unsigned int index;
2087         unsigned int entry;
2088         unsigned int *freq;
2089         int rc;
2090 
2091         ASSERT(mutex_owned(&(srsp->srs_lock)));
2092 
2093         if (rw != KSTAT_READ) {
2094                 rc = EACCES;
2095                 goto fail1;
2096         }
2097 
2098         if ((freq = kmem_zalloc(sizeof (unsigned int) * sip->si_nalloc,
2099             KM_NOSLEEP)) == NULL) {
2100                 rc = ENOMEM;
2101                 goto fail2;
2102         }
2103 
2104         for (entry = 0; entry < SFXGE_RX_SCALE_MAX; entry++) {
2105                 index = srsp->srs_tbl[entry];
2106 
2107                 freq[index]++;
2108         }
2109 
2110         knp = ksp->ks_data;
2111         for (index = 0; index < sip->si_nalloc; index++) {
2112                 knp->value.ui64 = freq[index];
2113                 knp++;
2114         }
2115 
2116         knp->value.ui64 = srsp->srs_count;
2117 
2118         kmem_free(freq, sizeof (unsigned int) * sip->si_nalloc);
2119 
2120         return (0);
2121 
2122 fail2:
2123         DTRACE_PROBE(fail2);
2124 fail1:
2125         DTRACE_PROBE1(fail1, int, rc);
2126         return (rc);
2127 }
2128 
2129 static int
2130 sfxge_rx_scale_kstat_init(sfxge_t *sp)
2131 {
2132         dev_info_t *dip = sp->s_dip;
2133         sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2134         sfxge_intr_t *sip = &(sp->s_intr);
2135         char name[MAXNAMELEN];
2136         kstat_t *ksp;
2137         kstat_named_t *knp;
2138         unsigned int index;
2139         int rc;
2140 
2141         /* Create the set */
2142         (void) snprintf(name, MAXNAMELEN - 1, "%s_rss", ddi_driver_name(dip));
2143 
2144         if ((ksp = kstat_create((char *)ddi_driver_name(dip),
2145             ddi_get_instance(dip), name, "rss", KSTAT_TYPE_NAMED,
2146             sip->si_nalloc + 1, 0)) == NULL) {
2147                 rc = ENOMEM;
2148                 goto fail1;
2149         }
2150 
2151         srsp->srs_ksp = ksp;
2152 
2153         ksp->ks_update = sfxge_rx_scale_kstat_update;
2154         ksp->ks_private = sp;
2155         ksp->ks_lock = &(srsp->srs_lock);
2156 
2157         /* Initialise the named stats */
2158         knp = ksp->ks_data;
2159         for (index = 0; index < sip->si_nalloc; index++) {
2160                 char name[MAXNAMELEN];
2161 
2162                 (void) snprintf(name, MAXNAMELEN - 1, "evq%04d_count", index);
2163                 kstat_named_init(knp, name, KSTAT_DATA_UINT64);
2164                 knp++;
2165         }
2166 
2167         kstat_named_init(knp, "scale", KSTAT_DATA_UINT64);
2168 
2169         kstat_install(ksp);
2170         return (0);
2171 
2172 fail1:
2173         DTRACE_PROBE1(fail1, int, rc);
2174 
2175         return (rc);
2176 }
2177 
2178 static void
2179 sfxge_rx_scale_kstat_fini(sfxge_t *sp)
2180 {
2181         sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2182 
2183         /* Destroy the set */
2184         kstat_delete(srsp->srs_ksp);
2185         srsp->srs_ksp = NULL;
2186 }
2187 
2188 
2189 unsigned int
2190 sfxge_rx_scale_prop_get(sfxge_t *sp)
2191 {
2192         int rx_scale;
2193 
2194         rx_scale = ddi_prop_get_int(DDI_DEV_T_ANY, sp->s_dip,
2195             DDI_PROP_DONTPASS, "rx_scale_count", SFXGE_RX_SCALE_MAX);
2196         /* 0 and all -ve numbers sets to number of logical CPUs */
2197         if (rx_scale <= 0)
2198                 rx_scale = ncpus;
2199 
2200         return (rx_scale);
2201 }
2202 
2203 
2204 static int
2205 sfxge_rx_scale_init(sfxge_t *sp)
2206 {
2207         sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2208         sfxge_intr_t *sip = &(sp->s_intr);
2209         int rc;
2210 
2211         ASSERT3U(srsp->srs_state, ==, SFXGE_RX_SCALE_UNINITIALIZED);
2212 
2213         /* Create tables for CPU, core, cache and chip counts */
2214         srsp->srs_cpu = kmem_zalloc(sizeof (unsigned int) * NCPU, KM_SLEEP);
2215 
2216         mutex_init(&(srsp->srs_lock), NULL, MUTEX_DRIVER, NULL);
2217 
2218         /* We need at least one event queue */
2219         srsp->srs_count = sfxge_rx_scale_prop_get(sp);
2220         if (srsp->srs_count > sip->si_nalloc)
2221                 srsp->srs_count = sip->si_nalloc;
2222         if (srsp->srs_count < 1)
2223                 srsp->srs_count = 1;
2224 
2225         /* Set up the kstats */
2226         if ((rc = sfxge_rx_scale_kstat_init(sp)) != 0)
2227                 goto fail1;
2228 
2229         srsp->srs_state = SFXGE_RX_SCALE_INITIALIZED;
2230 
2231         return (0);
2232 
2233 fail1:
2234         DTRACE_PROBE1(fail1, int, rc);
2235         mutex_destroy(&(srsp->srs_lock));
2236 
2237         return (rc);
2238 }
2239 
2240 void
2241 sfxge_rx_scale_update(void *arg)
2242 {
2243         sfxge_t *sp = arg;
2244         sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2245         sfxge_intr_t *sip;
2246         processorid_t id;
2247         unsigned int count;
2248         unsigned int *tbl;
2249         unsigned int *rating;
2250         unsigned int entry;
2251         int rc;
2252 
2253         mutex_enter(&(srsp->srs_lock));
2254 
2255         if (srsp->srs_state != SFXGE_RX_SCALE_STARTED) {
2256                 rc = EFAULT;
2257                 goto fail1;
2258         }
2259 
2260         if ((tbl =  kmem_zalloc(sizeof (unsigned int) * SFXGE_RX_SCALE_MAX,
2261             KM_NOSLEEP)) == NULL) {
2262                 rc = ENOMEM;
2263                 goto fail2;
2264         }
2265 
2266         sip = &(sp->s_intr);
2267         if ((rating = kmem_zalloc(sizeof (unsigned int) * sip->si_nalloc,
2268             KM_NOSLEEP)) == NULL) {
2269                 rc = ENOMEM;
2270                 goto fail3;
2271         }
2272 
2273         mutex_enter(&cpu_lock);
2274 
2275         /*
2276          * Substract any current CPU, core, cache and chip usage from the
2277          * global contention tables.
2278          */
2279         for (id = 0; id < NCPU; id++) {
2280                 ASSERT3U(sfxge_cpu[id], >=, srsp->srs_cpu[id]);
2281                 sfxge_cpu[id] -= srsp->srs_cpu[id];
2282                 srsp->srs_cpu[id] = 0;
2283         }
2284 
2285         ASSERT(srsp->srs_count != 0);
2286 
2287         /* Choose as many event queues as we need */
2288         for (count = 0; count < srsp->srs_count; count++) {
2289                 unsigned int index;
2290                 sfxge_evq_t *sep;
2291                 unsigned int choice;
2292                 unsigned int choice_rating;
2293 
2294                 bzero(rating, sizeof (unsigned int) * sip->si_nalloc);
2295 
2296                 /*
2297                  * Rate each event queue on its global level of CPU
2298                  * contention.
2299                  */
2300                 for (index = 0; index < sip->si_nalloc; index++) {
2301                         sep = sp->s_sep[index];
2302 
2303                         id = sep->se_cpu_id;
2304                         rating[index] += sfxge_cpu[id];
2305                 }
2306 
2307                 /* Choose the queue with the lowest CPU contention */
2308                 choice = 0;
2309                 choice_rating = rating[0];
2310 
2311                 for (index = 1; index < sip->si_nalloc; index++) {
2312                         if (rating[index] < choice_rating) {
2313                                 choice = index;
2314                                 choice_rating = rating[index];
2315                         }
2316                 }
2317 
2318                 /* Add our choice to the condensed RSS table */
2319                 tbl[count] = choice;
2320 
2321                 /* Add information to the global contention tables */
2322                 sep = sp->s_sep[choice];
2323 
2324                 id = sep->se_cpu_id;
2325                 srsp->srs_cpu[id]++;
2326                 sfxge_cpu[id]++;
2327         }
2328 
2329         mutex_exit(&cpu_lock);
2330 
2331         /* Build the expanded RSS table */
2332         count = 0;
2333         for (entry = 0; entry < SFXGE_RX_SCALE_MAX; entry++) {
2334                 unsigned int index;
2335 
2336                 index = tbl[count];
2337                 count = (count + 1) % srsp->srs_count;
2338 
2339                 srsp->srs_tbl[entry] = index;
2340         }
2341 
2342         /* Program the expanded RSS table into the hardware */
2343         (void) efx_rx_scale_tbl_set(sp->s_enp, srsp->srs_tbl,
2344             SFXGE_RX_SCALE_MAX);
2345 
2346         mutex_exit(&(srsp->srs_lock));
2347         kmem_free(rating, sizeof (unsigned int) * sip->si_nalloc);
2348         kmem_free(tbl, sizeof (unsigned int) * SFXGE_RX_SCALE_MAX);
2349         return;
2350 
2351 fail3:
2352         DTRACE_PROBE(fail3);
2353         kmem_free(tbl, sizeof (unsigned int) * SFXGE_RX_SCALE_MAX);
2354 fail2:
2355         DTRACE_PROBE(fail2);
2356 fail1:
2357         DTRACE_PROBE1(fail1, int, rc);
2358 
2359         mutex_exit(&(srsp->srs_lock));
2360 }
2361 
2362 static int
2363 sfxge_rx_scale_start(sfxge_t *sp)
2364 {
2365         sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2366         int rc;
2367 
2368         mutex_enter(&(srsp->srs_lock));
2369 
2370         ASSERT3U(srsp->srs_state, ==, SFXGE_RX_SCALE_INITIALIZED);
2371 
2372         /* Clear down the RSS table */
2373         bzero(srsp->srs_tbl, sizeof (unsigned int) * SFXGE_RX_SCALE_MAX);
2374 
2375         (void) efx_rx_scale_tbl_set(sp->s_enp, srsp->srs_tbl,
2376             SFXGE_RX_SCALE_MAX);
2377 
2378         if ((rc = sfxge_toeplitz_hash_init(sp)) != 0)
2379                 goto fail1;
2380 
2381         srsp->srs_state = SFXGE_RX_SCALE_STARTED;
2382 
2383         mutex_exit(&(srsp->srs_lock));
2384 
2385         /* sfxge_t->s_state_lock held */
2386         (void) ddi_taskq_dispatch(sp->s_tqp, sfxge_rx_scale_update, sp,
2387             DDI_SLEEP);
2388 
2389         return (0);
2390 
2391 fail1:
2392         DTRACE_PROBE1(fail1, int, rc);
2393 
2394         mutex_exit(&(srsp->srs_lock));
2395 
2396         return (rc);
2397 }
2398 
2399 int
2400 sfxge_rx_scale_count_get(sfxge_t *sp, unsigned int *countp)
2401 {
2402         sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2403         int rc;
2404 
2405         mutex_enter(&(srsp->srs_lock));
2406 
2407         if (srsp->srs_state != SFXGE_RX_SCALE_INITIALIZED &&
2408             srsp->srs_state != SFXGE_RX_SCALE_STARTED) {
2409                 rc = ENOTSUP;
2410                 goto fail1;
2411         }
2412 
2413         *countp = srsp->srs_count;
2414 
2415         mutex_exit(&(srsp->srs_lock));
2416 
2417         return (0);
2418 
2419 fail1:
2420         DTRACE_PROBE1(fail1, int, rc);
2421 
2422         mutex_exit(&(srsp->srs_lock));
2423 
2424         return (rc);
2425 }
2426 
2427 int
2428 sfxge_rx_scale_count_set(sfxge_t *sp, unsigned int count)
2429 {
2430         sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2431         sfxge_intr_t *sip = &(sp->s_intr);
2432         int dispatch = 1;
2433         int rc;
2434 
2435         if (count < 1 || count > sip->si_nalloc) {
2436                 rc = EINVAL;
2437                 goto fail1;
2438         }
2439 
2440         mutex_enter(&(srsp->srs_lock));
2441 
2442         if (srsp->srs_state != SFXGE_RX_SCALE_INITIALIZED &&
2443             srsp->srs_state != SFXGE_RX_SCALE_STARTED) {
2444                 rc = ENOTSUP;
2445                 goto fail2;
2446         }
2447 
2448         srsp->srs_count = count;
2449 
2450         if (srsp->srs_state != SFXGE_RX_SCALE_STARTED)
2451                 dispatch = 0;
2452 
2453         mutex_exit(&(srsp->srs_lock));
2454 
2455         if (dispatch)
2456                 /* no locks held */
2457                 (void) ddi_taskq_dispatch(sp->s_tqp, sfxge_rx_scale_update, sp,
2458                     DDI_SLEEP);
2459 
2460         return (0);
2461 
2462 fail2:
2463         DTRACE_PROBE(fail2);
2464 
2465         mutex_exit(&(srsp->srs_lock));
2466 
2467 fail1:
2468         DTRACE_PROBE1(fail1, int, rc);
2469 
2470         return (rc);
2471 }
2472 
2473 static void
2474 sfxge_rx_scale_stop(sfxge_t *sp)
2475 {
2476         sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2477         processorid_t id;
2478 
2479         mutex_enter(&(srsp->srs_lock));
2480 
2481         ASSERT3U(srsp->srs_state, ==, SFXGE_RX_SCALE_STARTED);
2482 
2483         srsp->srs_state = SFXGE_RX_SCALE_INITIALIZED;
2484 
2485         mutex_enter(&cpu_lock);
2486 
2487         /*
2488          * Substract any current CPU, core, cache and chip usage from the
2489          * global contention tables.
2490          */
2491         for (id = 0; id < NCPU; id++) {
2492                 ASSERT3U(sfxge_cpu[id], >=, srsp->srs_cpu[id]);
2493                 sfxge_cpu[id] -= srsp->srs_cpu[id];
2494                 srsp->srs_cpu[id] = 0;
2495         }
2496 
2497         mutex_exit(&cpu_lock);
2498 
2499         /* Clear down the RSS table */
2500         bzero(srsp->srs_tbl, sizeof (unsigned int) * SFXGE_RX_SCALE_MAX);
2501 
2502         (void) efx_rx_scale_tbl_set(sp->s_enp, srsp->srs_tbl,
2503             SFXGE_RX_SCALE_MAX);
2504 
2505         mutex_exit(&(srsp->srs_lock));
2506 }
2507 
2508 static void
2509 sfxge_rx_scale_fini(sfxge_t *sp)
2510 {
2511         sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2512 
2513         ASSERT3U(srsp->srs_state, ==, SFXGE_RX_SCALE_INITIALIZED);
2514 
2515         srsp->srs_state = SFXGE_RX_SCALE_UNINITIALIZED;
2516 
2517         /* Tear down the kstats */
2518         sfxge_rx_scale_kstat_fini(sp);
2519 
2520         srsp->srs_count = 0;
2521 
2522         mutex_destroy(&(srsp->srs_lock));
2523 
2524         /* Destroy tables */
2525         kmem_free(srsp->srs_cpu, sizeof (unsigned int) * NCPU);
2526         srsp->srs_cpu = NULL;
2527 
2528         sfxge_toeplitz_hash_fini(sp);
2529 }
2530 
2531 int
2532 sfxge_rx_init(sfxge_t *sp)
2533 {
2534         sfxge_intr_t *sip = &(sp->s_intr);
2535         char name[MAXNAMELEN];
2536         int index;
2537         int rc;
2538 
2539         if (sip->si_state == SFXGE_INTR_UNINITIALIZED) {
2540                 rc = EINVAL;
2541                 goto fail1;
2542         }
2543 
2544         if ((rc = sfxge_rx_scale_init(sp)) != 0)
2545                 goto fail2;
2546 
2547         (void) snprintf(name, MAXNAMELEN - 1, "%s%d_rx_packet_cache",
2548             ddi_driver_name(sp->s_dip), ddi_get_instance(sp->s_dip));
2549 
2550         sp->s_rpc = kmem_cache_create(name, sizeof (sfxge_rx_packet_t),
2551             SFXGE_CPU_CACHE_SIZE, sfxge_rx_packet_ctor, sfxge_rx_packet_dtor,
2552             NULL, sp, NULL, 0);
2553         ASSERT(sp->s_rpc != NULL);
2554 
2555         (void) snprintf(name, MAXNAMELEN - 1, "%s%d_rxq_cache",
2556             ddi_driver_name(sp->s_dip), ddi_get_instance(sp->s_dip));
2557 
2558         sp->s_rqc = kmem_cache_create(name, sizeof (sfxge_rxq_t),
2559             SFXGE_CPU_CACHE_SIZE, sfxge_rx_qctor, sfxge_rx_qdtor, NULL, sp,
2560             NULL, 0);
2561         ASSERT(sp->s_rqc != NULL);
2562 
2563         sp->s_rx_pkt_mem_max = ddi_prop_get_int64(DDI_DEV_T_ANY, sp->s_dip,
2564             DDI_PROP_DONTPASS, "rx_pkt_mem_max", 0); /* disabled */
2565 
2566         /* Initialize the receive queue(s) */
2567         for (index = 0; index < sip->si_nalloc; index++) {
2568                 if ((rc = sfxge_rx_qinit(sp, index)) != 0)
2569                         goto fail3;
2570         }
2571 
2572         sp->s_rx_coalesce_mode = ddi_prop_get_int(DDI_DEV_T_ANY, sp->s_dip,
2573             DDI_PROP_DONTPASS, "rx_coalesce_mode", SFXGE_RX_COALESCE_OFF);
2574 
2575         return (0);
2576 
2577 fail3:
2578         DTRACE_PROBE(fail3);
2579 
2580         /* Tear down the receive queue(s) */
2581         while (--index >= 0)
2582                 sfxge_rx_qfini(sp, index);
2583 
2584         kmem_cache_destroy(sp->s_rqc);
2585         sp->s_rqc = NULL;
2586 
2587         kmem_cache_destroy(sp->s_rpc);
2588         sp->s_rpc = NULL;
2589 
2590         sfxge_rx_scale_fini(sp);
2591 
2592 fail2:
2593         DTRACE_PROBE(fail2);
2594 fail1:
2595         DTRACE_PROBE1(fail1, int, rc);
2596 
2597         return (rc);
2598 }
2599 
2600 int
2601 sfxge_rx_start(sfxge_t *sp)
2602 {
2603         sfxge_mac_t *smp = &(sp->s_mac);
2604         sfxge_intr_t *sip;
2605         const efx_nic_cfg_t *encp;
2606         size_t hdrlen, align;
2607         int index;
2608         int rc;
2609 
2610         mutex_enter(&(smp->sm_lock));
2611 
2612         /* Calculate the receive packet buffer size and alignment */
2613         sp->s_rx_buffer_size = EFX_MAC_PDU(sp->s_mtu);
2614 
2615         encp = efx_nic_cfg_get(sp->s_enp);
2616 
2617         /* Packet buffer allocations are cache line aligned */
2618         EFSYS_ASSERT3U(encp->enc_rx_buf_align_start, <=, SFXGE_CPU_CACHE_SIZE);
2619 
2620         if (sp->s_family == EFX_FAMILY_HUNTINGTON) {
2621                 sp->s_rx_prefix_size = encp->enc_rx_prefix_size;
2622 
2623                 hdrlen = sp->s_rx_prefix_size + sizeof (struct ether_header);
2624 
2625                 /* Ensure IP headers are 32bit aligned */
2626                 sp->s_rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen;
2627                 sp->s_rx_buffer_size += sp->s_rx_buffer_align;
2628 
2629         } else if (encp->enc_features & EFX_FEATURE_LFSR_HASH_INSERT) {
2630                 sp->s_rx_prefix_size = encp->enc_rx_prefix_size;
2631 
2632                 /*
2633                  * Place the start of the buffer a prefix length minus 2
2634                  * before the start of a cache line. This ensures that the
2635                  * last two bytes of the prefix (which is where the LFSR hash
2636                  * is located) are in the same cache line as the headers, and
2637                  * the IP header is 32-bit aligned.
2638                  */
2639                 sp->s_rx_buffer_align =
2640                     SFXGE_CPU_CACHE_SIZE - (encp->enc_rx_prefix_size - 2);
2641                 sp->s_rx_buffer_size += sp->s_rx_buffer_align;
2642         } else {
2643                 sp->s_rx_prefix_size = 0;
2644 
2645                 /*
2646                  * Place the start of the buffer 2 bytes after a cache line
2647                  * boundary so that the headers fit into the cache line and
2648                  * the IP header is 32-bit aligned.
2649                  */
2650                 hdrlen = sp->s_rx_prefix_size + sizeof (struct ether_header);
2651 
2652                 sp->s_rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen;
2653                 sp->s_rx_buffer_size += sp->s_rx_buffer_align;
2654         }
2655 
2656         /* Align end of packet buffer for RX DMA end padding */
2657         align = MAX(1, encp->enc_rx_buf_align_end);
2658         EFSYS_ASSERT(ISP2(align));
2659         sp->s_rx_buffer_size = P2ROUNDUP(sp->s_rx_buffer_size, align);
2660 
2661         /* Initialize the receive module */
2662         if ((rc = efx_rx_init(sp->s_enp)) != 0)
2663                 goto fail1;
2664 
2665         mutex_exit(&(smp->sm_lock));
2666 
2667         if ((rc = sfxge_rx_scale_start(sp)) != 0)
2668                 goto fail2;
2669 
2670         /* Start the receive queue(s) */
2671         sip = &(sp->s_intr);
2672         for (index = 0; index < sip->si_nalloc; index++) {
2673                 if ((rc = sfxge_rx_qstart(sp, index)) != 0)
2674                         goto fail3;
2675         }
2676 
2677         ASSERT3U(sp->s_srp[0]->sr_state, ==, SFXGE_RXQ_STARTED);
2678         /* It is sufficient to have Rx scale initialized */
2679         ASSERT3U(sp->s_rx_scale.srs_state, ==, SFXGE_RX_SCALE_STARTED);
2680         rc = efx_mac_filter_default_rxq_set(sp->s_enp, sp->s_srp[0]->sr_erp,
2681             sp->s_rx_scale.srs_count > 1);
2682         if (rc != 0)
2683                 goto fail4;
2684 
2685         return (0);
2686 
2687 fail4:
2688         DTRACE_PROBE(fail4);
2689 
2690 fail3:
2691         DTRACE_PROBE(fail3);
2692 
2693         /* Stop the receive queue(s) */
2694         while (--index >= 0)
2695                 sfxge_rx_qstop(sp, index);
2696 
2697         sfxge_rx_scale_stop(sp);
2698 
2699 fail2:
2700         DTRACE_PROBE(fail2);
2701 
2702         mutex_enter(&(smp->sm_lock));
2703 
2704         /* Tear down the receive module */
2705         efx_rx_fini(sp->s_enp);
2706 
2707 fail1:
2708         DTRACE_PROBE1(fail1, int, rc);
2709 
2710         mutex_exit(&(smp->sm_lock));
2711 
2712         return (rc);
2713 }
2714 
2715 void
2716 sfxge_rx_coalesce_mode_get(sfxge_t *sp, sfxge_rx_coalesce_mode_t *modep)
2717 {
2718         *modep = sp->s_rx_coalesce_mode;
2719 }
2720 
2721 int
2722 sfxge_rx_coalesce_mode_set(sfxge_t *sp, sfxge_rx_coalesce_mode_t mode)
2723 {
2724         int rc;
2725 
2726         switch (mode) {
2727         case SFXGE_RX_COALESCE_OFF:
2728         case SFXGE_RX_COALESCE_DISALLOW_PUSH:
2729         case SFXGE_RX_COALESCE_ALLOW_PUSH:
2730                 break;
2731 
2732         default:
2733                 rc = EINVAL;
2734                 goto fail1;
2735         }
2736 
2737         sp->s_rx_coalesce_mode = mode;
2738 
2739         return (0);
2740 
2741 fail1:
2742         DTRACE_PROBE1(fail1, int, rc);
2743 
2744         return (rc);
2745 }
2746 
2747 void
2748 sfxge_rx_stop(sfxge_t *sp)
2749 {
2750         sfxge_mac_t *smp = &(sp->s_mac);
2751         sfxge_intr_t *sip = &(sp->s_intr);
2752         efx_nic_t *enp = sp->s_enp;
2753         int index;
2754 
2755         ASSERT(mutex_owned(&(sp->s_state_lock)));
2756 
2757         efx_mac_filter_default_rxq_clear(enp);
2758 
2759         /* Stop the receive queue(s) */
2760         index = sip->si_nalloc;
2761         while (--index >= 0) {
2762                 /* TBD: Flush RXQs in parallel; HW has limit + may need retry */
2763                 sfxge_rx_qstop(sp, index);
2764         }
2765 
2766         sfxge_rx_scale_stop(sp);
2767 
2768         mutex_enter(&(smp->sm_lock));
2769 
2770         /* Tear down the receive module */
2771         efx_rx_fini(enp);
2772 
2773         sp->s_rx_buffer_align = 0;
2774         sp->s_rx_prefix_size = 0;
2775         sp->s_rx_buffer_size = 0;
2776 
2777         mutex_exit(&(smp->sm_lock));
2778 }
2779 
2780 unsigned int
2781 sfxge_rx_loaned(sfxge_t *sp)
2782 {
2783         sfxge_intr_t *sip = &(sp->s_intr);
2784         int index;
2785         unsigned int loaned;
2786 
2787         ASSERT3U(sip->si_state, ==, SFXGE_INTR_INITIALIZED);
2788 
2789         loaned = 0;
2790         for (index = 0; index < sip->si_nalloc; index++) {
2791                 sfxge_rxq_t *srp = sp->s_srp[index];
2792                 sfxge_evq_t *sep = sp->s_sep[srp->sr_index];
2793 
2794                 mutex_enter(&(sep->se_lock));
2795 
2796                 loaned += sfxge_rx_qfpp_swizzle(srp);
2797 
2798                 mutex_exit(&(sep->se_lock));
2799         }
2800 
2801         return (loaned);
2802 }
2803 
2804 void
2805 sfxge_rx_fini(sfxge_t *sp)
2806 {
2807         sfxge_intr_t *sip = &(sp->s_intr);
2808         int index;
2809 
2810         ASSERT3U(sip->si_state, ==, SFXGE_INTR_INITIALIZED);
2811 
2812         sp->s_rx_coalesce_mode = SFXGE_RX_COALESCE_OFF;
2813 
2814         /* Tear down the receive queue(s) */
2815         index = sip->si_nalloc;
2816         while (--index >= 0)
2817                 sfxge_rx_qfini(sp, index);
2818 
2819         ASSERT3U(sp->s_rx_pkt_mem_alloc, ==, 0);
2820 
2821         kmem_cache_destroy(sp->s_rqc);
2822         sp->s_rqc = NULL;
2823 
2824         kmem_cache_destroy(sp->s_rpc);
2825         sp->s_rpc = NULL;
2826 
2827         sfxge_rx_scale_fini(sp);
2828 }