1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2008-2013 Solarflare Communications Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 #include <sys/types.h>
  28 #include <sys/sysmacros.h>
  29 #include <sys/ddi.h>
  30 #include <sys/sunddi.h>
  31 #include <sys/atomic.h>
  32 #include <sys/stream.h>
  33 #include <sys/strsun.h>
  34 #include <sys/strsubr.h>
  35 #include <sys/pattr.h>
  36 #include <sys/cpu.h>
  37 
  38 #include <sys/ethernet.h>
  39 #include <inet/ip.h>
  40 
  41 #include <netinet/in.h>
  42 #include <netinet/ip.h>
  43 #include <netinet/tcp.h>
  44 
  45 #include "sfxge.h"
  46 
  47 #include "efx.h"
  48 
  49 /* TXQ flush response timeout (in microseconds) */
  50 #define SFXGE_TX_QFLUSH_USEC    (2000000)
  51 #define EVQ_0 0
  52 
  53 /* See sfxge.conf.private for descriptions */
  54 #define SFXGE_TX_DPL_GET_PKT_LIMIT_DEFAULT 4096
  55 #define SFXGE_TX_DPL_PUT_PKT_LIMIT_DEFAULT 256
  56 
  57 
  58 /* Transmit buffer DMA attributes */
  59 static ddi_device_acc_attr_t sfxge_tx_buffer_devacc = {
  60 
  61         DDI_DEVICE_ATTR_V0,     /* devacc_attr_version */
  62         DDI_NEVERSWAP_ACC,      /* devacc_attr_endian_flags */
  63         DDI_STRICTORDER_ACC     /* devacc_attr_dataorder */
  64 };
  65 
  66 static ddi_dma_attr_t sfxge_tx_buffer_dma_attr = {
  67         DMA_ATTR_V0,            /* dma_attr_version     */
  68         0,                      /* dma_attr_addr_lo     */
  69         0xffffffffffffffffull,  /* dma_attr_addr_hi     */
  70         0xffffffffffffffffull,  /* dma_attr_count_max   */
  71         SFXGE_TX_BUFFER_SIZE,   /* dma_attr_align       */
  72         0xffffffff,             /* dma_attr_burstsizes  */
  73         1,                      /* dma_attr_minxfer     */
  74         0xffffffffffffffffull,  /* dma_attr_maxxfer     */
  75         0xffffffffffffffffull,  /* dma_attr_seg         */
  76         1,                      /* dma_attr_sgllen      */
  77         1,                      /* dma_attr_granular    */
  78         0                       /* dma_attr_flags       */
  79 };
  80 
  81 /* Transmit mapping DMA attributes */
  82 static ddi_dma_attr_t sfxge_tx_mapping_dma_attr = {
  83         DMA_ATTR_V0,            /* dma_attr_version     */
  84         0,                      /* dma_attr_addr_lo     */
  85         0xffffffffffffffffull,  /* dma_attr_addr_hi     */
  86         0xffffffffffffffffull,  /* dma_attr_count_max   */
  87         1,                      /* dma_attr_align       */
  88         0xffffffff,             /* dma_attr_burstsizes  */
  89         1,                      /* dma_attr_minxfer     */
  90         0xffffffffffffffffull,  /* dma_attr_maxxfer     */
  91         0xffffffffffffffffull,  /* dma_attr_seg         */
  92         0x7fffffff,             /* dma_attr_sgllen      */
  93         1,                      /* dma_attr_granular    */
  94         0                       /* dma_attr_flags       */
  95 };
  96 
  97 /* Transmit queue DMA attributes */
  98 static ddi_device_acc_attr_t sfxge_txq_devacc = {
  99 
 100         DDI_DEVICE_ATTR_V0,     /* devacc_attr_version */
 101         DDI_NEVERSWAP_ACC,      /* devacc_attr_endian_flags */
 102         DDI_STRICTORDER_ACC     /* devacc_attr_dataorder */
 103 };
 104 
 105 static ddi_dma_attr_t sfxge_txq_dma_attr = {
 106         DMA_ATTR_V0,            /* dma_attr_version     */
 107         0,                      /* dma_attr_addr_lo     */
 108         0xffffffffffffffffull,  /* dma_attr_addr_hi     */
 109         0xffffffffffffffffull,  /* dma_attr_count_max   */
 110         EFX_BUF_SIZE,           /* dma_attr_align       */
 111         0xffffffff,             /* dma_attr_burstsizes  */
 112         1,                      /* dma_attr_minxfer     */
 113         0xffffffffffffffffull,  /* dma_attr_maxxfer     */
 114         0xffffffffffffffffull,  /* dma_attr_seg         */
 115         1,                      /* dma_attr_sgllen      */
 116         1,                      /* dma_attr_granular    */
 117         0                       /* dma_attr_flags       */
 118 };
 119 
 120 
 121 /*
 122  * A sfxge_tx_qdpl_swizzle() can happen when the DPL get list is one packet
 123  * under the limit, and must move all packets from the DPL put->get list
 124  * Hence this is the real maximum length of the TX DPL get list.
 125  */
 126 static int
 127 sfxge_tx_dpl_get_pkt_max(sfxge_txq_t *stp)
 128 {
 129         sfxge_tx_dpl_t *stdp = &(stp->st_dpl);
 130         return (stdp->get_pkt_limit + stdp->put_pkt_limit - 1);
 131 }
 132 
 133 
 134 static int
 135 sfxge_tx_packet_ctor(void *buf, void *arg, int kmflags)
 136 {
 137         _NOTE(ARGUNUSED(arg, kmflags))
 138 
 139         bzero(buf, sizeof (sfxge_tx_packet_t));
 140 
 141         return (0);
 142 }
 143 
 144 static void
 145 sfxge_tx_packet_dtor(void *buf, void *arg)
 146 {
 147         sfxge_tx_packet_t *stpp = buf;
 148 
 149         _NOTE(ARGUNUSED(arg))
 150 
 151         SFXGE_OBJ_CHECK(stpp, sfxge_tx_packet_t);
 152 }
 153 
 154 static int
 155 sfxge_tx_buffer_ctor(void *buf, void *arg, int kmflags)
 156 {
 157         sfxge_tx_buffer_t *stbp = buf;
 158         sfxge_t *sp = arg;
 159         sfxge_dma_buffer_attr_t dma_attr;
 160         int rc;
 161 
 162         bzero(buf, sizeof (sfxge_tx_buffer_t));
 163 
 164         dma_attr.sdba_dip        = sp->s_dip;
 165         dma_attr.sdba_dattrp     = &sfxge_tx_buffer_dma_attr;
 166         dma_attr.sdba_callback   = ((kmflags == KM_SLEEP) ?
 167             DDI_DMA_SLEEP : DDI_DMA_DONTWAIT);
 168         dma_attr.sdba_length     = SFXGE_TX_BUFFER_SIZE;
 169         dma_attr.sdba_memflags   = DDI_DMA_STREAMING;
 170         dma_attr.sdba_devaccp    = &sfxge_tx_buffer_devacc;
 171         dma_attr.sdba_bindflags  = DDI_DMA_WRITE | DDI_DMA_STREAMING;
 172         dma_attr.sdba_maxcookies = 1;
 173         dma_attr.sdba_zeroinit   = B_FALSE;
 174 
 175         if ((rc = sfxge_dma_buffer_create(&(stbp->stb_esm), &dma_attr)) != 0)
 176                 goto fail1;
 177 
 178         return (0);
 179 
 180 fail1:
 181         DTRACE_PROBE1(fail1, int, rc);
 182 
 183         SFXGE_OBJ_CHECK(stbp, sfxge_tx_buffer_t);
 184 
 185         return (-1);
 186 }
 187 
 188 static void
 189 sfxge_tx_buffer_dtor(void *buf, void *arg)
 190 {
 191         sfxge_tx_buffer_t *stbp = buf;
 192 
 193         _NOTE(ARGUNUSED(arg))
 194 
 195         sfxge_dma_buffer_destroy(&(stbp->stb_esm));
 196 
 197         SFXGE_OBJ_CHECK(stbp, sfxge_tx_buffer_t);
 198 }
 199 
 200 static int
 201 sfxge_tx_mapping_ctor(void *buf, void *arg, int kmflags)
 202 {
 203         sfxge_tx_mapping_t *stmp = buf;
 204         sfxge_t *sp = arg;
 205         dev_info_t *dip = sp->s_dip;
 206         int rc;
 207 
 208         bzero(buf, sizeof (sfxge_tx_mapping_t));
 209 
 210         stmp->stm_sp = sp;
 211 
 212         /* Allocate DMA handle */
 213         rc = ddi_dma_alloc_handle(dip, &sfxge_tx_mapping_dma_attr,
 214             (kmflags == KM_SLEEP) ? DDI_DMA_SLEEP : DDI_DMA_DONTWAIT,
 215             NULL, &(stmp->stm_dma_handle));
 216         if (rc != DDI_SUCCESS)
 217                 goto fail1;
 218 
 219         return (0);
 220 
 221 fail1:
 222         DTRACE_PROBE1(fail1, int, rc);
 223 
 224         stmp->stm_sp = NULL;
 225 
 226         SFXGE_OBJ_CHECK(stmp, sfxge_tx_mapping_t);
 227 
 228         return (-1);
 229 }
 230 
 231 static void
 232 sfxge_tx_mapping_dtor(void *buf, void *arg)
 233 {
 234         sfxge_tx_mapping_t *stmp = buf;
 235 
 236         _NOTE(ARGUNUSED(arg))
 237 
 238         ASSERT3P(stmp->stm_sp, ==, arg);
 239 
 240         /* Free the DMA handle */
 241         ddi_dma_free_handle(&(stmp->stm_dma_handle));
 242         stmp->stm_dma_handle = NULL;
 243 
 244         stmp->stm_sp = NULL;
 245 
 246         SFXGE_OBJ_CHECK(stmp, sfxge_tx_mapping_t);
 247 }
 248 
 249 static int
 250 sfxge_tx_qctor(void *buf, void *arg, int kmflags)
 251 {
 252         sfxge_txq_t *stp = buf;
 253         efsys_mem_t *esmp = &(stp->st_mem);
 254         sfxge_t *sp = arg;
 255         sfxge_dma_buffer_attr_t dma_attr;
 256         sfxge_tx_dpl_t *stdp;
 257         int rc;
 258 
 259         /* Compile-time structure layout checks */
 260         EFX_STATIC_ASSERT(sizeof (stp->__st_u1.__st_s1) <=
 261             sizeof (stp->__st_u1.__st_pad));
 262         EFX_STATIC_ASSERT(sizeof (stp->__st_u2.__st_s2) <=
 263             sizeof (stp->__st_u2.__st_pad));
 264         EFX_STATIC_ASSERT(sizeof (stp->__st_u3.__st_s3) <=
 265             sizeof (stp->__st_u3.__st_pad));
 266         EFX_STATIC_ASSERT(sizeof (stp->__st_u4.__st_s4) <=
 267             sizeof (stp->__st_u4.__st_pad));
 268 
 269         bzero(buf, sizeof (sfxge_txq_t));
 270 
 271         stp->st_sp = sp;
 272 
 273         dma_attr.sdba_dip        = sp->s_dip;
 274         dma_attr.sdba_dattrp     = &sfxge_txq_dma_attr;
 275         dma_attr.sdba_callback   = DDI_DMA_SLEEP;
 276         dma_attr.sdba_length     = EFX_TXQ_SIZE(SFXGE_TX_NDESCS);
 277         dma_attr.sdba_memflags   = DDI_DMA_CONSISTENT;
 278         dma_attr.sdba_devaccp    = &sfxge_txq_devacc;
 279         dma_attr.sdba_bindflags  = DDI_DMA_READ | DDI_DMA_CONSISTENT;
 280         dma_attr.sdba_maxcookies = EFX_TXQ_NBUFS(SFXGE_TX_NDESCS);
 281         dma_attr.sdba_zeroinit   = B_FALSE;
 282 
 283         if ((rc = sfxge_dma_buffer_create(esmp, &dma_attr)) != 0)
 284                 goto fail1;
 285 
 286         /* Allocate some buffer table entries */
 287         if ((rc = sfxge_sram_buf_tbl_alloc(sp, EFX_TXQ_NBUFS(SFXGE_TX_NDESCS),
 288             &(stp->st_id))) != 0)
 289                 goto fail2;
 290 
 291         /* Allocate the descriptor array */
 292         if ((stp->st_eb = kmem_zalloc(sizeof (efx_buffer_t) *
 293             EFX_TXQ_LIMIT(SFXGE_TX_NDESCS), kmflags)) == NULL) {
 294                 rc = ENOMEM;
 295                 goto fail3;
 296         }
 297 
 298         /* Allocate the context arrays */
 299         if ((stp->st_stmp = kmem_zalloc(sizeof (sfxge_tx_mapping_t *) *
 300             SFXGE_TX_NDESCS, kmflags)) == NULL) {
 301                 rc = ENOMEM;
 302                 goto fail4;
 303         }
 304 
 305         if ((stp->st_stbp = kmem_zalloc(sizeof (sfxge_tx_buffer_t *) *
 306             SFXGE_TX_NDESCS, kmflags)) == NULL) {
 307                 rc = ENOMEM;
 308                 goto fail5;
 309         }
 310 
 311         if ((stp->st_mp = kmem_zalloc(sizeof (mblk_t *) *
 312             SFXGE_TX_NDESCS, kmflags)) == NULL) {
 313                 rc = ENOMEM;
 314                 goto fail6;
 315         }
 316 
 317         /* Initialize the deferred packet list */
 318         stdp = &(stp->st_dpl);
 319         stdp->std_getp = &(stdp->std_get);
 320 
 321         stp->st_unblock = SFXGE_TXQ_NOT_BLOCKED;
 322 
 323         return (0);
 324 
 325 fail6:
 326         DTRACE_PROBE(fail6);
 327 
 328         kmem_free(stp->st_stbp, sizeof (sfxge_tx_buffer_t *) * SFXGE_TX_NDESCS);
 329         stp->st_stbp = NULL;
 330 
 331 fail5:
 332         DTRACE_PROBE(fail5);
 333 
 334         kmem_free(stp->st_stmp,
 335             sizeof (sfxge_tx_mapping_t *) * SFXGE_TX_NDESCS);
 336         stp->st_stmp = NULL;
 337 
 338 fail4:
 339         DTRACE_PROBE(fail4);
 340 
 341         /* Free the descriptor array */
 342         kmem_free(stp->st_eb, sizeof (efx_buffer_t) *
 343             EFX_TXQ_LIMIT(SFXGE_TX_NDESCS));
 344         stp->st_eb = NULL;
 345 
 346 fail3:
 347         DTRACE_PROBE(fail3);
 348 
 349         /* Free the buffer table entries */
 350         sfxge_sram_buf_tbl_free(sp, stp->st_id, EFX_TXQ_NBUFS(SFXGE_TX_NDESCS));
 351         stp->st_id = 0;
 352 
 353 fail2:
 354         DTRACE_PROBE(fail2);
 355 
 356         /* Tear down DMA setup */
 357         sfxge_dma_buffer_destroy(esmp);
 358 
 359 fail1:
 360         DTRACE_PROBE1(fail1, int, rc);
 361 
 362         stp->st_sp = NULL;
 363 
 364         SFXGE_OBJ_CHECK(stp, sfxge_txq_t);
 365 
 366         return (-1);
 367 }
 368 
 369 static void
 370 sfxge_tx_qdtor(void *buf, void *arg)
 371 {
 372         sfxge_txq_t *stp = buf;
 373         efsys_mem_t *esmp = &(stp->st_mem);
 374         sfxge_t *sp = stp->st_sp;
 375         sfxge_tx_dpl_t *stdp;
 376 
 377         _NOTE(ARGUNUSED(arg))
 378 
 379         stp->st_unblock = 0;
 380 
 381         /* Tear down the deferred packet list */
 382         stdp = &(stp->st_dpl);
 383         ASSERT3P(stdp->std_getp, ==, &(stdp->std_get));
 384         stdp->std_getp = NULL;
 385 
 386         /* Free the context arrays */
 387         kmem_free(stp->st_mp, sizeof (mblk_t *) * SFXGE_TX_NDESCS);
 388         stp->st_mp = NULL;
 389 
 390         kmem_free(stp->st_stbp, sizeof (sfxge_tx_buffer_t *) * SFXGE_TX_NDESCS);
 391         stp->st_stbp = NULL;
 392 
 393         kmem_free(stp->st_stmp,
 394             sizeof (sfxge_tx_mapping_t *) * SFXGE_TX_NDESCS);
 395         stp->st_stmp = NULL;
 396 
 397         /* Free the descriptor array */
 398         kmem_free(stp->st_eb, sizeof (efx_buffer_t) *
 399             EFX_TXQ_LIMIT(SFXGE_TX_NDESCS));
 400         stp->st_eb = NULL;
 401 
 402         /* Free the buffer table entries */
 403         sfxge_sram_buf_tbl_free(sp, stp->st_id, EFX_TXQ_NBUFS(SFXGE_TX_NDESCS));
 404         stp->st_id = 0;
 405 
 406         /* Tear down dma setup */
 407         sfxge_dma_buffer_destroy(esmp);
 408 
 409         stp->st_sp = NULL;
 410 
 411         SFXGE_OBJ_CHECK(stp, sfxge_txq_t);
 412 }
 413 
 414 static void
 415 sfxge_tx_packet_destroy(sfxge_t *sp, sfxge_tx_packet_t *stpp)
 416 {
 417         kmem_cache_free(sp->s_tpc, stpp);
 418 }
 419 
 420 static sfxge_tx_packet_t *
 421 sfxge_tx_packet_create(sfxge_t *sp)
 422 {
 423         sfxge_tx_packet_t *stpp;
 424 
 425         stpp = kmem_cache_alloc(sp->s_tpc, KM_NOSLEEP);
 426 
 427         return (stpp);
 428 }
 429 
 430 static inline int
 431 sfxge_tx_qfpp_put(sfxge_txq_t *stp, sfxge_tx_packet_t *stpp)
 432 {
 433         sfxge_tx_fpp_t *stfp = &(stp->st_fpp);
 434 
 435         ASSERT(mutex_owned(&(stp->st_lock)));
 436 
 437         ASSERT3P(stpp->stp_next, ==, NULL);
 438         ASSERT3P(stpp->stp_mp, ==, NULL);
 439         ASSERT3P(stpp->stp_etherhp, ==, NULL);
 440         ASSERT3P(stpp->stp_iphp, ==, NULL);
 441         ASSERT3P(stpp->stp_thp, ==, NULL);
 442         ASSERT3U(stpp->stp_off, ==, 0);
 443         ASSERT3U(stpp->stp_size, ==, 0);
 444         ASSERT3U(stpp->stp_mss, ==, 0);
 445         ASSERT3U(stpp->stp_dpl_put_len, ==, 0);
 446 
 447         if (stfp->stf_count < SFXGE_TX_FPP_MAX) {
 448                 /* Add to the start of the list */
 449                 stpp->stp_next = stfp->stf_stpp;
 450                 stfp->stf_stpp = stpp;
 451                 stfp->stf_count++;
 452 
 453                 return (0);
 454         }
 455 
 456         DTRACE_PROBE(fpp_full);
 457         return (ENOSPC);
 458 }
 459 
 460 static inline sfxge_tx_packet_t *
 461 sfxge_tx_qfpp_get(sfxge_txq_t *stp)
 462 {
 463         sfxge_tx_packet_t *stpp;
 464         sfxge_tx_fpp_t *stfp = &(stp->st_fpp);
 465 
 466         ASSERT(mutex_owned(&(stp->st_lock)));
 467 
 468         stpp = stfp->stf_stpp;
 469         if (stpp == NULL) {
 470                 ASSERT3U(stfp->stf_count, ==, 0);
 471                 return (NULL);
 472         }
 473 
 474         /* Remove item from the head of the list */
 475         stfp->stf_stpp = stpp->stp_next;
 476         stpp->stp_next = NULL;
 477 
 478         ASSERT3U(stfp->stf_count, >, 0);
 479         stfp->stf_count--;
 480 
 481         if (stfp->stf_count != 0) {
 482                 ASSERT(stfp->stf_stpp != NULL);
 483                 prefetch_read_many(stfp->stf_stpp);
 484         }
 485         return (stpp);
 486 }
 487 
 488 static void
 489 sfxge_tx_qfpp_empty(sfxge_txq_t *stp)
 490 {
 491         sfxge_t *sp = stp->st_sp;
 492         sfxge_tx_fpp_t *stfp = &(stp->st_fpp);
 493         sfxge_tx_packet_t *stpp;
 494 
 495         mutex_enter(&(stp->st_lock));
 496 
 497         stpp = stfp->stf_stpp;
 498         stfp->stf_stpp = NULL;
 499 
 500         while (stpp != NULL) {
 501                 sfxge_tx_packet_t *next;
 502 
 503                 next = stpp->stp_next;
 504                 stpp->stp_next = NULL;
 505 
 506                 ASSERT3U(stfp->stf_count, >, 0);
 507                 stfp->stf_count--;
 508 
 509                 sfxge_tx_packet_destroy(sp, stpp);
 510 
 511                 stpp = next;
 512         }
 513         ASSERT3U(stfp->stf_count, ==, 0);
 514 
 515         mutex_exit(&(stp->st_lock));
 516 }
 517 
 518 static inline void
 519 sfxge_tx_qfbp_put(sfxge_txq_t *stp, sfxge_tx_buffer_t *stbp)
 520 {
 521         sfxge_tx_fbp_t *stfp = &(stp->st_fbp);
 522 
 523         ASSERT3P(stbp->stb_next, ==, NULL);
 524         ASSERT3U(stbp->stb_off, ==, 0);
 525         ASSERT3U(stbp->stb_esm.esm_size, ==, 0);
 526 
 527         stbp->stb_next = stfp->stf_stbp;
 528         stfp->stf_stbp = stbp;
 529         stfp->stf_count++;
 530 }
 531 
 532 
 533 static inline sfxge_tx_buffer_t *
 534 sfxge_tx_qfbp_get(sfxge_txq_t *stp)
 535 {
 536         sfxge_tx_buffer_t *stbp;
 537         sfxge_tx_fbp_t *stfp = &(stp->st_fbp);
 538 
 539         stbp = stfp->stf_stbp;
 540         if (stbp == NULL) {
 541                 ASSERT3U(stfp->stf_count, ==, 0);
 542                 return (NULL);
 543         }
 544 
 545         stfp->stf_stbp = stbp->stb_next;
 546         stbp->stb_next = NULL;
 547 
 548         ASSERT3U(stfp->stf_count, >, 0);
 549         stfp->stf_count--;
 550 
 551         if (stfp->stf_count != 0) {
 552                 ASSERT(stfp->stf_stbp != NULL);
 553                 prefetch_read_many(stfp->stf_stbp);
 554         }
 555 
 556         return (stbp);
 557 }
 558 
 559 static void
 560 sfxge_tx_qfbp_empty(sfxge_txq_t *stp)
 561 {
 562         sfxge_t *sp = stp->st_sp;
 563         sfxge_tx_fbp_t *stfp = &(stp->st_fbp);
 564         sfxge_tx_buffer_t *stbp;
 565 
 566         mutex_enter(&(stp->st_lock));
 567 
 568         stbp = stfp->stf_stbp;
 569         stfp->stf_stbp = NULL;
 570 
 571         while (stbp != NULL) {
 572                 sfxge_tx_buffer_t *next;
 573 
 574                 next = stbp->stb_next;
 575                 stbp->stb_next = NULL;
 576 
 577                 ASSERT3U(stfp->stf_count, >, 0);
 578                 stfp->stf_count--;
 579 
 580                 kmem_cache_free(sp->s_tbc, stbp);
 581 
 582                 stbp = next;
 583         }
 584         ASSERT3U(stfp->stf_count, ==, 0);
 585 
 586         mutex_exit(&(stp->st_lock));
 587 }
 588 
 589 static inline void
 590 sfxge_tx_qfmp_put(sfxge_txq_t *stp, sfxge_tx_mapping_t *stmp)
 591 {
 592         sfxge_tx_fmp_t *stfp = &(stp->st_fmp);
 593 
 594         ASSERT3P(stmp->stm_next, ==, NULL);
 595         ASSERT3P(stmp->stm_mp, ==, NULL);
 596         ASSERT3P(stmp->stm_base, ==, NULL);
 597         ASSERT3U(stmp->stm_off, ==, 0);
 598         ASSERT3U(stmp->stm_size, ==, 0);
 599 
 600         stmp->stm_next = stfp->stf_stmp;
 601         stfp->stf_stmp = stmp;
 602         stfp->stf_count++;
 603 }
 604 
 605 static inline sfxge_tx_mapping_t *
 606 sfxge_tx_qfmp_get(sfxge_txq_t *stp)
 607 {
 608         sfxge_tx_mapping_t *stmp;
 609         sfxge_tx_fmp_t *stfp = &(stp->st_fmp);
 610 
 611         stmp = stfp->stf_stmp;
 612         if (stmp == NULL) {
 613                 ASSERT3U(stfp->stf_count, ==, 0);
 614                 return (NULL);
 615         }
 616 
 617         stfp->stf_stmp = stmp->stm_next;
 618         stmp->stm_next = NULL;
 619 
 620         ASSERT3U(stfp->stf_count, >, 0);
 621         stfp->stf_count--;
 622 
 623         if (stfp->stf_count != 0) {
 624                 ASSERT(stfp->stf_stmp != NULL);
 625                 prefetch_read_many(stfp->stf_stmp);
 626         }
 627         return (stmp);
 628 }
 629 
 630 static void
 631 sfxge_tx_qfmp_empty(sfxge_txq_t *stp)
 632 {
 633         sfxge_t *sp = stp->st_sp;
 634         sfxge_tx_fmp_t *stfp = &(stp->st_fmp);
 635         sfxge_tx_mapping_t *stmp;
 636 
 637         mutex_enter(&(stp->st_lock));
 638 
 639         stmp = stfp->stf_stmp;
 640         stfp->stf_stmp = NULL;
 641 
 642         while (stmp != NULL) {
 643                 sfxge_tx_mapping_t *next;
 644 
 645                 next = stmp->stm_next;
 646                 stmp->stm_next = NULL;
 647 
 648                 ASSERT3U(stfp->stf_count, >, 0);
 649                 stfp->stf_count--;
 650 
 651                 kmem_cache_free(sp->s_tmc, stmp);
 652 
 653                 stmp = next;
 654         }
 655         ASSERT3U(stfp->stf_count, ==, 0);
 656 
 657         mutex_exit(&(stp->st_lock));
 658 }
 659 
 660 static void
 661 sfxge_tx_msgb_unbind(sfxge_tx_mapping_t *stmp)
 662 {
 663         bzero(stmp->stm_addr, sizeof (uint64_t) * SFXGE_TX_MAPPING_NADDR);
 664         stmp->stm_off = 0;
 665 
 666         (void) ddi_dma_unbind_handle(stmp->stm_dma_handle);
 667 
 668         stmp->stm_size = 0;
 669         stmp->stm_base = NULL;
 670 
 671         stmp->stm_mp = NULL;
 672 }
 673 
 674 #define SFXGE_TX_DESCSHIFT      12
 675 #define SFXGE_TX_DESCSIZE       (1 << 12)
 676 
 677 #define SFXGE_TX_DESCOFFSET     (SFXGE_TX_DESCSIZE - 1)
 678 #define SFXGE_TX_DESCMASK       (~SFXGE_TX_DESCOFFSET)
 679 
 680 static int
 681 sfxge_tx_msgb_bind(mblk_t *mp, sfxge_tx_mapping_t *stmp)
 682 {
 683         ddi_dma_cookie_t dmac;
 684         unsigned int ncookies;
 685         size_t size;
 686         unsigned int n;
 687         int rc;
 688 
 689         ASSERT(mp != NULL);
 690         ASSERT3U(DB_TYPE(mp), ==, M_DATA);
 691 
 692         ASSERT(stmp->stm_mp == NULL);
 693         stmp->stm_mp = mp;
 694 
 695         stmp->stm_base = (caddr_t)(mp->b_rptr);
 696         stmp->stm_size = MBLKL(mp);
 697 
 698         /* Bind the STREAMS block to the mapping */
 699         rc = ddi_dma_addr_bind_handle(stmp->stm_dma_handle, NULL,
 700             stmp->stm_base, stmp->stm_size, DDI_DMA_WRITE | DDI_DMA_STREAMING,
 701             DDI_DMA_DONTWAIT, NULL, &dmac, &ncookies);
 702         if (rc != DDI_DMA_MAPPED)
 703                 goto fail1;
 704 
 705         ASSERT3U(ncookies, <=, SFXGE_TX_MAPPING_NADDR);
 706 
 707         /*
 708          * Construct an array of addresses and an initial
 709          * offset.
 710          */
 711         n = 0;
 712         stmp->stm_addr[n++] = dmac.dmac_laddress & SFXGE_TX_DESCMASK;
 713         DTRACE_PROBE1(addr, uint64_t, dmac.dmac_laddress & SFXGE_TX_DESCMASK);
 714 
 715         stmp->stm_off = dmac.dmac_laddress & SFXGE_TX_DESCOFFSET;
 716 
 717         size = MIN(SFXGE_TX_DESCSIZE - stmp->stm_off, dmac.dmac_size);
 718         dmac.dmac_laddress += size;
 719         dmac.dmac_size -= size;
 720 
 721         for (;;) {
 722                 ASSERT3U(n, <, SFXGE_TX_MAPPING_NADDR);
 723 
 724                 if (dmac.dmac_size == 0) {
 725                         if (--ncookies == 0)
 726                                 break;
 727 
 728                         ddi_dma_nextcookie(stmp->stm_dma_handle, &dmac);
 729                 }
 730 
 731                 ASSERT((dmac.dmac_laddress & SFXGE_TX_DESCMASK) != 0);
 732                 ASSERT((dmac.dmac_laddress & SFXGE_TX_DESCOFFSET) == 0);
 733                 stmp->stm_addr[n++] = dmac.dmac_laddress;
 734                 DTRACE_PROBE1(addr, uint64_t, dmac.dmac_laddress);
 735 
 736                 size = MIN(SFXGE_TX_DESCSIZE, dmac.dmac_size);
 737                 dmac.dmac_laddress += size;
 738                 dmac.dmac_size -= size;
 739         }
 740         ASSERT3U(n, <=, SFXGE_TX_MAPPING_NADDR);
 741 
 742         return (0);
 743 
 744 fail1:
 745         DTRACE_PROBE1(fail1, int, rc);
 746 
 747         stmp->stm_size = 0;
 748         stmp->stm_base = NULL;
 749 
 750         stmp->stm_mp = NULL;
 751 
 752         return (-1);
 753 }
 754 
 755 static void
 756 sfxge_tx_qreap(sfxge_txq_t *stp)
 757 {
 758         unsigned int reaped;
 759 
 760         ASSERT(mutex_owned(&(stp->st_lock)));
 761 
 762         reaped = stp->st_reaped;
 763         while (reaped != stp->st_completed) {
 764                 unsigned int id;
 765                 sfxge_tx_mapping_t *stmp;
 766                 sfxge_tx_buffer_t *stbp;
 767 
 768                 id = reaped++ & (SFXGE_TX_NDESCS - 1);
 769 
 770                 ASSERT3P(stp->st_mp[id], ==, NULL);
 771 
 772                 if ((stmp = stp->st_stmp[id]) != NULL) {
 773                         stp->st_stmp[id] = NULL;
 774 
 775                         /* Free all the mappings */
 776                         do {
 777                                 sfxge_tx_mapping_t *next;
 778 
 779                                 next = stmp->stm_next;
 780                                 stmp->stm_next = NULL;
 781 
 782                                 sfxge_tx_qfmp_put(stp, stmp);
 783 
 784                                 stmp = next;
 785                         } while (stmp != NULL);
 786                 }
 787 
 788                 if ((stbp = stp->st_stbp[id]) != NULL) {
 789                         stp->st_stbp[id] = NULL;
 790 
 791                         /* Free all the buffers */
 792                         do {
 793                                 sfxge_tx_buffer_t *next;
 794 
 795                                 next = stbp->stb_next;
 796                                 stbp->stb_next = NULL;
 797 
 798                                 stbp->stb_esm.esm_size = 0;
 799                                 stbp->stb_off = 0;
 800 
 801                                 sfxge_tx_qfbp_put(stp, stbp);
 802 
 803                                 stbp = next;
 804                         } while (stbp != NULL);
 805                 }
 806         }
 807         stp->st_reaped = reaped;
 808 }
 809 
 810 static void
 811 sfxge_tx_qlist_abort(sfxge_txq_t *stp)
 812 {
 813         unsigned int id;
 814         sfxge_tx_mapping_t *stmp;
 815         sfxge_tx_buffer_t *stbp;
 816         mblk_t *mp;
 817 
 818         ASSERT(mutex_owned(&(stp->st_lock)));
 819 
 820         id = stp->st_added & (SFXGE_TX_NDESCS - 1);
 821 
 822         /* Clear the completion information */
 823         stmp = stp->st_stmp[id];
 824         stp->st_stmp[id] = NULL;
 825 
 826         /* Free any mappings that were used */
 827         while (stmp != NULL) {
 828                 sfxge_tx_mapping_t *next;
 829 
 830                 next = stmp->stm_next;
 831                 stmp->stm_next = NULL;
 832 
 833                 if (stmp->stm_mp != NULL)
 834                         sfxge_tx_msgb_unbind(stmp);
 835 
 836                 sfxge_tx_qfmp_put(stp, stmp);
 837 
 838                 stmp = next;
 839         }
 840 
 841         stbp = stp->st_stbp[id];
 842         stp->st_stbp[id] = NULL;
 843 
 844         /* Free any buffers that were used */
 845         while (stbp != NULL) {
 846                 sfxge_tx_buffer_t *next;
 847 
 848                 next = stbp->stb_next;
 849                 stbp->stb_next = NULL;
 850 
 851                 stbp->stb_off = 0;
 852                 stbp->stb_esm.esm_size = 0;
 853 
 854                 sfxge_tx_qfbp_put(stp, stbp);
 855 
 856                 stbp = next;
 857         }
 858 
 859         mp = stp->st_mp[id];
 860         stp->st_mp[id] = NULL;
 861 
 862         if (mp != NULL)
 863                 freemsg(mp);
 864 
 865         /* Clear the fragment list */
 866         stp->st_n = 0;
 867 }
 868 
 869 /* Push descriptors to the TX ring setting blocked if no space */
 870 static void
 871 sfxge_tx_qlist_post(sfxge_txq_t *stp)
 872 {
 873         unsigned int id;
 874         unsigned int level;
 875         unsigned int available;
 876         int rc;
 877 
 878         ASSERT(mutex_owned(&(stp->st_lock)));
 879 
 880         ASSERT(stp->st_n != 0);
 881 
 882 again:
 883         level = stp->st_added - stp->st_reaped;
 884         available = EFX_TXQ_LIMIT(SFXGE_TX_NDESCS) - level;
 885 
 886         id = stp->st_added & (SFXGE_TX_NDESCS - 1);
 887 
 888         if (available < stp->st_n) {
 889                 rc = ENOSPC;
 890                 goto fail1;
 891         }
 892 
 893         ASSERT3U(available, >=, stp->st_n);
 894 
 895         /* Post the fragment list */
 896         if ((rc = efx_tx_qpost(stp->st_etp, stp->st_eb, stp->st_n,
 897             stp->st_reaped, &(stp->st_added))) != 0)
 898                 goto fail2;
 899 
 900         /*
 901          * If the list took more than a single descriptor then we need to
 902          * to move the completion information so it is referenced by the last
 903          * descriptor.
 904          */
 905         if (((stp->st_added - 1) & (SFXGE_TX_NDESCS - 1)) != id) {
 906                 sfxge_tx_mapping_t *stmp;
 907                 sfxge_tx_buffer_t *stbp;
 908                 mblk_t *mp;
 909 
 910                 stmp = stp->st_stmp[id];
 911                 stp->st_stmp[id] = NULL;
 912 
 913                 stbp = stp->st_stbp[id];
 914                 stp->st_stbp[id] = NULL;
 915 
 916                 mp = stp->st_mp[id];
 917                 stp->st_mp[id] = NULL;
 918 
 919                 id = (stp->st_added - 1) & (SFXGE_TX_NDESCS - 1);
 920 
 921                 ASSERT(stp->st_stmp[id] == NULL);
 922                 stp->st_stmp[id] = stmp;
 923 
 924                 ASSERT(stp->st_stbp[id] == NULL);
 925                 stp->st_stbp[id] = stbp;
 926 
 927                 ASSERT(stp->st_mp[id] == NULL);
 928                 stp->st_mp[id] = mp;
 929         }
 930 
 931         /* Make the descriptors visible to the hardware */
 932         (void) ddi_dma_sync(stp->st_mem.esm_dma_handle,
 933             0,
 934             EFX_TXQ_SIZE(SFXGE_TX_NDESCS),
 935             DDI_DMA_SYNC_FORDEV);
 936 
 937         /* Clear the list */
 938         stp->st_n = 0;
 939 
 940         ASSERT3U(stp->st_unblock, ==, SFXGE_TXQ_NOT_BLOCKED);
 941         return;
 942 
 943 fail2:
 944         DTRACE_PROBE(fail2);
 945 fail1:
 946         DTRACE_PROBE1(fail1, int, rc);
 947 
 948         ASSERT(rc == ENOSPC);
 949 
 950         level = stp->st_added - stp->st_completed;
 951         available = EFX_TXQ_LIMIT(SFXGE_TX_NDESCS) - level;
 952 
 953         /*
 954          * If there would be enough space after we've reaped any completed
 955          * mappings and buffers, and we gain sufficient queue space by doing
 956          * so, then reap now and try posting again.
 957          */
 958         if (stp->st_n <= available &&
 959             stp->st_completed - stp->st_reaped >= SFXGE_TX_BATCH) {
 960                 sfxge_tx_qreap(stp);
 961 
 962                 goto again;
 963         }
 964 
 965         /* Set the unblock level */
 966         if (stp->st_unblock == SFXGE_TXQ_NOT_BLOCKED) {
 967                 stp->st_unblock = SFXGE_TXQ_UNBLOCK_LEVEL1;
 968         } else {
 969                 ASSERT(stp->st_unblock == SFXGE_TXQ_UNBLOCK_LEVEL1);
 970 
 971                 stp->st_unblock = SFXGE_TXQ_UNBLOCK_LEVEL2;
 972         }
 973 
 974         /*
 975          * Avoid a race with completion interrupt handling that could leave the
 976          * queue blocked.
 977          *
 978          * NOTE: The use of st_pending rather than st_completed is intentional
 979          *       as st_pending is updated per-event rather than per-batch and
 980          *       therefore avoids needless deferring.
 981          */
 982         if (stp->st_pending == stp->st_added) {
 983                 sfxge_tx_qreap(stp);
 984 
 985                 stp->st_unblock = SFXGE_TXQ_NOT_BLOCKED;
 986                 goto again;
 987         }
 988 
 989         ASSERT(stp->st_unblock != SFXGE_TXQ_NOT_BLOCKED);
 990 }
 991 
 992 static int
 993 sfxge_tx_kstat_update(kstat_t *ksp, int rw)
 994 {
 995         sfxge_txq_t *stp = ksp->ks_private;
 996         sfxge_tx_dpl_t *stdp = &(stp->st_dpl);
 997         kstat_named_t *knp;
 998         int rc;
 999 
1000         ASSERT(mutex_owned(&(stp->st_lock)));
1001 
1002         if (rw != KSTAT_READ) {
1003                 rc = EACCES;
1004                 goto fail1;
1005         }
1006 
1007         if (stp->st_state != SFXGE_TXQ_STARTED)
1008                 goto done;
1009 
1010         efx_tx_qstats_update(stp->st_etp, stp->st_stat);
1011         knp = (kstat_named_t *)ksp->ks_data + TX_NQSTATS;
1012         knp->value.ui64 = stdp->get_pkt_limit;
1013         knp++;
1014         knp->value.ui64 = stdp->put_pkt_limit;
1015         knp++;
1016         knp->value.ui64 = stdp->get_full_count;
1017         knp++;
1018         knp->value.ui64 = stdp->put_full_count;
1019 
1020 done:
1021         return (0);
1022 
1023 fail1:
1024         DTRACE_PROBE1(fail1, int, rc);
1025 
1026         return (rc);
1027 }
1028 
1029 static int
1030 sfxge_tx_kstat_init(sfxge_txq_t *stp)
1031 {
1032         sfxge_t *sp = stp->st_sp;
1033         unsigned int index = stp->st_index;
1034         dev_info_t *dip = sp->s_dip;
1035         kstat_t *ksp;
1036         kstat_named_t *knp;
1037         char name[MAXNAMELEN];
1038         unsigned int id;
1039         int rc;
1040 
1041         /* Create the set */
1042         (void) snprintf(name, MAXNAMELEN - 1, "%s_txq%04d",
1043             ddi_driver_name(dip), index);
1044 
1045         if ((ksp = kstat_create((char *)ddi_driver_name(dip),
1046             ddi_get_instance(dip), name, "queue", KSTAT_TYPE_NAMED,
1047             TX_NQSTATS + 4, 0)) == NULL) {
1048                 rc = ENOMEM;
1049                 goto fail1;
1050         }
1051 
1052         stp->st_ksp = ksp;
1053 
1054         ksp->ks_update = sfxge_tx_kstat_update;
1055         ksp->ks_private = stp;
1056         ksp->ks_lock = &(stp->st_lock);
1057 
1058         /* Initialise the named stats */
1059         stp->st_stat = knp = ksp->ks_data;
1060         for (id = 0; id < TX_NQSTATS; id++) {
1061                 kstat_named_init(knp, (char *)efx_tx_qstat_name(sp->s_enp, id),
1062                     KSTAT_DATA_UINT64);
1063                 knp++;
1064         }
1065         kstat_named_init(knp, "dpl_get_pkt_limit", KSTAT_DATA_UINT64);
1066         knp++;
1067         kstat_named_init(knp, "dpl_put_pkt_limit", KSTAT_DATA_UINT64);
1068         knp++;
1069         kstat_named_init(knp, "dpl_get_full_count", KSTAT_DATA_UINT64);
1070         knp++;
1071         kstat_named_init(knp, "dpl_put_full_count", KSTAT_DATA_UINT64);
1072 
1073         kstat_install(ksp);
1074         return (0);
1075 
1076 fail1:
1077         DTRACE_PROBE1(fail1, int, rc);
1078 
1079         return (rc);
1080 }
1081 
1082 static void
1083 sfxge_tx_kstat_fini(sfxge_txq_t *stp)
1084 {
1085         /* Destroy the set */
1086         kstat_delete(stp->st_ksp);
1087         stp->st_ksp = NULL;
1088         stp->st_stat = NULL;
1089 }
1090 
1091 static int
1092 sfxge_tx_qinit(sfxge_t *sp, unsigned int index, sfxge_txq_type_t type,
1093     unsigned int evq)
1094 {
1095         sfxge_txq_t *stp;
1096         sfxge_tx_dpl_t *stdp;
1097         int rc;
1098 
1099         ASSERT3U(index, <, SFXGE_TXQ_NTYPES + SFXGE_RX_SCALE_MAX);
1100         ASSERT3U(type, <, SFXGE_TXQ_NTYPES);
1101         ASSERT3U(evq, <, SFXGE_RX_SCALE_MAX);
1102 
1103         stp = kmem_cache_alloc(sp->s_tqc, KM_SLEEP);
1104         stdp = &(stp->st_dpl);
1105 
1106         ASSERT3U(stp->st_state, ==, SFXGE_TXQ_UNINITIALIZED);
1107 
1108         stp->st_index = index;
1109         stp->st_type = type;
1110         stp->st_evq = evq;
1111 
1112         mutex_init(&(stp->st_lock), NULL, MUTEX_DRIVER,
1113             DDI_INTR_PRI(sp->s_intr.si_intr_pri));
1114 
1115         /* Initialize the statistics */
1116         if ((rc = sfxge_tx_kstat_init(stp)) != 0)
1117                 goto fail1;
1118 
1119         stdp->get_pkt_limit = ddi_prop_get_int(DDI_DEV_T_ANY, sp->s_dip,
1120             DDI_PROP_DONTPASS, "tx_dpl_get_pkt_limit",
1121             SFXGE_TX_DPL_GET_PKT_LIMIT_DEFAULT);
1122 
1123         stdp->put_pkt_limit = ddi_prop_get_int(DDI_DEV_T_ANY, sp->s_dip,
1124             DDI_PROP_DONTPASS, "tx_dpl_put_pkt_limit",
1125             SFXGE_TX_DPL_PUT_PKT_LIMIT_DEFAULT);
1126 
1127         stp->st_state = SFXGE_TXQ_INITIALIZED;
1128 
1129         /* Attach the TXQ to the driver */
1130         ASSERT3P(sp->s_stp[index], ==, NULL);
1131         sp->s_stp[index] = stp;
1132         sp->s_tx_qcount++;
1133 
1134         return (0);
1135 
1136 fail1:
1137         DTRACE_PROBE1(fail1, int, rc);
1138 
1139         stp->st_evq = 0;
1140         stp->st_type = 0;
1141         stp->st_index = 0;
1142 
1143         mutex_destroy(&(stp->st_lock));
1144 
1145         kmem_cache_free(sp->s_tqc, stp);
1146 
1147         return (rc);
1148 }
1149 
1150 static int
1151 sfxge_tx_qstart(sfxge_t *sp, unsigned int index)
1152 {
1153         sfxge_txq_t *stp = sp->s_stp[index];
1154         efx_nic_t *enp = sp->s_enp;
1155         efsys_mem_t *esmp;
1156         sfxge_evq_t *sep;
1157         unsigned int evq;
1158         unsigned int flags;
1159         int rc;
1160 
1161         mutex_enter(&(stp->st_lock));
1162 
1163         esmp = &(stp->st_mem);
1164         evq = stp->st_evq;
1165         sep = sp->s_sep[evq];
1166 
1167         ASSERT3U(stp->st_state, ==, SFXGE_TXQ_INITIALIZED);
1168         ASSERT3U(sep->se_state, ==, SFXGE_EVQ_STARTED);
1169 
1170         /* Zero the memory */
1171         (void) memset(esmp->esm_base, 0, EFX_TXQ_SIZE(SFXGE_TX_NDESCS));
1172 
1173         /* Program the buffer table */
1174         if ((rc = sfxge_sram_buf_tbl_set(sp, stp->st_id, esmp,
1175             EFX_TXQ_NBUFS(SFXGE_TX_NDESCS))) != 0)
1176                 goto fail1;
1177 
1178         switch (stp->st_type) {
1179         case SFXGE_TXQ_NON_CKSUM:
1180                 flags = 0;
1181                 break;
1182 
1183         case SFXGE_TXQ_IP_CKSUM:
1184                 flags = EFX_CKSUM_IPV4;
1185                 break;
1186 
1187         case SFXGE_TXQ_IP_TCP_UDP_CKSUM:
1188                 flags = EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP;
1189                 break;
1190 
1191         default:
1192                 ASSERT(B_FALSE);
1193 
1194                 flags = 0;
1195                 break;
1196         }
1197 
1198         /* Create the transmit queue */
1199         if ((rc = efx_tx_qcreate(enp, index, index, esmp, SFXGE_TX_NDESCS,
1200             stp->st_id, flags, sep->se_eep, &(stp->st_etp))) != 0)
1201                 goto fail2;
1202 
1203         /* Enable the transmit queue */
1204         efx_tx_qenable(stp->st_etp);
1205 
1206         stp->st_state = SFXGE_TXQ_STARTED;
1207 
1208         mutex_exit(&(stp->st_lock));
1209 
1210         return (0);
1211 
1212 fail2:
1213         DTRACE_PROBE(fail2);
1214 
1215         /* Clear entries from the buffer table */
1216         sfxge_sram_buf_tbl_clear(sp, stp->st_id,
1217             EFX_TXQ_NBUFS(SFXGE_TX_NDESCS));
1218 
1219 fail1:
1220         DTRACE_PROBE1(fail1, int, rc);
1221 
1222         mutex_exit(&(stp->st_lock));
1223 
1224         return (rc);
1225 }
1226 
1227 static inline int
1228 sfxge_tx_qmapping_add(sfxge_txq_t *stp, sfxge_tx_mapping_t *stmp,
1229 size_t *offp, size_t *limitp)
1230 {
1231         mblk_t *mp;
1232         size_t mapping_off;
1233         size_t mapping_size;
1234         int rc;
1235 
1236         ASSERT3U(*offp, <, stmp->stm_size);
1237         ASSERT(*limitp != 0);
1238 
1239         mp = stmp->stm_mp;
1240 
1241         ASSERT3P(stmp->stm_base, ==, mp->b_rptr);
1242         ASSERT3U(stmp->stm_size, ==, MBLKL(mp));
1243 
1244         mapping_off = stmp->stm_off + *offp;
1245         mapping_size = stmp->stm_size - *offp;
1246 
1247         while (mapping_size != 0 && *limitp != 0) {
1248                 size_t page =
1249                     mapping_off >> SFXGE_TX_DESCSHIFT;
1250                 size_t page_off =
1251                     mapping_off & SFXGE_TX_DESCOFFSET;
1252                 size_t page_size =
1253                     SFXGE_TX_DESCSIZE - page_off;
1254                 efx_buffer_t *ebp;
1255 
1256                 ASSERT3U(page, <, SFXGE_TX_MAPPING_NADDR);
1257                 ASSERT((stmp->stm_addr[page] &
1258                         SFXGE_TX_DESCMASK) != 0);
1259 
1260                 page_size = MIN(page_size, mapping_size);
1261                 page_size = MIN(page_size, *limitp);
1262 
1263                 ASSERT3U(stp->st_n, <=,
1264                     EFX_TXQ_LIMIT(SFXGE_TX_NDESCS));
1265                 if (stp->st_n ==
1266                     EFX_TXQ_LIMIT(SFXGE_TX_NDESCS)) {
1267                         rc = ENOSPC;
1268                         goto fail1;
1269                 }
1270 
1271                 ebp = &(stp->st_eb[stp->st_n++]);
1272                 ebp->eb_addr = stmp->stm_addr[page] +
1273                     page_off;
1274                 ebp->eb_size = page_size;
1275 
1276                 *offp += page_size;
1277                 *limitp -= page_size;
1278 
1279                 mapping_off += page_size;
1280                 mapping_size -= page_size;
1281 
1282                 ebp->eb_eop = (*limitp == 0 ||
1283                     (mapping_size == 0 && mp->b_cont == NULL));
1284 
1285                 DTRACE_PROBE5(tx_mapping_add,
1286                     unsigned int, stp->st_index,
1287                     unsigned int, stp->st_n - 1,
1288                     uint64_t, ebp->eb_addr,
1289                     size_t, ebp->eb_size,
1290                     boolean_t, ebp->eb_eop);
1291         }
1292 
1293         ASSERT3U(*offp, <=, stmp->stm_size);
1294 
1295         return (0);
1296 
1297 fail1:
1298         DTRACE_PROBE1(fail1, int, rc);
1299 
1300         return (rc);
1301 }
1302 
1303 static inline int
1304 sfxge_tx_qbuffer_add(sfxge_txq_t *stp, sfxge_tx_buffer_t *stbp, boolean_t eop)
1305 {
1306         efx_buffer_t *ebp;
1307         int rc;
1308 
1309         ASSERT3U(stp->st_n, <=,
1310             EFX_TXQ_LIMIT(SFXGE_TX_NDESCS));
1311         if (stp->st_n == EFX_TXQ_LIMIT(SFXGE_TX_NDESCS)) {
1312                 rc = ENOSPC;
1313                 goto fail1;
1314         }
1315 
1316         ebp = &(stp->st_eb[stp->st_n++]);
1317         ebp->eb_addr = stbp->stb_esm.esm_addr + stbp->stb_off;
1318         ebp->eb_size = stbp->stb_esm.esm_size - stbp->stb_off;
1319         ebp->eb_eop = eop;
1320 
1321         (void) ddi_dma_sync(stbp->stb_esm.esm_dma_handle,
1322             stbp->stb_off, ebp->eb_size,
1323             DDI_DMA_SYNC_FORDEV);
1324 
1325         stbp->stb_off = stbp->stb_esm.esm_size;
1326 
1327         DTRACE_PROBE5(tx_buffer_add,
1328             unsigned int, stp->st_index,
1329             unsigned int, stp->st_n - 1,
1330             uint64_t, ebp->eb_addr, size_t, ebp->eb_size,
1331             boolean_t, ebp->eb_eop);
1332 
1333         return (0);
1334 
1335 fail1:
1336         DTRACE_PROBE1(fail1, int, rc);
1337 
1338         return (rc);
1339 }
1340 
1341 static inline boolean_t
1342 sfxge_tx_msgb_copy(mblk_t *mp, sfxge_tx_buffer_t *stbp, size_t *offp,
1343     size_t *limitp)
1344 {
1345         size_t data_off;
1346         size_t data_size;
1347         size_t copy_off;
1348         size_t copy_size;
1349         boolean_t eop;
1350 
1351         ASSERT3U(*offp, <=, MBLKL(mp));
1352         ASSERT(*limitp != 0);
1353 
1354         data_off = *offp;
1355         data_size = MBLKL(mp) - *offp;
1356 
1357         copy_off = stbp->stb_esm.esm_size;
1358         copy_size = SFXGE_TX_BUFFER_SIZE - copy_off;
1359 
1360         copy_size = MIN(copy_size, data_size);
1361         copy_size = MIN(copy_size, *limitp);
1362 
1363         bcopy(mp->b_rptr + data_off,
1364             stbp->stb_esm.esm_base + copy_off, copy_size);
1365 
1366         stbp->stb_esm.esm_size += copy_size;
1367         ASSERT3U(stbp->stb_esm.esm_size, <=,
1368             SFXGE_TX_BUFFER_SIZE);
1369 
1370         *offp += copy_size;
1371         *limitp -= copy_size;
1372 
1373         data_off += copy_size;
1374         data_size -= copy_size;
1375 
1376         eop = (*limitp == 0 ||
1377             (data_size == 0 && mp->b_cont == NULL));
1378 
1379         ASSERT3U(*offp, <=, MBLKL(mp));
1380 
1381         return (eop);
1382 }
1383 
1384 static int
1385 sfxge_tx_qpayload_fragment(sfxge_txq_t *stp, unsigned int id, mblk_t **mpp,
1386     size_t *offp, size_t size, boolean_t copy)
1387 {
1388         sfxge_t *sp = stp->st_sp;
1389         mblk_t *mp = *mpp;
1390         size_t off = *offp;
1391         sfxge_tx_buffer_t *stbp;
1392         sfxge_tx_mapping_t *stmp;
1393         int rc;
1394 
1395         stbp = stp->st_stbp[id];
1396         ASSERT(stbp == NULL || (stbp->stb_esm.esm_size == stbp->stb_off));
1397 
1398         stmp = stp->st_stmp[id];
1399 
1400         while (size != 0) {
1401                 boolean_t eop;
1402 
1403                 ASSERT(mp != NULL);
1404 
1405                 if (mp->b_cont != NULL)
1406                         prefetch_read_many(mp->b_cont);
1407 
1408                 ASSERT3U(off, <, MBLKL(mp));
1409 
1410                 if (copy)
1411                         goto copy;
1412 
1413                 /*
1414                  * Check whether we have already mapped this data block for
1415                  * DMA.
1416                  */
1417                 if (stmp == NULL || stmp->stm_mp != mp) {
1418                         /*
1419                          * If we are part way through copying a data block then
1420                          * there's no point in trying to map it for DMA.
1421                          */
1422                         if (off != 0)
1423                                 goto copy;
1424 
1425                         /*
1426                          * If the data block is too short then the cost of
1427                          * mapping it for DMA would outweigh the cost of
1428                          * copying it.
1429                          */
1430                         if (MBLKL(mp) < SFXGE_TX_COPY_THRESHOLD)
1431                                 goto copy;
1432 
1433                         /* Try to grab a transmit mapping from the pool */
1434                         stmp = sfxge_tx_qfmp_get(stp);
1435                         if (stmp == NULL) {
1436                                 /*
1437                                  * The pool was empty so allocate a new
1438                                  * mapping.
1439                                  */
1440                                 if ((stmp = kmem_cache_alloc(sp->s_tmc,
1441                                     KM_NOSLEEP)) == NULL)
1442                                         goto copy;
1443                         }
1444 
1445                         /* Add the DMA mapping to the list */
1446                         stmp->stm_next = stp->st_stmp[id];
1447                         stp->st_stmp[id] = stmp;
1448 
1449                         /* Try to bind the data block to the mapping */
1450                         if (sfxge_tx_msgb_bind(mp, stmp) != 0)
1451                                 goto copy;
1452                 }
1453                 ASSERT3P(stmp->stm_mp, ==, mp);
1454 
1455                 /*
1456                  * If we have a partially filled buffer then we must add it to
1457                  * the fragment list before adding the mapping.
1458                  */
1459                 if (stbp != NULL && (stbp->stb_esm.esm_size > stbp->stb_off)) {
1460                         rc = sfxge_tx_qbuffer_add(stp, stbp, B_FALSE);
1461                         if (rc != 0)
1462                                 goto fail1;
1463                 }
1464 
1465                 /* Add the mapping to the fragment list */
1466                 rc = sfxge_tx_qmapping_add(stp, stmp, &off, &size);
1467                 if (rc != 0)
1468                         goto fail2;
1469 
1470                 ASSERT(off == MBLKL(mp) || size == 0);
1471 
1472                 /*
1473                  * If the data block has been exhausted then Skip over the
1474                  * control block and advance to the next data block.
1475                  */
1476                 if (off == MBLKL(mp)) {
1477                         mp = mp->b_cont;
1478                         off = 0;
1479                 }
1480 
1481                 continue;
1482 
1483 copy:
1484                 if (stbp == NULL ||
1485                     stbp->stb_esm.esm_size == SFXGE_TX_BUFFER_SIZE) {
1486                         /* Try to grab a buffer from the pool */
1487                         stbp = sfxge_tx_qfbp_get(stp);
1488                         if (stbp == NULL) {
1489                                 /*
1490                                  * The pool was empty so allocate a new
1491                                  * buffer.
1492                                  */
1493                                 if ((stbp = kmem_cache_alloc(sp->s_tbc,
1494                                     KM_NOSLEEP)) == NULL) {
1495                                         rc = ENOMEM;
1496                                         goto fail3;
1497                                 }
1498                         }
1499 
1500                         /* Add it to the list */
1501                         stbp->stb_next = stp->st_stbp[id];
1502                         stp->st_stbp[id] = stbp;
1503                 }
1504 
1505                 /* Copy as much of the data block as we can into the buffer */
1506                 eop = sfxge_tx_msgb_copy(mp, stbp, &off, &size);
1507 
1508                 ASSERT(off == MBLKL(mp) || size == 0 ||
1509                     stbp->stb_esm.esm_size == SFXGE_TX_BUFFER_SIZE);
1510 
1511                 /*
1512                  * If we have reached the end of the packet, or the buffer is
1513                  * full, then add the buffer to the fragment list.
1514                  */
1515                 if (stbp->stb_esm.esm_size == SFXGE_TX_BUFFER_SIZE || eop) {
1516                         rc = sfxge_tx_qbuffer_add(stp, stbp, eop);
1517                         if (rc != 0)
1518                                 goto fail4;
1519                 }
1520 
1521                 /*
1522                  * If the data block has been exhaused then advance to the next
1523                  * one.
1524                  */
1525                 if (off == MBLKL(mp)) {
1526                         mp = mp->b_cont;
1527                         off = 0;
1528                 }
1529         }
1530 
1531         *mpp = mp;
1532         *offp = off;
1533 
1534         return (0);
1535 
1536 fail4:
1537         DTRACE_PROBE(fail4);
1538 fail3:
1539         DTRACE_PROBE(fail3);
1540 fail2:
1541         DTRACE_PROBE(fail2);
1542 fail1:
1543         DTRACE_PROBE1(fail1, int, rc);
1544 
1545         return (rc);
1546 }
1547 
1548 static int
1549 sfxge_tx_qlso_fragment(sfxge_txq_t *stp, sfxge_tx_packet_t *stpp,
1550     boolean_t copy)
1551 {
1552         sfxge_t *sp = stp->st_sp;
1553         mblk_t *mp = stpp->stp_mp;
1554         struct ether_header *etherhp = stpp->stp_etherhp;
1555         struct ip *iphp = stpp->stp_iphp;
1556         struct tcphdr *thp = stpp->stp_thp;
1557         size_t size = stpp->stp_size;
1558         size_t off = stpp->stp_off;
1559         size_t mss = stpp->stp_mss;
1560         unsigned int id;
1561         caddr_t hp;
1562         size_t ehs, hs;
1563         uint16_t start_len;
1564         uint16_t start_id;
1565         uint16_t ip_id;
1566         uint8_t start_flags;
1567         uint32_t start_seq;
1568         uint32_t th_seq;
1569         size_t lss;
1570         sfxge_tx_buffer_t *stbp;
1571         int rc;
1572 
1573         ASSERT(mutex_owned(&(stp->st_lock)));
1574 
1575         if ((DB_LSOFLAGS(mp) & HW_LSO) == 0) {
1576                 rc = EINVAL;
1577                 goto fail1;
1578         }
1579 
1580         id = stp->st_added & (SFXGE_TX_NDESCS - 1);
1581 
1582         ASSERT(stp->st_n == 0);
1583         ASSERT(stp->st_stbp[id] == NULL);
1584         ASSERT(stp->st_stmp[id] == NULL);
1585 
1586         ehs = (etherhp->ether_type == htons(ETHERTYPE_VLAN)) ?
1587             sizeof (struct ether_vlan_header) :
1588             sizeof (struct ether_header);
1589         if (msgdsize(mp) != ehs + ntohs(iphp->ip_len)) {
1590                 rc = EINVAL;
1591                 goto fail2;
1592         }
1593 
1594         /* The payload offset is equivalent to the size of the headers */
1595         hp = (caddr_t)(mp->b_rptr);
1596         hs = off;
1597 
1598         /*
1599          * If the initial data block only contains the headers then advance
1600          * to the next one.
1601          */
1602         if (hs > MBLKL(mp)) {
1603                 rc = EINVAL;
1604                 goto fail3;
1605         }
1606         mp->b_rptr += hs;
1607 
1608         if (MBLKL(mp) == 0)
1609                 mp = mp->b_cont;
1610 
1611         off = 0;
1612 
1613         /* Check IP and TCP headers are suitable for LSO */
1614         if (((iphp->ip_off & ~htons(IP_DF)) != 0) ||
1615             ((thp->th_flags & (TH_URG | TH_SYN)) != 0) ||
1616             (thp->th_urp != 0)) {
1617                 rc = EINVAL;
1618                 goto fail4;
1619         }
1620 
1621         if (size + (thp->th_off << 2) + (iphp->ip_hl << 2) !=
1622             ntohs(iphp->ip_len)) {
1623                 rc = EINVAL;
1624                 goto fail4;
1625         }
1626 
1627         /*
1628          * Get the base IP id, The stack leaves enough of a gap in id space
1629          * for us to increment this for each segment we send out.
1630          */
1631         start_len = ntohs(iphp->ip_len);
1632         start_id = ip_id = ntohs(iphp->ip_id);
1633 
1634         /* Get the base TCP sequence number and flags */
1635         start_flags = thp->th_flags;
1636         start_seq = th_seq = ntohl(thp->th_seq);
1637 
1638         /* Adjust the header for interim segments */
1639         iphp->ip_len = htons((iphp->ip_hl << 2) + (thp->th_off << 2) + mss);
1640         thp->th_flags = start_flags & ~(TH_PUSH | TH_FIN);
1641 
1642         lss = size;
1643         if ((lss / mss) >= (EFX_TXQ_LIMIT(SFXGE_TX_NDESCS) / 2)) {
1644                 rc = EINVAL;
1645                 goto fail5;
1646         }
1647 
1648         stbp = NULL;
1649         while (lss != 0) {
1650                 size_t ss = MIN(lss, mss);
1651                 boolean_t eol = (ss == lss);
1652 
1653                 /* Adjust the header for this segment */
1654                 iphp->ip_id = htons(ip_id);
1655                 ip_id++;
1656 
1657                 thp->th_seq = htonl(th_seq);
1658                 th_seq += ss;
1659 
1660                 /* If this is the final segment then do some extra adjustment */
1661                 if (eol) {
1662                         iphp->ip_len = htons((iphp->ip_hl << 2) +
1663                             (thp->th_off << 2) + ss);
1664                         thp->th_flags = start_flags;
1665                 }
1666 
1667                 if (stbp == NULL ||
1668                     stbp->stb_esm.esm_size + hs > SFXGE_TX_BUFFER_SIZE) {
1669                         /* Try to grab a buffer from the pool */
1670                         stbp = sfxge_tx_qfbp_get(stp);
1671                         if (stbp == NULL) {
1672                                 /*
1673                                  * The pool was empty so allocate a new
1674                                  * buffer.
1675                                  */
1676                                 if ((stbp = kmem_cache_alloc(sp->s_tbc,
1677                                     KM_NOSLEEP)) == NULL) {
1678                                         rc = ENOMEM;
1679                                         goto fail6;
1680                                 }
1681                         }
1682 
1683                         /* Add it to the list */
1684                         stbp->stb_next = stp->st_stbp[id];
1685                         stp->st_stbp[id] = stbp;
1686                 }
1687 
1688                 /* Copy in the headers */
1689                 ASSERT3U(stbp->stb_off, ==, stbp->stb_esm.esm_size);
1690                 bcopy(hp, stbp->stb_esm.esm_base + stbp->stb_off, hs);
1691                 stbp->stb_esm.esm_size += hs;
1692 
1693                 /* Add the buffer to the fragment list */
1694                 rc = sfxge_tx_qbuffer_add(stp, stbp, B_FALSE);
1695                 if (rc != 0)
1696                         goto fail7;
1697 
1698                 /* Add the payload to the fragment list */
1699                 if ((rc = sfxge_tx_qpayload_fragment(stp, id, &mp, &off,
1700                     ss, copy)) != 0)
1701                         goto fail8;
1702 
1703                 lss -= ss;
1704         }
1705         ASSERT3U(off, ==, 0);
1706         ASSERT3P(mp, ==, NULL);
1707 
1708         ASSERT3U(th_seq - start_seq, ==, size);
1709 
1710         /*
1711          * If no part of the packet has been mapped for DMA then we can free
1712          * it now, otherwise it can only be freed on completion.
1713          */
1714         if (stp->st_stmp[id] == NULL)
1715                 freemsg(stpp->stp_mp);
1716         else
1717                 stp->st_mp[id] = stpp->stp_mp;
1718 
1719         stpp->stp_mp = NULL;
1720 
1721         return (0);
1722 
1723 fail8:
1724         DTRACE_PROBE(fail8);
1725 fail7:
1726         DTRACE_PROBE(fail7);
1727 fail6:
1728         DTRACE_PROBE(fail6);
1729 fail5:
1730         DTRACE_PROBE(fail5);
1731 
1732         /* Restore the header */
1733         thp->th_seq = htonl(start_seq);
1734         thp->th_flags = start_flags;
1735 
1736         iphp->ip_len = htons(start_len);
1737         iphp->ip_id = htons(start_id);
1738 
1739 fail4:
1740         DTRACE_PROBE(fail4);
1741 
1742         mp = stpp->stp_mp;
1743         mp->b_rptr -= hs;
1744 
1745         ASSERT3U(((etherhp->ether_type == htons(ETHERTYPE_VLAN)) ?
1746             sizeof (struct ether_vlan_header) :
1747             sizeof (struct ether_header)) +
1748             ntohs(iphp->ip_len), ==, msgdsize(mp));
1749 
1750         ASSERT(stp->st_mp[id] == NULL);
1751 
1752 fail3:
1753         DTRACE_PROBE(fail3);
1754 fail2:
1755         DTRACE_PROBE(fail2);
1756 fail1:
1757         DTRACE_PROBE1(fail1, int, rc);
1758 
1759         return (rc);
1760 }
1761 
1762 static int
1763 sfxge_tx_qpacket_fragment(sfxge_txq_t *stp, sfxge_tx_packet_t *stpp,
1764     boolean_t copy)
1765 {
1766         sfxge_t *sp = stp->st_sp;
1767         mblk_t *mp = stpp->stp_mp;
1768         unsigned int id;
1769         size_t off;
1770         size_t size;
1771         sfxge_tx_mapping_t *stmp;
1772         sfxge_tx_buffer_t *stbp;
1773         int rc;
1774 
1775         ASSERT(mutex_owned(&(stp->st_lock)));
1776 
1777         ASSERT(stp->st_n == 0);
1778 
1779         id = stp->st_added & (SFXGE_TX_NDESCS - 1);
1780 
1781         ASSERT(stp->st_stbp[id] == NULL);
1782         ASSERT(stp->st_stmp[id] == NULL);
1783 
1784         off = 0;
1785         size = LONG_MAX;        /* must be larger than the packet */
1786 
1787         stbp = NULL;
1788         stmp = NULL;
1789 
1790         while (mp != NULL) {
1791                 boolean_t eop;
1792 
1793                 ASSERT(mp != NULL);
1794 
1795                 if (mp->b_cont != NULL)
1796                         prefetch_read_many(mp->b_cont);
1797 
1798                 ASSERT(stmp == NULL || stmp->stm_mp != mp);
1799 
1800                 if (copy)
1801                         goto copy;
1802 
1803                 /*
1804                  * If we are part way through copying a data block then there's
1805                  * no point in trying to map it for DMA.
1806                  */
1807                 if (off != 0)
1808                         goto copy;
1809 
1810                 /*
1811                  * If the data block is too short then the cost of mapping it
1812                  * for DMA would outweigh the cost of copying it.
1813                  *
1814                  * TX copy break
1815                  */
1816                 if (MBLKL(mp) < SFXGE_TX_COPY_THRESHOLD)
1817                         goto copy;
1818 
1819                 /* Try to grab a transmit mapping from the pool */
1820                 stmp = sfxge_tx_qfmp_get(stp);
1821                 if (stmp == NULL) {
1822                         /*
1823                          * The pool was empty so allocate a new
1824                          * mapping.
1825                          */
1826                         if ((stmp = kmem_cache_alloc(sp->s_tmc,
1827                             KM_NOSLEEP)) == NULL)
1828                                 goto copy;
1829                 }
1830 
1831                 /* Add the DMA mapping to the list */
1832                 stmp->stm_next = stp->st_stmp[id];
1833                 stp->st_stmp[id] = stmp;
1834 
1835                 /* Try to bind the data block to the mapping */
1836                 if (sfxge_tx_msgb_bind(mp, stmp) != 0)
1837                         goto copy;
1838 
1839                 /*
1840                  * If we have a partially filled buffer then we must add it to
1841                  * the fragment list before adding the mapping.
1842                  */
1843                 if (stbp != NULL && (stbp->stb_esm.esm_size > stbp->stb_off)) {
1844                         rc = sfxge_tx_qbuffer_add(stp, stbp, B_FALSE);
1845                         if (rc != 0)
1846                                 goto fail1;
1847                 }
1848 
1849                 /* Add the mapping to the fragment list */
1850                 rc = sfxge_tx_qmapping_add(stp, stmp, &off, &size);
1851                 if (rc != 0)
1852                         goto fail2;
1853 
1854                 ASSERT3U(off, ==, MBLKL(mp));
1855 
1856                 /* Advance to the next data block */
1857                 mp = mp->b_cont;
1858                 off = 0;
1859                 continue;
1860 
1861 copy:
1862                 if (stbp == NULL ||
1863                     stbp->stb_esm.esm_size == SFXGE_TX_BUFFER_SIZE) {
1864                         /* Try to grab a buffer from the pool */
1865                         stbp = sfxge_tx_qfbp_get(stp);
1866                         if (stbp == NULL) {
1867                                 /*
1868                                  * The pool was empty so allocate a new
1869                                  * buffer.
1870                                  */
1871                                 if ((stbp = kmem_cache_alloc(sp->s_tbc,
1872                                     KM_NOSLEEP)) == NULL) {
1873                                         rc = ENOMEM;
1874                                         goto fail3;
1875                                 }
1876                         }
1877 
1878                         /* Add it to the list */
1879                         stbp->stb_next = stp->st_stbp[id];
1880                         stp->st_stbp[id] = stbp;
1881                 }
1882 
1883                 /* Copy as much of the data block as we can into the buffer */
1884                 eop = sfxge_tx_msgb_copy(mp, stbp, &off, &size);
1885 
1886                 ASSERT(off == MBLKL(mp) ||
1887                     stbp->stb_esm.esm_size == SFXGE_TX_BUFFER_SIZE);
1888 
1889                 /*
1890                  * If we have reached the end of the packet, or the buffer is
1891                  * full, then add the buffer to the fragment list.
1892                  */
1893                 if (stbp->stb_esm.esm_size == SFXGE_TX_BUFFER_SIZE || eop) {
1894                         rc = sfxge_tx_qbuffer_add(stp, stbp, eop);
1895                         if (rc != 0)
1896                                 goto fail4;
1897                 }
1898 
1899                 /*
1900                  * If the data block has been exhaused then advance to the next
1901                  * one.
1902                  */
1903                 if (off == MBLKL(mp)) {
1904                         mp = mp->b_cont;
1905                         off = 0;
1906                 }
1907         }
1908         ASSERT3U(off, ==, 0);
1909         ASSERT3P(mp, ==, NULL);
1910         ASSERT3U(size, !=, 0);
1911 
1912         /*
1913          * If no part of the packet has been mapped for DMA then we can free
1914          * it now, otherwise it can only be freed on completion.
1915          */
1916         if (stp->st_stmp[id] == NULL)
1917                 freemsg(stpp->stp_mp);
1918         else
1919                 stp->st_mp[id] = stpp->stp_mp;
1920 
1921         stpp->stp_mp = NULL;
1922 
1923         return (0);
1924 
1925 fail4:
1926         DTRACE_PROBE(fail4);
1927 fail3:
1928         DTRACE_PROBE(fail3);
1929 fail2:
1930         DTRACE_PROBE(fail2);
1931 fail1:
1932         DTRACE_PROBE1(fail1, int, rc);
1933 
1934         ASSERT(stp->st_stmp[id] == NULL);
1935 
1936         return (rc);
1937 }
1938 
1939 
1940 #define SFXGE_TX_QDPL_PUT_PENDING(_stp)                                 \
1941         ((_stp)->st_dpl.std_put != 0)
1942 
1943 static void
1944 sfxge_tx_qdpl_swizzle(sfxge_txq_t *stp)
1945 {
1946         sfxge_tx_dpl_t *stdp = &(stp->st_dpl);
1947         volatile uintptr_t *putp;
1948         uintptr_t put;
1949         sfxge_tx_packet_t *stpp;
1950         sfxge_tx_packet_t *p;
1951         sfxge_tx_packet_t **pp;
1952         unsigned int count;
1953 
1954         ASSERT(mutex_owned(&(stp->st_lock)));
1955 
1956         /*
1957          * Guaranteed that in flight TX packets will cause more TX completions
1958          * hence more swizzles must happen
1959          */
1960         ASSERT3U(stdp->std_count, <=, sfxge_tx_dpl_get_pkt_max(stp));
1961         if (stdp->std_count >= stdp->get_pkt_limit)
1962                 return;
1963 
1964         /* Acquire the put list - replacing with an empty list */
1965         putp = &(stdp->std_put);
1966         put = atomic_swap_ulong(putp, 0);
1967         stpp = (void *)put;
1968 
1969         if (stpp == NULL)
1970                 return;
1971 
1972         /* Reverse the list */
1973         pp = &(stpp->stp_next);
1974         p = NULL;
1975 
1976         count = 0;
1977         do {
1978                 sfxge_tx_packet_t *next;
1979 
1980                 next = stpp->stp_next;
1981 
1982                 stpp->stp_next = p;
1983                 p = stpp;
1984 
1985                 count++;
1986                 stpp = next;
1987         } while (stpp != NULL);
1988 
1989         /* Add it to the tail of the get list */
1990         ASSERT3P(*pp, ==, NULL);
1991 
1992         *(stdp->std_getp) = p;
1993         stdp->std_getp = pp;
1994         stdp->std_count += count;
1995         ASSERT3U(stdp->std_count, <=, sfxge_tx_dpl_get_pkt_max(stp));
1996 
1997         DTRACE_PROBE2(dpl_counts, int, stdp->std_count, int, count);
1998 }
1999 
2000 
2001 /*
2002  * If TXQ locked, add the RX DPL put list and this packet to the TX DPL get list
2003  * If TXQ unlocked, atomically add this packet to TX DPL put list
2004  *
2005  * The only possible error is ENOSPC (used for TX backpressure)
2006  * For the TX DPL put or get list becoming full, in both cases there must be
2007  * future TX completions (as represented by the packets on the DPL get lists).
2008  *
2009  * This ensures that in the future mac_tx_update() will be called from
2010  * sfxge_tx_qcomplete()
2011  */
2012 static inline int
2013 sfxge_tx_qdpl_add(sfxge_txq_t *stp, sfxge_tx_packet_t *stpp, int locked)
2014 {
2015         sfxge_tx_dpl_t *stdp = &stp->st_dpl;
2016 
2017         ASSERT3P(stpp->stp_next, ==, NULL);
2018 
2019         if (locked) {
2020                 ASSERT(mutex_owned(&stp->st_lock));
2021 
2022                 if (stdp->std_count >= stdp->get_pkt_limit) {
2023                         stdp->get_full_count++;
2024                         return (ENOSPC);
2025                 }
2026 
2027                 /* Reverse the put list onto the get list */
2028                 sfxge_tx_qdpl_swizzle(stp);
2029 
2030                 /* Add to the tail of the get list */
2031                 *(stdp->std_getp) = stpp;
2032                 stdp->std_getp = &stpp->stp_next;
2033                 stdp->std_count++;
2034                 ASSERT3U(stdp->std_count, <=, sfxge_tx_dpl_get_pkt_max(stp));
2035 
2036         } else {
2037                 volatile uintptr_t *putp;
2038                 uintptr_t old;
2039                 uintptr_t new;
2040                 sfxge_tx_packet_t *old_pkt;
2041 
2042                 putp = &(stdp->std_put);
2043                 new = (uintptr_t)stpp;
2044 
2045                 /* Add to the head of the put list, keeping a list length */
2046                 do {
2047                         old = *putp;
2048                         old_pkt =  (sfxge_tx_packet_t *)old;
2049 
2050                         stpp->stp_dpl_put_len = old ?
2051                             old_pkt->stp_dpl_put_len + 1 : 1;
2052 
2053                         if (stpp->stp_dpl_put_len >= stdp->put_pkt_limit) {
2054                                 stpp->stp_next = 0;
2055                                 stpp->stp_dpl_put_len = 0;
2056                                 stdp->put_full_count++;
2057                                 return (ENOSPC);
2058                         }
2059 
2060                         stpp->stp_next = (void *)old;
2061                 } while (atomic_cas_ulong(putp, old, new) != old);
2062         }
2063         return (0);
2064 }
2065 
2066 
2067 /* Take all packets from DPL get list and try to send to HW */
2068 static void
2069 sfxge_tx_qdpl_drain(sfxge_txq_t *stp)
2070 {
2071         sfxge_t *sp = stp->st_sp;
2072         sfxge_tx_dpl_t *stdp = &(stp->st_dpl);
2073         unsigned int pushed = stp->st_added;
2074         sfxge_tx_packet_t *stpp;
2075         unsigned int count;
2076 
2077         ASSERT(mutex_owned(&(stp->st_lock)));
2078 
2079         prefetch_read_many(sp->s_enp);
2080         prefetch_read_many(stp->st_etp);
2081 
2082         stpp = stdp->std_get;
2083         count = stdp->std_count;
2084 
2085         while (count != 0) {
2086                 sfxge_tx_packet_t *next;
2087                 boolean_t copy;
2088                 int rc;
2089 
2090                 ASSERT(stpp != NULL);
2091 
2092                 /* Split stpp off */
2093                 next = stpp->stp_next;
2094                 stpp->stp_next = NULL;
2095 
2096                 if (next != NULL)
2097                         prefetch_read_many(next);
2098 
2099                 if (stp->st_state != SFXGE_TXQ_STARTED)
2100                         goto reject;
2101 
2102                 copy = B_FALSE;
2103 
2104 again:
2105                 /* Fragment the packet */
2106                 if (stpp->stp_mss != 0) {
2107                         rc = sfxge_tx_qlso_fragment(stp, stpp, copy);
2108                 } else {
2109                         rc = sfxge_tx_qpacket_fragment(stp, stpp, copy);
2110                 }
2111 
2112                 switch (rc) {
2113                 case 0:
2114                         break;
2115 
2116                 case ENOSPC:
2117                         if (!copy)
2118                                 goto copy;
2119 
2120                 /*FALLTHRU*/
2121                 default:
2122                         goto reject;
2123                 }
2124 
2125                 /* Free the packet structure */
2126                 stpp->stp_etherhp = NULL;
2127                 stpp->stp_iphp = NULL;
2128                 stpp->stp_thp = NULL;
2129                 stpp->stp_off = 0;
2130                 stpp->stp_size = 0;
2131                 stpp->stp_mss = 0;
2132                 stpp->stp_dpl_put_len = 0;
2133 
2134                 ASSERT3P(stpp->stp_mp, ==, NULL);
2135 
2136                 if (sfxge_tx_qfpp_put(stp, stpp) != 0) {
2137                         sfxge_tx_packet_destroy(sp, stpp);
2138                         stpp = NULL;
2139                 }
2140 
2141                 --count;
2142                 stpp = next;
2143 
2144                 /* Post the packet */
2145                 sfxge_tx_qlist_post(stp);
2146 
2147                 if (stp->st_unblock != SFXGE_TXQ_NOT_BLOCKED)
2148                         goto defer;
2149 
2150                 if (stp->st_added - pushed >= SFXGE_TX_BATCH) {
2151                         efx_tx_qpush(stp->st_etp, stp->st_added);
2152                         pushed = stp->st_added;
2153                 }
2154 
2155                 continue;
2156 
2157 copy:
2158                 /* Abort the current fragment list */
2159                 sfxge_tx_qlist_abort(stp);
2160 
2161                 /* Try copying the packet to flatten it */
2162                 ASSERT(!copy);
2163                 copy = B_TRUE;
2164 
2165                 goto again;
2166 
2167 reject:
2168                 /* Abort the current fragment list */
2169                 sfxge_tx_qlist_abort(stp);
2170 
2171                 /* Discard the packet */
2172                 freemsg(stpp->stp_mp);
2173                 stpp->stp_mp = NULL;
2174 
2175                 /* Free the packet structure */
2176                 stpp->stp_etherhp = NULL;
2177                 stpp->stp_iphp = NULL;
2178                 stpp->stp_thp = NULL;
2179                 stpp->stp_off = 0;
2180                 stpp->stp_size = 0;
2181                 stpp->stp_mss = 0;
2182                 stpp->stp_dpl_put_len = 0;
2183 
2184                 if (sfxge_tx_qfpp_put(stp, stpp) != 0) {
2185                         sfxge_tx_packet_destroy(sp, stpp);
2186                         stpp = NULL;
2187                 }
2188 
2189                 --count;
2190                 stpp = next;
2191                 continue;
2192 defer:
2193                 DTRACE_PROBE1(defer, unsigned int, stp->st_index);
2194                 break;
2195         }
2196 
2197         if (count == 0) {
2198                 /* New empty get list */
2199                 ASSERT3P(stpp, ==, NULL);
2200                 stdp->std_get = NULL;
2201                 stdp->std_count = 0;
2202 
2203                 stdp->std_getp = &(stdp->std_get);
2204         } else {
2205                 /* shorten the list by moving the head */
2206                 stdp->std_get = stpp;
2207                 stdp->std_count = count;
2208                 ASSERT3U(stdp->std_count, <=, sfxge_tx_dpl_get_pkt_max(stp));
2209         }
2210 
2211         if (stp->st_added != pushed)
2212                 efx_tx_qpush(stp->st_etp, stp->st_added);
2213 
2214         ASSERT(stp->st_unblock != SFXGE_TXQ_NOT_BLOCKED ||
2215             stdp->std_count == 0);
2216 }
2217 
2218 /* Swizzle deferred packet list, try and push to HW */
2219 static inline void
2220 sfxge_tx_qdpl_service(sfxge_txq_t *stp)
2221 {
2222         do {
2223                 ASSERT(mutex_owned(&(stp->st_lock)));
2224 
2225                 if (SFXGE_TX_QDPL_PUT_PENDING(stp))
2226                         sfxge_tx_qdpl_swizzle(stp);
2227 
2228                 if (stp->st_unblock == SFXGE_TXQ_NOT_BLOCKED)
2229                         sfxge_tx_qdpl_drain(stp);
2230 
2231                 mutex_exit(&(stp->st_lock));
2232 
2233                 if (!SFXGE_TX_QDPL_PUT_PENDING(stp))
2234                         break;
2235         } while (mutex_tryenter(&(stp->st_lock)));
2236 }
2237 
2238 static void
2239 sfxge_tx_qdpl_flush_locked(sfxge_txq_t *stp)
2240 {
2241         sfxge_t *sp = stp->st_sp;
2242         sfxge_tx_dpl_t *stdp = &(stp->st_dpl);
2243         sfxge_tx_packet_t *stpp;
2244         unsigned int count;
2245 
2246         ASSERT(mutex_owned(&(stp->st_lock)));
2247 
2248         /* Swizzle put list to the get list */
2249         sfxge_tx_qdpl_swizzle(stp);
2250 
2251         stpp = stdp->std_get;
2252         count = stdp->std_count;
2253 
2254         while (count != 0) {
2255                 sfxge_tx_packet_t *next;
2256 
2257                 next = stpp->stp_next;
2258                 stpp->stp_next = NULL;
2259 
2260                 /* Discard the packet */
2261                 freemsg(stpp->stp_mp);
2262                 stpp->stp_mp = NULL;
2263 
2264                 /* Free the packet structure */
2265                 stpp->stp_etherhp = NULL;
2266                 stpp->stp_iphp = NULL;
2267                 stpp->stp_thp = NULL;
2268                 stpp->stp_off = 0;
2269                 stpp->stp_size = 0;
2270                 stpp->stp_mss = 0;
2271                 stpp->stp_dpl_put_len = 0;
2272 
2273                 sfxge_tx_packet_destroy(sp, stpp);
2274 
2275                 --count;
2276                 stpp = next;
2277         }
2278 
2279         ASSERT3P(stpp, ==, NULL);
2280 
2281         /* Empty list */
2282         stdp->std_get = NULL;
2283         stdp->std_count = 0;
2284         stdp->std_getp = &(stdp->std_get);
2285 }
2286 
2287 
2288 void
2289 sfxge_tx_qdpl_flush(sfxge_txq_t *stp)
2290 {
2291         mutex_enter(&(stp->st_lock));
2292         sfxge_tx_qdpl_flush_locked(stp);
2293         mutex_exit(&(stp->st_lock));
2294 }
2295 
2296 
2297 static void
2298 sfxge_tx_qunblock(sfxge_txq_t *stp)
2299 {
2300         sfxge_t *sp = stp->st_sp;
2301         unsigned int evq = stp->st_evq;
2302         sfxge_evq_t *sep = sp->s_sep[evq];
2303 
2304         ASSERT(mutex_owned(&(sep->se_lock)));
2305 
2306         if (stp->st_state != SFXGE_TXQ_STARTED)
2307                 return;
2308 
2309         mutex_enter(&(stp->st_lock));
2310 
2311         if (stp->st_unblock != SFXGE_TXQ_NOT_BLOCKED) {
2312                 unsigned int level;
2313 
2314                 level = stp->st_added - stp->st_completed;
2315                 if (level <= stp->st_unblock) {
2316                         stp->st_unblock = SFXGE_TXQ_NOT_BLOCKED;
2317                         sfxge_tx_qlist_post(stp);
2318                 }
2319         }
2320 
2321         sfxge_tx_qdpl_service(stp);
2322         /* lock has been dropped */
2323 }
2324 
2325 void
2326 sfxge_tx_qcomplete(sfxge_txq_t *stp)
2327 {
2328         sfxge_t *sp = stp->st_sp;
2329         sfxge_tx_dpl_t *stdp = &(stp->st_dpl);
2330         unsigned int evq = stp->st_evq;
2331         sfxge_evq_t *sep = sp->s_sep[evq];
2332         unsigned int completed;
2333 
2334         ASSERT(mutex_owned(&(sep->se_lock)));
2335 
2336         completed = stp->st_completed;
2337         while (completed != stp->st_pending) {
2338                 unsigned int id;
2339                 sfxge_tx_mapping_t *stmp;
2340 
2341                 id = completed++ & (SFXGE_TX_NDESCS - 1);
2342 
2343                 if ((stmp = stp->st_stmp[id]) != NULL) {
2344                         mblk_t *mp;
2345 
2346                         /* Unbind all the mappings */
2347                         do {
2348                                 ASSERT(stmp->stm_mp != NULL);
2349                                 sfxge_tx_msgb_unbind(stmp);
2350 
2351                                 stmp = stmp->stm_next;
2352                         } while (stmp != NULL);
2353 
2354                         /*
2355                          * Now that the packet is no longer mapped for DMA it
2356                          * can be freed.
2357                          */
2358                         mp = stp->st_mp[id];
2359                         stp->st_mp[id] = NULL;
2360 
2361                         ASSERT(mp != NULL);
2362                         freemsg(mp);
2363                 }
2364         }
2365         stp->st_completed = completed;
2366 
2367         /* Check whether we need to unblock the queue */
2368         if (stp->st_unblock != SFXGE_TXQ_NOT_BLOCKED) {
2369                 unsigned int level;
2370 
2371                 level = stp->st_added - stp->st_completed;
2372                 if (level <= stp->st_unblock)
2373                         sfxge_tx_qunblock(stp);
2374         }
2375 
2376         /* Release TX backpressure from the TX DPL put/get list being full */
2377         if (stdp->std_count < stdp->get_pkt_limit)
2378                 mac_tx_update(sp->s_mh);
2379 }
2380 
2381 void
2382 sfxge_tx_qflush_done(sfxge_txq_t *stp)
2383 {
2384         sfxge_t *sp = stp->st_sp;
2385 
2386         ASSERT(mutex_owned(&(sp->s_sep[stp->st_evq]->se_lock)));
2387 
2388         mutex_enter(&(stp->st_lock));
2389 
2390         if (stp->st_flush == SFXGE_FLUSH_PENDING)
2391                 stp->st_flush = SFXGE_FLUSH_DONE;
2392 
2393         mutex_exit(&(stp->st_lock));
2394 
2395         mutex_enter(&(sp->s_tx_flush_lock));
2396         sp->s_tx_flush_pending--;
2397         if (sp->s_tx_flush_pending <= 0) {
2398                 /* All queues flushed: wakeup sfxge_tx_stop() */
2399                 cv_signal(&(sp->s_tx_flush_kv));
2400         }
2401         mutex_exit(&(sp->s_tx_flush_lock));
2402 }
2403 
2404 static void
2405 sfxge_tx_qflush(sfxge_t *sp, unsigned int index, boolean_t do_flush)
2406 {
2407         sfxge_txq_t *stp = sp->s_stp[index];
2408 
2409         ASSERT(mutex_owned(&(sp->s_state_lock)));
2410 
2411         mutex_enter(&(stp->st_lock));
2412 
2413         /* Prepare to flush and stop the queue */
2414         if (stp->st_state == SFXGE_TXQ_STARTED)
2415                 stp->st_state = SFXGE_TXQ_INITIALIZED;
2416         else
2417                 do_flush = B_FALSE; /* No hardware ring, so don't flush */
2418 
2419         if (do_flush)
2420                 stp->st_flush = SFXGE_FLUSH_PENDING;
2421         else
2422                 stp->st_flush = SFXGE_FLUSH_INACTIVE;
2423 
2424         mutex_exit(&(stp->st_lock));
2425 
2426         /* Flush the transmit queue */
2427         if (do_flush)
2428                 efx_tx_qflush(stp->st_etp);
2429 }
2430 
2431 static void
2432 sfxge_tx_qstop(sfxge_t *sp, unsigned int index)
2433 {
2434         sfxge_txq_t *stp = sp->s_stp[index];
2435         unsigned int evq = stp->st_evq;
2436         sfxge_evq_t *sep = sp->s_sep[evq];
2437 
2438         mutex_enter(&(sep->se_lock));
2439         mutex_enter(&(stp->st_lock));
2440         ASSERT3U(stp->st_state, ==, SFXGE_TXQ_INITIALIZED);
2441 
2442         /* All queues should have been flushed */
2443         ASSERT3S(stp->st_sp->s_tx_flush_pending, ==, 0);
2444         ASSERT(stp->st_flush != SFXGE_FLUSH_FAILED);
2445 
2446         /* in case of TX flush timeout */
2447         stp->st_flush = SFXGE_FLUSH_DONE;
2448 
2449         /* Destroy the transmit queue */
2450         efx_tx_qdestroy(stp->st_etp);
2451         stp->st_etp = NULL;
2452 
2453         /* Clear entries from the buffer table */
2454         sfxge_sram_buf_tbl_clear(sp, stp->st_id,
2455             EFX_TXQ_NBUFS(SFXGE_TX_NDESCS));
2456 
2457         sfxge_tx_qlist_abort(stp);
2458         ASSERT3U(stp->st_n, ==, 0);
2459 
2460         stp->st_unblock = SFXGE_TXQ_NOT_BLOCKED;
2461 
2462         stp->st_pending = stp->st_added;
2463 
2464         sfxge_tx_qcomplete(stp);
2465         ASSERT3U(stp->st_completed, ==, stp->st_pending);
2466 
2467         sfxge_tx_qreap(stp);
2468         ASSERT3U(stp->st_reaped, ==, stp->st_completed);
2469 
2470         /*
2471          * Ensure the deferred packet list is cleared
2472          * Can race with sfxge_tx_packet_add() adding to the put list
2473          */
2474         sfxge_tx_qdpl_flush_locked(stp);
2475 
2476         stp->st_added = 0;
2477         stp->st_pending = 0;
2478         stp->st_completed = 0;
2479         stp->st_reaped = 0;
2480 
2481         mutex_exit(&(stp->st_lock));
2482         mutex_exit(&(sep->se_lock));
2483 }
2484 
2485 static void
2486 sfxge_tx_qfini(sfxge_t *sp, unsigned int index)
2487 {
2488         sfxge_txq_t *stp = sp->s_stp[index];
2489         sfxge_tx_dpl_t *stdp = &(stp->st_dpl);
2490 
2491         /* Detach the TXQ from the driver */
2492         sp->s_stp[index] = NULL;
2493         ASSERT(sp->s_tx_qcount > 0);
2494         sp->s_tx_qcount--;
2495 
2496         ASSERT3U(stp->st_state, ==, SFXGE_TXQ_INITIALIZED);
2497         stp->st_state = SFXGE_TXQ_UNINITIALIZED;
2498 
2499         /* Tear down the statistics */
2500         sfxge_tx_kstat_fini(stp);
2501 
2502         /* Ensure the deferred packet list is empty */
2503         ASSERT3U(stdp->std_count, ==, 0);
2504         ASSERT3P(stdp->std_get, ==, NULL);
2505         ASSERT3U(stdp->std_put, ==, 0);
2506 
2507         /* Clear the free buffer pool */
2508         sfxge_tx_qfbp_empty(stp);
2509 
2510         /* Clear the free mapping pool */
2511         sfxge_tx_qfmp_empty(stp);
2512 
2513         /* Clear the free packet pool */
2514         sfxge_tx_qfpp_empty(stp);
2515 
2516         mutex_destroy(&(stp->st_lock));
2517 
2518         stp->st_evq = 0;
2519         stp->st_type = 0;
2520         stp->st_index = 0;
2521 
2522         kmem_cache_free(sp->s_tqc, stp);
2523 }
2524 
2525 int
2526 sfxge_tx_init(sfxge_t *sp)
2527 {
2528         sfxge_intr_t *sip = &(sp->s_intr);
2529         char name[MAXNAMELEN];
2530         int index;
2531         int rc;
2532 
2533         (void) snprintf(name, MAXNAMELEN - 1, "%s%d_tx_packet_cache",
2534             ddi_driver_name(sp->s_dip), ddi_get_instance(sp->s_dip));
2535 
2536         sp->s_tpc = kmem_cache_create(name, sizeof (sfxge_tx_packet_t),
2537             SFXGE_CPU_CACHE_SIZE, sfxge_tx_packet_ctor, sfxge_tx_packet_dtor,
2538             NULL, sp, NULL, 0);
2539         ASSERT(sp->s_tpc != NULL);
2540 
2541         (void) snprintf(name, MAXNAMELEN - 1, "%s%d_tx_buffer_cache",
2542             ddi_driver_name(sp->s_dip), ddi_get_instance(sp->s_dip));
2543 
2544         sp->s_tbc = kmem_cache_create(name, sizeof (sfxge_tx_buffer_t),
2545             SFXGE_CPU_CACHE_SIZE, sfxge_tx_buffer_ctor, sfxge_tx_buffer_dtor,
2546             NULL, sp, NULL, 0);
2547         ASSERT(sp->s_tbc != NULL);
2548 
2549         (void) snprintf(name, MAXNAMELEN - 1, "%s%d_tx_mapping_cache",
2550             ddi_driver_name(sp->s_dip), ddi_get_instance(sp->s_dip));
2551 
2552         sp->s_tmc = kmem_cache_create(name, sizeof (sfxge_tx_mapping_t),
2553             SFXGE_CPU_CACHE_SIZE, sfxge_tx_mapping_ctor, sfxge_tx_mapping_dtor,
2554             NULL, sp, NULL, 0);
2555         ASSERT(sp->s_tmc != NULL);
2556 
2557         (void) snprintf(name, MAXNAMELEN - 1, "%s%d_txq_cache",
2558             ddi_driver_name(sp->s_dip), ddi_get_instance(sp->s_dip));
2559 
2560         sp->s_tqc = kmem_cache_create(name, sizeof (sfxge_txq_t),
2561             SFXGE_CPU_CACHE_SIZE, sfxge_tx_qctor, sfxge_tx_qdtor, NULL, sp,
2562             NULL, 0);
2563         ASSERT(sp->s_tqc != NULL);
2564 
2565         /* Initialize the special non-checksummed transmit queues */
2566 
2567         /* NB sfxge_ev_qinit() is sensitive to using EVQ_0 */
2568         if ((rc = sfxge_tx_qinit(sp, SFXGE_TXQ_NON_CKSUM,
2569                     SFXGE_TXQ_NON_CKSUM, EVQ_0)) != 0)
2570                 goto fail1;
2571 
2572         /* NB sfxge_ev_qinit() is sensitive to using EVQ_0 */
2573         if ((rc = sfxge_tx_qinit(sp, SFXGE_TXQ_IP_CKSUM,
2574             SFXGE_TXQ_IP_CKSUM, EVQ_0)) != 0)
2575                 goto fail2;
2576 
2577         /* Initialize the normal transmit queues */
2578         for (index = 0; index < sip->si_nalloc; index++) {
2579                 if ((rc = sfxge_tx_qinit(sp, SFXGE_TXQ_IP_TCP_UDP_CKSUM + index,
2580                     SFXGE_TXQ_IP_TCP_UDP_CKSUM, index)) != 0)
2581                         goto fail3;
2582         }
2583 
2584         return (0);
2585 
2586 fail3:
2587         DTRACE_PROBE(fail3);
2588 
2589         while (--index >= 0)
2590                 sfxge_tx_qfini(sp, SFXGE_TXQ_IP_TCP_UDP_CKSUM + index);
2591 
2592         sfxge_tx_qfini(sp, SFXGE_TXQ_IP_CKSUM);
2593 
2594 fail2:
2595         DTRACE_PROBE(fail2);
2596 
2597 fail1:
2598         DTRACE_PROBE1(fail1, int, rc);
2599 
2600         sfxge_tx_qfini(sp, SFXGE_TXQ_NON_CKSUM);
2601 
2602         kmem_cache_destroy(sp->s_tqc);
2603         sp->s_tqc = NULL;
2604 
2605         kmem_cache_destroy(sp->s_tmc);
2606         sp->s_tmc = NULL;
2607 
2608         kmem_cache_destroy(sp->s_tbc);
2609         sp->s_tbc = NULL;
2610 
2611         kmem_cache_destroy(sp->s_tpc);
2612         sp->s_tpc = NULL;
2613 
2614         return (rc);
2615 }
2616 
2617 int
2618 sfxge_tx_start(sfxge_t *sp)
2619 {
2620         efx_nic_t *enp = sp->s_enp;
2621         int index;
2622         int rc;
2623 
2624         /* Initialize the transmit module */
2625         if ((rc = efx_tx_init(enp)) != 0)
2626                 goto fail1;
2627 
2628         for (index = 0; index < sp->s_tx_qcount; index++) {
2629                 if ((rc = sfxge_tx_qstart(sp, index)) != 0)
2630                         goto fail2;
2631         }
2632 
2633         return (0);
2634 
2635 fail2:
2636         DTRACE_PROBE(fail2);
2637 
2638         while (--index >= 0)
2639                 sfxge_tx_qstop(sp, index);
2640 
2641         /* Tear down the transmit module */
2642         efx_tx_fini(enp);
2643 
2644 fail1:
2645         DTRACE_PROBE1(fail1, int, rc);
2646 
2647         return (rc);
2648 }
2649 
2650 
2651 /*
2652  * Add a packet to the TX Deferred Packet List and if the TX queue lock
2653  * can be acquired then call sfxge_tx_qdpl_service() to fragment and push
2654  * to the H/W transmit descriptor ring
2655  *
2656  * If ENOSPC is returned then the DPL is full or the packet create failed, but
2657  * the mblk isn't freed so that the caller can return this mblk from mc_tx() to
2658  * back-pressure the OS stack.
2659  *
2660  * For all other errors the mblk is freed
2661  */
2662 int
2663 sfxge_tx_packet_add(sfxge_t *sp, mblk_t *mp)
2664 {
2665         struct ether_header *etherhp;
2666         struct ip *iphp;
2667         struct tcphdr *thp;
2668         size_t off;
2669         size_t size;
2670         size_t mss;
2671         sfxge_txq_t *stp;
2672         boolean_t locked;
2673         sfxge_tx_packet_t *stpp;
2674         int rc = 0;
2675 
2676         ASSERT3P(mp->b_next, ==, NULL);
2677         ASSERT(!(DB_CKSUMFLAGS(mp) & HCK_PARTIALCKSUM));
2678 
2679         /*
2680          * Do not enqueue packets during startup/shutdown;
2681          *
2682          * NOTE: This access to the state is NOT protected by the state lock. It
2683          * is an imperfect test and anything further getting onto the get/put
2684          * deferred packet lists is cleaned up in (possibly repeated) calls to
2685          * sfxge_can_destroy().
2686          */
2687         if (sp->s_state != SFXGE_STARTED) {
2688                 rc = EINVAL;
2689                 goto fail1;
2690         }
2691 
2692         etherhp = NULL;
2693         iphp = NULL;
2694         thp = NULL;
2695         off = 0;
2696         size = 0;
2697         mss = 0;
2698 
2699         /* Check whether we need the header pointers for LSO segmentation */
2700         if (DB_LSOFLAGS(mp) & HW_LSO) {
2701                 /* LSO segmentation relies on hardware checksum offload */
2702                 DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM;
2703 
2704                 if ((mss = DB_LSOMSS(mp)) == 0) {
2705                         rc = EINVAL;
2706                         goto fail1;
2707                 }
2708 
2709                 sfxge_tcp_parse(mp, &etherhp, &iphp, &thp, &off, &size);
2710 
2711                 if (etherhp == NULL ||
2712                     iphp == NULL ||
2713                     thp == NULL ||
2714                     off == 0) {
2715                         rc = EINVAL;
2716                         goto fail2;
2717                 }
2718         }
2719 
2720         /* Choose the appropriate transit queue */
2721         if (DB_CKSUMFLAGS(mp) & HCK_FULLCKSUM) {
2722                 sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2723 
2724                 if (srsp->srs_state == SFXGE_RX_SCALE_STARTED) {
2725                         uint16_t hash;
2726                         int index;
2727 
2728                         if (srsp->srs_count > 1) {
2729                                 /*
2730                                  * If we have not already parsed the headers
2731                                  * for LSO segmentation then we need to do it
2732                                  * now so we can calculate the hash.
2733                                  */
2734                                 if (thp == NULL)
2735                                         sfxge_tcp_parse(mp, &etherhp, &iphp,
2736                                             &thp, &off, &size);
2737 
2738                                 if (thp != NULL) {
2739                                         SFXGE_TCP_HASH(
2740                                             ntohl(iphp->ip_dst.s_addr),
2741                                             ntohs(thp->th_dport),
2742                                             ntohl(iphp->ip_src.s_addr),
2743                                             ntohs(thp->th_sport), hash);
2744 
2745                                         index = srsp->srs_tbl[hash %
2746                                             SFXGE_RX_SCALE_MAX];
2747                                 } else {
2748                                         /*
2749                                          * Non-TCP traffix always goes to the
2750                                          * the queue in the zero-th entry of
2751                                          * the RSS table.
2752                                          */
2753                                         index = srsp->srs_tbl[0];
2754                                 }
2755                         } else {
2756                                 /*
2757                                  * It does not matter what the hash is
2758                                  * because all the RSS table entries will be
2759                                  * the same.
2760                                  */
2761                                 index = srsp->srs_tbl[0];
2762                         }
2763 
2764                         /*
2765                          * Find the event queue corresponding to the hash in
2766                          * the RSS table.
2767                          */
2768                         stp = sp->s_stp[SFXGE_TXQ_IP_TCP_UDP_CKSUM + index];
2769                         ASSERT3U(stp->st_evq, ==, index);
2770                 } else {
2771                         stp = sp->s_stp[SFXGE_TXQ_IP_TCP_UDP_CKSUM];
2772                 }
2773         } else if (DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) {
2774                 stp = sp->s_stp[SFXGE_TXQ_IP_CKSUM];
2775         } else {
2776                 if ((stp = sp->s_stp[SFXGE_TXQ_NON_CKSUM]) == NULL)
2777                         stp = sp->s_stp[SFXGE_TXQ_IP_CKSUM];
2778         }
2779         ASSERT(stp != NULL);
2780 
2781         ASSERT(mss == 0 || (DB_LSOFLAGS(mp) & HW_LSO));
2782 
2783         /* Try to grab the lock */
2784         locked = mutex_tryenter(&(stp->st_lock));
2785 
2786         if (locked) {
2787                 /* Try to grab a packet from the pool */
2788                 stpp = sfxge_tx_qfpp_get(stp);
2789         } else {
2790                 stpp = NULL;
2791         }
2792 
2793         if (stpp == NULL) {
2794                 /*
2795                  * Either the pool was empty or we don't have the lock so
2796                  * allocate a new packet.
2797                  */
2798                 if ((stpp = sfxge_tx_packet_create(sp)) == NULL) {
2799                         rc = ENOSPC;
2800                         goto fail3;
2801                 }
2802         }
2803 
2804         stpp->stp_mp = mp;
2805         stpp->stp_etherhp = etherhp;
2806         stpp->stp_iphp = iphp;
2807         stpp->stp_thp = thp;
2808         stpp->stp_off = off;
2809         stpp->stp_size = size;
2810         stpp->stp_mss = mss;
2811         stpp->stp_dpl_put_len = 0;
2812 
2813         rc = sfxge_tx_qdpl_add(stp, stpp, locked);
2814         if (rc != 0) {
2815                 /* ENOSPC can happen for DPL get or put list is full */
2816                 ASSERT3U(rc, ==, ENOSPC);
2817 
2818                 /*
2819                  * Note; if this is the unlocked DPL put list full case there is
2820                  * no need to worry about a race with locked
2821                  * sfxge_tx_qdpl_swizzle() as we know that the TX DPL put list
2822                  * was full and would have been swizzle'd to the TX DPL get
2823                  * list; hence guaranteeing future TX completions and calls
2824                  * to mac_tx_update() via sfxge_tx_qcomplete()
2825                  */
2826                 goto fail4;
2827         }
2828 
2829         /* Try to grab the lock again */
2830         if (!locked)
2831                 locked = mutex_tryenter(&(stp->st_lock));
2832 
2833         if (locked) {
2834                 /* Try to service the list */
2835                 sfxge_tx_qdpl_service(stp);
2836                 /* lock has been dropped */
2837         }
2838 
2839         return (0);
2840 
2841 fail4:
2842         DTRACE_PROBE(fail4);
2843         sfxge_tx_packet_destroy(sp, stpp);
2844 fail3:
2845         DTRACE_PROBE(fail3);
2846         if (locked)
2847                 mutex_exit(&(stp->st_lock));
2848 fail2:
2849         DTRACE_PROBE(fail2);
2850 fail1:
2851         DTRACE_PROBE1(fail1, int, rc);
2852 
2853         if (rc != ENOSPC)
2854                 freemsg(mp);
2855         return (rc);
2856 }
2857 
2858 int
2859 sfxge_tx_loopback(sfxge_t *sp, unsigned int count)
2860 {
2861         uint8_t unicst[ETHERADDRL];
2862         size_t mtu;
2863         mblk_t *mp;
2864         struct ether_header *etherhp;
2865         unsigned int byte;
2866         int rc;
2867 
2868         if (count == 0) {
2869                 rc = EINVAL;
2870                 goto fail1;
2871         }
2872 
2873         rc = sfxge_mac_unicst_get(sp, SFXGE_UNICST_LAA, unicst);
2874 
2875         if (rc == ENOENT)
2876                 rc = sfxge_mac_unicst_get(sp, SFXGE_UNICST_BIA, unicst);
2877 
2878         if (rc != 0)
2879                 goto fail2;
2880 
2881         mtu = sp->s_mtu;
2882 
2883         if ((mp = allocb(sizeof (struct ether_header) + mtu,
2884             BPRI_HI)) == NULL) {
2885                 rc = ENOMEM;
2886                 goto fail3;
2887         }
2888 
2889         mp->b_wptr = mp->b_rptr + sizeof (struct ether_header);
2890         bzero(mp->b_rptr, MBLKL(mp));
2891 
2892         /*LINTED*/
2893         etherhp = (struct ether_header *)(mp->b_rptr);
2894         bcopy(sfxge_brdcst, &(etherhp->ether_dhost), ETHERADDRL);
2895         bcopy(unicst, &(etherhp->ether_shost), ETHERADDRL);
2896         etherhp->ether_type = htons(SFXGE_ETHERTYPE_LOOPBACK);
2897 
2898         for (byte = 0; byte < 30; byte++)
2899                 *(mp->b_wptr++) = (byte & 1) ? 0xaa : 0x55;
2900 
2901         do {
2902                 mblk_t *nmp;
2903 
2904                 if ((nmp = dupb(mp)) == NULL) {
2905                         rc = ENOMEM;
2906                         goto fail4;
2907                 }
2908 
2909                 rc = sfxge_tx_packet_add(sp, nmp);
2910                 if (rc != 0) {
2911                         freeb(nmp);
2912                         goto fail5;
2913                 }
2914 
2915         } while (--count != 0);
2916 
2917         freeb(mp);
2918         return (0);
2919 
2920 fail5:
2921         DTRACE_PROBE(fail5);
2922 fail4:
2923         DTRACE_PROBE(fail4);
2924 
2925         freeb(mp);
2926 
2927 fail3:
2928         DTRACE_PROBE(fail3);
2929 fail2:
2930         DTRACE_PROBE(fail2);
2931 fail1:
2932         DTRACE_PROBE1(fail1, int, rc);
2933 
2934         return (rc);
2935 }
2936 
2937 int
2938 sfxge_tx_ioctl(sfxge_t *sp, sfxge_tx_ioc_t *stip)
2939 {
2940         int rc;
2941 
2942         switch (stip->sti_op) {
2943         case SFXGE_TX_OP_LOOPBACK: {
2944                 unsigned int count = stip->sti_data;
2945 
2946                 if ((rc = sfxge_tx_loopback(sp, count)) != 0)
2947                         goto fail1;
2948 
2949                 break;
2950         }
2951         default:
2952                 rc = ENOTSUP;
2953                 goto fail1;
2954         }
2955 
2956         return (0);
2957 
2958 fail1:
2959         DTRACE_PROBE1(fail1, int, rc);
2960 
2961         return (rc);
2962 }
2963 
2964 void
2965 sfxge_tx_stop(sfxge_t *sp)
2966 {
2967         efx_nic_t *enp = sp->s_enp;
2968         clock_t timeout;
2969         boolean_t do_flush;
2970         int index;
2971 
2972         ASSERT(mutex_owned(&(sp->s_state_lock)));
2973 
2974         mutex_enter(&(sp->s_tx_flush_lock));
2975 
2976         /* Flush all the queues */
2977         if (sp->s_hw_err == SFXGE_HW_OK) {
2978                 sp->s_tx_flush_pending = sp->s_tx_qcount;
2979                 do_flush = B_TRUE;
2980         } else {
2981                 sp->s_tx_flush_pending = 0;
2982                 do_flush = B_FALSE;
2983         }
2984 
2985         /* Prepare queues to stop and flush the hardware ring */
2986         for (index = 0; index < sp->s_tx_qcount; index++)
2987                 sfxge_tx_qflush(sp, index, do_flush);
2988 
2989         if (do_flush == B_FALSE)
2990                 goto flush_done;
2991 
2992         /* Wait upto 2sec for queue flushing to complete */
2993         timeout = ddi_get_lbolt() + drv_usectohz(SFXGE_TX_QFLUSH_USEC);
2994 
2995         while (sp->s_tx_flush_pending > 0) {
2996                 if (cv_timedwait(&(sp->s_tx_flush_kv), &(sp->s_tx_flush_lock),
2997                         timeout) < 0) {
2998                         /* Timeout waiting for queues to flush */
2999                         dev_info_t *dip = sp->s_dip;
3000 
3001                         DTRACE_PROBE(timeout);
3002                         cmn_err(CE_NOTE,
3003                             SFXGE_CMN_ERR "[%s%d] tx qflush timeout",
3004                             ddi_driver_name(dip), ddi_get_instance(dip));
3005                         break;
3006                 }
3007         }
3008         sp->s_tx_flush_pending = 0;
3009 
3010 flush_done:
3011         mutex_exit(&(sp->s_tx_flush_lock));
3012 
3013         /* Stop all the queues */
3014         for (index = 0; index < sp->s_tx_qcount; index++)
3015                 sfxge_tx_qstop(sp, index);
3016 
3017         /* Tear down the transmit module */
3018         efx_tx_fini(enp);
3019 }
3020 
3021 void
3022 sfxge_tx_fini(sfxge_t *sp)
3023 {
3024         int index;
3025 
3026         index = sp->s_tx_qcount;
3027         while (--index >= 0)
3028                 sfxge_tx_qfini(sp, index);
3029 
3030         kmem_cache_destroy(sp->s_tqc);
3031         sp->s_tqc = NULL;
3032 
3033         kmem_cache_destroy(sp->s_tmc);
3034         sp->s_tmc = NULL;
3035 
3036         kmem_cache_destroy(sp->s_tbc);
3037         sp->s_tbc = NULL;
3038 
3039         kmem_cache_destroy(sp->s_tpc);
3040         sp->s_tpc = NULL;
3041 }