1 /*
   2  * Copyright (c) 2008-2015 Solarflare Communications Inc.
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions are met:
   7  *
   8  * 1. Redistributions of source code must retain the above copyright notice,
   9  *    this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright notice,
  11  *    this list of conditions and the following disclaimer in the documentation
  12  *    and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  15  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  16  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  17  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  21  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  22  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  23  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  24  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25  *
  26  * The views and conclusions contained in the software and documentation are
  27  * those of the authors and should not be interpreted as representing official
  28  * policies, either expressed or implied, of the FreeBSD Project.
  29  */
  30 
  31 #include <sys/types.h>
  32 #include <sys/sysmacros.h>
  33 #include <sys/ddi.h>
  34 #include <sys/sunddi.h>
  35 #include <sys/atomic.h>
  36 #include <sys/stream.h>
  37 #include <sys/strsun.h>
  38 #include <sys/strsubr.h>
  39 #include <sys/pattr.h>
  40 #include <sys/cpu.h>
  41 
  42 #include <sys/ethernet.h>
  43 #include <inet/ip.h>
  44 
  45 #include <netinet/in.h>
  46 #include <netinet/ip.h>
  47 #include <netinet/tcp.h>
  48 
  49 #include "sfxge.h"
  50 
  51 #include "efx.h"
  52 
  53 /* TXQ flush response timeout (in microseconds) */
  54 #define SFXGE_TX_QFLUSH_USEC    (2000000)
  55 
  56 /* See sfxge.conf.private for descriptions */
  57 #define SFXGE_TX_DPL_GET_PKT_LIMIT_DEFAULT 4096
  58 #define SFXGE_TX_DPL_PUT_PKT_LIMIT_DEFAULT 256
  59 
  60 
  61 /* Transmit buffer DMA attributes */
  62 static ddi_device_acc_attr_t sfxge_tx_buffer_devacc = {
  63 
  64         DDI_DEVICE_ATTR_V0,     /* devacc_attr_version */
  65         DDI_NEVERSWAP_ACC,      /* devacc_attr_endian_flags */
  66         DDI_STRICTORDER_ACC     /* devacc_attr_dataorder */
  67 };
  68 
  69 static ddi_dma_attr_t sfxge_tx_buffer_dma_attr = {
  70         DMA_ATTR_V0,            /* dma_attr_version     */
  71         0,                      /* dma_attr_addr_lo     */
  72         0xffffffffffffffffull,  /* dma_attr_addr_hi     */
  73         0xffffffffffffffffull,  /* dma_attr_count_max   */
  74         SFXGE_TX_BUFFER_SIZE,   /* dma_attr_align       */
  75         0xffffffff,             /* dma_attr_burstsizes  */
  76         1,                      /* dma_attr_minxfer     */
  77         0xffffffffffffffffull,  /* dma_attr_maxxfer     */
  78         0xffffffffffffffffull,  /* dma_attr_seg         */
  79         1,                      /* dma_attr_sgllen      */
  80         1,                      /* dma_attr_granular    */
  81         0                       /* dma_attr_flags       */
  82 };
  83 
  84 /* Transmit mapping DMA attributes */
  85 static ddi_dma_attr_t sfxge_tx_mapping_dma_attr = {
  86         DMA_ATTR_V0,            /* dma_attr_version     */
  87         0,                      /* dma_attr_addr_lo     */
  88         0xffffffffffffffffull,  /* dma_attr_addr_hi     */
  89         0xffffffffffffffffull,  /* dma_attr_count_max   */
  90         1,                      /* dma_attr_align       */
  91         0xffffffff,             /* dma_attr_burstsizes  */
  92         1,                      /* dma_attr_minxfer     */
  93         0xffffffffffffffffull,  /* dma_attr_maxxfer     */
  94         0xffffffffffffffffull,  /* dma_attr_seg         */
  95         0x7fffffff,             /* dma_attr_sgllen      */
  96         1,                      /* dma_attr_granular    */
  97         0                       /* dma_attr_flags       */
  98 };
  99 
 100 /* Transmit queue DMA attributes */
 101 static ddi_device_acc_attr_t sfxge_txq_devacc = {
 102 
 103         DDI_DEVICE_ATTR_V0,     /* devacc_attr_version */
 104         DDI_NEVERSWAP_ACC,      /* devacc_attr_endian_flags */
 105         DDI_STRICTORDER_ACC     /* devacc_attr_dataorder */
 106 };
 107 
 108 static ddi_dma_attr_t sfxge_txq_dma_attr = {
 109         DMA_ATTR_V0,            /* dma_attr_version     */
 110         0,                      /* dma_attr_addr_lo     */
 111         0xffffffffffffffffull,  /* dma_attr_addr_hi     */
 112         0xffffffffffffffffull,  /* dma_attr_count_max   */
 113         EFX_BUF_SIZE,           /* dma_attr_align       */
 114         0xffffffff,             /* dma_attr_burstsizes  */
 115         1,                      /* dma_attr_minxfer     */
 116         0xffffffffffffffffull,  /* dma_attr_maxxfer     */
 117         0xffffffffffffffffull,  /* dma_attr_seg         */
 118         1,                      /* dma_attr_sgllen      */
 119         1,                      /* dma_attr_granular    */
 120         0                       /* dma_attr_flags       */
 121 };
 122 
 123 
 124 /*
 125  * A sfxge_tx_qdpl_swizzle() can happen when the DPL get list is one packet
 126  * under the limit, and must move all packets from the DPL put->get list
 127  * Hence this is the real maximum length of the TX DPL get list.
 128  */
 129 static int
 130 sfxge_tx_dpl_get_pkt_max(sfxge_txq_t *stp)
 131 {
 132         sfxge_tx_dpl_t *stdp = &(stp->st_dpl);
 133         return (stdp->get_pkt_limit + stdp->put_pkt_limit - 1);
 134 }
 135 
 136 
 137 static int
 138 sfxge_tx_packet_ctor(void *buf, void *arg, int kmflags)
 139 {
 140         _NOTE(ARGUNUSED(arg, kmflags))
 141 
 142         bzero(buf, sizeof (sfxge_tx_packet_t));
 143 
 144         return (0);
 145 }
 146 
 147 static void
 148 sfxge_tx_packet_dtor(void *buf, void *arg)
 149 {
 150         sfxge_tx_packet_t *stpp = buf;
 151 
 152         _NOTE(ARGUNUSED(arg))
 153 
 154         SFXGE_OBJ_CHECK(stpp, sfxge_tx_packet_t);
 155 }
 156 
 157 static int
 158 sfxge_tx_buffer_ctor(void *buf, void *arg, int kmflags)
 159 {
 160         sfxge_tx_buffer_t *stbp = buf;
 161         sfxge_t *sp = arg;
 162         sfxge_dma_buffer_attr_t dma_attr;
 163         int rc;
 164 
 165         bzero(buf, sizeof (sfxge_tx_buffer_t));
 166 
 167         dma_attr.sdba_dip        = sp->s_dip;
 168         dma_attr.sdba_dattrp     = &sfxge_tx_buffer_dma_attr;
 169         dma_attr.sdba_callback   = ((kmflags == KM_SLEEP) ?
 170             DDI_DMA_SLEEP : DDI_DMA_DONTWAIT);
 171         dma_attr.sdba_length     = SFXGE_TX_BUFFER_SIZE;
 172         dma_attr.sdba_memflags   = DDI_DMA_STREAMING;
 173         dma_attr.sdba_devaccp    = &sfxge_tx_buffer_devacc;
 174         dma_attr.sdba_bindflags  = DDI_DMA_WRITE | DDI_DMA_STREAMING;
 175         dma_attr.sdba_maxcookies = 1;
 176         dma_attr.sdba_zeroinit   = B_FALSE;
 177 
 178         if ((rc = sfxge_dma_buffer_create(&(stbp->stb_esm), &dma_attr)) != 0)
 179                 goto fail1;
 180 
 181         return (0);
 182 
 183 fail1:
 184         DTRACE_PROBE1(fail1, int, rc);
 185 
 186         SFXGE_OBJ_CHECK(stbp, sfxge_tx_buffer_t);
 187 
 188         return (-1);
 189 }
 190 
 191 static void
 192 sfxge_tx_buffer_dtor(void *buf, void *arg)
 193 {
 194         sfxge_tx_buffer_t *stbp = buf;
 195 
 196         _NOTE(ARGUNUSED(arg))
 197 
 198         sfxge_dma_buffer_destroy(&(stbp->stb_esm));
 199 
 200         SFXGE_OBJ_CHECK(stbp, sfxge_tx_buffer_t);
 201 }
 202 
 203 static int
 204 sfxge_tx_mapping_ctor(void *buf, void *arg, int kmflags)
 205 {
 206         sfxge_tx_mapping_t *stmp = buf;
 207         sfxge_t *sp = arg;
 208         dev_info_t *dip = sp->s_dip;
 209         int rc;
 210 
 211         bzero(buf, sizeof (sfxge_tx_mapping_t));
 212 
 213         stmp->stm_sp = sp;
 214 
 215         /* Allocate DMA handle */
 216         rc = ddi_dma_alloc_handle(dip, &sfxge_tx_mapping_dma_attr,
 217             (kmflags == KM_SLEEP) ? DDI_DMA_SLEEP : DDI_DMA_DONTWAIT,
 218             NULL, &(stmp->stm_dma_handle));
 219         if (rc != DDI_SUCCESS)
 220                 goto fail1;
 221 
 222         return (0);
 223 
 224 fail1:
 225         DTRACE_PROBE1(fail1, int, rc);
 226 
 227         stmp->stm_sp = NULL;
 228 
 229         SFXGE_OBJ_CHECK(stmp, sfxge_tx_mapping_t);
 230 
 231         return (-1);
 232 }
 233 
 234 static void
 235 sfxge_tx_mapping_dtor(void *buf, void *arg)
 236 {
 237         sfxge_tx_mapping_t *stmp = buf;
 238 
 239         ASSERT3P(stmp->stm_sp, ==, arg);
 240 
 241         /* Free the DMA handle */
 242         ddi_dma_free_handle(&(stmp->stm_dma_handle));
 243         stmp->stm_dma_handle = NULL;
 244 
 245         stmp->stm_sp = NULL;
 246 
 247         SFXGE_OBJ_CHECK(stmp, sfxge_tx_mapping_t);
 248 }
 249 
 250 static int
 251 sfxge_tx_qctor(void *buf, void *arg, int kmflags)
 252 {
 253         sfxge_txq_t *stp = buf;
 254         efsys_mem_t *esmp = &(stp->st_mem);
 255         sfxge_t *sp = arg;
 256         sfxge_dma_buffer_attr_t dma_attr;
 257         sfxge_tx_dpl_t *stdp;
 258         int rc;
 259 
 260         /* Compile-time structure layout checks */
 261         EFX_STATIC_ASSERT(sizeof (stp->__st_u1.__st_s1) <=
 262             sizeof (stp->__st_u1.__st_pad));
 263         EFX_STATIC_ASSERT(sizeof (stp->__st_u2.__st_s2) <=
 264             sizeof (stp->__st_u2.__st_pad));
 265         EFX_STATIC_ASSERT(sizeof (stp->__st_u3.__st_s3) <=
 266             sizeof (stp->__st_u3.__st_pad));
 267         EFX_STATIC_ASSERT(sizeof (stp->__st_u4.__st_s4) <=
 268             sizeof (stp->__st_u4.__st_pad));
 269 
 270         bzero(buf, sizeof (sfxge_txq_t));
 271 
 272         stp->st_sp = sp;
 273 
 274         dma_attr.sdba_dip        = sp->s_dip;
 275         dma_attr.sdba_dattrp     = &sfxge_txq_dma_attr;
 276         dma_attr.sdba_callback   = DDI_DMA_SLEEP;
 277         dma_attr.sdba_length     = EFX_TXQ_SIZE(SFXGE_TX_NDESCS);
 278         dma_attr.sdba_memflags   = DDI_DMA_CONSISTENT;
 279         dma_attr.sdba_devaccp    = &sfxge_txq_devacc;
 280         dma_attr.sdba_bindflags  = DDI_DMA_READ | DDI_DMA_CONSISTENT;
 281         dma_attr.sdba_maxcookies = EFX_TXQ_NBUFS(SFXGE_TX_NDESCS);
 282         dma_attr.sdba_zeroinit   = B_FALSE;
 283 
 284         if ((rc = sfxge_dma_buffer_create(esmp, &dma_attr)) != 0)
 285                 goto fail1;
 286 
 287         /* Allocate some buffer table entries */
 288         if ((rc = sfxge_sram_buf_tbl_alloc(sp, EFX_TXQ_NBUFS(SFXGE_TX_NDESCS),
 289             &(stp->st_id))) != 0)
 290                 goto fail2;
 291 
 292         /* Allocate the descriptor array */
 293         if ((stp->st_eb = kmem_zalloc(sizeof (efx_buffer_t) *
 294             EFX_TXQ_LIMIT(SFXGE_TX_NDESCS), kmflags)) == NULL) {
 295                 rc = ENOMEM;
 296                 goto fail3;
 297         }
 298 
 299         /* Allocate the context arrays */
 300         if ((stp->st_stmp = kmem_zalloc(sizeof (sfxge_tx_mapping_t *) *
 301             SFXGE_TX_NDESCS, kmflags)) == NULL) {
 302                 rc = ENOMEM;
 303                 goto fail4;
 304         }
 305 
 306         if ((stp->st_stbp = kmem_zalloc(sizeof (sfxge_tx_buffer_t *) *
 307             SFXGE_TX_NDESCS, kmflags)) == NULL) {
 308                 rc = ENOMEM;
 309                 goto fail5;
 310         }
 311 
 312         if ((stp->st_mp = kmem_zalloc(sizeof (mblk_t *) *
 313             SFXGE_TX_NDESCS, kmflags)) == NULL) {
 314                 rc = ENOMEM;
 315                 goto fail6;
 316         }
 317 
 318         /* Initialize the deferred packet list */
 319         stdp = &(stp->st_dpl);
 320         stdp->std_getp = &(stdp->std_get);
 321 
 322         stp->st_unblock = SFXGE_TXQ_NOT_BLOCKED;
 323 
 324         return (0);
 325 
 326 fail6:
 327         DTRACE_PROBE(fail6);
 328 
 329         kmem_free(stp->st_stbp, sizeof (sfxge_tx_buffer_t *) * SFXGE_TX_NDESCS);
 330         stp->st_stbp = NULL;
 331 
 332 fail5:
 333         DTRACE_PROBE(fail5);
 334 
 335         kmem_free(stp->st_stmp,
 336             sizeof (sfxge_tx_mapping_t *) * SFXGE_TX_NDESCS);
 337         stp->st_stmp = NULL;
 338 
 339 fail4:
 340         DTRACE_PROBE(fail4);
 341 
 342         /* Free the descriptor array */
 343         kmem_free(stp->st_eb, sizeof (efx_buffer_t) *
 344             EFX_TXQ_LIMIT(SFXGE_TX_NDESCS));
 345         stp->st_eb = NULL;
 346 
 347 fail3:
 348         DTRACE_PROBE(fail3);
 349 
 350         /* Free the buffer table entries */
 351         sfxge_sram_buf_tbl_free(sp, stp->st_id, EFX_TXQ_NBUFS(SFXGE_TX_NDESCS));
 352         stp->st_id = 0;
 353 
 354 fail2:
 355         DTRACE_PROBE(fail2);
 356 
 357         /* Tear down DMA setup */
 358         sfxge_dma_buffer_destroy(esmp);
 359 
 360 fail1:
 361         DTRACE_PROBE1(fail1, int, rc);
 362 
 363         stp->st_sp = NULL;
 364 
 365         SFXGE_OBJ_CHECK(stp, sfxge_txq_t);
 366 
 367         return (-1);
 368 }
 369 
 370 static void
 371 sfxge_tx_qdtor(void *buf, void *arg)
 372 {
 373         sfxge_txq_t *stp = buf;
 374         efsys_mem_t *esmp = &(stp->st_mem);
 375         sfxge_t *sp = stp->st_sp;
 376         sfxge_tx_dpl_t *stdp;
 377 
 378         _NOTE(ARGUNUSED(arg))
 379 
 380         stp->st_unblock = 0;
 381 
 382         /* Tear down the deferred packet list */
 383         stdp = &(stp->st_dpl);
 384         ASSERT3P(stdp->std_getp, ==, &(stdp->std_get));
 385         stdp->std_getp = NULL;
 386 
 387         /* Free the context arrays */
 388         kmem_free(stp->st_mp, sizeof (mblk_t *) * SFXGE_TX_NDESCS);
 389         stp->st_mp = NULL;
 390 
 391         kmem_free(stp->st_stbp, sizeof (sfxge_tx_buffer_t *) * SFXGE_TX_NDESCS);
 392         stp->st_stbp = NULL;
 393 
 394         kmem_free(stp->st_stmp,
 395             sizeof (sfxge_tx_mapping_t *) * SFXGE_TX_NDESCS);
 396         stp->st_stmp = NULL;
 397 
 398         /* Free the descriptor array */
 399         kmem_free(stp->st_eb, sizeof (efx_buffer_t) *
 400             EFX_TXQ_LIMIT(SFXGE_TX_NDESCS));
 401         stp->st_eb = NULL;
 402 
 403         /* Free the buffer table entries */
 404         sfxge_sram_buf_tbl_free(sp, stp->st_id, EFX_TXQ_NBUFS(SFXGE_TX_NDESCS));
 405         stp->st_id = 0;
 406 
 407         /* Tear down dma setup */
 408         sfxge_dma_buffer_destroy(esmp);
 409 
 410         stp->st_sp = NULL;
 411 
 412         SFXGE_OBJ_CHECK(stp, sfxge_txq_t);
 413 }
 414 
 415 static void
 416 sfxge_tx_packet_destroy(sfxge_t *sp, sfxge_tx_packet_t *stpp)
 417 {
 418         kmem_cache_free(sp->s_tpc, stpp);
 419 }
 420 
 421 static sfxge_tx_packet_t *
 422 sfxge_tx_packet_create(sfxge_t *sp)
 423 {
 424         sfxge_tx_packet_t *stpp;
 425 
 426         stpp = kmem_cache_alloc(sp->s_tpc, KM_NOSLEEP);
 427 
 428         return (stpp);
 429 }
 430 
 431 static inline int
 432 sfxge_tx_qfpp_put(sfxge_txq_t *stp, sfxge_tx_packet_t *stpp)
 433 {
 434         sfxge_tx_fpp_t *stfp = &(stp->st_fpp);
 435 
 436         ASSERT(mutex_owned(&(stp->st_lock)));
 437 
 438         ASSERT3P(stpp->stp_next, ==, NULL);
 439         ASSERT3P(stpp->stp_mp, ==, NULL);
 440         ASSERT3P(stpp->stp_etherhp, ==, NULL);
 441         ASSERT3P(stpp->stp_iphp, ==, NULL);
 442         ASSERT3P(stpp->stp_thp, ==, NULL);
 443         ASSERT3U(stpp->stp_off, ==, 0);
 444         ASSERT3U(stpp->stp_size, ==, 0);
 445         ASSERT3U(stpp->stp_mss, ==, 0);
 446         ASSERT3U(stpp->stp_dpl_put_len, ==, 0);
 447 
 448         if (stfp->stf_count < SFXGE_TX_FPP_MAX) {
 449                 /* Add to the start of the list */
 450                 stpp->stp_next = stfp->stf_stpp;
 451                 stfp->stf_stpp = stpp;
 452                 stfp->stf_count++;
 453 
 454                 return (0);
 455         }
 456 
 457         DTRACE_PROBE(fpp_full);
 458         return (ENOSPC);
 459 }
 460 
 461 static inline sfxge_tx_packet_t *
 462 sfxge_tx_qfpp_get(sfxge_txq_t *stp)
 463 {
 464         sfxge_tx_packet_t *stpp;
 465         sfxge_tx_fpp_t *stfp = &(stp->st_fpp);
 466 
 467         ASSERT(mutex_owned(&(stp->st_lock)));
 468 
 469         stpp = stfp->stf_stpp;
 470         if (stpp == NULL) {
 471                 ASSERT3U(stfp->stf_count, ==, 0);
 472                 return (NULL);
 473         }
 474 
 475         /* Remove item from the head of the list */
 476         stfp->stf_stpp = stpp->stp_next;
 477         stpp->stp_next = NULL;
 478 
 479         ASSERT3U(stfp->stf_count, >, 0);
 480         stfp->stf_count--;
 481 
 482         if (stfp->stf_count != 0) {
 483                 ASSERT(stfp->stf_stpp != NULL);
 484                 prefetch_read_many(stfp->stf_stpp);
 485         }
 486         return (stpp);
 487 }
 488 
 489 static void
 490 sfxge_tx_qfpp_empty(sfxge_txq_t *stp)
 491 {
 492         sfxge_t *sp = stp->st_sp;
 493         sfxge_tx_fpp_t *stfp = &(stp->st_fpp);
 494         sfxge_tx_packet_t *stpp;
 495 
 496         mutex_enter(&(stp->st_lock));
 497 
 498         stpp = stfp->stf_stpp;
 499         stfp->stf_stpp = NULL;
 500 
 501         while (stpp != NULL) {
 502                 sfxge_tx_packet_t *next;
 503 
 504                 next = stpp->stp_next;
 505                 stpp->stp_next = NULL;
 506 
 507                 ASSERT3U(stfp->stf_count, >, 0);
 508                 stfp->stf_count--;
 509 
 510                 sfxge_tx_packet_destroy(sp, stpp);
 511 
 512                 stpp = next;
 513         }
 514         ASSERT3U(stfp->stf_count, ==, 0);
 515 
 516         mutex_exit(&(stp->st_lock));
 517 }
 518 
 519 static inline void
 520 sfxge_tx_qfbp_put(sfxge_txq_t *stp, sfxge_tx_buffer_t *stbp)
 521 {
 522         sfxge_tx_fbp_t *stfp = &(stp->st_fbp);
 523 
 524         ASSERT3P(stbp->stb_next, ==, NULL);
 525         ASSERT3U(stbp->stb_off, ==, 0);
 526         ASSERT3U(stbp->stb_esm.esm_used, ==, 0);
 527 
 528         stbp->stb_next = stfp->stf_stbp;
 529         stfp->stf_stbp = stbp;
 530         stfp->stf_count++;
 531 }
 532 
 533 
 534 static inline sfxge_tx_buffer_t *
 535 sfxge_tx_qfbp_get(sfxge_txq_t *stp)
 536 {
 537         sfxge_tx_buffer_t *stbp;
 538         sfxge_tx_fbp_t *stfp = &(stp->st_fbp);
 539 
 540         stbp = stfp->stf_stbp;
 541         if (stbp == NULL) {
 542                 ASSERT3U(stfp->stf_count, ==, 0);
 543                 return (NULL);
 544         }
 545 
 546         stfp->stf_stbp = stbp->stb_next;
 547         stbp->stb_next = NULL;
 548 
 549         ASSERT3U(stfp->stf_count, >, 0);
 550         stfp->stf_count--;
 551 
 552         if (stfp->stf_count != 0) {
 553                 ASSERT(stfp->stf_stbp != NULL);
 554                 prefetch_read_many(stfp->stf_stbp);
 555         }
 556 
 557         return (stbp);
 558 }
 559 
 560 static void
 561 sfxge_tx_qfbp_empty(sfxge_txq_t *stp)
 562 {
 563         sfxge_t *sp = stp->st_sp;
 564         sfxge_tx_fbp_t *stfp = &(stp->st_fbp);
 565         sfxge_tx_buffer_t *stbp;
 566 
 567         mutex_enter(&(stp->st_lock));
 568 
 569         stbp = stfp->stf_stbp;
 570         stfp->stf_stbp = NULL;
 571 
 572         while (stbp != NULL) {
 573                 sfxge_tx_buffer_t *next;
 574 
 575                 next = stbp->stb_next;
 576                 stbp->stb_next = NULL;
 577 
 578                 ASSERT3U(stfp->stf_count, >, 0);
 579                 stfp->stf_count--;
 580 
 581                 kmem_cache_free(sp->s_tbc, stbp);
 582 
 583                 stbp = next;
 584         }
 585         ASSERT3U(stfp->stf_count, ==, 0);
 586 
 587         mutex_exit(&(stp->st_lock));
 588 }
 589 
 590 static inline void
 591 sfxge_tx_qfmp_put(sfxge_txq_t *stp, sfxge_tx_mapping_t *stmp)
 592 {
 593         sfxge_tx_fmp_t *stfp = &(stp->st_fmp);
 594 
 595         ASSERT3P(stmp->stm_next, ==, NULL);
 596         ASSERT3P(stmp->stm_mp, ==, NULL);
 597         ASSERT3P(stmp->stm_base, ==, NULL);
 598         ASSERT3U(stmp->stm_off, ==, 0);
 599         ASSERT3U(stmp->stm_size, ==, 0);
 600 
 601         stmp->stm_next = stfp->stf_stmp;
 602         stfp->stf_stmp = stmp;
 603         stfp->stf_count++;
 604 }
 605 
 606 static inline sfxge_tx_mapping_t *
 607 sfxge_tx_qfmp_get(sfxge_txq_t *stp)
 608 {
 609         sfxge_tx_mapping_t *stmp;
 610         sfxge_tx_fmp_t *stfp = &(stp->st_fmp);
 611 
 612         stmp = stfp->stf_stmp;
 613         if (stmp == NULL) {
 614                 ASSERT3U(stfp->stf_count, ==, 0);
 615                 return (NULL);
 616         }
 617 
 618         stfp->stf_stmp = stmp->stm_next;
 619         stmp->stm_next = NULL;
 620 
 621         ASSERT3U(stfp->stf_count, >, 0);
 622         stfp->stf_count--;
 623 
 624         if (stfp->stf_count != 0) {
 625                 ASSERT(stfp->stf_stmp != NULL);
 626                 prefetch_read_many(stfp->stf_stmp);
 627         }
 628         return (stmp);
 629 }
 630 
 631 static void
 632 sfxge_tx_qfmp_empty(sfxge_txq_t *stp)
 633 {
 634         sfxge_t *sp = stp->st_sp;
 635         sfxge_tx_fmp_t *stfp = &(stp->st_fmp);
 636         sfxge_tx_mapping_t *stmp;
 637 
 638         mutex_enter(&(stp->st_lock));
 639 
 640         stmp = stfp->stf_stmp;
 641         stfp->stf_stmp = NULL;
 642 
 643         while (stmp != NULL) {
 644                 sfxge_tx_mapping_t *next;
 645 
 646                 next = stmp->stm_next;
 647                 stmp->stm_next = NULL;
 648 
 649                 ASSERT3U(stfp->stf_count, >, 0);
 650                 stfp->stf_count--;
 651 
 652                 kmem_cache_free(sp->s_tmc, stmp);
 653 
 654                 stmp = next;
 655         }
 656         ASSERT3U(stfp->stf_count, ==, 0);
 657 
 658         mutex_exit(&(stp->st_lock));
 659 }
 660 
 661 static void
 662 sfxge_tx_msgb_unbind(sfxge_tx_mapping_t *stmp)
 663 {
 664         bzero(stmp->stm_addr, sizeof (uint64_t) * SFXGE_TX_MAPPING_NADDR);
 665         stmp->stm_off = 0;
 666 
 667         (void) ddi_dma_unbind_handle(stmp->stm_dma_handle);
 668 
 669         stmp->stm_size = 0;
 670         stmp->stm_base = NULL;
 671 
 672         stmp->stm_mp = NULL;
 673 }
 674 
 675 #define SFXGE_TX_DESCSHIFT      12
 676 #define SFXGE_TX_DESCSIZE       (1 << 12)
 677 
 678 #define SFXGE_TX_DESCOFFSET     (SFXGE_TX_DESCSIZE - 1)
 679 #define SFXGE_TX_DESCMASK       (~SFXGE_TX_DESCOFFSET)
 680 
 681 static int
 682 sfxge_tx_msgb_bind(mblk_t *mp, sfxge_tx_mapping_t *stmp)
 683 {
 684         ddi_dma_cookie_t dmac;
 685         unsigned int ncookies;
 686         size_t size;
 687         unsigned int n;
 688         int rc;
 689 
 690         ASSERT(mp != NULL);
 691         ASSERT3U(DB_TYPE(mp), ==, M_DATA);
 692 
 693         ASSERT(stmp->stm_mp == NULL);
 694         stmp->stm_mp = mp;
 695 
 696         stmp->stm_base = (caddr_t)(mp->b_rptr);
 697         stmp->stm_size = MBLKL(mp);
 698 
 699         /* Bind the STREAMS block to the mapping */
 700         rc = ddi_dma_addr_bind_handle(stmp->stm_dma_handle, NULL,
 701             stmp->stm_base, stmp->stm_size, DDI_DMA_WRITE | DDI_DMA_STREAMING,
 702             DDI_DMA_DONTWAIT, NULL, &dmac, &ncookies);
 703         if (rc != DDI_DMA_MAPPED)
 704                 goto fail1;
 705 
 706         ASSERT3U(ncookies, <=, SFXGE_TX_MAPPING_NADDR);
 707 
 708         /*
 709          * Construct an array of addresses and an initial
 710          * offset.
 711          */
 712         n = 0;
 713         stmp->stm_addr[n++] = dmac.dmac_laddress & SFXGE_TX_DESCMASK;
 714         DTRACE_PROBE1(addr, uint64_t, dmac.dmac_laddress & SFXGE_TX_DESCMASK);
 715 
 716         stmp->stm_off = dmac.dmac_laddress & SFXGE_TX_DESCOFFSET;
 717 
 718         size = MIN(SFXGE_TX_DESCSIZE - stmp->stm_off, dmac.dmac_size);
 719         dmac.dmac_laddress += size;
 720         dmac.dmac_size -= size;
 721 
 722         for (;;) {
 723                 ASSERT3U(n, <, SFXGE_TX_MAPPING_NADDR);
 724 
 725                 if (dmac.dmac_size == 0) {
 726                         if (--ncookies == 0)
 727                                 break;
 728 
 729                         ddi_dma_nextcookie(stmp->stm_dma_handle, &dmac);
 730                 }
 731 
 732                 ASSERT((dmac.dmac_laddress & SFXGE_TX_DESCMASK) != 0);
 733                 ASSERT((dmac.dmac_laddress & SFXGE_TX_DESCOFFSET) == 0);
 734                 stmp->stm_addr[n++] = dmac.dmac_laddress;
 735                 DTRACE_PROBE1(addr, uint64_t, dmac.dmac_laddress);
 736 
 737                 size = MIN(SFXGE_TX_DESCSIZE, dmac.dmac_size);
 738                 dmac.dmac_laddress += size;
 739                 dmac.dmac_size -= size;
 740         }
 741         ASSERT3U(n, <=, SFXGE_TX_MAPPING_NADDR);
 742 
 743         return (0);
 744 
 745 fail1:
 746         DTRACE_PROBE1(fail1, int, rc);
 747 
 748         stmp->stm_size = 0;
 749         stmp->stm_base = NULL;
 750 
 751         stmp->stm_mp = NULL;
 752 
 753         return (-1);
 754 }
 755 
 756 static void
 757 sfxge_tx_qreap(sfxge_txq_t *stp)
 758 {
 759         unsigned int reaped;
 760 
 761         ASSERT(mutex_owned(&(stp->st_lock)));
 762 
 763         reaped = stp->st_reaped;
 764         while (reaped != stp->st_completed) {
 765                 unsigned int id;
 766                 sfxge_tx_mapping_t *stmp;
 767                 sfxge_tx_buffer_t *stbp;
 768 
 769                 id = reaped++ & (SFXGE_TX_NDESCS - 1);
 770 
 771                 ASSERT3P(stp->st_mp[id], ==, NULL);
 772 
 773                 if ((stmp = stp->st_stmp[id]) != NULL) {
 774                         stp->st_stmp[id] = NULL;
 775 
 776                         /* Free all the mappings */
 777                         do {
 778                                 sfxge_tx_mapping_t *next;
 779 
 780                                 next = stmp->stm_next;
 781                                 stmp->stm_next = NULL;
 782 
 783                                 sfxge_tx_qfmp_put(stp, stmp);
 784 
 785                                 stmp = next;
 786                         } while (stmp != NULL);
 787                 }
 788 
 789                 if ((stbp = stp->st_stbp[id]) != NULL) {
 790                         stp->st_stbp[id] = NULL;
 791 
 792                         /* Free all the buffers */
 793                         do {
 794                                 sfxge_tx_buffer_t *next;
 795 
 796                                 next = stbp->stb_next;
 797                                 stbp->stb_next = NULL;
 798 
 799                                 stbp->stb_esm.esm_used = 0;
 800                                 stbp->stb_off = 0;
 801 
 802                                 sfxge_tx_qfbp_put(stp, stbp);
 803 
 804                                 stbp = next;
 805                         } while (stbp != NULL);
 806                 }
 807         }
 808         stp->st_reaped = reaped;
 809 }
 810 
 811 static void
 812 sfxge_tx_qlist_abort(sfxge_txq_t *stp)
 813 {
 814         unsigned int id;
 815         sfxge_tx_mapping_t *stmp;
 816         sfxge_tx_buffer_t *stbp;
 817         mblk_t *mp;
 818 
 819         ASSERT(mutex_owned(&(stp->st_lock)));
 820 
 821         id = stp->st_added & (SFXGE_TX_NDESCS - 1);
 822 
 823         /* Clear the completion information */
 824         stmp = stp->st_stmp[id];
 825         stp->st_stmp[id] = NULL;
 826 
 827         /* Free any mappings that were used */
 828         while (stmp != NULL) {
 829                 sfxge_tx_mapping_t *next;
 830 
 831                 next = stmp->stm_next;
 832                 stmp->stm_next = NULL;
 833 
 834                 if (stmp->stm_mp != NULL)
 835                         sfxge_tx_msgb_unbind(stmp);
 836 
 837                 sfxge_tx_qfmp_put(stp, stmp);
 838 
 839                 stmp = next;
 840         }
 841 
 842         stbp = stp->st_stbp[id];
 843         stp->st_stbp[id] = NULL;
 844 
 845         /* Free any buffers that were used */
 846         while (stbp != NULL) {
 847                 sfxge_tx_buffer_t *next;
 848 
 849                 next = stbp->stb_next;
 850                 stbp->stb_next = NULL;
 851 
 852                 stbp->stb_off = 0;
 853                 stbp->stb_esm.esm_used = 0;
 854 
 855                 sfxge_tx_qfbp_put(stp, stbp);
 856 
 857                 stbp = next;
 858         }
 859 
 860         mp = stp->st_mp[id];
 861         stp->st_mp[id] = NULL;
 862 
 863         if (mp != NULL)
 864                 freemsg(mp);
 865 
 866         /* Clear the fragment list */
 867         stp->st_n = 0;
 868 }
 869 
 870 /* Push descriptors to the TX ring setting blocked if no space */
 871 static void
 872 sfxge_tx_qlist_post(sfxge_txq_t *stp)
 873 {
 874         unsigned int id;
 875         unsigned int level;
 876         unsigned int available;
 877         int rc;
 878 
 879         ASSERT(mutex_owned(&(stp->st_lock)));
 880 
 881         ASSERT(stp->st_n != 0);
 882 
 883 again:
 884         level = stp->st_added - stp->st_reaped;
 885         available = EFX_TXQ_LIMIT(SFXGE_TX_NDESCS) - level;
 886 
 887         id = stp->st_added & (SFXGE_TX_NDESCS - 1);
 888 
 889         if (available < stp->st_n) {
 890                 rc = ENOSPC;
 891                 goto fail1;
 892         }
 893 
 894         ASSERT3U(available, >=, stp->st_n);
 895 
 896         /* Post the fragment list */
 897         if ((rc = efx_tx_qpost(stp->st_etp, stp->st_eb, stp->st_n,
 898             stp->st_reaped, &(stp->st_added))) != 0)
 899                 goto fail2;
 900 
 901         /*
 902          * If the list took more than a single descriptor then we need to
 903          * to move the completion information so it is referenced by the last
 904          * descriptor.
 905          */
 906         if (((stp->st_added - 1) & (SFXGE_TX_NDESCS - 1)) != id) {
 907                 sfxge_tx_mapping_t *stmp;
 908                 sfxge_tx_buffer_t *stbp;
 909                 mblk_t *mp;
 910 
 911                 stmp = stp->st_stmp[id];
 912                 stp->st_stmp[id] = NULL;
 913 
 914                 stbp = stp->st_stbp[id];
 915                 stp->st_stbp[id] = NULL;
 916 
 917                 mp = stp->st_mp[id];
 918                 stp->st_mp[id] = NULL;
 919 
 920                 id = (stp->st_added - 1) & (SFXGE_TX_NDESCS - 1);
 921 
 922                 ASSERT(stp->st_stmp[id] == NULL);
 923                 stp->st_stmp[id] = stmp;
 924 
 925                 ASSERT(stp->st_stbp[id] == NULL);
 926                 stp->st_stbp[id] = stbp;
 927 
 928                 ASSERT(stp->st_mp[id] == NULL);
 929                 stp->st_mp[id] = mp;
 930         }
 931 
 932         /* Clear the list */
 933         stp->st_n = 0;
 934 
 935         ASSERT3U(stp->st_unblock, ==, SFXGE_TXQ_NOT_BLOCKED);
 936         return;
 937 
 938 fail2:
 939         DTRACE_PROBE(fail2);
 940 fail1:
 941         DTRACE_PROBE1(fail1, int, rc);
 942 
 943         ASSERT(rc == ENOSPC);
 944 
 945         level = stp->st_added - stp->st_completed;
 946         available = EFX_TXQ_LIMIT(SFXGE_TX_NDESCS) - level;
 947 
 948         /*
 949          * If there would be enough space after we've reaped any completed
 950          * mappings and buffers, and we gain sufficient queue space by doing
 951          * so, then reap now and try posting again.
 952          */
 953         if (stp->st_n <= available &&
 954             stp->st_completed - stp->st_reaped >= SFXGE_TX_BATCH) {
 955                 sfxge_tx_qreap(stp);
 956 
 957                 goto again;
 958         }
 959 
 960         /* Set the unblock level */
 961         if (stp->st_unblock == SFXGE_TXQ_NOT_BLOCKED) {
 962                 stp->st_unblock = SFXGE_TXQ_UNBLOCK_LEVEL1;
 963         } else {
 964                 ASSERT(stp->st_unblock == SFXGE_TXQ_UNBLOCK_LEVEL1);
 965 
 966                 stp->st_unblock = SFXGE_TXQ_UNBLOCK_LEVEL2;
 967         }
 968 
 969         /*
 970          * Avoid a race with completion interrupt handling that could leave the
 971          * queue blocked.
 972          *
 973          * NOTE: The use of st_pending rather than st_completed is intentional
 974          *       as st_pending is updated per-event rather than per-batch and
 975          *       therefore avoids needless deferring.
 976          */
 977         if (stp->st_pending == stp->st_added) {
 978                 sfxge_tx_qreap(stp);
 979 
 980                 stp->st_unblock = SFXGE_TXQ_NOT_BLOCKED;
 981                 goto again;
 982         }
 983 
 984         ASSERT(stp->st_unblock != SFXGE_TXQ_NOT_BLOCKED);
 985 }
 986 
 987 static int
 988 sfxge_tx_kstat_update(kstat_t *ksp, int rw)
 989 {
 990         sfxge_txq_t *stp = ksp->ks_private;
 991         sfxge_tx_dpl_t *stdp = &(stp->st_dpl);
 992         kstat_named_t *knp;
 993         int rc;
 994 
 995         ASSERT(mutex_owned(&(stp->st_lock)));
 996 
 997         if (rw != KSTAT_READ) {
 998                 rc = EACCES;
 999                 goto fail1;
1000         }
1001 
1002         if (stp->st_state != SFXGE_TXQ_STARTED)
1003                 goto done;
1004 
1005         efx_tx_qstats_update(stp->st_etp, stp->st_stat);
1006         knp = (kstat_named_t *)ksp->ks_data + TX_NQSTATS;
1007         knp->value.ui64 = stdp->get_pkt_limit;
1008         knp++;
1009         knp->value.ui64 = stdp->put_pkt_limit;
1010         knp++;
1011         knp->value.ui64 = stdp->get_full_count;
1012         knp++;
1013         knp->value.ui64 = stdp->put_full_count;
1014 
1015 done:
1016         return (0);
1017 
1018 fail1:
1019         DTRACE_PROBE1(fail1, int, rc);
1020 
1021         return (rc);
1022 }
1023 
1024 static int
1025 sfxge_tx_kstat_init(sfxge_txq_t *stp)
1026 {
1027         sfxge_t *sp = stp->st_sp;
1028         unsigned int index = stp->st_index;
1029         dev_info_t *dip = sp->s_dip;
1030         kstat_t *ksp;
1031         kstat_named_t *knp;
1032         char name[MAXNAMELEN];
1033         unsigned int id;
1034         int rc;
1035 
1036         /* Create the set */
1037         (void) snprintf(name, MAXNAMELEN - 1, "%s_txq%04d",
1038             ddi_driver_name(dip), index);
1039 
1040         if ((ksp = kstat_create((char *)ddi_driver_name(dip),
1041             ddi_get_instance(dip), name, "queue", KSTAT_TYPE_NAMED,
1042             TX_NQSTATS + 4, 0)) == NULL) {
1043                 rc = ENOMEM;
1044                 goto fail1;
1045         }
1046 
1047         stp->st_ksp = ksp;
1048 
1049         ksp->ks_update = sfxge_tx_kstat_update;
1050         ksp->ks_private = stp;
1051         ksp->ks_lock = &(stp->st_lock);
1052 
1053         /* Initialise the named stats */
1054         stp->st_stat = knp = ksp->ks_data;
1055         for (id = 0; id < TX_NQSTATS; id++) {
1056                 kstat_named_init(knp, (char *)efx_tx_qstat_name(sp->s_enp, id),
1057                     KSTAT_DATA_UINT64);
1058                 knp++;
1059         }
1060         kstat_named_init(knp, "dpl_get_pkt_limit", KSTAT_DATA_UINT64);
1061         knp++;
1062         kstat_named_init(knp, "dpl_put_pkt_limit", KSTAT_DATA_UINT64);
1063         knp++;
1064         kstat_named_init(knp, "dpl_get_full_count", KSTAT_DATA_UINT64);
1065         knp++;
1066         kstat_named_init(knp, "dpl_put_full_count", KSTAT_DATA_UINT64);
1067 
1068         kstat_install(ksp);
1069         return (0);
1070 
1071 fail1:
1072         DTRACE_PROBE1(fail1, int, rc);
1073 
1074         return (rc);
1075 }
1076 
1077 static void
1078 sfxge_tx_kstat_fini(sfxge_txq_t *stp)
1079 {
1080         /* Destroy the set */
1081         kstat_delete(stp->st_ksp);
1082         stp->st_ksp = NULL;
1083         stp->st_stat = NULL;
1084 }
1085 
1086 static int
1087 sfxge_tx_qinit(sfxge_t *sp, unsigned int index, sfxge_txq_type_t type,
1088     unsigned int evq)
1089 {
1090         sfxge_txq_t *stp;
1091         sfxge_tx_dpl_t *stdp;
1092         int rc;
1093 
1094         ASSERT3U(index, <, EFX_ARRAY_SIZE(sp->s_stp));
1095         ASSERT3U(type, <, SFXGE_TXQ_NTYPES);
1096         ASSERT3U(evq, <, EFX_ARRAY_SIZE(sp->s_sep));
1097 
1098         if ((stp = kmem_cache_alloc(sp->s_tqc, KM_SLEEP)) == NULL) {
1099                 rc = ENOMEM;
1100                 goto fail1;
1101         }
1102         ASSERT3U(stp->st_state, ==, SFXGE_TXQ_UNINITIALIZED);
1103 
1104         stdp = &(stp->st_dpl);
1105 
1106         stp->st_index = index;
1107         stp->st_type = type;
1108         stp->st_evq = evq;
1109 
1110         mutex_init(&(stp->st_lock), NULL, MUTEX_DRIVER,
1111             DDI_INTR_PRI(sp->s_intr.si_intr_pri));
1112 
1113         /* Initialize the statistics */
1114         if ((rc = sfxge_tx_kstat_init(stp)) != 0)
1115                 goto fail2;
1116 
1117         stdp->get_pkt_limit = ddi_prop_get_int(DDI_DEV_T_ANY, sp->s_dip,
1118             DDI_PROP_DONTPASS, "tx_dpl_get_pkt_limit",
1119             SFXGE_TX_DPL_GET_PKT_LIMIT_DEFAULT);
1120 
1121         stdp->put_pkt_limit = ddi_prop_get_int(DDI_DEV_T_ANY, sp->s_dip,
1122             DDI_PROP_DONTPASS, "tx_dpl_put_pkt_limit",
1123             SFXGE_TX_DPL_PUT_PKT_LIMIT_DEFAULT);
1124 
1125         /* Allocate a per-EVQ label for events from this TXQ */
1126         if ((rc = sfxge_ev_txlabel_alloc(sp, evq, stp, &(stp->st_label))) != 0)
1127                 goto fail2;
1128 
1129         stp->st_state = SFXGE_TXQ_INITIALIZED;
1130 
1131         /* Attach the TXQ to the driver */
1132         ASSERT3P(sp->s_stp[index], ==, NULL);
1133         sp->s_stp[index] = stp;
1134         sp->s_tx_qcount++;
1135 
1136         return (0);
1137 
1138 fail2:
1139         DTRACE_PROBE(fail2);
1140 
1141         sfxge_tx_kstat_fini(stp);
1142 
1143 
1144         stp->st_evq = 0;
1145         stp->st_type = 0;
1146         stp->st_index = 0;
1147 
1148         mutex_destroy(&(stp->st_lock));
1149 
1150         kmem_cache_free(sp->s_tqc, stp);
1151 
1152 fail1:
1153         DTRACE_PROBE1(fail1, int, rc);
1154 
1155         return (rc);
1156 }
1157 
1158 static int
1159 sfxge_tx_qstart(sfxge_t *sp, unsigned int index)
1160 {
1161         sfxge_txq_t *stp = sp->s_stp[index];
1162         efx_nic_t *enp = sp->s_enp;
1163         efsys_mem_t *esmp;
1164         sfxge_evq_t *sep;
1165         unsigned int evq;
1166         unsigned int flags;
1167         unsigned int desc_index;
1168         int rc;
1169 
1170         mutex_enter(&(stp->st_lock));
1171 
1172         esmp = &(stp->st_mem);
1173         evq = stp->st_evq;
1174         sep = sp->s_sep[evq];
1175 
1176         ASSERT3U(stp->st_state, ==, SFXGE_TXQ_INITIALIZED);
1177         ASSERT3U(sep->se_state, ==, SFXGE_EVQ_STARTED);
1178 
1179         /* Zero the memory */
1180         bzero(esmp->esm_base, EFX_TXQ_SIZE(SFXGE_TX_NDESCS));
1181 
1182         /* Program the buffer table */
1183         if ((rc = sfxge_sram_buf_tbl_set(sp, stp->st_id, esmp,
1184             EFX_TXQ_NBUFS(SFXGE_TX_NDESCS))) != 0)
1185                 goto fail1;
1186 
1187         switch (stp->st_type) {
1188         case SFXGE_TXQ_NON_CKSUM:
1189                 flags = 0;
1190                 break;
1191 
1192         case SFXGE_TXQ_IP_CKSUM:
1193                 flags = EFX_TXQ_CKSUM_IPV4;
1194                 break;
1195 
1196         case SFXGE_TXQ_IP_TCP_UDP_CKSUM:
1197                 flags = EFX_TXQ_CKSUM_IPV4 | EFX_TXQ_CKSUM_TCPUDP;
1198                 break;
1199 
1200         default:
1201                 ASSERT(B_FALSE);
1202 
1203                 flags = 0;
1204                 break;
1205         }
1206 
1207         /* Create the transmit queue */
1208         if ((rc = efx_tx_qcreate(enp, index, stp->st_label, esmp,
1209             SFXGE_TX_NDESCS, stp->st_id, flags, sep->se_eep,
1210             &(stp->st_etp), &desc_index)) != 0)
1211                 goto fail2;
1212 
1213         /* Initialise queue descriptor indexes */
1214         stp->st_added = desc_index;
1215         stp->st_pending = desc_index;
1216         stp->st_completed = desc_index;
1217         stp->st_reaped = desc_index;
1218 
1219         /* Enable the transmit queue */
1220         efx_tx_qenable(stp->st_etp);
1221 
1222         stp->st_state = SFXGE_TXQ_STARTED;
1223 
1224         mutex_exit(&(stp->st_lock));
1225 
1226         return (0);
1227 
1228 fail2:
1229         DTRACE_PROBE(fail2);
1230 
1231         /* Clear entries from the buffer table */
1232         sfxge_sram_buf_tbl_clear(sp, stp->st_id,
1233             EFX_TXQ_NBUFS(SFXGE_TX_NDESCS));
1234 
1235 fail1:
1236         DTRACE_PROBE1(fail1, int, rc);
1237 
1238         mutex_exit(&(stp->st_lock));
1239 
1240         return (rc);
1241 }
1242 
1243 static inline int
1244 sfxge_tx_qmapping_add(sfxge_txq_t *stp, sfxge_tx_mapping_t *stmp,
1245     size_t *offp, size_t *limitp)
1246 {
1247         mblk_t *mp;
1248         size_t mapping_off;
1249         size_t mapping_size;
1250         int rc;
1251 
1252         ASSERT3U(*offp, <, stmp->stm_size);
1253         ASSERT(*limitp != 0);
1254 
1255         mp = stmp->stm_mp;
1256 
1257         ASSERT3P(stmp->stm_base, ==, mp->b_rptr);
1258         ASSERT3U(stmp->stm_size, ==, MBLKL(mp));
1259 
1260         mapping_off = stmp->stm_off + *offp;
1261         mapping_size = stmp->stm_size - *offp;
1262 
1263         while (mapping_size != 0 && *limitp != 0) {
1264                 size_t page =
1265                     mapping_off >> SFXGE_TX_DESCSHIFT;
1266                 size_t page_off =
1267                     mapping_off & SFXGE_TX_DESCOFFSET;
1268                 size_t page_size =
1269                     SFXGE_TX_DESCSIZE - page_off;
1270                 efx_buffer_t *ebp;
1271 
1272                 ASSERT3U(page, <, SFXGE_TX_MAPPING_NADDR);
1273                 ASSERT((stmp->stm_addr[page] & SFXGE_TX_DESCMASK) != 0);
1274 
1275                 page_size = MIN(page_size, mapping_size);
1276                 page_size = MIN(page_size, *limitp);
1277 
1278                 ASSERT3U(stp->st_n, <=,
1279                     EFX_TXQ_LIMIT(SFXGE_TX_NDESCS));
1280                 if (stp->st_n ==
1281                     EFX_TXQ_LIMIT(SFXGE_TX_NDESCS)) {
1282                         rc = ENOSPC;
1283                         goto fail1;
1284                 }
1285 
1286                 ebp = &(stp->st_eb[stp->st_n++]);
1287                 ebp->eb_addr = stmp->stm_addr[page] +
1288                     page_off;
1289                 ebp->eb_size = page_size;
1290 
1291                 *offp += page_size;
1292                 *limitp -= page_size;
1293 
1294                 mapping_off += page_size;
1295                 mapping_size -= page_size;
1296 
1297                 ebp->eb_eop = (*limitp == 0 ||
1298                     (mapping_size == 0 && mp->b_cont == NULL));
1299 
1300                 DTRACE_PROBE5(tx_mapping_add,
1301                     unsigned int, stp->st_index,
1302                     unsigned int, stp->st_n - 1,
1303                     uint64_t, ebp->eb_addr,
1304                     size_t, ebp->eb_size,
1305                     boolean_t, ebp->eb_eop);
1306         }
1307 
1308         ASSERT3U(*offp, <=, stmp->stm_size);
1309 
1310         return (0);
1311 
1312 fail1:
1313         DTRACE_PROBE1(fail1, int, rc);
1314 
1315         return (rc);
1316 }
1317 
1318 static inline int
1319 sfxge_tx_qbuffer_add(sfxge_txq_t *stp, sfxge_tx_buffer_t *stbp, boolean_t eop)
1320 {
1321         efx_buffer_t *ebp;
1322         int rc;
1323 
1324         ASSERT3U(stp->st_n, <=,
1325             EFX_TXQ_LIMIT(SFXGE_TX_NDESCS));
1326         if (stp->st_n == EFX_TXQ_LIMIT(SFXGE_TX_NDESCS)) {
1327                 rc = ENOSPC;
1328                 goto fail1;
1329         }
1330 
1331         ebp = &(stp->st_eb[stp->st_n++]);
1332         ebp->eb_addr = stbp->stb_esm.esm_addr + stbp->stb_off;
1333         ebp->eb_size = stbp->stb_esm.esm_used - stbp->stb_off;
1334         ebp->eb_eop = eop;
1335 
1336         (void) ddi_dma_sync(stbp->stb_esm.esm_dma_handle,
1337             stbp->stb_off, ebp->eb_size,
1338             DDI_DMA_SYNC_FORDEV);
1339 
1340         stbp->stb_off = stbp->stb_esm.esm_used;
1341 
1342         DTRACE_PROBE5(tx_buffer_add,
1343             unsigned int, stp->st_index,
1344             unsigned int, stp->st_n - 1,
1345             uint64_t, ebp->eb_addr, size_t, ebp->eb_size,
1346             boolean_t, ebp->eb_eop);
1347 
1348         return (0);
1349 
1350 fail1:
1351         DTRACE_PROBE1(fail1, int, rc);
1352 
1353         return (rc);
1354 }
1355 
1356 static inline boolean_t
1357 sfxge_tx_msgb_copy(mblk_t *mp, sfxge_tx_buffer_t *stbp, size_t *offp,
1358     size_t *limitp)
1359 {
1360         size_t data_off;
1361         size_t data_size;
1362         size_t copy_off;
1363         size_t copy_size;
1364         boolean_t eop;
1365 
1366         ASSERT3U(*offp, <=, MBLKL(mp));
1367         ASSERT(*limitp != 0);
1368 
1369         data_off = *offp;
1370         data_size = MBLKL(mp) - *offp;
1371 
1372         copy_off = stbp->stb_esm.esm_used;
1373         copy_size = SFXGE_TX_BUFFER_SIZE - copy_off;
1374 
1375         copy_size = MIN(copy_size, data_size);
1376         copy_size = MIN(copy_size, *limitp);
1377 
1378         bcopy(mp->b_rptr + data_off,
1379             stbp->stb_esm.esm_base + copy_off, copy_size);
1380 
1381         stbp->stb_esm.esm_used += copy_size;
1382         ASSERT3U(stbp->stb_esm.esm_used, <=,
1383             SFXGE_TX_BUFFER_SIZE);
1384 
1385         *offp += copy_size;
1386         *limitp -= copy_size;
1387 
1388         data_off += copy_size;
1389         data_size -= copy_size;
1390 
1391         eop = (*limitp == 0 ||
1392             (data_size == 0 && mp->b_cont == NULL));
1393 
1394         ASSERT3U(*offp, <=, MBLKL(mp));
1395 
1396         return (eop);
1397 }
1398 
1399 static int
1400 sfxge_tx_qpayload_fragment(sfxge_txq_t *stp, unsigned int id, mblk_t **mpp,
1401     size_t *offp, size_t size, boolean_t copy)
1402 {
1403         sfxge_t *sp = stp->st_sp;
1404         mblk_t *mp = *mpp;
1405         size_t off = *offp;
1406         sfxge_tx_buffer_t *stbp;
1407         sfxge_tx_mapping_t *stmp;
1408         int rc;
1409 
1410         stbp = stp->st_stbp[id];
1411         ASSERT(stbp == NULL || (stbp->stb_esm.esm_used == stbp->stb_off));
1412 
1413         stmp = stp->st_stmp[id];
1414 
1415         while (size != 0) {
1416                 boolean_t eop;
1417 
1418                 ASSERT(mp != NULL);
1419 
1420                 if (mp->b_cont != NULL)
1421                         prefetch_read_many(mp->b_cont);
1422 
1423                 ASSERT3U(off, <, MBLKL(mp));
1424 
1425                 if (copy)
1426                         goto copy;
1427 
1428                 /*
1429                  * Check whether we have already mapped this data block for
1430                  * DMA.
1431                  */
1432                 if (stmp == NULL || stmp->stm_mp != mp) {
1433                         /*
1434                          * If we are part way through copying a data block then
1435                          * there's no point in trying to map it for DMA.
1436                          */
1437                         if (off != 0)
1438                                 goto copy;
1439 
1440                         /*
1441                          * If the data block is too short then the cost of
1442                          * mapping it for DMA would outweigh the cost of
1443                          * copying it.
1444                          */
1445                         if (MBLKL(mp) < SFXGE_TX_COPY_THRESHOLD)
1446                                 goto copy;
1447 
1448                         /* Try to grab a transmit mapping from the pool */
1449                         stmp = sfxge_tx_qfmp_get(stp);
1450                         if (stmp == NULL) {
1451                                 /*
1452                                  * The pool was empty so allocate a new
1453                                  * mapping.
1454                                  */
1455                                 if ((stmp = kmem_cache_alloc(sp->s_tmc,
1456                                     KM_NOSLEEP)) == NULL)
1457                                         goto copy;
1458                         }
1459 
1460                         /* Add the DMA mapping to the list */
1461                         stmp->stm_next = stp->st_stmp[id];
1462                         stp->st_stmp[id] = stmp;
1463 
1464                         /* Try to bind the data block to the mapping */
1465                         if (sfxge_tx_msgb_bind(mp, stmp) != 0)
1466                                 goto copy;
1467                 }
1468                 ASSERT3P(stmp->stm_mp, ==, mp);
1469 
1470                 /*
1471                  * If we have a partially filled buffer then we must add it to
1472                  * the fragment list before adding the mapping.
1473                  */
1474                 if (stbp != NULL && (stbp->stb_esm.esm_used > stbp->stb_off)) {
1475                         rc = sfxge_tx_qbuffer_add(stp, stbp, B_FALSE);
1476                         if (rc != 0)
1477                                 goto fail1;
1478                 }
1479 
1480                 /* Add the mapping to the fragment list */
1481                 rc = sfxge_tx_qmapping_add(stp, stmp, &off, &size);
1482                 if (rc != 0)
1483                         goto fail2;
1484 
1485                 ASSERT(off == MBLKL(mp) || size == 0);
1486 
1487                 /*
1488                  * If the data block has been exhausted then Skip over the
1489                  * control block and advance to the next data block.
1490                  */
1491                 if (off == MBLKL(mp)) {
1492                         mp = mp->b_cont;
1493                         off = 0;
1494                 }
1495 
1496                 continue;
1497 
1498 copy:
1499                 if (stbp == NULL ||
1500                     stbp->stb_esm.esm_used == SFXGE_TX_BUFFER_SIZE) {
1501                         /* Try to grab a buffer from the pool */
1502                         stbp = sfxge_tx_qfbp_get(stp);
1503                         if (stbp == NULL) {
1504                                 /*
1505                                  * The pool was empty so allocate a new
1506                                  * buffer.
1507                                  */
1508                                 if ((stbp = kmem_cache_alloc(sp->s_tbc,
1509                                     KM_NOSLEEP)) == NULL) {
1510                                         rc = ENOMEM;
1511                                         goto fail3;
1512                                 }
1513                         }
1514 
1515                         /* Add it to the list */
1516                         stbp->stb_next = stp->st_stbp[id];
1517                         stp->st_stbp[id] = stbp;
1518                 }
1519 
1520                 /* Copy as much of the data block as we can into the buffer */
1521                 eop = sfxge_tx_msgb_copy(mp, stbp, &off, &size);
1522 
1523                 ASSERT(off == MBLKL(mp) || size == 0 ||
1524                     stbp->stb_esm.esm_used == SFXGE_TX_BUFFER_SIZE);
1525 
1526                 /*
1527                  * If we have reached the end of the packet, or the buffer is
1528                  * full, then add the buffer to the fragment list.
1529                  */
1530                 if (stbp->stb_esm.esm_used == SFXGE_TX_BUFFER_SIZE || eop) {
1531                         rc = sfxge_tx_qbuffer_add(stp, stbp, eop);
1532                         if (rc != 0)
1533                                 goto fail4;
1534                 }
1535 
1536                 /*
1537                  * If the data block has been exhaused then advance to the next
1538                  * one.
1539                  */
1540                 if (off == MBLKL(mp)) {
1541                         mp = mp->b_cont;
1542                         off = 0;
1543                 }
1544         }
1545 
1546         *mpp = mp;
1547         *offp = off;
1548 
1549         return (0);
1550 
1551 fail4:
1552         DTRACE_PROBE(fail4);
1553 fail3:
1554         DTRACE_PROBE(fail3);
1555 fail2:
1556         DTRACE_PROBE(fail2);
1557 fail1:
1558         DTRACE_PROBE1(fail1, int, rc);
1559 
1560         return (rc);
1561 }
1562 
1563 static int
1564 sfxge_tx_qlso_fragment(sfxge_txq_t *stp, sfxge_tx_packet_t *stpp,
1565     boolean_t copy)
1566 {
1567         sfxge_t *sp = stp->st_sp;
1568         mblk_t *mp = stpp->stp_mp;
1569         struct ether_header *etherhp = stpp->stp_etherhp;
1570         struct ip *iphp = stpp->stp_iphp;
1571         struct tcphdr *thp = stpp->stp_thp;
1572         size_t size = stpp->stp_size;
1573         size_t off = stpp->stp_off;
1574         size_t mss = stpp->stp_mss;
1575         unsigned int id;
1576         caddr_t hp;
1577         size_t ehs, hs;
1578         uint16_t start_len;
1579         uint16_t start_id;
1580         uint16_t ip_id;
1581         uint8_t start_flags;
1582         uint32_t start_seq;
1583         uint32_t th_seq;
1584         size_t lss;
1585         sfxge_tx_buffer_t *stbp;
1586         int rc;
1587 
1588         ASSERT(mutex_owned(&(stp->st_lock)));
1589 
1590         if ((DB_LSOFLAGS(mp) & HW_LSO) == 0) {
1591                 rc = EINVAL;
1592                 goto fail1;
1593         }
1594 
1595         id = stp->st_added & (SFXGE_TX_NDESCS - 1);
1596 
1597         ASSERT(stp->st_n == 0);
1598         ASSERT(stp->st_stbp[id] == NULL);
1599         ASSERT(stp->st_stmp[id] == NULL);
1600 
1601         ehs = (etherhp->ether_type == htons(ETHERTYPE_VLAN)) ?
1602             sizeof (struct ether_vlan_header) :
1603             sizeof (struct ether_header);
1604         if (msgdsize(mp) != ehs + ntohs(iphp->ip_len)) {
1605                 rc = EINVAL;
1606                 goto fail2;
1607         }
1608 
1609         /* The payload offset is equivalent to the size of the headers */
1610         hp = (caddr_t)(mp->b_rptr);
1611         hs = off;
1612 
1613         /*
1614          * If the initial data block only contains the headers then advance
1615          * to the next one.
1616          */
1617         if (hs > MBLKL(mp)) {
1618                 rc = EINVAL;
1619                 goto fail3;
1620         }
1621         mp->b_rptr += hs;
1622 
1623         if (MBLKL(mp) == 0)
1624                 mp = mp->b_cont;
1625 
1626         off = 0;
1627 
1628         /* Check IP and TCP headers are suitable for LSO */
1629         if (((iphp->ip_off & ~htons(IP_DF)) != 0) ||
1630             ((thp->th_flags & (TH_URG | TH_SYN)) != 0) ||
1631             (thp->th_urp != 0)) {
1632                 rc = EINVAL;
1633                 goto fail4;
1634         }
1635 
1636         if (size + (thp->th_off << 2) + (iphp->ip_hl << 2) !=
1637             ntohs(iphp->ip_len)) {
1638                 rc = EINVAL;
1639                 goto fail4;
1640         }
1641 
1642         /*
1643          * Get the base IP id, The stack leaves enough of a gap in id space
1644          * for us to increment this for each segment we send out.
1645          */
1646         start_len = ntohs(iphp->ip_len);
1647         start_id = ip_id = ntohs(iphp->ip_id);
1648 
1649         /* Get the base TCP sequence number and flags */
1650         start_flags = thp->th_flags;
1651         start_seq = th_seq = ntohl(thp->th_seq);
1652 
1653         /* Adjust the header for interim segments */
1654         iphp->ip_len = htons((iphp->ip_hl << 2) + (thp->th_off << 2) + mss);
1655         thp->th_flags = start_flags & ~(TH_PUSH | TH_FIN);
1656 
1657         lss = size;
1658         if ((lss / mss) >= (EFX_TXQ_LIMIT(SFXGE_TX_NDESCS) / 2)) {
1659                 rc = EINVAL;
1660                 goto fail5;
1661         }
1662 
1663         stbp = NULL;
1664         while (lss != 0) {
1665                 size_t ss = MIN(lss, mss);
1666                 boolean_t eol = (ss == lss);
1667 
1668                 /* Adjust the header for this segment */
1669                 iphp->ip_id = htons(ip_id);
1670                 ip_id++;
1671 
1672                 thp->th_seq = htonl(th_seq);
1673                 th_seq += ss;
1674 
1675                 /* If this is the final segment then do some extra adjustment */
1676                 if (eol) {
1677                         iphp->ip_len = htons((iphp->ip_hl << 2) +
1678                             (thp->th_off << 2) + ss);
1679                         thp->th_flags = start_flags;
1680                 }
1681 
1682                 if (stbp == NULL ||
1683                     stbp->stb_esm.esm_used + hs > SFXGE_TX_BUFFER_SIZE) {
1684                         /* Try to grab a buffer from the pool */
1685                         stbp = sfxge_tx_qfbp_get(stp);
1686                         if (stbp == NULL) {
1687                                 /*
1688                                  * The pool was empty so allocate a new
1689                                  * buffer.
1690                                  */
1691                                 if ((stbp = kmem_cache_alloc(sp->s_tbc,
1692                                     KM_NOSLEEP)) == NULL) {
1693                                         rc = ENOMEM;
1694                                         goto fail6;
1695                                 }
1696                         }
1697 
1698                         /* Add it to the list */
1699                         stbp->stb_next = stp->st_stbp[id];
1700                         stp->st_stbp[id] = stbp;
1701                 }
1702 
1703                 /* Copy in the headers */
1704                 ASSERT3U(stbp->stb_off, ==, stbp->stb_esm.esm_used);
1705                 bcopy(hp, stbp->stb_esm.esm_base + stbp->stb_off, hs);
1706                 stbp->stb_esm.esm_used += hs;
1707 
1708                 /* Add the buffer to the fragment list */
1709                 rc = sfxge_tx_qbuffer_add(stp, stbp, B_FALSE);
1710                 if (rc != 0)
1711                         goto fail7;
1712 
1713                 /* Add the payload to the fragment list */
1714                 if ((rc = sfxge_tx_qpayload_fragment(stp, id, &mp, &off,
1715                     ss, copy)) != 0)
1716                         goto fail8;
1717 
1718                 lss -= ss;
1719         }
1720         ASSERT3U(off, ==, 0);
1721         ASSERT3P(mp, ==, NULL);
1722 
1723         ASSERT3U(th_seq - start_seq, ==, size);
1724 
1725         /*
1726          * If no part of the packet has been mapped for DMA then we can free
1727          * it now, otherwise it can only be freed on completion.
1728          */
1729         if (stp->st_stmp[id] == NULL)
1730                 freemsg(stpp->stp_mp);
1731         else
1732                 stp->st_mp[id] = stpp->stp_mp;
1733 
1734         stpp->stp_mp = NULL;
1735 
1736         return (0);
1737 
1738 fail8:
1739         DTRACE_PROBE(fail8);
1740 fail7:
1741         DTRACE_PROBE(fail7);
1742 fail6:
1743         DTRACE_PROBE(fail6);
1744 fail5:
1745         DTRACE_PROBE(fail5);
1746 
1747         /* Restore the header */
1748         thp->th_seq = htonl(start_seq);
1749         thp->th_flags = start_flags;
1750 
1751         iphp->ip_len = htons(start_len);
1752         iphp->ip_id = htons(start_id);
1753 
1754 fail4:
1755         DTRACE_PROBE(fail4);
1756 
1757         mp = stpp->stp_mp;
1758         mp->b_rptr -= hs;
1759 
1760         ASSERT3U(((etherhp->ether_type == htons(ETHERTYPE_VLAN)) ?
1761             sizeof (struct ether_vlan_header) :
1762             sizeof (struct ether_header)) +
1763             ntohs(iphp->ip_len), ==, msgdsize(mp));
1764 
1765         ASSERT(stp->st_mp[id] == NULL);
1766 
1767 fail3:
1768         DTRACE_PROBE(fail3);
1769 fail2:
1770         DTRACE_PROBE(fail2);
1771 fail1:
1772         DTRACE_PROBE1(fail1, int, rc);
1773 
1774         return (rc);
1775 }
1776 
1777 static int
1778 sfxge_tx_qpacket_fragment(sfxge_txq_t *stp, sfxge_tx_packet_t *stpp,
1779     boolean_t copy)
1780 {
1781         sfxge_t *sp = stp->st_sp;
1782         mblk_t *mp = stpp->stp_mp;
1783         unsigned int id;
1784         size_t off;
1785         size_t size;
1786         sfxge_tx_mapping_t *stmp;
1787         sfxge_tx_buffer_t *stbp;
1788         int rc;
1789 
1790         ASSERT(mutex_owned(&(stp->st_lock)));
1791 
1792         ASSERT(stp->st_n == 0);
1793 
1794         id = stp->st_added & (SFXGE_TX_NDESCS - 1);
1795 
1796         ASSERT(stp->st_stbp[id] == NULL);
1797         ASSERT(stp->st_stmp[id] == NULL);
1798 
1799         off = 0;
1800         size = LONG_MAX;        /* must be larger than the packet */
1801 
1802         stbp = NULL;
1803         stmp = NULL;
1804 
1805         while (mp != NULL) {
1806                 boolean_t eop;
1807 
1808                 ASSERT(mp != NULL);
1809 
1810                 if (mp->b_cont != NULL)
1811                         prefetch_read_many(mp->b_cont);
1812 
1813                 ASSERT(stmp == NULL || stmp->stm_mp != mp);
1814 
1815                 if (copy)
1816                         goto copy;
1817 
1818                 /*
1819                  * If we are part way through copying a data block then there's
1820                  * no point in trying to map it for DMA.
1821                  */
1822                 if (off != 0)
1823                         goto copy;
1824 
1825                 /*
1826                  * If the data block is too short then the cost of mapping it
1827                  * for DMA would outweigh the cost of copying it.
1828                  *
1829                  * TX copy break
1830                  */
1831                 if (MBLKL(mp) < SFXGE_TX_COPY_THRESHOLD)
1832                         goto copy;
1833 
1834                 /* Try to grab a transmit mapping from the pool */
1835                 stmp = sfxge_tx_qfmp_get(stp);
1836                 if (stmp == NULL) {
1837                         /*
1838                          * The pool was empty so allocate a new
1839                          * mapping.
1840                          */
1841                         if ((stmp = kmem_cache_alloc(sp->s_tmc,
1842                             KM_NOSLEEP)) == NULL)
1843                                 goto copy;
1844                 }
1845 
1846                 /* Add the DMA mapping to the list */
1847                 stmp->stm_next = stp->st_stmp[id];
1848                 stp->st_stmp[id] = stmp;
1849 
1850                 /* Try to bind the data block to the mapping */
1851                 if (sfxge_tx_msgb_bind(mp, stmp) != 0)
1852                         goto copy;
1853 
1854                 /*
1855                  * If we have a partially filled buffer then we must add it to
1856                  * the fragment list before adding the mapping.
1857                  */
1858                 if (stbp != NULL && (stbp->stb_esm.esm_used > stbp->stb_off)) {
1859                         rc = sfxge_tx_qbuffer_add(stp, stbp, B_FALSE);
1860                         if (rc != 0)
1861                                 goto fail1;
1862                 }
1863 
1864                 /* Add the mapping to the fragment list */
1865                 rc = sfxge_tx_qmapping_add(stp, stmp, &off, &size);
1866                 if (rc != 0)
1867                         goto fail2;
1868 
1869                 ASSERT3U(off, ==, MBLKL(mp));
1870 
1871                 /* Advance to the next data block */
1872                 mp = mp->b_cont;
1873                 off = 0;
1874                 continue;
1875 
1876 copy:
1877                 if (stbp == NULL ||
1878                     stbp->stb_esm.esm_used == SFXGE_TX_BUFFER_SIZE) {
1879                         /* Try to grab a buffer from the pool */
1880                         stbp = sfxge_tx_qfbp_get(stp);
1881                         if (stbp == NULL) {
1882                                 /*
1883                                  * The pool was empty so allocate a new
1884                                  * buffer.
1885                                  */
1886                                 if ((stbp = kmem_cache_alloc(sp->s_tbc,
1887                                     KM_NOSLEEP)) == NULL) {
1888                                         rc = ENOMEM;
1889                                         goto fail3;
1890                                 }
1891                         }
1892 
1893                         /* Add it to the list */
1894                         stbp->stb_next = stp->st_stbp[id];
1895                         stp->st_stbp[id] = stbp;
1896                 }
1897 
1898                 /* Copy as much of the data block as we can into the buffer */
1899                 eop = sfxge_tx_msgb_copy(mp, stbp, &off, &size);
1900 
1901                 ASSERT(off == MBLKL(mp) ||
1902                     stbp->stb_esm.esm_used == SFXGE_TX_BUFFER_SIZE);
1903 
1904                 /*
1905                  * If we have reached the end of the packet, or the buffer is
1906                  * full, then add the buffer to the fragment list.
1907                  */
1908                 if (stbp->stb_esm.esm_used == SFXGE_TX_BUFFER_SIZE || eop) {
1909                         rc = sfxge_tx_qbuffer_add(stp, stbp, eop);
1910                         if (rc != 0)
1911                                 goto fail4;
1912                 }
1913 
1914                 /*
1915                  * If the data block has been exhaused then advance to the next
1916                  * one.
1917                  */
1918                 if (off == MBLKL(mp)) {
1919                         mp = mp->b_cont;
1920                         off = 0;
1921                 }
1922         }
1923         ASSERT3U(off, ==, 0);
1924         ASSERT3P(mp, ==, NULL);
1925         ASSERT3U(size, !=, 0);
1926 
1927         /*
1928          * If no part of the packet has been mapped for DMA then we can free
1929          * it now, otherwise it can only be freed on completion.
1930          */
1931         if (stp->st_stmp[id] == NULL)
1932                 freemsg(stpp->stp_mp);
1933         else
1934                 stp->st_mp[id] = stpp->stp_mp;
1935 
1936         stpp->stp_mp = NULL;
1937 
1938         return (0);
1939 
1940 fail4:
1941         DTRACE_PROBE(fail4);
1942 fail3:
1943         DTRACE_PROBE(fail3);
1944 fail2:
1945         DTRACE_PROBE(fail2);
1946 fail1:
1947         DTRACE_PROBE1(fail1, int, rc);
1948 
1949         ASSERT(stp->st_stmp[id] == NULL);
1950 
1951         return (rc);
1952 }
1953 
1954 
1955 #define SFXGE_TX_QDPL_PUT_PENDING(_stp)                                 \
1956         ((_stp)->st_dpl.std_put != 0)
1957 
1958 static void
1959 sfxge_tx_qdpl_swizzle(sfxge_txq_t *stp)
1960 {
1961         sfxge_tx_dpl_t *stdp = &(stp->st_dpl);
1962         volatile uintptr_t *putp;
1963         uintptr_t put;
1964         sfxge_tx_packet_t *stpp;
1965         sfxge_tx_packet_t *p;
1966         sfxge_tx_packet_t **pp;
1967         unsigned int count;
1968 
1969         ASSERT(mutex_owned(&(stp->st_lock)));
1970 
1971         /*
1972          * Guaranteed that in flight TX packets will cause more TX completions
1973          * hence more swizzles must happen
1974          */
1975         ASSERT3U(stdp->std_count, <=, sfxge_tx_dpl_get_pkt_max(stp));
1976         if (stdp->std_count >= stdp->get_pkt_limit)
1977                 return;
1978 
1979         /* Acquire the put list - replacing with an empty list */
1980         putp = &(stdp->std_put);
1981         put = atomic_swap_ulong(putp, 0);
1982         stpp = (void *)put;
1983 
1984         if (stpp == NULL)
1985                 return;
1986 
1987         /* Reverse the list */
1988         pp = &(stpp->stp_next);
1989         p = NULL;
1990 
1991         count = 0;
1992         do {
1993                 sfxge_tx_packet_t *next;
1994 
1995                 next = stpp->stp_next;
1996 
1997                 stpp->stp_next = p;
1998                 p = stpp;
1999 
2000                 count++;
2001                 stpp = next;
2002         } while (stpp != NULL);
2003 
2004         /* Add it to the tail of the get list */
2005         ASSERT3P(*pp, ==, NULL);
2006 
2007         *(stdp->std_getp) = p;
2008         stdp->std_getp = pp;
2009         stdp->std_count += count;
2010         ASSERT3U(stdp->std_count, <=, sfxge_tx_dpl_get_pkt_max(stp));
2011 
2012         DTRACE_PROBE2(dpl_counts, int, stdp->std_count, int, count);
2013 }
2014 
2015 
2016 /*
2017  * If TXQ locked, add the RX DPL put list and this packet to the TX DPL get list
2018  * If TXQ unlocked, atomically add this packet to TX DPL put list
2019  *
2020  * The only possible error is ENOSPC (used for TX backpressure)
2021  * For the TX DPL put or get list becoming full, in both cases there must be
2022  * future TX completions (as represented by the packets on the DPL get lists).
2023  *
2024  * This ensures that in the future mac_tx_update() will be called from
2025  * sfxge_tx_qcomplete()
2026  */
2027 static inline int
2028 sfxge_tx_qdpl_add(sfxge_txq_t *stp, sfxge_tx_packet_t *stpp, int locked)
2029 {
2030         sfxge_tx_dpl_t *stdp = &stp->st_dpl;
2031 
2032         ASSERT3P(stpp->stp_next, ==, NULL);
2033 
2034         if (locked) {
2035                 ASSERT(mutex_owned(&stp->st_lock));
2036 
2037                 if (stdp->std_count >= stdp->get_pkt_limit) {
2038                         stdp->get_full_count++;
2039                         return (ENOSPC);
2040                 }
2041 
2042                 /* Reverse the put list onto the get list */
2043                 sfxge_tx_qdpl_swizzle(stp);
2044 
2045                 /* Add to the tail of the get list */
2046                 *(stdp->std_getp) = stpp;
2047                 stdp->std_getp = &stpp->stp_next;
2048                 stdp->std_count++;
2049                 ASSERT3U(stdp->std_count, <=, sfxge_tx_dpl_get_pkt_max(stp));
2050 
2051         } else {
2052                 volatile uintptr_t *putp;
2053                 uintptr_t old;
2054                 uintptr_t new;
2055                 sfxge_tx_packet_t *old_pkt;
2056 
2057                 putp = &(stdp->std_put);
2058                 new = (uintptr_t)stpp;
2059 
2060                 /* Add to the head of the put list, keeping a list length */
2061                 do {
2062                         old = *putp;
2063                         old_pkt =  (sfxge_tx_packet_t *)old;
2064 
2065                         stpp->stp_dpl_put_len = old ?
2066                             old_pkt->stp_dpl_put_len + 1 : 1;
2067 
2068                         if (stpp->stp_dpl_put_len >= stdp->put_pkt_limit) {
2069                                 stpp->stp_next = 0;
2070                                 stpp->stp_dpl_put_len = 0;
2071                                 stdp->put_full_count++;
2072                                 return (ENOSPC);
2073                         }
2074 
2075                         stpp->stp_next = (void *)old;
2076                 } while (atomic_cas_ulong(putp, old, new) != old);
2077         }
2078         return (0);
2079 }
2080 
2081 
2082 /* Take all packets from DPL get list and try to send to HW */
2083 static void
2084 sfxge_tx_qdpl_drain(sfxge_txq_t *stp)
2085 {
2086         sfxge_t *sp = stp->st_sp;
2087         sfxge_tx_dpl_t *stdp = &(stp->st_dpl);
2088         unsigned int pushed = stp->st_added;
2089         sfxge_tx_packet_t *stpp;
2090         unsigned int count;
2091 
2092         ASSERT(mutex_owned(&(stp->st_lock)));
2093 
2094         prefetch_read_many(sp->s_enp);
2095         prefetch_read_many(stp->st_etp);
2096 
2097         stpp = stdp->std_get;
2098         count = stdp->std_count;
2099 
2100         while (count != 0) {
2101                 sfxge_tx_packet_t *next;
2102                 boolean_t copy;
2103                 int rc;
2104 
2105                 ASSERT(stpp != NULL);
2106 
2107                 /* Split stpp off */
2108                 next = stpp->stp_next;
2109                 stpp->stp_next = NULL;
2110 
2111                 if (next != NULL)
2112                         prefetch_read_many(next);
2113 
2114                 if (stp->st_state != SFXGE_TXQ_STARTED)
2115                         goto reject;
2116 
2117                 copy = B_FALSE;
2118 
2119 again:
2120                 /* Fragment the packet */
2121                 if (stpp->stp_mss != 0) {
2122                         rc = sfxge_tx_qlso_fragment(stp, stpp, copy);
2123                 } else {
2124                         rc = sfxge_tx_qpacket_fragment(stp, stpp, copy);
2125                 }
2126 
2127                 switch (rc) {
2128                 case 0:
2129                         break;
2130 
2131                 case ENOSPC:
2132                         if (!copy)
2133                                 goto copy;
2134 
2135                 /*FALLTHRU*/
2136                 default:
2137                         goto reject;
2138                 }
2139 
2140                 /* Free the packet structure */
2141                 stpp->stp_etherhp = NULL;
2142                 stpp->stp_iphp = NULL;
2143                 stpp->stp_thp = NULL;
2144                 stpp->stp_off = 0;
2145                 stpp->stp_size = 0;
2146                 stpp->stp_mss = 0;
2147                 stpp->stp_dpl_put_len = 0;
2148 
2149                 ASSERT3P(stpp->stp_mp, ==, NULL);
2150 
2151                 if (sfxge_tx_qfpp_put(stp, stpp) != 0) {
2152                         sfxge_tx_packet_destroy(sp, stpp);
2153                         stpp = NULL;
2154                 }
2155 
2156                 --count;
2157                 stpp = next;
2158 
2159                 /* Post the packet */
2160                 sfxge_tx_qlist_post(stp);
2161 
2162                 if (stp->st_unblock != SFXGE_TXQ_NOT_BLOCKED)
2163                         goto defer;
2164 
2165                 if (stp->st_added - pushed >= SFXGE_TX_BATCH) {
2166                         efx_tx_qpush(stp->st_etp, stp->st_added, pushed);
2167                         pushed = stp->st_added;
2168                 }
2169 
2170                 continue;
2171 
2172 copy:
2173                 /* Abort the current fragment list */
2174                 sfxge_tx_qlist_abort(stp);
2175 
2176                 /* Try copying the packet to flatten it */
2177                 ASSERT(!copy);
2178                 copy = B_TRUE;
2179 
2180                 goto again;
2181 
2182 reject:
2183                 /* Abort the current fragment list */
2184                 sfxge_tx_qlist_abort(stp);
2185 
2186                 /* Discard the packet */
2187                 freemsg(stpp->stp_mp);
2188                 stpp->stp_mp = NULL;
2189 
2190                 /* Free the packet structure */
2191                 stpp->stp_etherhp = NULL;
2192                 stpp->stp_iphp = NULL;
2193                 stpp->stp_thp = NULL;
2194                 stpp->stp_off = 0;
2195                 stpp->stp_size = 0;
2196                 stpp->stp_mss = 0;
2197                 stpp->stp_dpl_put_len = 0;
2198 
2199                 if (sfxge_tx_qfpp_put(stp, stpp) != 0) {
2200                         sfxge_tx_packet_destroy(sp, stpp);
2201                         stpp = NULL;
2202                 }
2203 
2204                 --count;
2205                 stpp = next;
2206                 continue;
2207 defer:
2208                 DTRACE_PROBE1(defer, unsigned int, stp->st_index);
2209                 break;
2210         }
2211 
2212         if (count == 0) {
2213                 /* New empty get list */
2214                 ASSERT3P(stpp, ==, NULL);
2215                 stdp->std_get = NULL;
2216                 stdp->std_count = 0;
2217 
2218                 stdp->std_getp = &(stdp->std_get);
2219         } else {
2220                 /* shorten the list by moving the head */
2221                 stdp->std_get = stpp;
2222                 stdp->std_count = count;
2223                 ASSERT3U(stdp->std_count, <=, sfxge_tx_dpl_get_pkt_max(stp));
2224         }
2225 
2226         if (stp->st_added != pushed)
2227                 efx_tx_qpush(stp->st_etp, stp->st_added, pushed);
2228 
2229         ASSERT(stp->st_unblock != SFXGE_TXQ_NOT_BLOCKED ||
2230             stdp->std_count == 0);
2231 }
2232 
2233 /* Swizzle deferred packet list, try and push to HW */
2234 static inline void
2235 sfxge_tx_qdpl_service(sfxge_txq_t *stp)
2236 {
2237         do {
2238                 ASSERT(mutex_owned(&(stp->st_lock)));
2239 
2240                 if (SFXGE_TX_QDPL_PUT_PENDING(stp))
2241                         sfxge_tx_qdpl_swizzle(stp);
2242 
2243                 if (stp->st_unblock == SFXGE_TXQ_NOT_BLOCKED)
2244                         sfxge_tx_qdpl_drain(stp);
2245 
2246                 mutex_exit(&(stp->st_lock));
2247 
2248                 if (!SFXGE_TX_QDPL_PUT_PENDING(stp))
2249                         break;
2250         } while (mutex_tryenter(&(stp->st_lock)));
2251 }
2252 
2253 static void
2254 sfxge_tx_qdpl_flush_locked(sfxge_txq_t *stp)
2255 {
2256         sfxge_t *sp = stp->st_sp;
2257         sfxge_tx_dpl_t *stdp = &(stp->st_dpl);
2258         sfxge_tx_packet_t *stpp;
2259         unsigned int count;
2260 
2261         ASSERT(mutex_owned(&(stp->st_lock)));
2262 
2263         /* Swizzle put list to the get list */
2264         sfxge_tx_qdpl_swizzle(stp);
2265 
2266         stpp = stdp->std_get;
2267         count = stdp->std_count;
2268 
2269         while (count != 0) {
2270                 sfxge_tx_packet_t *next;
2271 
2272                 next = stpp->stp_next;
2273                 stpp->stp_next = NULL;
2274 
2275                 /* Discard the packet */
2276                 freemsg(stpp->stp_mp);
2277                 stpp->stp_mp = NULL;
2278 
2279                 /* Free the packet structure */
2280                 stpp->stp_etherhp = NULL;
2281                 stpp->stp_iphp = NULL;
2282                 stpp->stp_thp = NULL;
2283                 stpp->stp_off = 0;
2284                 stpp->stp_size = 0;
2285                 stpp->stp_mss = 0;
2286                 stpp->stp_dpl_put_len = 0;
2287 
2288                 sfxge_tx_packet_destroy(sp, stpp);
2289 
2290                 --count;
2291                 stpp = next;
2292         }
2293 
2294         ASSERT3P(stpp, ==, NULL);
2295 
2296         /* Empty list */
2297         stdp->std_get = NULL;
2298         stdp->std_count = 0;
2299         stdp->std_getp = &(stdp->std_get);
2300 }
2301 
2302 
2303 void
2304 sfxge_tx_qdpl_flush(sfxge_txq_t *stp)
2305 {
2306         mutex_enter(&(stp->st_lock));
2307         sfxge_tx_qdpl_flush_locked(stp);
2308         mutex_exit(&(stp->st_lock));
2309 }
2310 
2311 
2312 static void
2313 sfxge_tx_qunblock(sfxge_txq_t *stp)
2314 {
2315         sfxge_t *sp = stp->st_sp;
2316         unsigned int evq = stp->st_evq;
2317         sfxge_evq_t *sep = sp->s_sep[evq];
2318 
2319         ASSERT(mutex_owned(&(sep->se_lock)));
2320 
2321         mutex_enter(&(stp->st_lock));
2322 
2323         if (stp->st_state != SFXGE_TXQ_STARTED) {
2324                 mutex_exit(&(stp->st_lock));
2325                 return;
2326         }
2327 
2328         if (stp->st_unblock != SFXGE_TXQ_NOT_BLOCKED) {
2329                 unsigned int level;
2330 
2331                 level = stp->st_added - stp->st_completed;
2332                 if (level <= stp->st_unblock) {
2333                         stp->st_unblock = SFXGE_TXQ_NOT_BLOCKED;
2334                         sfxge_tx_qlist_post(stp);
2335                 }
2336         }
2337 
2338         sfxge_tx_qdpl_service(stp);
2339         /* lock has been dropped */
2340 }
2341 
2342 void
2343 sfxge_tx_qcomplete(sfxge_txq_t *stp)
2344 {
2345         sfxge_t *sp = stp->st_sp;
2346         sfxge_tx_dpl_t *stdp = &(stp->st_dpl);
2347         unsigned int evq = stp->st_evq;
2348         sfxge_evq_t *sep = sp->s_sep[evq];
2349         unsigned int completed;
2350 
2351         ASSERT(mutex_owned(&(sep->se_lock)));
2352 
2353         completed = stp->st_completed;
2354         while (completed != stp->st_pending) {
2355                 unsigned int id;
2356                 sfxge_tx_mapping_t *stmp;
2357 
2358                 id = completed++ & (SFXGE_TX_NDESCS - 1);
2359 
2360                 if ((stmp = stp->st_stmp[id]) != NULL) {
2361                         mblk_t *mp;
2362 
2363                         /* Unbind all the mappings */
2364                         do {
2365                                 ASSERT(stmp->stm_mp != NULL);
2366                                 sfxge_tx_msgb_unbind(stmp);
2367 
2368                                 stmp = stmp->stm_next;
2369                         } while (stmp != NULL);
2370 
2371                         /*
2372                          * Now that the packet is no longer mapped for DMA it
2373                          * can be freed.
2374                          */
2375                         mp = stp->st_mp[id];
2376                         stp->st_mp[id] = NULL;
2377 
2378                         ASSERT(mp != NULL);
2379                         freemsg(mp);
2380                 }
2381         }
2382         stp->st_completed = completed;
2383 
2384         /* Check whether we need to unblock the queue */
2385         if (stp->st_unblock != SFXGE_TXQ_NOT_BLOCKED) {
2386                 unsigned int level;
2387 
2388                 level = stp->st_added - stp->st_completed;
2389                 if (level <= stp->st_unblock)
2390                         sfxge_tx_qunblock(stp);
2391         }
2392 
2393         /* Release TX backpressure from the TX DPL put/get list being full */
2394         if (stdp->std_count < stdp->get_pkt_limit)
2395                 mac_tx_update(sp->s_mh);
2396 }
2397 
2398 void
2399 sfxge_tx_qflush_done(sfxge_txq_t *stp)
2400 {
2401         sfxge_t *sp = stp->st_sp;
2402         boolean_t flush_pending = B_FALSE;
2403 
2404         ASSERT(mutex_owned(&(sp->s_sep[stp->st_evq]->se_lock)));
2405 
2406         mutex_enter(&(stp->st_lock));
2407 
2408         switch (stp->st_state) {
2409         case SFXGE_TXQ_INITIALIZED:
2410                 /* Ignore flush event after TxQ destroyed */
2411                 break;
2412 
2413         case SFXGE_TXQ_FLUSH_PENDING:
2414                 flush_pending = B_TRUE;
2415                 stp->st_state = SFXGE_TXQ_FLUSH_DONE;
2416                 break;
2417 
2418         case SFXGE_TXQ_FLUSH_FAILED:
2419                 /* MC may have rebooted before handling the flush request */
2420                 stp->st_state = SFXGE_TXQ_FLUSH_DONE;
2421                 break;
2422 
2423         case SFXGE_TXQ_STARTED:
2424                 /*
2425                  * MC initiated flush on MC reboot or because of bad Tx
2426                  * descriptor
2427                  */
2428                 stp->st_state = SFXGE_TXQ_FLUSH_DONE;
2429                 break;
2430 
2431         case SFXGE_TXQ_FLUSH_DONE:
2432                 /* Ignore unexpected extra flush event */
2433                 ASSERT(B_FALSE);
2434                 break;
2435 
2436         default:
2437                 ASSERT(B_FALSE);
2438         }
2439 
2440 
2441         mutex_exit(&(stp->st_lock));
2442 
2443         if (flush_pending == B_FALSE) {
2444                 /* Flush was not pending */
2445                 return;
2446         }
2447 
2448         mutex_enter(&(sp->s_tx_flush_lock));
2449         sp->s_tx_flush_pending--;
2450         if (sp->s_tx_flush_pending <= 0) {
2451                 /* All queues flushed: wakeup sfxge_tx_stop() */
2452                 cv_signal(&(sp->s_tx_flush_kv));
2453         }
2454         mutex_exit(&(sp->s_tx_flush_lock));
2455 }
2456 
2457 static void
2458 sfxge_tx_qflush(sfxge_t *sp, unsigned int index, boolean_t wait_for_flush)
2459 {
2460         sfxge_txq_t *stp = sp->s_stp[index];
2461         int rc;
2462 
2463         ASSERT(mutex_owned(&(sp->s_state_lock)));
2464         ASSERT(mutex_owned(&(sp->s_tx_flush_lock)));
2465 
2466         mutex_enter(&(stp->st_lock));
2467 
2468         /* Prepare to flush and stop the queue */
2469         if (stp->st_state == SFXGE_TXQ_STARTED) {
2470                 /* Flush the transmit queue */
2471                 if ((rc = efx_tx_qflush(stp->st_etp)) == EALREADY) {
2472                         /* Already flushed, may be initiated by MC */
2473                         stp->st_state = SFXGE_TXQ_FLUSH_DONE;
2474                 } else if (rc != 0) {
2475                         /* Unexpected error */
2476                         stp->st_state = SFXGE_TXQ_FLUSH_FAILED;
2477                 } else if (wait_for_flush) {
2478                         stp->st_state = SFXGE_TXQ_FLUSH_PENDING;
2479                         sp->s_tx_flush_pending++;
2480                 } else {
2481                         /* Assume the flush is done */
2482                         stp->st_state = SFXGE_TXQ_FLUSH_DONE;
2483                 }
2484         }
2485 
2486         mutex_exit(&(stp->st_lock));
2487 }
2488 
2489 static void
2490 sfxge_tx_qstop(sfxge_t *sp, unsigned int index)
2491 {
2492         sfxge_txq_t *stp = sp->s_stp[index];
2493         unsigned int evq = stp->st_evq;
2494         sfxge_evq_t *sep = sp->s_sep[evq];
2495 
2496         mutex_enter(&(sep->se_lock));
2497         mutex_enter(&(stp->st_lock));
2498 
2499         if (stp->st_state == SFXGE_TXQ_INITIALIZED)
2500                 goto done;
2501 
2502         ASSERT(stp->st_state == SFXGE_TXQ_FLUSH_PENDING ||
2503             stp->st_state == SFXGE_TXQ_FLUSH_DONE ||
2504             stp->st_state == SFXGE_TXQ_FLUSH_FAILED);
2505 
2506         /* All queues should have been flushed */
2507         if (stp->st_sp->s_tx_flush_pending != 0) {
2508                 dev_err(sp->s_dip, CE_NOTE,
2509                     SFXGE_CMN_ERR "txq[%d] stop with flush_pending=%d",
2510                     index, stp->st_sp->s_tx_flush_pending);
2511         }
2512         if (stp->st_state == SFXGE_TXQ_FLUSH_FAILED) {
2513                 dev_err(sp->s_dip, CE_NOTE,
2514                     SFXGE_CMN_ERR "txq[%d] flush failed", index);
2515         }
2516 
2517         /* Destroy the transmit queue */
2518         efx_tx_qdestroy(stp->st_etp);
2519         stp->st_etp = NULL;
2520 
2521         /* Clear entries from the buffer table */
2522         sfxge_sram_buf_tbl_clear(sp, stp->st_id,
2523             EFX_TXQ_NBUFS(SFXGE_TX_NDESCS));
2524 
2525         sfxge_tx_qlist_abort(stp);
2526         ASSERT3U(stp->st_n, ==, 0);
2527 
2528         stp->st_unblock = SFXGE_TXQ_NOT_BLOCKED;
2529 
2530         stp->st_pending = stp->st_added;
2531 
2532         sfxge_tx_qcomplete(stp);
2533         ASSERT3U(stp->st_completed, ==, stp->st_pending);
2534 
2535         sfxge_tx_qreap(stp);
2536         ASSERT3U(stp->st_reaped, ==, stp->st_completed);
2537 
2538         /*
2539          * Ensure the deferred packet list is cleared
2540          * Can race with sfxge_tx_packet_add() adding to the put list
2541          */
2542         sfxge_tx_qdpl_flush_locked(stp);
2543 
2544         stp->st_added = 0;
2545         stp->st_pending = 0;
2546         stp->st_completed = 0;
2547         stp->st_reaped = 0;
2548 
2549         stp->st_state = SFXGE_TXQ_INITIALIZED;
2550 
2551 done:
2552         mutex_exit(&(stp->st_lock));
2553         mutex_exit(&(sep->se_lock));
2554 }
2555 
2556 static void
2557 sfxge_tx_qfini(sfxge_t *sp, unsigned int index)
2558 {
2559         sfxge_txq_t *stp = sp->s_stp[index];
2560         sfxge_tx_dpl_t *stdp = &(stp->st_dpl);
2561 
2562         ASSERT3U(stp->st_state, ==, SFXGE_TXQ_INITIALIZED);
2563         stp->st_state = SFXGE_TXQ_UNINITIALIZED;
2564 
2565         /* Detach the TXQ from the driver */
2566         sp->s_stp[index] = NULL;
2567         ASSERT(sp->s_tx_qcount > 0);
2568         sp->s_tx_qcount--;
2569 
2570         /* Free the EVQ label for events from this TXQ */
2571         (void) sfxge_ev_txlabel_free(sp, stp->st_evq, stp, stp->st_label);
2572         stp->st_label = 0;
2573 
2574         /* Tear down the statistics */
2575         sfxge_tx_kstat_fini(stp);
2576 
2577         /* Ensure the deferred packet list is empty */
2578         ASSERT3U(stdp->std_count, ==, 0);
2579         ASSERT3P(stdp->std_get, ==, NULL);
2580         ASSERT3U(stdp->std_put, ==, 0);
2581 
2582         /* Clear the free buffer pool */
2583         sfxge_tx_qfbp_empty(stp);
2584 
2585         /* Clear the free mapping pool */
2586         sfxge_tx_qfmp_empty(stp);
2587 
2588         /* Clear the free packet pool */
2589         sfxge_tx_qfpp_empty(stp);
2590 
2591         mutex_destroy(&(stp->st_lock));
2592 
2593         stp->st_evq = 0;
2594         stp->st_type = 0;
2595         stp->st_index = 0;
2596 
2597         kmem_cache_free(sp->s_tqc, stp);
2598 }
2599 
2600 int
2601 sfxge_tx_init(sfxge_t *sp)
2602 {
2603         sfxge_intr_t *sip = &(sp->s_intr);
2604         char name[MAXNAMELEN];
2605         sfxge_txq_type_t qtype;
2606         unsigned int txq, evq;
2607         int index;
2608         int rc;
2609 
2610         (void) snprintf(name, MAXNAMELEN - 1, "%s%d_tx_packet_cache",
2611             ddi_driver_name(sp->s_dip), ddi_get_instance(sp->s_dip));
2612 
2613         sp->s_tpc = kmem_cache_create(name, sizeof (sfxge_tx_packet_t),
2614             SFXGE_CPU_CACHE_SIZE, sfxge_tx_packet_ctor, sfxge_tx_packet_dtor,
2615             NULL, sp, NULL, 0);
2616         ASSERT(sp->s_tpc != NULL);
2617 
2618         (void) snprintf(name, MAXNAMELEN - 1, "%s%d_tx_buffer_cache",
2619             ddi_driver_name(sp->s_dip), ddi_get_instance(sp->s_dip));
2620 
2621         sp->s_tbc = kmem_cache_create(name, sizeof (sfxge_tx_buffer_t),
2622             SFXGE_CPU_CACHE_SIZE, sfxge_tx_buffer_ctor, sfxge_tx_buffer_dtor,
2623             NULL, sp, NULL, 0);
2624         ASSERT(sp->s_tbc != NULL);
2625 
2626         (void) snprintf(name, MAXNAMELEN - 1, "%s%d_tx_mapping_cache",
2627             ddi_driver_name(sp->s_dip), ddi_get_instance(sp->s_dip));
2628 
2629         sp->s_tmc = kmem_cache_create(name, sizeof (sfxge_tx_mapping_t),
2630             SFXGE_CPU_CACHE_SIZE, sfxge_tx_mapping_ctor, sfxge_tx_mapping_dtor,
2631             NULL, sp, NULL, 0);
2632         ASSERT(sp->s_tmc != NULL);
2633 
2634         (void) snprintf(name, MAXNAMELEN - 1, "%s%d_txq_cache",
2635             ddi_driver_name(sp->s_dip), ddi_get_instance(sp->s_dip));
2636 
2637         sp->s_tqc = kmem_cache_create(name, sizeof (sfxge_txq_t),
2638             SFXGE_CPU_CACHE_SIZE, sfxge_tx_qctor, sfxge_tx_qdtor, NULL, sp,
2639             NULL, 0);
2640         ASSERT(sp->s_tqc != NULL);
2641 
2642         /* Initialize the transmit queues. */
2643         sp->s_tx_scale_max[SFXGE_TXQ_NON_CKSUM]              = sip->si_nalloc;
2644         sp->s_tx_scale_max[SFXGE_TXQ_IP_CKSUM]               = 1;
2645         sp->s_tx_scale_max[SFXGE_TXQ_IP_TCP_UDP_CKSUM]       = sip->si_nalloc;
2646 
2647         /* Ensure minimum queue counts required by sfxge_tx_packet_add(). */
2648         if (sp->s_tx_scale_max[SFXGE_TXQ_NON_CKSUM] < 1)
2649                 sp->s_tx_scale_max[SFXGE_TXQ_NON_CKSUM] = 1;
2650 
2651         if (sp->s_tx_scale_max[SFXGE_TXQ_IP_CKSUM] < 1)
2652                 sp->s_tx_scale_max[SFXGE_TXQ_IP_CKSUM] = 1;
2653 
2654         txq = 0;
2655         for (qtype = 0; qtype < SFXGE_TXQ_NTYPES; qtype++) {
2656                 unsigned int tx_scale = sp->s_tx_scale_max[qtype];
2657 
2658                 if (txq + tx_scale > EFX_ARRAY_SIZE(sp->s_stp)) {
2659                         rc = EINVAL;
2660                         goto fail1;
2661                 }
2662 
2663                 sp->s_tx_scale_base[qtype] = txq;
2664 
2665                 for (evq = 0; evq < tx_scale; evq++) {
2666                         if ((rc = sfxge_tx_qinit(sp, txq, qtype, evq)) != 0) {
2667                                 goto fail2;
2668                         }
2669                         txq++;
2670                 }
2671                 ASSERT3U(txq, <=, EFX_ARRAY_SIZE(sp->s_stp));
2672         }
2673 
2674         return (0);
2675 
2676 fail2:
2677         DTRACE_PROBE(fail2);
2678 
2679 fail1:
2680         DTRACE_PROBE1(fail1, int, rc);
2681 
2682         index = EFX_ARRAY_SIZE(sp->s_stp);
2683         while (--index >= 0) {
2684                 if (sp->s_stp[index] != NULL)
2685                         sfxge_tx_qfini(sp, index);
2686         }
2687 
2688         kmem_cache_destroy(sp->s_tqc);
2689         sp->s_tqc = NULL;
2690 
2691         kmem_cache_destroy(sp->s_tmc);
2692         sp->s_tmc = NULL;
2693 
2694         kmem_cache_destroy(sp->s_tbc);
2695         sp->s_tbc = NULL;
2696 
2697         kmem_cache_destroy(sp->s_tpc);
2698         sp->s_tpc = NULL;
2699 
2700         return (rc);
2701 }
2702 
2703 int
2704 sfxge_tx_start(sfxge_t *sp)
2705 {
2706         efx_nic_t *enp = sp->s_enp;
2707         int index;
2708         int rc;
2709 
2710         /* Initialize the transmit module */
2711         if ((rc = efx_tx_init(enp)) != 0)
2712                 goto fail1;
2713 
2714         for (index = 0; index < EFX_ARRAY_SIZE(sp->s_stp); index++) {
2715                 if (sp->s_stp[index] != NULL)
2716                         if ((rc = sfxge_tx_qstart(sp, index)) != 0)
2717                                 goto fail2;
2718         }
2719 
2720         return (0);
2721 
2722 fail2:
2723         DTRACE_PROBE(fail2);
2724 
2725         sfxge_tx_stop(sp);
2726 
2727 fail1:
2728         DTRACE_PROBE1(fail1, int, rc);
2729 
2730         return (rc);
2731 }
2732 
2733 
2734 /*
2735  * Add a packet to the TX Deferred Packet List and if the TX queue lock
2736  * can be acquired then call sfxge_tx_qdpl_service() to fragment and push
2737  * to the H/W transmit descriptor ring
2738  *
2739  * If ENOSPC is returned then the DPL is full or the packet create failed, but
2740  * the mblk isn't freed so that the caller can return this mblk from mc_tx() to
2741  * back-pressure the OS stack.
2742  *
2743  * For all other errors the mblk is freed
2744  */
2745 int
2746 sfxge_tx_packet_add(sfxge_t *sp, mblk_t *mp)
2747 {
2748         struct ether_header *etherhp;
2749         struct ip *iphp;
2750         struct tcphdr *thp;
2751         size_t off;
2752         size_t size;
2753         size_t mss;
2754         sfxge_txq_t *stp;
2755         unsigned int txq;
2756         int index;
2757         boolean_t locked;
2758         sfxge_tx_packet_t *stpp;
2759         sfxge_packet_type_t pkt_type;
2760         uint16_t sport, dport;
2761         int rc = 0;
2762 
2763         ASSERT3P(mp->b_next, ==, NULL);
2764         ASSERT(!(DB_CKSUMFLAGS(mp) & HCK_PARTIALCKSUM));
2765 
2766         /*
2767          * Do not enqueue packets during startup/shutdown;
2768          *
2769          * NOTE: This access to the state is NOT protected by the state lock. It
2770          * is an imperfect test and anything further getting onto the get/put
2771          * deferred packet lists is cleaned up in (possibly repeated) calls to
2772          * sfxge_can_destroy().
2773          */
2774         if (sp->s_state != SFXGE_STARTED) {
2775                 rc = EINVAL;
2776                 goto fail1;
2777         }
2778 
2779         etherhp = NULL;
2780         iphp = NULL;
2781         thp = NULL;
2782         off = 0;
2783         size = 0;
2784         mss = 0;
2785 
2786         /* Check whether we need the header pointers for LSO segmentation */
2787         if (DB_LSOFLAGS(mp) & HW_LSO) {
2788                 /* LSO segmentation relies on hardware checksum offload */
2789                 DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM;
2790 
2791                 if ((mss = DB_LSOMSS(mp)) == 0) {
2792                         rc = EINVAL;
2793                         goto fail1;
2794                 }
2795 
2796                 pkt_type = sfxge_pkthdr_parse(mp, &etherhp, &iphp, &thp,
2797                     &off, &size, &sport, &dport);
2798 
2799                 if (pkt_type != SFXGE_PACKET_TYPE_IPV4_TCP ||
2800                     etherhp == NULL ||
2801                     iphp == NULL ||
2802                     thp == NULL ||
2803                     off == 0) {
2804                         rc = EINVAL;
2805                         goto fail2;
2806                 }
2807         }
2808 
2809         /* Choose the appropriate transit queue */
2810         if (DB_CKSUMFLAGS(mp) & HCK_FULLCKSUM) {
2811                 sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2812 
2813                 if (srsp->srs_state == SFXGE_RX_SCALE_STARTED) {
2814                         uint32_t hash;
2815 
2816                         if (srsp->srs_count > 1) {
2817                                 /*
2818                                  * If we have not already parsed the headers
2819                                  * for LSO segmentation then we need to do it
2820                                  * now so we can calculate the hash.
2821                                  */
2822                                 if (thp == NULL) {
2823                                         (void) sfxge_pkthdr_parse(mp, &etherhp,
2824                                             &iphp, &thp, &off, &size,
2825                                             &sport, &dport);
2826                                 }
2827 
2828                                 if (thp != NULL) {
2829                                         SFXGE_TCP_HASH(sp,
2830                                             &iphp->ip_dst.s_addr,
2831                                             thp->th_dport,
2832                                             &iphp->ip_src.s_addr,
2833                                             thp->th_sport, hash);
2834 
2835                                         index = srsp->srs_tbl[hash %
2836                                             SFXGE_RX_SCALE_MAX];
2837                                 } else if (iphp != NULL) {
2838                                         /*
2839                                          * Calculate IPv4 4-tuple hash, with
2840                                          * TCP/UDP/SCTP src/dest ports. Ports
2841                                          * are zero for other IPv4 protocols.
2842                                          */
2843                                         SFXGE_IP_HASH(sp,
2844                                             &iphp->ip_dst.s_addr, dport,
2845                                             &iphp->ip_src.s_addr, sport, hash);
2846 
2847                                         index = srsp->srs_tbl[hash %
2848                                             SFXGE_RX_SCALE_MAX];
2849                                 } else {
2850                                         /*
2851                                          * Other traffic always goes to the
2852                                          * the queue in the zero-th entry of
2853                                          * the RSS table.
2854                                          */
2855                                         index = srsp->srs_tbl[0];
2856                                 }
2857                         } else {
2858                                 /*
2859                                  * It does not matter what the hash is
2860                                  * because all the RSS table entries will be
2861                                  * the same.
2862                                  */
2863                                 index = srsp->srs_tbl[0];
2864                         }
2865 
2866                         /*
2867                          * Find the event queue corresponding to the hash in
2868                          * the RSS table.
2869                          */
2870                         txq = sp->s_tx_scale_base[SFXGE_TXQ_IP_TCP_UDP_CKSUM] +
2871                             index;
2872                         stp = sp->s_stp[txq];
2873                         ASSERT3U(stp->st_evq, ==, index);
2874                 } else {
2875                         index = 0;
2876                         txq = sp->s_tx_scale_base[SFXGE_TXQ_IP_TCP_UDP_CKSUM] +
2877                             index;
2878                         stp = sp->s_stp[txq];
2879                 }
2880         } else if (DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) {
2881                 ASSERT3U(sp->s_tx_scale_max[SFXGE_TXQ_IP_CKSUM], >=, 1);
2882                 index = 0;
2883                 txq = sp->s_tx_scale_base[SFXGE_TXQ_IP_CKSUM] + index;
2884                 stp = sp->s_stp[txq];
2885         } else {
2886                 /*
2887                  * No hardware checksum offload requested.
2888                  */
2889                 sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2890 
2891                 if (srsp->srs_state == SFXGE_RX_SCALE_STARTED) {
2892                         uint32_t hash = 0;
2893 
2894                         if (srsp->srs_count > 1) {
2895                                 if (iphp == NULL) {
2896                                         (void) sfxge_pkthdr_parse(mp, &etherhp,
2897                                             &iphp, &thp, &off, &size,
2898                                             &sport, &dport);
2899                                 }
2900 
2901                                 if (iphp != NULL) {
2902                                         /*
2903                                          * Calculate IPv4 4-tuple hash, with
2904                                          * TCP/UDP/SCTP src/dest ports. Ports
2905                                          * are zero for other IPv4 protocols.
2906                                          */
2907                                         SFXGE_IP_HASH(sp,
2908                                             &iphp->ip_dst.s_addr, dport,
2909                                             &iphp->ip_src.s_addr, sport, hash);
2910 
2911                                         hash = hash % SFXGE_RX_SCALE_MAX;
2912                                 }
2913                         }
2914                         index = srsp->srs_tbl[hash];
2915 
2916                         /*
2917                          * The RSS table (indexed by hash) gives the RXQ index,
2918                          * (mapped 1:1 with EVQs). Find the TXQ that results in
2919                          * using the same EVQ as for the RX data path.
2920                          */
2921                         ASSERT3U(sp->s_tx_scale_max[SFXGE_TXQ_NON_CKSUM],
2922                             >, index);
2923                         txq = sp->s_tx_scale_base[SFXGE_TXQ_NON_CKSUM] + index;
2924                         stp = sp->s_stp[txq];
2925                         ASSERT3U(stp->st_evq, ==, index);
2926                 } else {
2927                         ASSERT3U(sp->s_tx_scale_max[SFXGE_TXQ_NON_CKSUM], >, 0);
2928                         index = 0;
2929                         txq = sp->s_tx_scale_base[SFXGE_TXQ_NON_CKSUM] + index;
2930                         stp = sp->s_stp[txq];
2931                 }
2932 
2933 
2934         }
2935         ASSERT(stp != NULL);
2936 
2937         ASSERT(mss == 0 || (DB_LSOFLAGS(mp) & HW_LSO));
2938 
2939         /* Try to grab the lock */
2940         locked = mutex_tryenter(&(stp->st_lock));
2941 
2942         if (locked) {
2943                 /* Try to grab a packet from the pool */
2944                 stpp = sfxge_tx_qfpp_get(stp);
2945         } else {
2946                 stpp = NULL;
2947         }
2948 
2949         if (stpp == NULL) {
2950                 /*
2951                  * Either the pool was empty or we don't have the lock so
2952                  * allocate a new packet.
2953                  */
2954                 if ((stpp = sfxge_tx_packet_create(sp)) == NULL) {
2955                         rc = ENOSPC;
2956                         goto fail3;
2957                 }
2958         }
2959 
2960         stpp->stp_mp = mp;
2961         stpp->stp_etherhp = etherhp;
2962         stpp->stp_iphp = iphp;
2963         stpp->stp_thp = thp;
2964         stpp->stp_off = off;
2965         stpp->stp_size = size;
2966         stpp->stp_mss = mss;
2967         stpp->stp_dpl_put_len = 0;
2968 
2969         rc = sfxge_tx_qdpl_add(stp, stpp, locked);
2970         if (rc != 0) {
2971                 /* ENOSPC can happen for DPL get or put list is full */
2972                 ASSERT3U(rc, ==, ENOSPC);
2973 
2974                 /*
2975                  * Note; if this is the unlocked DPL put list full case there is
2976                  * no need to worry about a race with locked
2977                  * sfxge_tx_qdpl_swizzle() as we know that the TX DPL put list
2978                  * was full and would have been swizzle'd to the TX DPL get
2979                  * list; hence guaranteeing future TX completions and calls
2980                  * to mac_tx_update() via sfxge_tx_qcomplete()
2981                  */
2982                 goto fail4;
2983         }
2984 
2985         /* Try to grab the lock again */
2986         if (!locked)
2987                 locked = mutex_tryenter(&(stp->st_lock));
2988 
2989         if (locked) {
2990                 /* Try to service the list */
2991                 sfxge_tx_qdpl_service(stp);
2992                 /* lock has been dropped */
2993         }
2994 
2995         return (0);
2996 
2997 fail4:
2998         DTRACE_PROBE(fail4);
2999         sfxge_tx_packet_destroy(sp, stpp);
3000 fail3:
3001         DTRACE_PROBE(fail3);
3002         if (locked)
3003                 mutex_exit(&(stp->st_lock));
3004 fail2:
3005         DTRACE_PROBE(fail2);
3006 fail1:
3007         DTRACE_PROBE1(fail1, int, rc);
3008 
3009         if (rc != ENOSPC)
3010                 freemsg(mp);
3011         return (rc);
3012 }
3013 
3014 void
3015 sfxge_tx_stop(sfxge_t *sp)
3016 {
3017         efx_nic_t *enp = sp->s_enp;
3018         clock_t timeout;
3019         boolean_t wait_for_flush;
3020         int index;
3021 
3022         ASSERT(mutex_owned(&(sp->s_state_lock)));
3023 
3024         mutex_enter(&(sp->s_tx_flush_lock));
3025 
3026         /* Flush all the queues */
3027         if (sp->s_hw_err == SFXGE_HW_OK) {
3028                 wait_for_flush = B_TRUE;
3029         } else {
3030                 /*
3031                  * Flag indicates possible hardware failure.
3032                  * Attempt flush but do not wait for it to complete.
3033                  */
3034                 wait_for_flush = B_FALSE;
3035         }
3036 
3037         /* Prepare queues to stop and flush the hardware ring */
3038         index = EFX_ARRAY_SIZE(sp->s_stp);
3039         while (--index >= 0) {
3040                 if (sp->s_stp[index] != NULL)
3041                         sfxge_tx_qflush(sp, index, wait_for_flush);
3042         }
3043 
3044         if (wait_for_flush == B_FALSE)
3045                 goto flush_done;
3046 
3047         /* Wait upto 2sec for queue flushing to complete */
3048         timeout = ddi_get_lbolt() + drv_usectohz(SFXGE_TX_QFLUSH_USEC);
3049 
3050         while (sp->s_tx_flush_pending > 0) {
3051                 if (cv_timedwait(&(sp->s_tx_flush_kv), &(sp->s_tx_flush_lock),
3052                     timeout) < 0) {
3053                         /* Timeout waiting for queues to flush */
3054                         dev_info_t *dip = sp->s_dip;
3055 
3056                         DTRACE_PROBE(timeout);
3057                         dev_err(dip, CE_NOTE,
3058                             SFXGE_CMN_ERR "tx qflush timeout");
3059                         break;
3060                 }
3061         }
3062 
3063 flush_done:
3064         sp->s_tx_flush_pending = 0;
3065         mutex_exit(&(sp->s_tx_flush_lock));
3066 
3067         /* Stop all the queues */
3068         index = EFX_ARRAY_SIZE(sp->s_stp);
3069         while (--index >= 0) {
3070                 if (sp->s_stp[index] != NULL)
3071                         sfxge_tx_qstop(sp, index);
3072         }
3073 
3074         /* Tear down the transmit module */
3075         efx_tx_fini(enp);
3076 }
3077 
3078 void
3079 sfxge_tx_fini(sfxge_t *sp)
3080 {
3081         int index;
3082 
3083         index = EFX_ARRAY_SIZE(sp->s_stp);
3084         while (--index >= 0) {
3085                 if (sp->s_stp[index] != NULL)
3086                         sfxge_tx_qfini(sp, index);
3087         }
3088 
3089         kmem_cache_destroy(sp->s_tqc);
3090         sp->s_tqc = NULL;
3091 
3092         kmem_cache_destroy(sp->s_tmc);
3093         sp->s_tmc = NULL;
3094 
3095         kmem_cache_destroy(sp->s_tbc);
3096         sp->s_tbc = NULL;
3097 
3098         kmem_cache_destroy(sp->s_tpc);
3099         sp->s_tpc = NULL;
3100 }