1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright(c) 2007-2010 Intel Corporation. All rights reserved.
  24  */
  25 
  26 /*
  27  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  28  * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
  29  * Copyright 2016 OmniTI Computer Consulting, Inc. All rights reserved.
  30  */
  31 
  32 #include "ixgbe_sw.h"
  33 
  34 static int ixgbe_tx_copy(ixgbe_tx_ring_t *, tx_control_block_t *, mblk_t *,
  35     uint32_t, boolean_t);
  36 static int ixgbe_tx_bind(ixgbe_tx_ring_t *, tx_control_block_t *, mblk_t *,
  37     uint32_t);
  38 static int ixgbe_tx_fill_ring(ixgbe_tx_ring_t *, link_list_t *,
  39     ixgbe_tx_context_t *, size_t);
  40 static void ixgbe_save_desc(tx_control_block_t *, uint64_t, size_t);
  41 static tx_control_block_t *ixgbe_get_free_list(ixgbe_tx_ring_t *);
  42 
  43 static int ixgbe_get_context(mblk_t *, ixgbe_tx_context_t *);
  44 static boolean_t ixgbe_check_context(ixgbe_tx_ring_t *,
  45     ixgbe_tx_context_t *);
  46 static void ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *,
  47     ixgbe_tx_context_t *);
  48 
  49 #ifndef IXGBE_DEBUG
  50 #pragma inline(ixgbe_save_desc)
  51 #pragma inline(ixgbe_get_context)
  52 #pragma inline(ixgbe_check_context)
  53 #pragma inline(ixgbe_fill_context)
  54 #endif
  55 
  56 /*
  57  * ixgbe_ring_tx
  58  *
  59  * To transmit one mblk through one specified ring.
  60  *
  61  * One mblk can consist of several fragments, each fragment
  62  * will be processed with different methods based on the size.
  63  * For the fragments with size less than the bcopy threshold,
  64  * they will be processed by using bcopy; otherwise, they will
  65  * be processed by using DMA binding.
  66  *
  67  * To process the mblk, a tx control block is got from the
  68  * free list. One tx control block contains one tx buffer, which
  69  * is used to copy mblk fragments' data; and one tx DMA handle,
  70  * which is used to bind a mblk fragment with DMA resource.
  71  *
  72  * Several small mblk fragments can be copied into one tx control
  73  * block's buffer, and then the buffer will be transmitted with
  74  * one tx descriptor.
  75  *
  76  * A large fragment only binds with one tx control block's DMA
  77  * handle, and it can span several tx descriptors for transmitting.
  78  *
  79  * So to transmit a packet (mblk), several tx control blocks can
  80  * be used. After the processing, those tx control blocks will
  81  * be put to the work list.
  82  */
  83 mblk_t *
  84 ixgbe_ring_tx(void *arg, mblk_t *mp)
  85 {
  86         ixgbe_tx_ring_t *tx_ring = (ixgbe_tx_ring_t *)arg;
  87         ixgbe_t *ixgbe = tx_ring->ixgbe;
  88         tx_type_t current_flag, next_flag;
  89         uint32_t current_len, next_len;
  90         uint32_t desc_total;
  91         size_t mbsize;
  92         int desc_num;
  93         boolean_t copy_done, eop;
  94         mblk_t *current_mp, *next_mp, *nmp, *pull_mp = NULL;
  95         tx_control_block_t *tcb;
  96         ixgbe_tx_context_t tx_context, *ctx;
  97         link_list_t pending_list;
  98         uint32_t len, hdr_frag_len, hdr_len;
  99         uint32_t copy_thresh;
 100         mblk_t *hdr_new_mp = NULL;
 101         mblk_t *hdr_pre_mp = NULL;
 102         mblk_t *hdr_nmp = NULL;
 103 
 104         ASSERT(mp->b_next == NULL);
 105 
 106         if ((ixgbe->ixgbe_state & IXGBE_SUSPENDED) ||
 107             (ixgbe->ixgbe_state & IXGBE_ERROR) ||
 108             (ixgbe->ixgbe_state & IXGBE_OVERTEMP) ||
 109             !(ixgbe->ixgbe_state & IXGBE_STARTED) ||
 110             ixgbe->link_state != LINK_STATE_UP) {
 111                 freemsg(mp);
 112                 return (NULL);
 113         }
 114 
 115         copy_thresh = ixgbe->tx_copy_thresh;
 116 
 117         /* Get the mblk size */
 118         mbsize = 0;
 119         for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
 120                 mbsize += MBLKL(nmp);
 121         }
 122 
 123         if (ixgbe->tx_hcksum_enable) {
 124                 /*
 125                  * Retrieve checksum context information from the mblk
 126                  * that will be used to decide whether/how to fill the
 127                  * context descriptor.
 128                  */
 129                 ctx = &tx_context;
 130                 if (ixgbe_get_context(mp, ctx) < 0) {
 131                         freemsg(mp);
 132                         return (NULL);
 133                 }
 134 
 135                 /*
 136                  * If the mblk size exceeds the max size ixgbe could
 137                  * process, then discard this mblk, and return NULL.
 138                  */
 139                 if ((ctx->lso_flag &&
 140                     ((mbsize - ctx->mac_hdr_len) > IXGBE_LSO_MAXLEN)) ||
 141                     (!ctx->lso_flag &&
 142                     (mbsize > (ixgbe->max_frame_size - ETHERFCSL)))) {
 143                         freemsg(mp);
 144                         IXGBE_DEBUGLOG_0(ixgbe, "ixgbe_tx: packet oversize");
 145                         return (NULL);
 146                 }
 147         } else {
 148                 ctx = NULL;
 149         }
 150 
 151         /*
 152          * Check and recycle tx descriptors.
 153          * The recycle threshold here should be selected carefully
 154          */
 155         if (tx_ring->tbd_free < ixgbe->tx_recycle_thresh) {
 156                 tx_ring->tx_recycle(tx_ring);
 157         }
 158 
 159         /*
 160          * After the recycling, if the tbd_free is less than the
 161          * overload_threshold, assert overload, return mp;
 162          * and we need to re-schedule the tx again.
 163          */
 164         if (tx_ring->tbd_free < ixgbe->tx_overload_thresh) {
 165                 tx_ring->reschedule = B_TRUE;
 166                 IXGBE_DEBUG_STAT(tx_ring->stat_overload);
 167                 return (mp);
 168         }
 169 
 170         /*
 171          * The pending_list is a linked list that is used to save
 172          * the tx control blocks that have packet data processed
 173          * but have not put the data to the tx descriptor ring.
 174          * It is used to reduce the lock contention of the tx_lock.
 175          */
 176         LINK_LIST_INIT(&pending_list);
 177         desc_num = 0;
 178         desc_total = 0;
 179 
 180         /*
 181          * The software should guarantee LSO packet header(MAC+IP+TCP)
 182          * to be within one descriptor. Here we reallocate and refill the
 183          * the header if it's physical memory non-contiguous.
 184          */
 185         if ((ctx != NULL) && ctx->lso_flag) {
 186                 /* find the last fragment of the header */
 187                 len = MBLKL(mp);
 188                 ASSERT(len > 0);
 189                 hdr_nmp = mp;
 190                 hdr_len = ctx->ip_hdr_len + ctx->mac_hdr_len + ctx->l4_hdr_len;
 191                 while (len < hdr_len) {
 192                         hdr_pre_mp = hdr_nmp;
 193                         hdr_nmp = hdr_nmp->b_cont;
 194                         len += MBLKL(hdr_nmp);
 195                 }
 196                 /*
 197                  * If the header and the payload are in different mblks,
 198                  * we simply force the header to be copied into pre-allocated
 199                  * page-aligned buffer.
 200                  */
 201                 if (len == hdr_len)
 202                         goto adjust_threshold;
 203 
 204                 hdr_frag_len = hdr_len - (len - MBLKL(hdr_nmp));
 205                 /*
 206                  * There are two cases we need to reallocate a mblk for the
 207                  * last header fragment:
 208                  * 1. the header is in multiple mblks and the last fragment
 209                  * share the same mblk with the payload
 210                  * 2. the header is in a single mblk shared with the payload
 211                  * and the header is physical memory non-contiguous
 212                  */
 213                 if ((hdr_nmp != mp) ||
 214                     (P2NPHASE((uintptr_t)hdr_nmp->b_rptr, ixgbe->sys_page_size)
 215                     < hdr_len)) {
 216                         IXGBE_DEBUG_STAT(tx_ring->stat_lso_header_fail);
 217                         /*
 218                          * reallocate the mblk for the last header fragment,
 219                          * expect to bcopy into pre-allocated page-aligned
 220                          * buffer
 221                          */
 222                         hdr_new_mp = allocb(hdr_frag_len, NULL);
 223                         if (!hdr_new_mp)
 224                                 return (mp);
 225                         bcopy(hdr_nmp->b_rptr, hdr_new_mp->b_rptr,
 226                             hdr_frag_len);
 227                         /* link the new header fragment with the other parts */
 228                         hdr_new_mp->b_wptr = hdr_new_mp->b_rptr + hdr_frag_len;
 229                         hdr_new_mp->b_cont = hdr_nmp;
 230                         if (hdr_pre_mp)
 231                                 hdr_pre_mp->b_cont = hdr_new_mp;
 232                         else
 233                                 mp = hdr_new_mp;
 234                         hdr_nmp->b_rptr += hdr_frag_len;
 235                 }
 236 adjust_threshold:
 237                 /*
 238                  * adjust the bcopy threshhold to guarantee
 239                  * the header to use bcopy way
 240                  */
 241                 if (copy_thresh < hdr_len)
 242                         copy_thresh = hdr_len;
 243         }
 244 
 245         current_mp = mp;
 246         current_len = MBLKL(current_mp);
 247         /*
 248          * Decide which method to use for the first fragment
 249          */
 250         current_flag = (current_len <= copy_thresh) ?
 251             USE_COPY : USE_DMA;
 252         /*
 253          * If the mblk includes several contiguous small fragments,
 254          * they may be copied into one buffer. This flag is used to
 255          * indicate whether there are pending fragments that need to
 256          * be copied to the current tx buffer.
 257          *
 258          * If this flag is B_TRUE, it indicates that a new tx control
 259          * block is needed to process the next fragment using either
 260          * copy or DMA binding.
 261          *
 262          * Otherwise, it indicates that the next fragment will be
 263          * copied to the current tx buffer that is maintained by the
 264          * current tx control block. No new tx control block is needed.
 265          */
 266         copy_done = B_TRUE;
 267         while (current_mp) {
 268                 next_mp = current_mp->b_cont;
 269                 eop = (next_mp == NULL); /* Last fragment of the packet? */
 270                 next_len = eop ? 0: MBLKL(next_mp);
 271 
 272                 /*
 273                  * When the current fragment is an empty fragment, if
 274                  * the next fragment will still be copied to the current
 275                  * tx buffer, we cannot skip this fragment here. Because
 276                  * the copy processing is pending for completion. We have
 277                  * to process this empty fragment in the tx_copy routine.
 278                  *
 279                  * If the copy processing is completed or a DMA binding
 280                  * processing is just completed, we can just skip this
 281                  * empty fragment.
 282                  */
 283                 if ((current_len == 0) && (copy_done)) {
 284                         current_mp = next_mp;
 285                         current_len = next_len;
 286                         current_flag = (current_len <= copy_thresh) ?
 287                             USE_COPY : USE_DMA;
 288                         continue;
 289                 }
 290 
 291                 if (copy_done) {
 292                         /*
 293                          * Get a new tx control block from the free list
 294                          */
 295                         tcb = ixgbe_get_free_list(tx_ring);
 296 
 297                         if (tcb == NULL) {
 298                                 IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tcb);
 299                                 goto tx_failure;
 300                         }
 301 
 302                         /*
 303                          * Push the tx control block to the pending list
 304                          * to avoid using lock too early
 305                          */
 306                         LIST_PUSH_TAIL(&pending_list, &tcb->link);
 307                 }
 308 
 309                 if (current_flag == USE_COPY) {
 310                         /*
 311                          * Check whether to use bcopy or DMA binding to process
 312                          * the next fragment, and if using bcopy, whether we
 313                          * need to continue copying the next fragment into the
 314                          * current tx buffer.
 315                          */
 316                         ASSERT((tcb->tx_buf.len + current_len) <=
 317                             tcb->tx_buf.size);
 318 
 319                         if (eop) {
 320                                 /*
 321                                  * This is the last fragment of the packet, so
 322                                  * the copy processing will be completed with
 323                                  * this fragment.
 324                                  */
 325                                 next_flag = USE_NONE;
 326                                 copy_done = B_TRUE;
 327                         } else if ((tcb->tx_buf.len + current_len + next_len) >
 328                             tcb->tx_buf.size) {
 329                                 /*
 330                                  * If the next fragment is too large to be
 331                                  * copied to the current tx buffer, we need
 332                                  * to complete the current copy processing.
 333                                  */
 334                                 next_flag = (next_len > copy_thresh) ?
 335                                     USE_DMA: USE_COPY;
 336                                 copy_done = B_TRUE;
 337                         } else if (next_len > copy_thresh) {
 338                                 /*
 339                                  * The next fragment needs to be processed with
 340                                  * DMA binding. So the copy prcessing will be
 341                                  * completed with the current fragment.
 342                                  */
 343                                 next_flag = USE_DMA;
 344                                 copy_done = B_TRUE;
 345                         } else {
 346                                 /*
 347                                  * Continue to copy the next fragment to the
 348                                  * current tx buffer.
 349                                  */
 350                                 next_flag = USE_COPY;
 351                                 copy_done = B_FALSE;
 352                         }
 353 
 354                         desc_num = ixgbe_tx_copy(tx_ring, tcb, current_mp,
 355                             current_len, copy_done);
 356                 } else {
 357                         /*
 358                          * Check whether to use bcopy or DMA binding to process
 359                          * the next fragment.
 360                          */
 361                         next_flag = (next_len > copy_thresh) ?
 362                             USE_DMA: USE_COPY;
 363                         ASSERT(copy_done == B_TRUE);
 364 
 365                         desc_num = ixgbe_tx_bind(tx_ring, tcb, current_mp,
 366                             current_len);
 367                 }
 368 
 369                 if (desc_num > 0)
 370                         desc_total += desc_num;
 371                 else if (desc_num < 0)
 372                         goto tx_failure;
 373 
 374                 current_mp = next_mp;
 375                 current_len = next_len;
 376                 current_flag = next_flag;
 377         }
 378 
 379         /*
 380          * Attach the mblk to the last tx control block
 381          */
 382         ASSERT(tcb);
 383         ASSERT(tcb->mp == NULL);
 384         tcb->mp = mp;
 385 
 386         /*
 387          * 82598/82599 chipset has a limitation that no more than 32 tx
 388          * descriptors can be transmited out at one time.
 389          *
 390          * Here is a workaround for it: pull up the mblk then send it
 391          * out with bind way. By doing so, no more than MAX_COOKIE (18)
 392          * descriptors is needed.
 393          */
 394         if (desc_total + 1 > IXGBE_TX_DESC_LIMIT) {
 395                 IXGBE_DEBUG_STAT(tx_ring->stat_break_tbd_limit);
 396 
 397                 /*
 398                  * Discard the mblk and free the used resources
 399                  */
 400                 tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
 401                 while (tcb) {
 402                         tcb->mp = NULL;
 403                         ixgbe_free_tcb(tcb);
 404                         tcb = (tx_control_block_t *)
 405                             LIST_GET_NEXT(&pending_list, &tcb->link);
 406                 }
 407 
 408                 /*
 409                  * Return the tx control blocks in the pending list to
 410                  * the free list.
 411                  */
 412                 ixgbe_put_free_list(tx_ring, &pending_list);
 413 
 414                 /*
 415                  * pull up the mblk and send it out with bind way
 416                  */
 417                 if ((pull_mp = msgpullup(mp, -1)) == NULL) {
 418                         tx_ring->reschedule = B_TRUE;
 419 
 420                         /*
 421                          * If new mblk has been allocted for the last header
 422                          * fragment of a LSO packet, we should restore the
 423                          * modified mp.
 424                          */
 425                         if (hdr_new_mp) {
 426                                 hdr_new_mp->b_cont = NULL;
 427                                 freeb(hdr_new_mp);
 428                                 hdr_nmp->b_rptr -= hdr_frag_len;
 429                                 if (hdr_pre_mp)
 430                                         hdr_pre_mp->b_cont = hdr_nmp;
 431                                 else
 432                                         mp = hdr_nmp;
 433                         }
 434                         return (mp);
 435                 }
 436 
 437                 LINK_LIST_INIT(&pending_list);
 438                 desc_total = 0;
 439 
 440                 /*
 441                  * if the packet is a LSO packet, we simply
 442                  * transmit the header in one descriptor using the copy way
 443                  */
 444                 if ((ctx != NULL) && ctx->lso_flag) {
 445                         hdr_len = ctx->ip_hdr_len + ctx->mac_hdr_len +
 446                             ctx->l4_hdr_len;
 447 
 448                         tcb = ixgbe_get_free_list(tx_ring);
 449                         if (tcb == NULL) {
 450                                 IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tcb);
 451                                 goto tx_failure;
 452                         }
 453                         desc_num = ixgbe_tx_copy(tx_ring, tcb, pull_mp,
 454                             hdr_len, B_TRUE);
 455                         LIST_PUSH_TAIL(&pending_list, &tcb->link);
 456                         desc_total  += desc_num;
 457 
 458                         pull_mp->b_rptr += hdr_len;
 459                 }
 460 
 461                 tcb = ixgbe_get_free_list(tx_ring);
 462                 if (tcb == NULL) {
 463                         IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tcb);
 464                         goto tx_failure;
 465                 }
 466                 if ((ctx != NULL) && ctx->lso_flag) {
 467                         desc_num = ixgbe_tx_bind(tx_ring, tcb, pull_mp,
 468                             mbsize - hdr_len);
 469                 } else {
 470                         desc_num = ixgbe_tx_bind(tx_ring, tcb, pull_mp,
 471                             mbsize);
 472                 }
 473                 if (desc_num < 0) {
 474                         goto tx_failure;
 475                 }
 476                 LIST_PUSH_TAIL(&pending_list, &tcb->link);
 477 
 478                 desc_total += desc_num;
 479                 tcb->mp = pull_mp;
 480         }
 481 
 482         /*
 483          * Before fill the tx descriptor ring with the data, we need to
 484          * ensure there are adequate free descriptors for transmit
 485          * (including one context descriptor).
 486          * Do not use up all the tx descriptors.
 487          * Otherwise tx recycle will fail and cause false hang.
 488          */
 489         if (tx_ring->tbd_free <= (desc_total + 1)) {
 490                 tx_ring->tx_recycle(tx_ring);
 491         }
 492 
 493         mutex_enter(&tx_ring->tx_lock);
 494         /*
 495          * If the number of free tx descriptors is not enough for transmit
 496          * then return mp.
 497          *
 498          * Note: we must put this check under the mutex protection to
 499          * ensure the correctness when multiple threads access it in
 500          * parallel.
 501          */
 502         if (tx_ring->tbd_free <= (desc_total + 1)) {
 503                 IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tbd);
 504                 mutex_exit(&tx_ring->tx_lock);
 505                 goto tx_failure;
 506         }
 507 
 508         desc_num = ixgbe_tx_fill_ring(tx_ring, &pending_list, ctx,
 509             mbsize);
 510 
 511         ASSERT((desc_num == desc_total) || (desc_num == (desc_total + 1)));
 512 
 513         tx_ring->stat_obytes += mbsize;
 514         tx_ring->stat_opackets ++;
 515 
 516         mutex_exit(&tx_ring->tx_lock);
 517 
 518         /*
 519          * now that the transmission succeeds, need to free the original
 520          * mp if we used the pulling up mblk for transmission.
 521          */
 522         if (pull_mp) {
 523                 freemsg(mp);
 524         }
 525 
 526         return (NULL);
 527 
 528 tx_failure:
 529         /*
 530          * If transmission fails, need to free the pulling up mblk.
 531          */
 532         if (pull_mp) {
 533                 freemsg(pull_mp);
 534         }
 535 
 536         /*
 537          * If new mblk has been allocted for the last header
 538          * fragment of a LSO packet, we should restore the
 539          * modified mp.
 540          */
 541         if (hdr_new_mp) {
 542                 hdr_new_mp->b_cont = NULL;
 543                 freeb(hdr_new_mp);
 544                 hdr_nmp->b_rptr -= hdr_frag_len;
 545                 if (hdr_pre_mp)
 546                         hdr_pre_mp->b_cont = hdr_nmp;
 547                 else
 548                         mp = hdr_nmp;
 549         }
 550         /*
 551          * Discard the mblk and free the used resources
 552          */
 553         tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
 554         while (tcb) {
 555                 tcb->mp = NULL;
 556 
 557                 ixgbe_free_tcb(tcb);
 558 
 559                 tcb = (tx_control_block_t *)
 560                     LIST_GET_NEXT(&pending_list, &tcb->link);
 561         }
 562 
 563         /*
 564          * Return the tx control blocks in the pending list to the free list.
 565          */
 566         ixgbe_put_free_list(tx_ring, &pending_list);
 567 
 568         /* Transmit failed, do not drop the mblk, rechedule the transmit */
 569         tx_ring->reschedule = B_TRUE;
 570 
 571         return (mp);
 572 }
 573 
 574 /*
 575  * ixgbe_tx_copy
 576  *
 577  * Copy the mblk fragment to the pre-allocated tx buffer
 578  */
 579 static int
 580 ixgbe_tx_copy(ixgbe_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp,
 581     uint32_t len, boolean_t copy_done)
 582 {
 583         dma_buffer_t *tx_buf;
 584         uint32_t desc_num;
 585         _NOTE(ARGUNUSED(tx_ring));
 586 
 587         tx_buf = &tcb->tx_buf;
 588 
 589         /*
 590          * Copy the packet data of the mblk fragment into the
 591          * pre-allocated tx buffer, which is maintained by the
 592          * tx control block.
 593          *
 594          * Several mblk fragments can be copied into one tx buffer.
 595          * The destination address of the current copied fragment in
 596          * the tx buffer is next to the end of the previous copied
 597          * fragment.
 598          */
 599         if (len > 0) {
 600                 bcopy(mp->b_rptr, tx_buf->address + tx_buf->len, len);
 601 
 602                 tx_buf->len += len;
 603                 tcb->frag_num++;
 604         }
 605 
 606         desc_num = 0;
 607 
 608         /*
 609          * If it is the last fragment copied to the current tx buffer,
 610          * in other words, if there's no remaining fragment or the remaining
 611          * fragment requires a new tx control block to process, we need to
 612          * complete the current copy processing by syncing up the current
 613          * DMA buffer and saving the descriptor data.
 614          */
 615         if (copy_done) {
 616                 /*
 617                  * Sync the DMA buffer of the packet data
 618                  */
 619                 DMA_SYNC(tx_buf, DDI_DMA_SYNC_FORDEV);
 620 
 621                 tcb->tx_type = USE_COPY;
 622 
 623                 /*
 624                  * Save the address and length to the private data structure
 625                  * of the tx control block, which will be used to fill the
 626                  * tx descriptor ring after all the fragments are processed.
 627                  */
 628                 ixgbe_save_desc(tcb, tx_buf->dma_address, tx_buf->len);
 629                 desc_num++;
 630         }
 631 
 632         return (desc_num);
 633 }
 634 
 635 /*
 636  * ixgbe_tx_bind
 637  *
 638  * Bind the mblk fragment with DMA
 639  */
 640 static int
 641 ixgbe_tx_bind(ixgbe_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp,
 642     uint32_t len)
 643 {
 644         int status, i;
 645         ddi_dma_cookie_t dma_cookie;
 646         uint_t ncookies;
 647         int desc_num;
 648 
 649         /*
 650          * Use DMA binding to process the mblk fragment
 651          */
 652         status = ddi_dma_addr_bind_handle(tcb->tx_dma_handle, NULL,
 653             (caddr_t)mp->b_rptr, len,
 654             DDI_DMA_WRITE | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
 655             0, &dma_cookie, &ncookies);
 656 
 657         if (status != DDI_DMA_MAPPED) {
 658                 IXGBE_DEBUG_STAT(tx_ring->stat_fail_dma_bind);
 659                 return (-1);
 660         }
 661 
 662         tcb->frag_num++;
 663         tcb->tx_type = USE_DMA;
 664         /*
 665          * Each fragment can span several cookies. One cookie will have
 666          * one tx descriptor to transmit.
 667          */
 668         desc_num = 0;
 669         for (i = ncookies; i > 0; i--) {
 670                 /*
 671                  * Save the address and length to the private data structure
 672                  * of the tx control block, which will be used to fill the
 673                  * tx descriptor ring after all the fragments are processed.
 674                  */
 675                 ixgbe_save_desc(tcb,
 676                     dma_cookie.dmac_laddress,
 677                     dma_cookie.dmac_size);
 678 
 679                 desc_num++;
 680 
 681                 if (i > 1)
 682                         ddi_dma_nextcookie(tcb->tx_dma_handle, &dma_cookie);
 683         }
 684 
 685         return (desc_num);
 686 }
 687 
 688 /*
 689  * ixgbe_get_context
 690  *
 691  * Get the context information from the mblk
 692  */
 693 static int
 694 ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx)
 695 {
 696         uint32_t start;
 697         uint32_t hckflags;
 698         uint32_t lsoflags;
 699         uint32_t mss;
 700         uint32_t len;
 701         uint32_t size;
 702         uint32_t offset;
 703         unsigned char *pos;
 704         ushort_t etype;
 705         uint32_t mac_hdr_len;
 706         uint32_t l4_proto;
 707         uint32_t l4_hdr_len;
 708 
 709         ASSERT(mp != NULL);
 710 
 711         mac_hcksum_get(mp, &start, NULL, NULL, NULL, &hckflags);
 712         bzero(ctx, sizeof (ixgbe_tx_context_t));
 713 
 714         if (hckflags == 0) {
 715                 return (0);
 716         }
 717 
 718         ctx->hcksum_flags = hckflags;
 719 
 720         mac_lso_get(mp, &mss, &lsoflags);
 721         ctx->mss = mss;
 722         ctx->lso_flag = (lsoflags == HW_LSO);
 723 
 724         /*
 725          * LSO relies on tx h/w checksum, so here will drop the package
 726          * if h/w checksum flag is not declared.
 727          */
 728         if (ctx->lso_flag) {
 729                 if (!((ctx->hcksum_flags & HCK_PARTIALCKSUM) &&
 730                     (ctx->hcksum_flags & HCK_IPV4_HDRCKSUM))) {
 731                         IXGBE_DEBUGLOG_0(NULL, "ixgbe_tx: h/w "
 732                             "checksum flags are not specified when doing LSO");
 733                         return (-1);
 734                 }
 735         }
 736 
 737         etype = 0;
 738         mac_hdr_len = 0;
 739         l4_proto = 0;
 740 
 741         /*
 742          * Firstly get the position of the ether_type/ether_tpid.
 743          * Here we don't assume the ether (VLAN) header is fully included
 744          * in one mblk fragment, so we go thourgh the fragments to parse
 745          * the ether type.
 746          */
 747         size = len = MBLKL(mp);
 748         offset = offsetof(struct ether_header, ether_type);
 749         while (size <= offset) {
 750                 mp = mp->b_cont;
 751                 ASSERT(mp != NULL);
 752                 len = MBLKL(mp);
 753                 size += len;
 754         }
 755         pos = mp->b_rptr + offset + len - size;
 756 
 757         etype = ntohs(*(ushort_t *)(uintptr_t)pos);
 758         if (etype == ETHERTYPE_VLAN) {
 759                 /*
 760                  * Get the position of the ether_type in VLAN header
 761                  */
 762                 offset = offsetof(struct ether_vlan_header, ether_type);
 763                 while (size <= offset) {
 764                         mp = mp->b_cont;
 765                         ASSERT(mp != NULL);
 766                         len = MBLKL(mp);
 767                         size += len;
 768                 }
 769                 pos = mp->b_rptr + offset + len - size;
 770 
 771                 etype = ntohs(*(ushort_t *)(uintptr_t)pos);
 772                 mac_hdr_len = sizeof (struct ether_vlan_header);
 773         } else {
 774                 mac_hdr_len = sizeof (struct ether_header);
 775         }
 776 
 777         /*
 778          * Here we don't assume the IP(V6) header is fully included in
 779          * one mblk fragment.
 780          */
 781         switch (etype) {
 782         case ETHERTYPE_IP:
 783                 if (ctx->lso_flag) {
 784                         offset = offsetof(ipha_t, ipha_length) + mac_hdr_len;
 785                         while (size <= offset) {
 786                                 mp = mp->b_cont;
 787                                 ASSERT(mp != NULL);
 788                                 len = MBLKL(mp);
 789                                 size += len;
 790                         }
 791                         pos = mp->b_rptr + offset + len - size;
 792                         *((uint16_t *)(uintptr_t)(pos)) = 0;
 793 
 794                         offset = offsetof(ipha_t, ipha_hdr_checksum) +
 795                             mac_hdr_len;
 796                         while (size <= offset) {
 797                                 mp = mp->b_cont;
 798                                 ASSERT(mp != NULL);
 799                                 len = MBLKL(mp);
 800                                 size += len;
 801                         }
 802                         pos = mp->b_rptr + offset + len - size;
 803                         *((uint16_t *)(uintptr_t)(pos)) = 0;
 804 
 805                         /*
 806                          * To perform ixgbe LSO, here also need to fill
 807                          * the tcp checksum field of the packet with the
 808                          * following pseudo-header checksum:
 809                          * (ip_source_addr, ip_destination_addr, l4_proto)
 810                          * Currently the tcp/ip stack has done it.
 811                          */
 812                 }
 813 
 814                 offset = offsetof(ipha_t, ipha_protocol) + mac_hdr_len;
 815                 while (size <= offset) {
 816                         mp = mp->b_cont;
 817                         ASSERT(mp != NULL);
 818                         len = MBLKL(mp);
 819                         size += len;
 820                 }
 821                 pos = mp->b_rptr + offset + len - size;
 822 
 823                 l4_proto = *(uint8_t *)pos;
 824                 break;
 825         case ETHERTYPE_IPV6:
 826                 offset = offsetof(ip6_t, ip6_nxt) + mac_hdr_len;
 827                 while (size <= offset) {
 828                         mp = mp->b_cont;
 829                         ASSERT(mp != NULL);
 830                         len = MBLKL(mp);
 831                         size += len;
 832                 }
 833                 pos = mp->b_rptr + offset + len - size;
 834 
 835                 l4_proto = *(uint8_t *)pos;
 836                 break;
 837         default:
 838                 /* Unrecoverable error */
 839                 IXGBE_DEBUGLOG_0(NULL, "Ether type error with tx hcksum");
 840                 return (-2);
 841         }
 842 
 843         if (ctx->lso_flag) {
 844                 offset = mac_hdr_len + start;
 845                 while (size <= offset) {
 846                         mp = mp->b_cont;
 847                         ASSERT(mp != NULL);
 848                         len = MBLKL(mp);
 849                         size += len;
 850                 }
 851                 pos = mp->b_rptr + offset + len - size;
 852 
 853                 l4_hdr_len = TCP_HDR_LENGTH((tcph_t *)pos);
 854         } else {
 855                 /*
 856                  * l4 header length is only required for LSO
 857                  */
 858                 l4_hdr_len = 0;
 859         }
 860 
 861         ctx->mac_hdr_len = mac_hdr_len;
 862         ctx->ip_hdr_len = start;
 863         ctx->l4_proto = l4_proto;
 864         ctx->l4_hdr_len = l4_hdr_len;
 865 
 866         return (0);
 867 }
 868 
 869 /*
 870  * ixgbe_check_context
 871  *
 872  * Check if a new context descriptor is needed
 873  */
 874 static boolean_t
 875 ixgbe_check_context(ixgbe_tx_ring_t *tx_ring, ixgbe_tx_context_t *ctx)
 876 {
 877         ixgbe_tx_context_t *last;
 878 
 879         if (ctx == NULL)
 880                 return (B_FALSE);
 881 
 882         /*
 883          * Compare the context data retrieved from the mblk and the
 884          * stored data of the last context descriptor. The data need
 885          * to be checked are:
 886          *      hcksum_flags
 887          *      l4_proto
 888          *      mac_hdr_len
 889          *      ip_hdr_len
 890          *      lso_flag
 891          *      mss (only checked for LSO)
 892          *      l4_hr_len (only checked for LSO)
 893          * Either one of the above data is changed, a new context descriptor
 894          * will be needed.
 895          */
 896         last = &tx_ring->tx_context;
 897 
 898         if ((ctx->hcksum_flags != last->hcksum_flags) ||
 899             (ctx->l4_proto != last->l4_proto) ||
 900             (ctx->mac_hdr_len != last->mac_hdr_len) ||
 901             (ctx->ip_hdr_len != last->ip_hdr_len) ||
 902             (ctx->lso_flag != last->lso_flag) ||
 903             (ctx->lso_flag && ((ctx->mss != last->mss) ||
 904             (ctx->l4_hdr_len != last->l4_hdr_len)))) {
 905                 return (B_TRUE);
 906         }
 907 
 908         return (B_FALSE);
 909 }
 910 
 911 /*
 912  * ixgbe_fill_context
 913  *
 914  * Fill the context descriptor with hardware checksum informations
 915  */
 916 static void
 917 ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *ctx_tbd,
 918     ixgbe_tx_context_t *ctx)
 919 {
 920         /*
 921          * Fill the context descriptor with the checksum
 922          * context information we've got.
 923          */
 924         ctx_tbd->vlan_macip_lens = ctx->ip_hdr_len;
 925         ctx_tbd->vlan_macip_lens |= ctx->mac_hdr_len <<
 926             IXGBE_ADVTXD_MACLEN_SHIFT;
 927 
 928         ctx_tbd->type_tucmd_mlhl =
 929             IXGBE_ADVTXD_DCMD_DEXT | IXGBE_ADVTXD_DTYP_CTXT;
 930 
 931         if (ctx->hcksum_flags & HCK_IPV4_HDRCKSUM)
 932                 ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4;
 933 
 934         if (ctx->hcksum_flags & HCK_PARTIALCKSUM) {
 935                 switch (ctx->l4_proto) {
 936                 case IPPROTO_TCP:
 937                         ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP;
 938                         break;
 939                 case IPPROTO_UDP:
 940                         /*
 941                          * We don't have to explicitly set:
 942                          *      ctx_tbd->type_tucmd_mlhl |=
 943                          *          IXGBE_ADVTXD_TUCMD_L4T_UDP;
 944                          * Because IXGBE_ADVTXD_TUCMD_L4T_UDP == 0b
 945                          */
 946                         break;
 947                 default:
 948                         /* Unrecoverable error */
 949                         IXGBE_DEBUGLOG_0(NULL, "L4 type error with tx hcksum");
 950                         break;
 951                 }
 952         }
 953 
 954         ctx_tbd->seqnum_seed = 0;
 955 
 956         if (ctx->lso_flag) {
 957                 ctx_tbd->mss_l4len_idx =
 958                     (ctx->l4_hdr_len << IXGBE_ADVTXD_L4LEN_SHIFT) |
 959                     (ctx->mss << IXGBE_ADVTXD_MSS_SHIFT);
 960         } else {
 961                 ctx_tbd->mss_l4len_idx = 0;
 962         }
 963 }
 964 
 965 /*
 966  * ixgbe_tx_fill_ring
 967  *
 968  * Fill the tx descriptor ring with the data
 969  */
 970 static int
 971 ixgbe_tx_fill_ring(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list,
 972     ixgbe_tx_context_t *ctx, size_t mbsize)
 973 {
 974         struct ixgbe_hw *hw = &tx_ring->ixgbe->hw;
 975         boolean_t load_context;
 976         uint32_t index, tcb_index, desc_num;
 977         union ixgbe_adv_tx_desc *tbd, *first_tbd;
 978         tx_control_block_t *tcb, *first_tcb;
 979         uint32_t hcksum_flags;
 980         int i;
 981 
 982         ASSERT(mutex_owned(&tx_ring->tx_lock));
 983 
 984         tbd = NULL;
 985         first_tbd = NULL;
 986         first_tcb = NULL;
 987         desc_num = 0;
 988         hcksum_flags = 0;
 989         load_context = B_FALSE;
 990 
 991         /*
 992          * Get the index of the first tx descriptor that will be filled,
 993          * and the index of the first work list item that will be attached
 994          * with the first used tx control block in the pending list.
 995          * Note: the two indexes are the same.
 996          */
 997         index = tx_ring->tbd_tail;
 998         tcb_index = tx_ring->tbd_tail;
 999 
1000         if (ctx != NULL) {
1001                 hcksum_flags = ctx->hcksum_flags;
1002 
1003                 /*
1004                  * Check if a new context descriptor is needed for this packet
1005                  */
1006                 load_context = ixgbe_check_context(tx_ring, ctx);
1007 
1008                 if (load_context) {
1009                         tbd = &tx_ring->tbd_ring[index];
1010 
1011                         /*
1012                          * Fill the context descriptor with the
1013                          * hardware checksum offload informations.
1014                          */
1015                         ixgbe_fill_context(
1016                             (struct ixgbe_adv_tx_context_desc *)tbd, ctx);
1017 
1018                         index = NEXT_INDEX(index, 1, tx_ring->ring_size);
1019                         desc_num++;
1020 
1021                         /*
1022                          * Store the checksum context data if
1023                          * a new context descriptor is added
1024                          */
1025                         tx_ring->tx_context = *ctx;
1026                 }
1027         }
1028 
1029         first_tbd = &tx_ring->tbd_ring[index];
1030 
1031         /*
1032          * Fill tx data descriptors with the data saved in the pending list.
1033          * The tx control blocks in the pending list are added to the work list
1034          * at the same time.
1035          *
1036          * The work list is strictly 1:1 corresponding to the descriptor ring.
1037          * One item of the work list corresponds to one tx descriptor. Because
1038          * one tx control block can span multiple tx descriptors, the tx
1039          * control block will be added to the first work list item that
1040          * corresponds to the first tx descriptor generated from that tx
1041          * control block.
1042          */
1043         tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1044         first_tcb = tcb;
1045         while (tcb != NULL) {
1046 
1047                 for (i = 0; i < tcb->desc_num; i++) {
1048                         tbd = &tx_ring->tbd_ring[index];
1049 
1050                         tbd->read.buffer_addr = tcb->desc[i].address;
1051                         tbd->read.cmd_type_len = tcb->desc[i].length;
1052 
1053                         tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_DEXT
1054                             | IXGBE_ADVTXD_DTYP_DATA;
1055 
1056                         tbd->read.olinfo_status = 0;
1057 
1058                         index = NEXT_INDEX(index, 1, tx_ring->ring_size);
1059                         desc_num++;
1060                 }
1061 
1062                 /*
1063                  * Add the tx control block to the work list
1064                  */
1065                 ASSERT(tx_ring->work_list[tcb_index] == NULL);
1066                 tx_ring->work_list[tcb_index] = tcb;
1067 
1068                 tcb_index = index;
1069                 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1070         }
1071 
1072         if (load_context) {
1073                 /*
1074                  * Count the context descriptor for
1075                  * the first tx control block.
1076                  */
1077                 first_tcb->desc_num++;
1078         }
1079         first_tcb->last_index = PREV_INDEX(index, 1, tx_ring->ring_size);
1080 
1081         /*
1082          * The Insert Ethernet CRC (IFCS) bit and the checksum fields are only
1083          * valid in the first descriptor of the packet.
1084          * Setting paylen in every first_tbd for all parts.
1085          * 82599, X540 and X550 require the packet length in paylen field
1086          * with or without LSO and 82598 will ignore it in non-LSO mode.
1087          */
1088         ASSERT(first_tbd != NULL);
1089         first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_IFCS;
1090 
1091         switch (hw->mac.type) {
1092         case ixgbe_mac_82598EB:
1093                 if (ctx != NULL && ctx->lso_flag) {
1094                         first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE;
1095                         first_tbd->read.olinfo_status |=
1096                             (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len
1097                             - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT;
1098                 }
1099                 break;
1100 
1101         case ixgbe_mac_82599EB:
1102         case ixgbe_mac_X540:
1103         case ixgbe_mac_X550:
1104         case ixgbe_mac_X550EM_x:
1105                 if (ctx != NULL && ctx->lso_flag) {
1106                         first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE;
1107                         first_tbd->read.olinfo_status |=
1108                             (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len
1109                             - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT;
1110                 } else {
1111                         first_tbd->read.olinfo_status |=
1112                             (mbsize << IXGBE_ADVTXD_PAYLEN_SHIFT);
1113                 }
1114                 break;
1115 
1116         default:
1117                 break;
1118         }
1119 
1120         /* Set hardware checksum bits */
1121         if (hcksum_flags != 0) {
1122                 if (hcksum_flags & HCK_IPV4_HDRCKSUM)
1123                         first_tbd->read.olinfo_status |=
1124                             IXGBE_ADVTXD_POPTS_IXSM;
1125                 if (hcksum_flags & HCK_PARTIALCKSUM)
1126                         first_tbd->read.olinfo_status |=
1127                             IXGBE_ADVTXD_POPTS_TXSM;
1128         }
1129 
1130         /*
1131          * The last descriptor of packet needs End Of Packet (EOP),
1132          * and Report Status (RS) bits set
1133          */
1134         ASSERT(tbd != NULL);
1135         tbd->read.cmd_type_len |=
1136             IXGBE_ADVTXD_DCMD_EOP | IXGBE_ADVTXD_DCMD_RS;
1137 
1138         /*
1139          * Sync the DMA buffer of the tx descriptor ring
1140          */
1141         DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORDEV);
1142 
1143         /*
1144          * Update the number of the free tx descriptors.
1145          * The mutual exclusion between the transmission and the recycling
1146          * (for the tx descriptor ring and the work list) is implemented
1147          * with the atomic operation on the number of the free tx descriptors.
1148          *
1149          * Note: we should always decrement the counter tbd_free before
1150          * advancing the hardware TDT pointer to avoid the race condition -
1151          * before the counter tbd_free is decremented, the transmit of the
1152          * tx descriptors has done and the counter tbd_free is increased by
1153          * the tx recycling.
1154          */
1155         i = ixgbe_atomic_reserve(&tx_ring->tbd_free, desc_num);
1156         ASSERT(i >= 0);
1157 
1158         tx_ring->tbd_tail = index;
1159 
1160         /*
1161          * Advance the hardware TDT pointer of the tx descriptor ring
1162          */
1163         IXGBE_WRITE_REG(hw, IXGBE_TDT(tx_ring->index), index);
1164 
1165         if (ixgbe_check_acc_handle(tx_ring->ixgbe->osdep.reg_handle) !=
1166             DDI_FM_OK) {
1167                 ddi_fm_service_impact(tx_ring->ixgbe->dip,
1168                     DDI_SERVICE_DEGRADED);
1169                 atomic_or_32(&tx_ring->ixgbe->ixgbe_state, IXGBE_ERROR);
1170         }
1171 
1172         return (desc_num);
1173 }
1174 
1175 /*
1176  * ixgbe_save_desc
1177  *
1178  * Save the address/length pair to the private array
1179  * of the tx control block. The address/length pairs
1180  * will be filled into the tx descriptor ring later.
1181  */
1182 static void
1183 ixgbe_save_desc(tx_control_block_t *tcb, uint64_t address, size_t length)
1184 {
1185         sw_desc_t *desc;
1186 
1187         desc = &tcb->desc[tcb->desc_num];
1188         desc->address = address;
1189         desc->length = length;
1190 
1191         tcb->desc_num++;
1192 }
1193 
1194 /*
1195  * ixgbe_tx_recycle_legacy
1196  *
1197  * Recycle the tx descriptors and tx control blocks.
1198  *
1199  * The work list is traversed to check if the corresponding
1200  * tx descriptors have been transmitted. If so, the resources
1201  * bound to the tx control blocks will be freed, and those
1202  * tx control blocks will be returned to the free list.
1203  */
1204 uint32_t
1205 ixgbe_tx_recycle_legacy(ixgbe_tx_ring_t *tx_ring)
1206 {
1207         uint32_t index, last_index, prev_index;
1208         int desc_num;
1209         boolean_t desc_done;
1210         tx_control_block_t *tcb;
1211         link_list_t pending_list;
1212         ixgbe_t *ixgbe = tx_ring->ixgbe;
1213 
1214         mutex_enter(&tx_ring->recycle_lock);
1215 
1216         ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);
1217 
1218         if (tx_ring->tbd_free == tx_ring->ring_size) {
1219                 tx_ring->recycle_fail = 0;
1220                 tx_ring->stall_watchdog = 0;
1221                 if (tx_ring->reschedule) {
1222                         tx_ring->reschedule = B_FALSE;
1223                         mac_tx_ring_update(ixgbe->mac_hdl,
1224                             tx_ring->ring_handle);
1225                 }
1226                 mutex_exit(&tx_ring->recycle_lock);
1227                 return (0);
1228         }
1229 
1230         /*
1231          * Sync the DMA buffer of the tx descriptor ring
1232          */
1233         DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);
1234 
1235         if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) {
1236                 mutex_exit(&tx_ring->recycle_lock);
1237                 ddi_fm_service_impact(ixgbe->dip, DDI_SERVICE_DEGRADED);
1238                 atomic_or_32(&ixgbe->ixgbe_state, IXGBE_ERROR);
1239                 return (0);
1240         }
1241 
1242         LINK_LIST_INIT(&pending_list);
1243         desc_num = 0;
1244         index = tx_ring->tbd_head;   /* Index of next tbd/tcb to recycle */
1245 
1246         tcb = tx_ring->work_list[index];
1247         ASSERT(tcb != NULL);
1248 
1249         while (tcb != NULL) {
1250                 /*
1251                  * Get the last tx descriptor of this packet.
1252                  * If the last tx descriptor is done, then
1253                  * we can recycle all descriptors of a packet
1254                  * which usually includes several tx control blocks.
1255                  * For 82599, LSO descriptors can not be recycled
1256                  * unless the whole packet's transmission is done.
1257                  * That's why packet level recycling is used here.
1258                  * For 82598, there's not such limit.
1259                  */
1260                 last_index = tcb->last_index;
1261                 /*
1262                  * MAX_TX_RING_SIZE is used to judge whether
1263                  * the index is a valid value or not.
1264                  */
1265                 if (last_index == MAX_TX_RING_SIZE)
1266                         break;
1267 
1268                 /*
1269                  * Check if the Descriptor Done bit is set
1270                  */
1271                 desc_done = tx_ring->tbd_ring[last_index].wb.status &
1272                     IXGBE_TXD_STAT_DD;
1273                 if (desc_done) {
1274                         /*
1275                          * recycle all descriptors of the packet
1276                          */
1277                         while (tcb != NULL) {
1278                                 /*
1279                                  * Strip off the tx control block from
1280                                  * the work list, and add it to the
1281                                  * pending list.
1282                                  */
1283                                 tx_ring->work_list[index] = NULL;
1284                                 LIST_PUSH_TAIL(&pending_list, &tcb->link);
1285 
1286                                 /*
1287                                  * Count the total number of the tx
1288                                  * descriptors recycled
1289                                  */
1290                                 desc_num += tcb->desc_num;
1291 
1292                                 index = NEXT_INDEX(index, tcb->desc_num,
1293                                     tx_ring->ring_size);
1294 
1295                                 tcb = tx_ring->work_list[index];
1296 
1297                                 prev_index = PREV_INDEX(index, 1,
1298                                     tx_ring->ring_size);
1299                                 if (prev_index == last_index)
1300                                         break;
1301                         }
1302                 } else {
1303                         break;
1304                 }
1305         }
1306 
1307         /*
1308          * If no tx descriptors are recycled, no need to do more processing
1309          */
1310         if (desc_num == 0) {
1311                 tx_ring->recycle_fail++;
1312                 mutex_exit(&tx_ring->recycle_lock);
1313                 return (0);
1314         }
1315 
1316         tx_ring->recycle_fail = 0;
1317         tx_ring->stall_watchdog = 0;
1318 
1319         /*
1320          * Update the head index of the tx descriptor ring
1321          */
1322         tx_ring->tbd_head = index;
1323 
1324         /*
1325          * Update the number of the free tx descriptors with atomic operations
1326          */
1327         atomic_add_32(&tx_ring->tbd_free, desc_num);
1328 
1329         if ((tx_ring->tbd_free >= ixgbe->tx_resched_thresh) &&
1330             (tx_ring->reschedule)) {
1331                 tx_ring->reschedule = B_FALSE;
1332                 mac_tx_ring_update(ixgbe->mac_hdl,
1333                     tx_ring->ring_handle);
1334         }
1335         mutex_exit(&tx_ring->recycle_lock);
1336 
1337         /*
1338          * Free the resources used by the tx control blocks
1339          * in the pending list
1340          */
1341         tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
1342         while (tcb != NULL) {
1343                 /*
1344                  * Release the resources occupied by the tx control block
1345                  */
1346                 ixgbe_free_tcb(tcb);
1347 
1348                 tcb = (tx_control_block_t *)
1349                     LIST_GET_NEXT(&pending_list, &tcb->link);
1350         }
1351 
1352         /*
1353          * Add the tx control blocks in the pending list to the free list.
1354          */
1355         ixgbe_put_free_list(tx_ring, &pending_list);
1356 
1357         return (desc_num);
1358 }
1359 
1360 /*
1361  * ixgbe_tx_recycle_head_wb
1362  *
1363  * Check the head write-back, and recycle all the transmitted
1364  * tx descriptors and tx control blocks.
1365  */
1366 uint32_t
1367 ixgbe_tx_recycle_head_wb(ixgbe_tx_ring_t *tx_ring)
1368 {
1369         uint32_t index;
1370         uint32_t head_wb;
1371         int desc_num;
1372         tx_control_block_t *tcb;
1373         link_list_t pending_list;
1374         ixgbe_t *ixgbe = tx_ring->ixgbe;
1375 
1376         mutex_enter(&tx_ring->recycle_lock);
1377 
1378         ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);
1379 
1380         if (tx_ring->tbd_free == tx_ring->ring_size) {
1381                 tx_ring->recycle_fail = 0;
1382                 tx_ring->stall_watchdog = 0;
1383                 if (tx_ring->reschedule) {
1384                         tx_ring->reschedule = B_FALSE;
1385                         mac_tx_ring_update(ixgbe->mac_hdl,
1386                             tx_ring->ring_handle);
1387                 }
1388                 mutex_exit(&tx_ring->recycle_lock);
1389                 return (0);
1390         }
1391 
1392         /*
1393          * Sync the DMA buffer of the tx descriptor ring
1394          *
1395          * Note: For head write-back mode, the tx descriptors will not
1396          * be written back, but the head write-back value is stored at
1397          * the last extra tbd at the end of the DMA area, we still need
1398          * to sync the head write-back value for kernel.
1399          *
1400          * DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);
1401          */
1402         (void) ddi_dma_sync(tx_ring->tbd_area.dma_handle,
1403             sizeof (union ixgbe_adv_tx_desc) * tx_ring->ring_size,
1404             sizeof (uint32_t),
1405             DDI_DMA_SYNC_FORKERNEL);
1406 
1407         if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) {
1408                 mutex_exit(&tx_ring->recycle_lock);
1409                 ddi_fm_service_impact(ixgbe->dip,
1410                     DDI_SERVICE_DEGRADED);
1411                 atomic_or_32(&ixgbe->ixgbe_state, IXGBE_ERROR);
1412                 return (0);
1413         }
1414 
1415         LINK_LIST_INIT(&pending_list);
1416         desc_num = 0;
1417         index = tx_ring->tbd_head;   /* Next index to clean */
1418 
1419         /*
1420          * Get the value of head write-back
1421          */
1422         head_wb = *tx_ring->tbd_head_wb;
1423         while (index != head_wb) {
1424                 tcb = tx_ring->work_list[index];
1425                 ASSERT(tcb != NULL);
1426 
1427                 if (OFFSET(index, head_wb, tx_ring->ring_size) <
1428                     tcb->desc_num) {
1429                         /*
1430                          * The current tx control block is not
1431                          * completely transmitted, stop recycling
1432                          */
1433                         break;
1434                 }
1435 
1436                 /*
1437                  * Strip off the tx control block from the work list,
1438                  * and add it to the pending list.
1439                  */
1440                 tx_ring->work_list[index] = NULL;
1441                 LIST_PUSH_TAIL(&pending_list, &tcb->link);
1442 
1443                 /*
1444                  * Advance the index of the tx descriptor ring
1445                  */
1446                 index = NEXT_INDEX(index, tcb->desc_num, tx_ring->ring_size);
1447 
1448                 /*
1449                  * Count the total number of the tx descriptors recycled
1450                  */
1451                 desc_num += tcb->desc_num;
1452         }
1453 
1454         /*
1455          * If no tx descriptors are recycled, no need to do more processing
1456          */
1457         if (desc_num == 0) {
1458                 tx_ring->recycle_fail++;
1459                 mutex_exit(&tx_ring->recycle_lock);
1460                 return (0);
1461         }
1462 
1463         tx_ring->recycle_fail = 0;
1464         tx_ring->stall_watchdog = 0;
1465 
1466         /*
1467          * Update the head index of the tx descriptor ring
1468          */
1469         tx_ring->tbd_head = index;
1470 
1471         /*
1472          * Update the number of the free tx descriptors with atomic operations
1473          */
1474         atomic_add_32(&tx_ring->tbd_free, desc_num);
1475 
1476         if ((tx_ring->tbd_free >= ixgbe->tx_resched_thresh) &&
1477             (tx_ring->reschedule)) {
1478                 tx_ring->reschedule = B_FALSE;
1479                 mac_tx_ring_update(ixgbe->mac_hdl,
1480                     tx_ring->ring_handle);
1481         }
1482         mutex_exit(&tx_ring->recycle_lock);
1483 
1484         /*
1485          * Free the resources used by the tx control blocks
1486          * in the pending list
1487          */
1488         tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
1489         while (tcb) {
1490                 /*
1491                  * Release the resources occupied by the tx control block
1492                  */
1493                 ixgbe_free_tcb(tcb);
1494 
1495                 tcb = (tx_control_block_t *)
1496                     LIST_GET_NEXT(&pending_list, &tcb->link);
1497         }
1498 
1499         /*
1500          * Add the tx control blocks in the pending list to the free list.
1501          */
1502         ixgbe_put_free_list(tx_ring, &pending_list);
1503 
1504         return (desc_num);
1505 }
1506 
1507 /*
1508  * ixgbe_free_tcb - free up the tx control block
1509  *
1510  * Free the resources of the tx control block, including
1511  * unbind the previously bound DMA handle, and reset other
1512  * control fields.
1513  */
1514 void
1515 ixgbe_free_tcb(tx_control_block_t *tcb)
1516 {
1517         switch (tcb->tx_type) {
1518         case USE_COPY:
1519                 /*
1520                  * Reset the buffer length that is used for copy
1521                  */
1522                 tcb->tx_buf.len = 0;
1523                 break;
1524         case USE_DMA:
1525                 /*
1526                  * Release the DMA resource that is used for
1527                  * DMA binding.
1528                  */
1529                 (void) ddi_dma_unbind_handle(tcb->tx_dma_handle);
1530                 break;
1531         default:
1532                 break;
1533         }
1534 
1535         /*
1536          * Free the mblk
1537          */
1538         if (tcb->mp != NULL) {
1539                 freemsg(tcb->mp);
1540                 tcb->mp = NULL;
1541         }
1542 
1543         tcb->tx_type = USE_NONE;
1544         tcb->last_index = MAX_TX_RING_SIZE;
1545         tcb->frag_num = 0;
1546         tcb->desc_num = 0;
1547 }
1548 
1549 /*
1550  * ixgbe_get_free_list - Get a free tx control block from the free list
1551  *
1552  * The atomic operation on the number of the available tx control block
1553  * in the free list is used to keep this routine mutual exclusive with
1554  * the routine ixgbe_put_check_list.
1555  */
1556 static tx_control_block_t *
1557 ixgbe_get_free_list(ixgbe_tx_ring_t *tx_ring)
1558 {
1559         tx_control_block_t *tcb;
1560 
1561         /*
1562          * Check and update the number of the free tx control block
1563          * in the free list.
1564          */
1565         if (ixgbe_atomic_reserve(&tx_ring->tcb_free, 1) < 0)
1566                 return (NULL);
1567 
1568         mutex_enter(&tx_ring->tcb_head_lock);
1569 
1570         tcb = tx_ring->free_list[tx_ring->tcb_head];
1571         ASSERT(tcb != NULL);
1572         tx_ring->free_list[tx_ring->tcb_head] = NULL;
1573         tx_ring->tcb_head = NEXT_INDEX(tx_ring->tcb_head, 1,
1574             tx_ring->free_list_size);
1575 
1576         mutex_exit(&tx_ring->tcb_head_lock);
1577 
1578         return (tcb);
1579 }
1580 
1581 /*
1582  * ixgbe_put_free_list
1583  *
1584  * Put a list of used tx control blocks back to the free list
1585  *
1586  * A mutex is used here to ensure the serialization. The mutual exclusion
1587  * between ixgbe_get_free_list and ixgbe_put_free_list is implemented with
1588  * the atomic operation on the counter tcb_free.
1589  */
1590 void
1591 ixgbe_put_free_list(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list)
1592 {
1593         uint32_t index;
1594         int tcb_num;
1595         tx_control_block_t *tcb;
1596 
1597         mutex_enter(&tx_ring->tcb_tail_lock);
1598 
1599         index = tx_ring->tcb_tail;
1600 
1601         tcb_num = 0;
1602         tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1603         while (tcb != NULL) {
1604                 ASSERT(tx_ring->free_list[index] == NULL);
1605                 tx_ring->free_list[index] = tcb;
1606 
1607                 tcb_num++;
1608 
1609                 index = NEXT_INDEX(index, 1, tx_ring->free_list_size);
1610 
1611                 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1612         }
1613 
1614         tx_ring->tcb_tail = index;
1615 
1616         /*
1617          * Update the number of the free tx control block
1618          * in the free list. This operation must be placed
1619          * under the protection of the lock.
1620          */
1621         atomic_add_32(&tx_ring->tcb_free, tcb_num);
1622 
1623         mutex_exit(&tx_ring->tcb_tail_lock);
1624 }