1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright(c) 2007-2010 Intel Corporation. All rights reserved.
  24  */
  25 
  26 /*
  27  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  28  * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
  29  */
  30 
  31 #include "ixgbe_sw.h"
  32 
  33 static int ixgbe_tx_copy(ixgbe_tx_ring_t *, tx_control_block_t *, mblk_t *,
  34     uint32_t, boolean_t);
  35 static int ixgbe_tx_bind(ixgbe_tx_ring_t *, tx_control_block_t *, mblk_t *,
  36     uint32_t);
  37 static int ixgbe_tx_fill_ring(ixgbe_tx_ring_t *, link_list_t *,
  38     ixgbe_tx_context_t *, size_t);
  39 static void ixgbe_save_desc(tx_control_block_t *, uint64_t, size_t);
  40 static tx_control_block_t *ixgbe_get_free_list(ixgbe_tx_ring_t *);
  41 
  42 static int ixgbe_get_context(mblk_t *, ixgbe_tx_context_t *);
  43 static boolean_t ixgbe_check_context(ixgbe_tx_ring_t *,
  44     ixgbe_tx_context_t *);
  45 static void ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *,
  46     ixgbe_tx_context_t *);
  47 
  48 #ifndef IXGBE_DEBUG
  49 #pragma inline(ixgbe_save_desc)
  50 #pragma inline(ixgbe_get_context)
  51 #pragma inline(ixgbe_check_context)
  52 #pragma inline(ixgbe_fill_context)
  53 #endif
  54 
  55 /*
  56  * ixgbe_ring_tx
  57  *
  58  * To transmit one mblk through one specified ring.
  59  *
  60  * One mblk can consist of several fragments, each fragment
  61  * will be processed with different methods based on the size.
  62  * For the fragments with size less than the bcopy threshold,
  63  * they will be processed by using bcopy; otherwise, they will
  64  * be processed by using DMA binding.
  65  *
  66  * To process the mblk, a tx control block is got from the
  67  * free list. One tx control block contains one tx buffer, which
  68  * is used to copy mblk fragments' data; and one tx DMA handle,
  69  * which is used to bind a mblk fragment with DMA resource.
  70  *
  71  * Several small mblk fragments can be copied into one tx control
  72  * block's buffer, and then the buffer will be transmitted with
  73  * one tx descriptor.
  74  *
  75  * A large fragment only binds with one tx control block's DMA
  76  * handle, and it can span several tx descriptors for transmitting.
  77  *
  78  * So to transmit a packet (mblk), several tx control blocks can
  79  * be used. After the processing, those tx control blocks will
  80  * be put to the work list.
  81  */
  82 mblk_t *
  83 ixgbe_ring_tx(void *arg, mblk_t *mp)
  84 {
  85         ixgbe_tx_ring_t *tx_ring = (ixgbe_tx_ring_t *)arg;
  86         ixgbe_t *ixgbe = tx_ring->ixgbe;
  87         tx_type_t current_flag, next_flag;
  88         uint32_t current_len, next_len;
  89         uint32_t desc_total;
  90         size_t mbsize;
  91         int desc_num;
  92         boolean_t copy_done, eop;
  93         mblk_t *current_mp, *next_mp, *nmp, *pull_mp = NULL;
  94         tx_control_block_t *tcb;
  95         ixgbe_tx_context_t tx_context, *ctx;
  96         link_list_t pending_list;
  97         uint32_t len, hdr_frag_len, hdr_len;
  98         uint32_t copy_thresh;
  99         mblk_t *hdr_new_mp = NULL;
 100         mblk_t *hdr_pre_mp = NULL;
 101         mblk_t *hdr_nmp = NULL;
 102 
 103         ASSERT(mp->b_next == NULL);
 104 
 105         if ((ixgbe->ixgbe_state & IXGBE_SUSPENDED) ||
 106             (ixgbe->ixgbe_state & IXGBE_ERROR) ||
 107             (ixgbe->ixgbe_state & IXGBE_OVERTEMP) ||
 108             !(ixgbe->ixgbe_state & IXGBE_STARTED) ||
 109             ixgbe->link_state != LINK_STATE_UP) {
 110                 freemsg(mp);
 111                 return (NULL);
 112         }
 113 
 114         copy_thresh = ixgbe->tx_copy_thresh;
 115 
 116         /* Get the mblk size */
 117         mbsize = 0;
 118         for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
 119                 mbsize += MBLKL(nmp);
 120         }
 121 
 122         if (ixgbe->tx_hcksum_enable) {
 123                 /*
 124                  * Retrieve checksum context information from the mblk
 125                  * that will be used to decide whether/how to fill the
 126                  * context descriptor.
 127                  */
 128                 ctx = &tx_context;
 129                 if (ixgbe_get_context(mp, ctx) < 0) {
 130                         freemsg(mp);
 131                         return (NULL);
 132                 }
 133 
 134                 /*
 135                  * If the mblk size exceeds the max size ixgbe could
 136                  * process, then discard this mblk, and return NULL.
 137                  */
 138                 if ((ctx->lso_flag &&
 139                     ((mbsize - ctx->mac_hdr_len) > IXGBE_LSO_MAXLEN)) ||
 140                     (!ctx->lso_flag &&
 141                     (mbsize > (ixgbe->max_frame_size - ETHERFCSL)))) {
 142                         freemsg(mp);
 143                         IXGBE_DEBUGLOG_0(ixgbe, "ixgbe_tx: packet oversize");
 144                         return (NULL);
 145                 }
 146         } else {
 147                 ctx = NULL;
 148         }
 149 
 150         /*
 151          * Check and recycle tx descriptors.
 152          * The recycle threshold here should be selected carefully
 153          */
 154         if (tx_ring->tbd_free < ixgbe->tx_recycle_thresh) {
 155                 tx_ring->tx_recycle(tx_ring);
 156         }
 157 
 158         /*
 159          * After the recycling, if the tbd_free is less than the
 160          * overload_threshold, assert overload, return mp;
 161          * and we need to re-schedule the tx again.
 162          */
 163         if (tx_ring->tbd_free < ixgbe->tx_overload_thresh) {
 164                 tx_ring->reschedule = B_TRUE;
 165                 IXGBE_DEBUG_STAT(tx_ring->stat_overload);
 166                 return (mp);
 167         }
 168 
 169         /*
 170          * The pending_list is a linked list that is used to save
 171          * the tx control blocks that have packet data processed
 172          * but have not put the data to the tx descriptor ring.
 173          * It is used to reduce the lock contention of the tx_lock.
 174          */
 175         LINK_LIST_INIT(&pending_list);
 176         desc_num = 0;
 177         desc_total = 0;
 178 
 179         /*
 180          * The software should guarantee LSO packet header(MAC+IP+TCP)
 181          * to be within one descriptor. Here we reallocate and refill the
 182          * the header if it's physical memory non-contiguous.
 183          */
 184         if ((ctx != NULL) && ctx->lso_flag) {
 185                 /* find the last fragment of the header */
 186                 len = MBLKL(mp);
 187                 ASSERT(len > 0);
 188                 hdr_nmp = mp;
 189                 hdr_len = ctx->ip_hdr_len + ctx->mac_hdr_len + ctx->l4_hdr_len;
 190                 while (len < hdr_len) {
 191                         hdr_pre_mp = hdr_nmp;
 192                         hdr_nmp = hdr_nmp->b_cont;
 193                         len += MBLKL(hdr_nmp);
 194                 }
 195                 /*
 196                  * If the header and the payload are in different mblks,
 197                  * we simply force the header to be copied into pre-allocated
 198                  * page-aligned buffer.
 199                  */
 200                 if (len == hdr_len)
 201                         goto adjust_threshold;
 202 
 203                 hdr_frag_len = hdr_len - (len - MBLKL(hdr_nmp));
 204                 /*
 205                  * There are two cases we need to reallocate a mblk for the
 206                  * last header fragment:
 207                  * 1. the header is in multiple mblks and the last fragment
 208                  * share the same mblk with the payload
 209                  * 2. the header is in a single mblk shared with the payload
 210                  * and the header is physical memory non-contiguous
 211                  */
 212                 if ((hdr_nmp != mp) ||
 213                     (P2NPHASE((uintptr_t)hdr_nmp->b_rptr, ixgbe->sys_page_size)
 214                     < hdr_len)) {
 215                         IXGBE_DEBUG_STAT(tx_ring->stat_lso_header_fail);
 216                         /*
 217                          * reallocate the mblk for the last header fragment,
 218                          * expect to bcopy into pre-allocated page-aligned
 219                          * buffer
 220                          */
 221                         hdr_new_mp = allocb(hdr_frag_len, NULL);
 222                         if (!hdr_new_mp)
 223                                 return (mp);
 224                         bcopy(hdr_nmp->b_rptr, hdr_new_mp->b_rptr,
 225                             hdr_frag_len);
 226                         /* link the new header fragment with the other parts */
 227                         hdr_new_mp->b_wptr = hdr_new_mp->b_rptr + hdr_frag_len;
 228                         hdr_new_mp->b_cont = hdr_nmp;
 229                         if (hdr_pre_mp)
 230                                 hdr_pre_mp->b_cont = hdr_new_mp;
 231                         else
 232                                 mp = hdr_new_mp;
 233                         hdr_nmp->b_rptr += hdr_frag_len;
 234                 }
 235 adjust_threshold:
 236                 /*
 237                  * adjust the bcopy threshhold to guarantee
 238                  * the header to use bcopy way
 239                  */
 240                 if (copy_thresh < hdr_len)
 241                         copy_thresh = hdr_len;
 242         }
 243 
 244         current_mp = mp;
 245         current_len = MBLKL(current_mp);
 246         /*
 247          * Decide which method to use for the first fragment
 248          */
 249         current_flag = (current_len <= copy_thresh) ?
 250             USE_COPY : USE_DMA;
 251         /*
 252          * If the mblk includes several contiguous small fragments,
 253          * they may be copied into one buffer. This flag is used to
 254          * indicate whether there are pending fragments that need to
 255          * be copied to the current tx buffer.
 256          *
 257          * If this flag is B_TRUE, it indicates that a new tx control
 258          * block is needed to process the next fragment using either
 259          * copy or DMA binding.
 260          *
 261          * Otherwise, it indicates that the next fragment will be
 262          * copied to the current tx buffer that is maintained by the
 263          * current tx control block. No new tx control block is needed.
 264          */
 265         copy_done = B_TRUE;
 266         while (current_mp) {
 267                 next_mp = current_mp->b_cont;
 268                 eop = (next_mp == NULL); /* Last fragment of the packet? */
 269                 next_len = eop ? 0: MBLKL(next_mp);
 270 
 271                 /*
 272                  * When the current fragment is an empty fragment, if
 273                  * the next fragment will still be copied to the current
 274                  * tx buffer, we cannot skip this fragment here. Because
 275                  * the copy processing is pending for completion. We have
 276                  * to process this empty fragment in the tx_copy routine.
 277                  *
 278                  * If the copy processing is completed or a DMA binding
 279                  * processing is just completed, we can just skip this
 280                  * empty fragment.
 281                  */
 282                 if ((current_len == 0) && (copy_done)) {
 283                         current_mp = next_mp;
 284                         current_len = next_len;
 285                         current_flag = (current_len <= copy_thresh) ?
 286                             USE_COPY : USE_DMA;
 287                         continue;
 288                 }
 289 
 290                 if (copy_done) {
 291                         /*
 292                          * Get a new tx control block from the free list
 293                          */
 294                         tcb = ixgbe_get_free_list(tx_ring);
 295 
 296                         if (tcb == NULL) {
 297                                 IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tcb);
 298                                 goto tx_failure;
 299                         }
 300 
 301                         /*
 302                          * Push the tx control block to the pending list
 303                          * to avoid using lock too early
 304                          */
 305                         LIST_PUSH_TAIL(&pending_list, &tcb->link);
 306                 }
 307 
 308                 if (current_flag == USE_COPY) {
 309                         /*
 310                          * Check whether to use bcopy or DMA binding to process
 311                          * the next fragment, and if using bcopy, whether we
 312                          * need to continue copying the next fragment into the
 313                          * current tx buffer.
 314                          */
 315                         ASSERT((tcb->tx_buf.len + current_len) <=
 316                             tcb->tx_buf.size);
 317 
 318                         if (eop) {
 319                                 /*
 320                                  * This is the last fragment of the packet, so
 321                                  * the copy processing will be completed with
 322                                  * this fragment.
 323                                  */
 324                                 next_flag = USE_NONE;
 325                                 copy_done = B_TRUE;
 326                         } else if ((tcb->tx_buf.len + current_len + next_len) >
 327                             tcb->tx_buf.size) {
 328                                 /*
 329                                  * If the next fragment is too large to be
 330                                  * copied to the current tx buffer, we need
 331                                  * to complete the current copy processing.
 332                                  */
 333                                 next_flag = (next_len > copy_thresh) ?
 334                                     USE_DMA: USE_COPY;
 335                                 copy_done = B_TRUE;
 336                         } else if (next_len > copy_thresh) {
 337                                 /*
 338                                  * The next fragment needs to be processed with
 339                                  * DMA binding. So the copy prcessing will be
 340                                  * completed with the current fragment.
 341                                  */
 342                                 next_flag = USE_DMA;
 343                                 copy_done = B_TRUE;
 344                         } else {
 345                                 /*
 346                                  * Continue to copy the next fragment to the
 347                                  * current tx buffer.
 348                                  */
 349                                 next_flag = USE_COPY;
 350                                 copy_done = B_FALSE;
 351                         }
 352 
 353                         desc_num = ixgbe_tx_copy(tx_ring, tcb, current_mp,
 354                             current_len, copy_done);
 355                 } else {
 356                         /*
 357                          * Check whether to use bcopy or DMA binding to process
 358                          * the next fragment.
 359                          */
 360                         next_flag = (next_len > copy_thresh) ?
 361                             USE_DMA: USE_COPY;
 362                         ASSERT(copy_done == B_TRUE);
 363 
 364                         desc_num = ixgbe_tx_bind(tx_ring, tcb, current_mp,
 365                             current_len);
 366                 }
 367 
 368                 if (desc_num > 0)
 369                         desc_total += desc_num;
 370                 else if (desc_num < 0)
 371                         goto tx_failure;
 372 
 373                 current_mp = next_mp;
 374                 current_len = next_len;
 375                 current_flag = next_flag;
 376         }
 377 
 378         /*
 379          * Attach the mblk to the last tx control block
 380          */
 381         ASSERT(tcb);
 382         ASSERT(tcb->mp == NULL);
 383         tcb->mp = mp;
 384 
 385         /*
 386          * 82598/82599 chipset has a limitation that no more than 32 tx
 387          * descriptors can be transmited out at one time.
 388          *
 389          * Here is a workaround for it: pull up the mblk then send it
 390          * out with bind way. By doing so, no more than MAX_COOKIE (18)
 391          * descriptors is needed.
 392          */
 393         if (desc_total + 1 > IXGBE_TX_DESC_LIMIT) {
 394                 IXGBE_DEBUG_STAT(tx_ring->stat_break_tbd_limit);
 395 
 396                 /*
 397                  * Discard the mblk and free the used resources
 398                  */
 399                 tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
 400                 while (tcb) {
 401                         tcb->mp = NULL;
 402                         ixgbe_free_tcb(tcb);
 403                         tcb = (tx_control_block_t *)
 404                             LIST_GET_NEXT(&pending_list, &tcb->link);
 405                 }
 406 
 407                 /*
 408                  * Return the tx control blocks in the pending list to
 409                  * the free list.
 410                  */
 411                 ixgbe_put_free_list(tx_ring, &pending_list);
 412 
 413                 /*
 414                  * pull up the mblk and send it out with bind way
 415                  */
 416                 if ((pull_mp = msgpullup(mp, -1)) == NULL) {
 417                         tx_ring->reschedule = B_TRUE;
 418 
 419                         /*
 420                          * If new mblk has been allocted for the last header
 421                          * fragment of a LSO packet, we should restore the
 422                          * modified mp.
 423                          */
 424                         if (hdr_new_mp) {
 425                                 hdr_new_mp->b_cont = NULL;
 426                                 freeb(hdr_new_mp);
 427                                 hdr_nmp->b_rptr -= hdr_frag_len;
 428                                 if (hdr_pre_mp)
 429                                         hdr_pre_mp->b_cont = hdr_nmp;
 430                                 else
 431                                         mp = hdr_nmp;
 432                         }
 433                         return (mp);
 434                 }
 435 
 436                 LINK_LIST_INIT(&pending_list);
 437                 desc_total = 0;
 438 
 439                 /*
 440                  * if the packet is a LSO packet, we simply
 441                  * transmit the header in one descriptor using the copy way
 442                  */
 443                 if ((ctx != NULL) && ctx->lso_flag) {
 444                         hdr_len = ctx->ip_hdr_len + ctx->mac_hdr_len +
 445                             ctx->l4_hdr_len;
 446 
 447                         tcb = ixgbe_get_free_list(tx_ring);
 448                         if (tcb == NULL) {
 449                                 IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tcb);
 450                                 goto tx_failure;
 451                         }
 452                         desc_num = ixgbe_tx_copy(tx_ring, tcb, pull_mp,
 453                             hdr_len, B_TRUE);
 454                         LIST_PUSH_TAIL(&pending_list, &tcb->link);
 455                         desc_total  += desc_num;
 456 
 457                         pull_mp->b_rptr += hdr_len;
 458                 }
 459 
 460                 tcb = ixgbe_get_free_list(tx_ring);
 461                 if (tcb == NULL) {
 462                         IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tcb);
 463                         goto tx_failure;
 464                 }
 465                 if ((ctx != NULL) && ctx->lso_flag) {
 466                         desc_num = ixgbe_tx_bind(tx_ring, tcb, pull_mp,
 467                             mbsize - hdr_len);
 468                 } else {
 469                         desc_num = ixgbe_tx_bind(tx_ring, tcb, pull_mp,
 470                             mbsize);
 471                 }
 472                 if (desc_num < 0) {
 473                         goto tx_failure;
 474                 }
 475                 LIST_PUSH_TAIL(&pending_list, &tcb->link);
 476 
 477                 desc_total += desc_num;
 478                 tcb->mp = pull_mp;
 479         }
 480 
 481         /*
 482          * Before fill the tx descriptor ring with the data, we need to
 483          * ensure there are adequate free descriptors for transmit
 484          * (including one context descriptor).
 485          * Do not use up all the tx descriptors.
 486          * Otherwise tx recycle will fail and cause false hang.
 487          */
 488         if (tx_ring->tbd_free <= (desc_total + 1)) {
 489                 tx_ring->tx_recycle(tx_ring);
 490         }
 491 
 492         mutex_enter(&tx_ring->tx_lock);
 493         /*
 494          * If the number of free tx descriptors is not enough for transmit
 495          * then return mp.
 496          *
 497          * Note: we must put this check under the mutex protection to
 498          * ensure the correctness when multiple threads access it in
 499          * parallel.
 500          */
 501         if (tx_ring->tbd_free <= (desc_total + 1)) {
 502                 IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tbd);
 503                 mutex_exit(&tx_ring->tx_lock);
 504                 goto tx_failure;
 505         }
 506 
 507         desc_num = ixgbe_tx_fill_ring(tx_ring, &pending_list, ctx,
 508             mbsize);
 509 
 510         ASSERT((desc_num == desc_total) || (desc_num == (desc_total + 1)));
 511 
 512         tx_ring->stat_obytes += mbsize;
 513         tx_ring->stat_opackets ++;
 514 
 515         mutex_exit(&tx_ring->tx_lock);
 516 
 517         /*
 518          * now that the transmission succeeds, need to free the original
 519          * mp if we used the pulling up mblk for transmission.
 520          */
 521         if (pull_mp) {
 522                 freemsg(mp);
 523         }
 524 
 525         return (NULL);
 526 
 527 tx_failure:
 528         /*
 529          * If transmission fails, need to free the pulling up mblk.
 530          */
 531         if (pull_mp) {
 532                 freemsg(pull_mp);
 533         }
 534 
 535         /*
 536          * If new mblk has been allocted for the last header
 537          * fragment of a LSO packet, we should restore the
 538          * modified mp.
 539          */
 540         if (hdr_new_mp) {
 541                 hdr_new_mp->b_cont = NULL;
 542                 freeb(hdr_new_mp);
 543                 hdr_nmp->b_rptr -= hdr_frag_len;
 544                 if (hdr_pre_mp)
 545                         hdr_pre_mp->b_cont = hdr_nmp;
 546                 else
 547                         mp = hdr_nmp;
 548         }
 549         /*
 550          * Discard the mblk and free the used resources
 551          */
 552         tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
 553         while (tcb) {
 554                 tcb->mp = NULL;
 555 
 556                 ixgbe_free_tcb(tcb);
 557 
 558                 tcb = (tx_control_block_t *)
 559                     LIST_GET_NEXT(&pending_list, &tcb->link);
 560         }
 561 
 562         /*
 563          * Return the tx control blocks in the pending list to the free list.
 564          */
 565         ixgbe_put_free_list(tx_ring, &pending_list);
 566 
 567         /* Transmit failed, do not drop the mblk, rechedule the transmit */
 568         tx_ring->reschedule = B_TRUE;
 569 
 570         return (mp);
 571 }
 572 
 573 /*
 574  * ixgbe_tx_copy
 575  *
 576  * Copy the mblk fragment to the pre-allocated tx buffer
 577  */
 578 static int
 579 ixgbe_tx_copy(ixgbe_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp,
 580     uint32_t len, boolean_t copy_done)
 581 {
 582         dma_buffer_t *tx_buf;
 583         uint32_t desc_num;
 584         _NOTE(ARGUNUSED(tx_ring));
 585 
 586         tx_buf = &tcb->tx_buf;
 587 
 588         /*
 589          * Copy the packet data of the mblk fragment into the
 590          * pre-allocated tx buffer, which is maintained by the
 591          * tx control block.
 592          *
 593          * Several mblk fragments can be copied into one tx buffer.
 594          * The destination address of the current copied fragment in
 595          * the tx buffer is next to the end of the previous copied
 596          * fragment.
 597          */
 598         if (len > 0) {
 599                 bcopy(mp->b_rptr, tx_buf->address + tx_buf->len, len);
 600 
 601                 tx_buf->len += len;
 602                 tcb->frag_num++;
 603         }
 604 
 605         desc_num = 0;
 606 
 607         /*
 608          * If it is the last fragment copied to the current tx buffer,
 609          * in other words, if there's no remaining fragment or the remaining
 610          * fragment requires a new tx control block to process, we need to
 611          * complete the current copy processing by syncing up the current
 612          * DMA buffer and saving the descriptor data.
 613          */
 614         if (copy_done) {
 615                 /*
 616                  * Sync the DMA buffer of the packet data
 617                  */
 618                 DMA_SYNC(tx_buf, DDI_DMA_SYNC_FORDEV);
 619 
 620                 tcb->tx_type = USE_COPY;
 621 
 622                 /*
 623                  * Save the address and length to the private data structure
 624                  * of the tx control block, which will be used to fill the
 625                  * tx descriptor ring after all the fragments are processed.
 626                  */
 627                 ixgbe_save_desc(tcb, tx_buf->dma_address, tx_buf->len);
 628                 desc_num++;
 629         }
 630 
 631         return (desc_num);
 632 }
 633 
 634 /*
 635  * ixgbe_tx_bind
 636  *
 637  * Bind the mblk fragment with DMA
 638  */
 639 static int
 640 ixgbe_tx_bind(ixgbe_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp,
 641     uint32_t len)
 642 {
 643         int status, i;
 644         ddi_dma_cookie_t dma_cookie;
 645         uint_t ncookies;
 646         int desc_num;
 647 
 648         /*
 649          * Use DMA binding to process the mblk fragment
 650          */
 651         status = ddi_dma_addr_bind_handle(tcb->tx_dma_handle, NULL,
 652             (caddr_t)mp->b_rptr, len,
 653             DDI_DMA_WRITE | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
 654             0, &dma_cookie, &ncookies);
 655 
 656         if (status != DDI_DMA_MAPPED) {
 657                 IXGBE_DEBUG_STAT(tx_ring->stat_fail_dma_bind);
 658                 return (-1);
 659         }
 660 
 661         tcb->frag_num++;
 662         tcb->tx_type = USE_DMA;
 663         /*
 664          * Each fragment can span several cookies. One cookie will have
 665          * one tx descriptor to transmit.
 666          */
 667         desc_num = 0;
 668         for (i = ncookies; i > 0; i--) {
 669                 /*
 670                  * Save the address and length to the private data structure
 671                  * of the tx control block, which will be used to fill the
 672                  * tx descriptor ring after all the fragments are processed.
 673                  */
 674                 ixgbe_save_desc(tcb,
 675                     dma_cookie.dmac_laddress,
 676                     dma_cookie.dmac_size);
 677 
 678                 desc_num++;
 679 
 680                 if (i > 1)
 681                         ddi_dma_nextcookie(tcb->tx_dma_handle, &dma_cookie);
 682         }
 683 
 684         return (desc_num);
 685 }
 686 
 687 /*
 688  * ixgbe_get_context
 689  *
 690  * Get the context information from the mblk
 691  */
 692 static int
 693 ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx)
 694 {
 695         uint32_t start;
 696         uint32_t hckflags;
 697         uint32_t lsoflags;
 698         uint32_t mss;
 699         uint32_t len;
 700         uint32_t size;
 701         uint32_t offset;
 702         unsigned char *pos;
 703         ushort_t etype;
 704         uint32_t mac_hdr_len;
 705         uint32_t l4_proto;
 706         uint32_t l4_hdr_len;
 707 
 708         ASSERT(mp != NULL);
 709 
 710         mac_hcksum_get(mp, &start, NULL, NULL, NULL, &hckflags);
 711         bzero(ctx, sizeof (ixgbe_tx_context_t));
 712 
 713         if (hckflags == 0) {
 714                 return (0);
 715         }
 716 
 717         ctx->hcksum_flags = hckflags;
 718 
 719         mac_lso_get(mp, &mss, &lsoflags);
 720         ctx->mss = mss;
 721         ctx->lso_flag = (lsoflags == HW_LSO);
 722 
 723         /*
 724          * LSO relies on tx h/w checksum, so here will drop the package
 725          * if h/w checksum flag is not declared.
 726          */
 727         if (ctx->lso_flag) {
 728                 if (!((ctx->hcksum_flags & HCK_PARTIALCKSUM) &&
 729                     (ctx->hcksum_flags & HCK_IPV4_HDRCKSUM))) {
 730                         IXGBE_DEBUGLOG_0(NULL, "ixgbe_tx: h/w "
 731                             "checksum flags are not specified when doing LSO");
 732                         return (-1);
 733                 }
 734         }
 735 
 736         etype = 0;
 737         mac_hdr_len = 0;
 738         l4_proto = 0;
 739 
 740         /*
 741          * Firstly get the position of the ether_type/ether_tpid.
 742          * Here we don't assume the ether (VLAN) header is fully included
 743          * in one mblk fragment, so we go thourgh the fragments to parse
 744          * the ether type.
 745          */
 746         size = len = MBLKL(mp);
 747         offset = offsetof(struct ether_header, ether_type);
 748         while (size <= offset) {
 749                 mp = mp->b_cont;
 750                 ASSERT(mp != NULL);
 751                 len = MBLKL(mp);
 752                 size += len;
 753         }
 754         pos = mp->b_rptr + offset + len - size;
 755 
 756         etype = ntohs(*(ushort_t *)(uintptr_t)pos);
 757         if (etype == ETHERTYPE_VLAN) {
 758                 /*
 759                  * Get the position of the ether_type in VLAN header
 760                  */
 761                 offset = offsetof(struct ether_vlan_header, ether_type);
 762                 while (size <= offset) {
 763                         mp = mp->b_cont;
 764                         ASSERT(mp != NULL);
 765                         len = MBLKL(mp);
 766                         size += len;
 767                 }
 768                 pos = mp->b_rptr + offset + len - size;
 769 
 770                 etype = ntohs(*(ushort_t *)(uintptr_t)pos);
 771                 mac_hdr_len = sizeof (struct ether_vlan_header);
 772         } else {
 773                 mac_hdr_len = sizeof (struct ether_header);
 774         }
 775 
 776         /*
 777          * Here we don't assume the IP(V6) header is fully included in
 778          * one mblk fragment.
 779          */
 780         switch (etype) {
 781         case ETHERTYPE_IP:
 782                 if (ctx->lso_flag) {
 783                         offset = offsetof(ipha_t, ipha_length) + mac_hdr_len;
 784                         while (size <= offset) {
 785                                 mp = mp->b_cont;
 786                                 ASSERT(mp != NULL);
 787                                 len = MBLKL(mp);
 788                                 size += len;
 789                         }
 790                         pos = mp->b_rptr + offset + len - size;
 791                         *((uint16_t *)(uintptr_t)(pos)) = 0;
 792 
 793                         offset = offsetof(ipha_t, ipha_hdr_checksum) +
 794                             mac_hdr_len;
 795                         while (size <= offset) {
 796                                 mp = mp->b_cont;
 797                                 ASSERT(mp != NULL);
 798                                 len = MBLKL(mp);
 799                                 size += len;
 800                         }
 801                         pos = mp->b_rptr + offset + len - size;
 802                         *((uint16_t *)(uintptr_t)(pos)) = 0;
 803 
 804                         /*
 805                          * To perform ixgbe LSO, here also need to fill
 806                          * the tcp checksum field of the packet with the
 807                          * following pseudo-header checksum:
 808                          * (ip_source_addr, ip_destination_addr, l4_proto)
 809                          * Currently the tcp/ip stack has done it.
 810                          */
 811                 }
 812 
 813                 offset = offsetof(ipha_t, ipha_protocol) + mac_hdr_len;
 814                 while (size <= offset) {
 815                         mp = mp->b_cont;
 816                         ASSERT(mp != NULL);
 817                         len = MBLKL(mp);
 818                         size += len;
 819                 }
 820                 pos = mp->b_rptr + offset + len - size;
 821 
 822                 l4_proto = *(uint8_t *)pos;
 823                 break;
 824         case ETHERTYPE_IPV6:
 825                 offset = offsetof(ip6_t, ip6_nxt) + mac_hdr_len;
 826                 while (size <= offset) {
 827                         mp = mp->b_cont;
 828                         ASSERT(mp != NULL);
 829                         len = MBLKL(mp);
 830                         size += len;
 831                 }
 832                 pos = mp->b_rptr + offset + len - size;
 833 
 834                 l4_proto = *(uint8_t *)pos;
 835                 break;
 836         default:
 837                 /* Unrecoverable error */
 838                 IXGBE_DEBUGLOG_0(NULL, "Ether type error with tx hcksum");
 839                 return (-2);
 840         }
 841 
 842         if (ctx->lso_flag) {
 843                 offset = mac_hdr_len + start;
 844                 while (size <= offset) {
 845                         mp = mp->b_cont;
 846                         ASSERT(mp != NULL);
 847                         len = MBLKL(mp);
 848                         size += len;
 849                 }
 850                 pos = mp->b_rptr + offset + len - size;
 851 
 852                 l4_hdr_len = TCP_HDR_LENGTH((tcph_t *)pos);
 853         } else {
 854                 /*
 855                  * l4 header length is only required for LSO
 856                  */
 857                 l4_hdr_len = 0;
 858         }
 859 
 860         ctx->mac_hdr_len = mac_hdr_len;
 861         ctx->ip_hdr_len = start;
 862         ctx->l4_proto = l4_proto;
 863         ctx->l4_hdr_len = l4_hdr_len;
 864 
 865         return (0);
 866 }
 867 
 868 /*
 869  * ixgbe_check_context
 870  *
 871  * Check if a new context descriptor is needed
 872  */
 873 static boolean_t
 874 ixgbe_check_context(ixgbe_tx_ring_t *tx_ring, ixgbe_tx_context_t *ctx)
 875 {
 876         ixgbe_tx_context_t *last;
 877 
 878         if (ctx == NULL)
 879                 return (B_FALSE);
 880 
 881         /*
 882          * Compare the context data retrieved from the mblk and the
 883          * stored data of the last context descriptor. The data need
 884          * to be checked are:
 885          *      hcksum_flags
 886          *      l4_proto
 887          *      mac_hdr_len
 888          *      ip_hdr_len
 889          *      lso_flag
 890          *      mss (only checked for LSO)
 891          *      l4_hr_len (only checked for LSO)
 892          * Either one of the above data is changed, a new context descriptor
 893          * will be needed.
 894          */
 895         last = &tx_ring->tx_context;
 896 
 897         if ((ctx->hcksum_flags != last->hcksum_flags) ||
 898             (ctx->l4_proto != last->l4_proto) ||
 899             (ctx->mac_hdr_len != last->mac_hdr_len) ||
 900             (ctx->ip_hdr_len != last->ip_hdr_len) ||
 901             (ctx->lso_flag != last->lso_flag) ||
 902             (ctx->lso_flag && ((ctx->mss != last->mss) ||
 903             (ctx->l4_hdr_len != last->l4_hdr_len)))) {
 904                 return (B_TRUE);
 905         }
 906 
 907         return (B_FALSE);
 908 }
 909 
 910 /*
 911  * ixgbe_fill_context
 912  *
 913  * Fill the context descriptor with hardware checksum informations
 914  */
 915 static void
 916 ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *ctx_tbd,
 917     ixgbe_tx_context_t *ctx)
 918 {
 919         /*
 920          * Fill the context descriptor with the checksum
 921          * context information we've got.
 922          */
 923         ctx_tbd->vlan_macip_lens = ctx->ip_hdr_len;
 924         ctx_tbd->vlan_macip_lens |= ctx->mac_hdr_len <<
 925             IXGBE_ADVTXD_MACLEN_SHIFT;
 926 
 927         ctx_tbd->type_tucmd_mlhl =
 928             IXGBE_ADVTXD_DCMD_DEXT | IXGBE_ADVTXD_DTYP_CTXT;
 929 
 930         if (ctx->hcksum_flags & HCK_IPV4_HDRCKSUM)
 931                 ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4;
 932 
 933         if (ctx->hcksum_flags & HCK_PARTIALCKSUM) {
 934                 switch (ctx->l4_proto) {
 935                 case IPPROTO_TCP:
 936                         ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP;
 937                         break;
 938                 case IPPROTO_UDP:
 939                         /*
 940                          * We don't have to explicitly set:
 941                          *      ctx_tbd->type_tucmd_mlhl |=
 942                          *          IXGBE_ADVTXD_TUCMD_L4T_UDP;
 943                          * Because IXGBE_ADVTXD_TUCMD_L4T_UDP == 0b
 944                          */
 945                         break;
 946                 default:
 947                         /* Unrecoverable error */
 948                         IXGBE_DEBUGLOG_0(NULL, "L4 type error with tx hcksum");
 949                         break;
 950                 }
 951         }
 952 
 953         ctx_tbd->seqnum_seed = 0;
 954 
 955         if (ctx->lso_flag) {
 956                 ctx_tbd->mss_l4len_idx =
 957                     (ctx->l4_hdr_len << IXGBE_ADVTXD_L4LEN_SHIFT) |
 958                     (ctx->mss << IXGBE_ADVTXD_MSS_SHIFT);
 959         } else {
 960                 ctx_tbd->mss_l4len_idx = 0;
 961         }
 962 }
 963 
 964 /*
 965  * ixgbe_tx_fill_ring
 966  *
 967  * Fill the tx descriptor ring with the data
 968  */
 969 static int
 970 ixgbe_tx_fill_ring(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list,
 971     ixgbe_tx_context_t *ctx, size_t mbsize)
 972 {
 973         struct ixgbe_hw *hw = &tx_ring->ixgbe->hw;
 974         boolean_t load_context;
 975         uint32_t index, tcb_index, desc_num;
 976         union ixgbe_adv_tx_desc *tbd, *first_tbd;
 977         tx_control_block_t *tcb, *first_tcb;
 978         uint32_t hcksum_flags;
 979         int i;
 980 
 981         ASSERT(mutex_owned(&tx_ring->tx_lock));
 982 
 983         tbd = NULL;
 984         first_tbd = NULL;
 985         first_tcb = NULL;
 986         desc_num = 0;
 987         hcksum_flags = 0;
 988         load_context = B_FALSE;
 989 
 990         /*
 991          * Get the index of the first tx descriptor that will be filled,
 992          * and the index of the first work list item that will be attached
 993          * with the first used tx control block in the pending list.
 994          * Note: the two indexes are the same.
 995          */
 996         index = tx_ring->tbd_tail;
 997         tcb_index = tx_ring->tbd_tail;
 998 
 999         if (ctx != NULL) {
1000                 hcksum_flags = ctx->hcksum_flags;
1001 
1002                 /*
1003                  * Check if a new context descriptor is needed for this packet
1004                  */
1005                 load_context = ixgbe_check_context(tx_ring, ctx);
1006 
1007                 if (load_context) {
1008                         tbd = &tx_ring->tbd_ring[index];
1009 
1010                         /*
1011                          * Fill the context descriptor with the
1012                          * hardware checksum offload informations.
1013                          */
1014                         ixgbe_fill_context(
1015                             (struct ixgbe_adv_tx_context_desc *)tbd, ctx);
1016 
1017                         index = NEXT_INDEX(index, 1, tx_ring->ring_size);
1018                         desc_num++;
1019 
1020                         /*
1021                          * Store the checksum context data if
1022                          * a new context descriptor is added
1023                          */
1024                         tx_ring->tx_context = *ctx;
1025                 }
1026         }
1027 
1028         first_tbd = &tx_ring->tbd_ring[index];
1029 
1030         /*
1031          * Fill tx data descriptors with the data saved in the pending list.
1032          * The tx control blocks in the pending list are added to the work list
1033          * at the same time.
1034          *
1035          * The work list is strictly 1:1 corresponding to the descriptor ring.
1036          * One item of the work list corresponds to one tx descriptor. Because
1037          * one tx control block can span multiple tx descriptors, the tx
1038          * control block will be added to the first work list item that
1039          * corresponds to the first tx descriptor generated from that tx
1040          * control block.
1041          */
1042         tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1043         first_tcb = tcb;
1044         while (tcb != NULL) {
1045 
1046                 for (i = 0; i < tcb->desc_num; i++) {
1047                         tbd = &tx_ring->tbd_ring[index];
1048 
1049                         tbd->read.buffer_addr = tcb->desc[i].address;
1050                         tbd->read.cmd_type_len = tcb->desc[i].length;
1051 
1052                         tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_DEXT
1053                             | IXGBE_ADVTXD_DTYP_DATA;
1054 
1055                         tbd->read.olinfo_status = 0;
1056 
1057                         index = NEXT_INDEX(index, 1, tx_ring->ring_size);
1058                         desc_num++;
1059                 }
1060 
1061                 /*
1062                  * Add the tx control block to the work list
1063                  */
1064                 ASSERT(tx_ring->work_list[tcb_index] == NULL);
1065                 tx_ring->work_list[tcb_index] = tcb;
1066 
1067                 tcb_index = index;
1068                 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1069         }
1070 
1071         if (load_context) {
1072                 /*
1073                  * Count the context descriptor for
1074                  * the first tx control block.
1075                  */
1076                 first_tcb->desc_num++;
1077         }
1078         first_tcb->last_index = PREV_INDEX(index, 1, tx_ring->ring_size);
1079 
1080         /*
1081          * The Insert Ethernet CRC (IFCS) bit and the checksum fields are only
1082          * valid in the first descriptor of the packet.
1083          * Setting paylen in every first_tbd for all parts.
1084          * 82599 and X540 require the packet length in paylen field with or
1085          * without LSO and 82598 will ignore it in non-LSO mode.
1086          */
1087         ASSERT(first_tbd != NULL);
1088         first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_IFCS;
1089 
1090         switch (hw->mac.type) {
1091         case ixgbe_mac_82598EB:
1092                 if (ctx != NULL && ctx->lso_flag) {
1093                         first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE;
1094                         first_tbd->read.olinfo_status |=
1095                             (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len
1096                             - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT;
1097                 }
1098                 break;
1099 
1100         case ixgbe_mac_82599EB:
1101         case ixgbe_mac_X540:
1102                 if (ctx != NULL && ctx->lso_flag) {
1103                         first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE;
1104                         first_tbd->read.olinfo_status |=
1105                             (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len
1106                             - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT;
1107                 } else {
1108                         first_tbd->read.olinfo_status |=
1109                             (mbsize << IXGBE_ADVTXD_PAYLEN_SHIFT);
1110                 }
1111                 break;
1112 
1113         default:
1114                 break;
1115         }
1116 
1117         /* Set hardware checksum bits */
1118         if (hcksum_flags != 0) {
1119                 if (hcksum_flags & HCK_IPV4_HDRCKSUM)
1120                         first_tbd->read.olinfo_status |=
1121                             IXGBE_ADVTXD_POPTS_IXSM;
1122                 if (hcksum_flags & HCK_PARTIALCKSUM)
1123                         first_tbd->read.olinfo_status |=
1124                             IXGBE_ADVTXD_POPTS_TXSM;
1125         }
1126 
1127         /*
1128          * The last descriptor of packet needs End Of Packet (EOP),
1129          * and Report Status (RS) bits set
1130          */
1131         ASSERT(tbd != NULL);
1132         tbd->read.cmd_type_len |=
1133             IXGBE_ADVTXD_DCMD_EOP | IXGBE_ADVTXD_DCMD_RS;
1134 
1135         /*
1136          * Sync the DMA buffer of the tx descriptor ring
1137          */
1138         DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORDEV);
1139 
1140         /*
1141          * Update the number of the free tx descriptors.
1142          * The mutual exclusion between the transmission and the recycling
1143          * (for the tx descriptor ring and the work list) is implemented
1144          * with the atomic operation on the number of the free tx descriptors.
1145          *
1146          * Note: we should always decrement the counter tbd_free before
1147          * advancing the hardware TDT pointer to avoid the race condition -
1148          * before the counter tbd_free is decremented, the transmit of the
1149          * tx descriptors has done and the counter tbd_free is increased by
1150          * the tx recycling.
1151          */
1152         i = ixgbe_atomic_reserve(&tx_ring->tbd_free, desc_num);
1153         ASSERT(i >= 0);
1154 
1155         tx_ring->tbd_tail = index;
1156 
1157         /*
1158          * Advance the hardware TDT pointer of the tx descriptor ring
1159          */
1160         IXGBE_WRITE_REG(hw, IXGBE_TDT(tx_ring->index), index);
1161 
1162         if (ixgbe_check_acc_handle(tx_ring->ixgbe->osdep.reg_handle) !=
1163             DDI_FM_OK) {
1164                 ddi_fm_service_impact(tx_ring->ixgbe->dip,
1165                     DDI_SERVICE_DEGRADED);
1166                 atomic_or_32(&tx_ring->ixgbe->ixgbe_state, IXGBE_ERROR);
1167         }
1168 
1169         return (desc_num);
1170 }
1171 
1172 /*
1173  * ixgbe_save_desc
1174  *
1175  * Save the address/length pair to the private array
1176  * of the tx control block. The address/length pairs
1177  * will be filled into the tx descriptor ring later.
1178  */
1179 static void
1180 ixgbe_save_desc(tx_control_block_t *tcb, uint64_t address, size_t length)
1181 {
1182         sw_desc_t *desc;
1183 
1184         desc = &tcb->desc[tcb->desc_num];
1185         desc->address = address;
1186         desc->length = length;
1187 
1188         tcb->desc_num++;
1189 }
1190 
1191 /*
1192  * ixgbe_tx_recycle_legacy
1193  *
1194  * Recycle the tx descriptors and tx control blocks.
1195  *
1196  * The work list is traversed to check if the corresponding
1197  * tx descriptors have been transmitted. If so, the resources
1198  * bound to the tx control blocks will be freed, and those
1199  * tx control blocks will be returned to the free list.
1200  */
1201 uint32_t
1202 ixgbe_tx_recycle_legacy(ixgbe_tx_ring_t *tx_ring)
1203 {
1204         uint32_t index, last_index, prev_index;
1205         int desc_num;
1206         boolean_t desc_done;
1207         tx_control_block_t *tcb;
1208         link_list_t pending_list;
1209         ixgbe_t *ixgbe = tx_ring->ixgbe;
1210 
1211         mutex_enter(&tx_ring->recycle_lock);
1212 
1213         ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);
1214 
1215         if (tx_ring->tbd_free == tx_ring->ring_size) {
1216                 tx_ring->recycle_fail = 0;
1217                 tx_ring->stall_watchdog = 0;
1218                 if (tx_ring->reschedule) {
1219                         tx_ring->reschedule = B_FALSE;
1220                         mac_tx_ring_update(ixgbe->mac_hdl,
1221                             tx_ring->ring_handle);
1222                 }
1223                 mutex_exit(&tx_ring->recycle_lock);
1224                 return (0);
1225         }
1226 
1227         /*
1228          * Sync the DMA buffer of the tx descriptor ring
1229          */
1230         DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);
1231 
1232         if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) {
1233                 mutex_exit(&tx_ring->recycle_lock);
1234                 ddi_fm_service_impact(ixgbe->dip, DDI_SERVICE_DEGRADED);
1235                 atomic_or_32(&ixgbe->ixgbe_state, IXGBE_ERROR);
1236                 return (0);
1237         }
1238 
1239         LINK_LIST_INIT(&pending_list);
1240         desc_num = 0;
1241         index = tx_ring->tbd_head;   /* Index of next tbd/tcb to recycle */
1242 
1243         tcb = tx_ring->work_list[index];
1244         ASSERT(tcb != NULL);
1245 
1246         while (tcb != NULL) {
1247                 /*
1248                  * Get the last tx descriptor of this packet.
1249                  * If the last tx descriptor is done, then
1250                  * we can recycle all descriptors of a packet
1251                  * which usually includes several tx control blocks.
1252                  * For 82599, LSO descriptors can not be recycled
1253                  * unless the whole packet's transmission is done.
1254                  * That's why packet level recycling is used here.
1255                  * For 82598, there's not such limit.
1256                  */
1257                 last_index = tcb->last_index;
1258                 /*
1259                  * MAX_TX_RING_SIZE is used to judge whether
1260                  * the index is a valid value or not.
1261                  */
1262                 if (last_index == MAX_TX_RING_SIZE)
1263                         break;
1264 
1265                 /*
1266                  * Check if the Descriptor Done bit is set
1267                  */
1268                 desc_done = tx_ring->tbd_ring[last_index].wb.status &
1269                     IXGBE_TXD_STAT_DD;
1270                 if (desc_done) {
1271                         /*
1272                          * recycle all descriptors of the packet
1273                          */
1274                         while (tcb != NULL) {
1275                                 /*
1276                                  * Strip off the tx control block from
1277                                  * the work list, and add it to the
1278                                  * pending list.
1279                                  */
1280                                 tx_ring->work_list[index] = NULL;
1281                                 LIST_PUSH_TAIL(&pending_list, &tcb->link);
1282 
1283                                 /*
1284                                  * Count the total number of the tx
1285                                  * descriptors recycled
1286                                  */
1287                                 desc_num += tcb->desc_num;
1288 
1289                                 index = NEXT_INDEX(index, tcb->desc_num,
1290                                     tx_ring->ring_size);
1291 
1292                                 tcb = tx_ring->work_list[index];
1293 
1294                                 prev_index = PREV_INDEX(index, 1,
1295                                     tx_ring->ring_size);
1296                                 if (prev_index == last_index)
1297                                         break;
1298                         }
1299                 } else {
1300                         break;
1301                 }
1302         }
1303 
1304         /*
1305          * If no tx descriptors are recycled, no need to do more processing
1306          */
1307         if (desc_num == 0) {
1308                 tx_ring->recycle_fail++;
1309                 mutex_exit(&tx_ring->recycle_lock);
1310                 return (0);
1311         }
1312 
1313         tx_ring->recycle_fail = 0;
1314         tx_ring->stall_watchdog = 0;
1315 
1316         /*
1317          * Update the head index of the tx descriptor ring
1318          */
1319         tx_ring->tbd_head = index;
1320 
1321         /*
1322          * Update the number of the free tx descriptors with atomic operations
1323          */
1324         atomic_add_32(&tx_ring->tbd_free, desc_num);
1325 
1326         if ((tx_ring->tbd_free >= ixgbe->tx_resched_thresh) &&
1327             (tx_ring->reschedule)) {
1328                 tx_ring->reschedule = B_FALSE;
1329                 mac_tx_ring_update(ixgbe->mac_hdl,
1330                     tx_ring->ring_handle);
1331         }
1332         mutex_exit(&tx_ring->recycle_lock);
1333 
1334         /*
1335          * Free the resources used by the tx control blocks
1336          * in the pending list
1337          */
1338         tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
1339         while (tcb != NULL) {
1340                 /*
1341                  * Release the resources occupied by the tx control block
1342                  */
1343                 ixgbe_free_tcb(tcb);
1344 
1345                 tcb = (tx_control_block_t *)
1346                     LIST_GET_NEXT(&pending_list, &tcb->link);
1347         }
1348 
1349         /*
1350          * Add the tx control blocks in the pending list to the free list.
1351          */
1352         ixgbe_put_free_list(tx_ring, &pending_list);
1353 
1354         return (desc_num);
1355 }
1356 
1357 /*
1358  * ixgbe_tx_recycle_head_wb
1359  *
1360  * Check the head write-back, and recycle all the transmitted
1361  * tx descriptors and tx control blocks.
1362  */
1363 uint32_t
1364 ixgbe_tx_recycle_head_wb(ixgbe_tx_ring_t *tx_ring)
1365 {
1366         uint32_t index;
1367         uint32_t head_wb;
1368         int desc_num;
1369         tx_control_block_t *tcb;
1370         link_list_t pending_list;
1371         ixgbe_t *ixgbe = tx_ring->ixgbe;
1372 
1373         mutex_enter(&tx_ring->recycle_lock);
1374 
1375         ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);
1376 
1377         if (tx_ring->tbd_free == tx_ring->ring_size) {
1378                 tx_ring->recycle_fail = 0;
1379                 tx_ring->stall_watchdog = 0;
1380                 if (tx_ring->reschedule) {
1381                         tx_ring->reschedule = B_FALSE;
1382                         mac_tx_ring_update(ixgbe->mac_hdl,
1383                             tx_ring->ring_handle);
1384                 }
1385                 mutex_exit(&tx_ring->recycle_lock);
1386                 return (0);
1387         }
1388 
1389         /*
1390          * Sync the DMA buffer of the tx descriptor ring
1391          *
1392          * Note: For head write-back mode, the tx descriptors will not
1393          * be written back, but the head write-back value is stored at
1394          * the last extra tbd at the end of the DMA area, we still need
1395          * to sync the head write-back value for kernel.
1396          *
1397          * DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);
1398          */
1399         (void) ddi_dma_sync(tx_ring->tbd_area.dma_handle,
1400             sizeof (union ixgbe_adv_tx_desc) * tx_ring->ring_size,
1401             sizeof (uint32_t),
1402             DDI_DMA_SYNC_FORKERNEL);
1403 
1404         if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) {
1405                 mutex_exit(&tx_ring->recycle_lock);
1406                 ddi_fm_service_impact(ixgbe->dip,
1407                     DDI_SERVICE_DEGRADED);
1408                 atomic_or_32(&ixgbe->ixgbe_state, IXGBE_ERROR);
1409                 return (0);
1410         }
1411 
1412         LINK_LIST_INIT(&pending_list);
1413         desc_num = 0;
1414         index = tx_ring->tbd_head;   /* Next index to clean */
1415 
1416         /*
1417          * Get the value of head write-back
1418          */
1419         head_wb = *tx_ring->tbd_head_wb;
1420         while (index != head_wb) {
1421                 tcb = tx_ring->work_list[index];
1422                 ASSERT(tcb != NULL);
1423 
1424                 if (OFFSET(index, head_wb, tx_ring->ring_size) <
1425                     tcb->desc_num) {
1426                         /*
1427                          * The current tx control block is not
1428                          * completely transmitted, stop recycling
1429                          */
1430                         break;
1431                 }
1432 
1433                 /*
1434                  * Strip off the tx control block from the work list,
1435                  * and add it to the pending list.
1436                  */
1437                 tx_ring->work_list[index] = NULL;
1438                 LIST_PUSH_TAIL(&pending_list, &tcb->link);
1439 
1440                 /*
1441                  * Advance the index of the tx descriptor ring
1442                  */
1443                 index = NEXT_INDEX(index, tcb->desc_num, tx_ring->ring_size);
1444 
1445                 /*
1446                  * Count the total number of the tx descriptors recycled
1447                  */
1448                 desc_num += tcb->desc_num;
1449         }
1450 
1451         /*
1452          * If no tx descriptors are recycled, no need to do more processing
1453          */
1454         if (desc_num == 0) {
1455                 tx_ring->recycle_fail++;
1456                 mutex_exit(&tx_ring->recycle_lock);
1457                 return (0);
1458         }
1459 
1460         tx_ring->recycle_fail = 0;
1461         tx_ring->stall_watchdog = 0;
1462 
1463         /*
1464          * Update the head index of the tx descriptor ring
1465          */
1466         tx_ring->tbd_head = index;
1467 
1468         /*
1469          * Update the number of the free tx descriptors with atomic operations
1470          */
1471         atomic_add_32(&tx_ring->tbd_free, desc_num);
1472 
1473         if ((tx_ring->tbd_free >= ixgbe->tx_resched_thresh) &&
1474             (tx_ring->reschedule)) {
1475                 tx_ring->reschedule = B_FALSE;
1476                 mac_tx_ring_update(ixgbe->mac_hdl,
1477                     tx_ring->ring_handle);
1478         }
1479         mutex_exit(&tx_ring->recycle_lock);
1480 
1481         /*
1482          * Free the resources used by the tx control blocks
1483          * in the pending list
1484          */
1485         tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
1486         while (tcb) {
1487                 /*
1488                  * Release the resources occupied by the tx control block
1489                  */
1490                 ixgbe_free_tcb(tcb);
1491 
1492                 tcb = (tx_control_block_t *)
1493                     LIST_GET_NEXT(&pending_list, &tcb->link);
1494         }
1495 
1496         /*
1497          * Add the tx control blocks in the pending list to the free list.
1498          */
1499         ixgbe_put_free_list(tx_ring, &pending_list);
1500 
1501         return (desc_num);
1502 }
1503 
1504 /*
1505  * ixgbe_free_tcb - free up the tx control block
1506  *
1507  * Free the resources of the tx control block, including
1508  * unbind the previously bound DMA handle, and reset other
1509  * control fields.
1510  */
1511 void
1512 ixgbe_free_tcb(tx_control_block_t *tcb)
1513 {
1514         switch (tcb->tx_type) {
1515         case USE_COPY:
1516                 /*
1517                  * Reset the buffer length that is used for copy
1518                  */
1519                 tcb->tx_buf.len = 0;
1520                 break;
1521         case USE_DMA:
1522                 /*
1523                  * Release the DMA resource that is used for
1524                  * DMA binding.
1525                  */
1526                 (void) ddi_dma_unbind_handle(tcb->tx_dma_handle);
1527                 break;
1528         default:
1529                 break;
1530         }
1531 
1532         /*
1533          * Free the mblk
1534          */
1535         if (tcb->mp != NULL) {
1536                 freemsg(tcb->mp);
1537                 tcb->mp = NULL;
1538         }
1539 
1540         tcb->tx_type = USE_NONE;
1541         tcb->last_index = MAX_TX_RING_SIZE;
1542         tcb->frag_num = 0;
1543         tcb->desc_num = 0;
1544 }
1545 
1546 /*
1547  * ixgbe_get_free_list - Get a free tx control block from the free list
1548  *
1549  * The atomic operation on the number of the available tx control block
1550  * in the free list is used to keep this routine mutual exclusive with
1551  * the routine ixgbe_put_check_list.
1552  */
1553 static tx_control_block_t *
1554 ixgbe_get_free_list(ixgbe_tx_ring_t *tx_ring)
1555 {
1556         tx_control_block_t *tcb;
1557 
1558         /*
1559          * Check and update the number of the free tx control block
1560          * in the free list.
1561          */
1562         if (ixgbe_atomic_reserve(&tx_ring->tcb_free, 1) < 0)
1563                 return (NULL);
1564 
1565         mutex_enter(&tx_ring->tcb_head_lock);
1566 
1567         tcb = tx_ring->free_list[tx_ring->tcb_head];
1568         ASSERT(tcb != NULL);
1569         tx_ring->free_list[tx_ring->tcb_head] = NULL;
1570         tx_ring->tcb_head = NEXT_INDEX(tx_ring->tcb_head, 1,
1571             tx_ring->free_list_size);
1572 
1573         mutex_exit(&tx_ring->tcb_head_lock);
1574 
1575         return (tcb);
1576 }
1577 
1578 /*
1579  * ixgbe_put_free_list
1580  *
1581  * Put a list of used tx control blocks back to the free list
1582  *
1583  * A mutex is used here to ensure the serialization. The mutual exclusion
1584  * between ixgbe_get_free_list and ixgbe_put_free_list is implemented with
1585  * the atomic operation on the counter tcb_free.
1586  */
1587 void
1588 ixgbe_put_free_list(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list)
1589 {
1590         uint32_t index;
1591         int tcb_num;
1592         tx_control_block_t *tcb;
1593 
1594         mutex_enter(&tx_ring->tcb_tail_lock);
1595 
1596         index = tx_ring->tcb_tail;
1597 
1598         tcb_num = 0;
1599         tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1600         while (tcb != NULL) {
1601                 ASSERT(tx_ring->free_list[index] == NULL);
1602                 tx_ring->free_list[index] = tcb;
1603 
1604                 tcb_num++;
1605 
1606                 index = NEXT_INDEX(index, 1, tx_ring->free_list_size);
1607 
1608                 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1609         }
1610 
1611         tx_ring->tcb_tail = index;
1612 
1613         /*
1614          * Update the number of the free tx control block
1615          * in the free list. This operation must be placed
1616          * under the protection of the lock.
1617          */
1618         atomic_add_32(&tx_ring->tcb_free, tcb_num);
1619 
1620         mutex_exit(&tx_ring->tcb_tail_lock);
1621 }