illumos-gate Wdiff usr/src/uts/common/inet/tcp/tcp_output.c

Print this page

11546 Track TCP round-trip time in nanoseconds
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Portions contributed by: Brandon Baker <bbaker@delphix.com>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/inet/tcp/tcp_output.c
          +++ new/usr/src/uts/common/inet/tcp/tcp_output.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each

↓ open down ↓

13 lines elided

↑ open up ↑

  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24      - * Copyright (c) 2014 by Delphix. All rights reserved.
       24 + * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
       25 + * Copyright 2019 Joyent, Inc.
  25   26   */
  26   27  
  27   28  /* This file contains all TCP output processing functions. */
  28   29  
  29   30  #include <sys/types.h>
  30   31  #include <sys/stream.h>
  31   32  #include <sys/strsun.h>
  32   33  #include <sys/strsubr.h>
  33   34  #include <sys/stropts.h>
  34   35  #include <sys/strlog.h>

  35   36  #define _SUN_TPI_VERSION 2
  36   37  #include <sys/tihdr.h>
  37   38  #include <sys/suntpi.h>
  38   39  #include <sys/xti_inet.h>
  39   40  #include <sys/timod.h>
  40   41  #include <sys/pattr.h>
  41   42  #include <sys/squeue_impl.h>
  42   43  #include <sys/squeue.h>
  43   44  #include <sys/sockio.h>
  44   45  #include <sys/tsol/tnet.h>
  45   46  
  46   47  #include <inet/common.h>
  47   48  #include <inet/ip.h>
  48   49  #include <inet/tcp.h>
  49   50  #include <inet/tcp_impl.h>
  50   51  #include <inet/snmpcom.h>

↓ open down ↓

16 lines elided

↑ open up ↑

  51   52  #include <inet/proto_set.h>
  52   53  #include <inet/ipsec_impl.h>
  53   54  #include <inet/ip_ndp.h>
  54   55  
  55   56  static mblk_t   *tcp_get_seg_mp(tcp_t *, uint32_t, int32_t *);
  56   57  static void     tcp_wput_cmdblk(queue_t *, mblk_t *);
  57   58  static void     tcp_wput_flush(tcp_t *, mblk_t *);
  58   59  static void     tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp);
  59   60  static int      tcp_xmit_end(tcp_t *);
  60   61  static int      tcp_send(tcp_t *, const int, const int, const int,
  61      -                    const int, int *, uint_t *, int *, mblk_t **, mblk_t *);
       62 +                    const int, int *, uint32_t *, int *, mblk_t **, mblk_t *);
  62   63  static void     tcp_xmit_early_reset(char *, mblk_t *, uint32_t, uint32_t,
  63   64                      int, ip_recv_attr_t *, ip_stack_t *, conn_t *);
  64   65  static boolean_t        tcp_send_rst_chk(tcp_stack_t *);
  65   66  static void     tcp_process_shrunk_swnd(tcp_t *, uint32_t);
  66      -static void     tcp_fill_header(tcp_t *, uchar_t *, clock_t, int);
       67 +static void     tcp_fill_header(tcp_t *, uchar_t *, int);
  67   68  
  68   69  /*
  69   70   * Functions called directly via squeue having a prototype of edesc_t.
  70   71   */
  71   72  static void     tcp_wput_nondata(void *, mblk_t *, void *, ip_recv_attr_t *);
  72   73  static void     tcp_wput_ioctl(void *, mblk_t *, void *, ip_recv_attr_t *);
  73   74  static void     tcp_wput_proto(void *, mblk_t *, void *, ip_recv_attr_t *);
  74   75  
  75   76  /*
  76   77   * This controls how tiny a write must be before we try to copy it

  77   78   * into the mblk on the tail of the transmit queue.  Not much
  78   79   * speedup is observed for values larger than sixteen.  Zero will
  79   80   * disable the optimisation.
  80   81   */
  81   82  static int tcp_tx_pull_len = 16;
  82   83  
  83   84  int
  84   85  tcp_wput(queue_t *q, mblk_t *mp)
  85   86  {
  86   87          conn_t  *connp = Q_TO_CONN(q);
  87   88          tcp_t   *tcp;
  88   89          void (*output_proc)();
  89   90          t_scalar_t type;
  90   91          uchar_t *rptr;
  91   92          struct iocblk   *iocp;
  92   93          size_t size;
  93   94  
  94   95          ASSERT(connp->conn_ref >= 2);
  95   96  
  96   97          switch (DB_TYPE(mp)) {
  97   98          case M_DATA:
  98   99                  tcp = connp->conn_tcp;
  99  100                  ASSERT(tcp != NULL);
 100  101  
 101  102                  size = msgdsize(mp);
 102  103  
 103  104                  mutex_enter(&tcp->tcp_non_sq_lock);
 104  105                  tcp->tcp_squeue_bytes += size;
 105  106                  if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) {
 106  107                          tcp_setqfull(tcp);
 107  108                  }
 108  109                  mutex_exit(&tcp->tcp_non_sq_lock);
 109  110  
 110  111                  CONN_INC_REF(connp);
 111  112                  SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output, connp,
 112  113                      NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
 113  114                  return (0);
 114  115  
 115  116          case M_CMD:
 116  117                  tcp_wput_cmdblk(q, mp);
 117  118                  return (0);
 118  119  
 119  120          case M_PROTO:
 120  121          case M_PCPROTO:
 121  122                  /*
 122  123                   * if it is a snmp message, don't get behind the squeue
 123  124                   */
 124  125                  tcp = connp->conn_tcp;
 125  126                  rptr = mp->b_rptr;
 126  127                  if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) {
 127  128                          type = ((union T_primitives *)rptr)->type;
 128  129                  } else {
 129  130                          if (connp->conn_debug) {
 130  131                                  (void) strlog(TCP_MOD_ID, 0, 1,
 131  132                                      SL_ERROR|SL_TRACE,
 132  133                                      "tcp_wput_proto, dropping one...");
 133  134                          }
 134  135                          freemsg(mp);
 135  136                          return (0);
 136  137                  }
 137  138                  if (type == T_SVR4_OPTMGMT_REQ) {
 138  139                          /*
 139  140                           * All Solaris components should pass a db_credp
 140  141                           * for this TPI message, hence we ASSERT.
 141  142                           * But in case there is some other M_PROTO that looks
 142  143                           * like a TPI message sent by some other kernel
 143  144                           * component, we check and return an error.
 144  145                           */
 145  146                          cred_t  *cr = msg_getcred(mp, NULL);
 146  147  
 147  148                          ASSERT(cr != NULL);
 148  149                          if (cr == NULL) {
 149  150                                  tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
 150  151                                  return (0);
 151  152                          }
 152  153                          if (snmpcom_req(q, mp, tcp_snmp_set, ip_snmp_get,
 153  154                              cr)) {
 154  155                                  /*
 155  156                                   * This was a SNMP request
 156  157                                   */
 157  158                                  return (0);
 158  159                          } else {
 159  160                                  output_proc = tcp_wput_proto;
 160  161                          }
 161  162                  } else {
 162  163                          output_proc = tcp_wput_proto;
 163  164                  }
 164  165                  break;
 165  166          case M_IOCTL:
 166  167                  /*
 167  168                   * Most ioctls can be processed right away without going via
 168  169                   * squeues - process them right here. Those that do require
 169  170                   * squeue (currently _SIOCSOCKFALLBACK)
 170  171                   * are processed by tcp_wput_ioctl().
 171  172                   */
 172  173                  iocp = (struct iocblk *)mp->b_rptr;
 173  174                  tcp = connp->conn_tcp;
 174  175  
 175  176                  switch (iocp->ioc_cmd) {
 176  177                  case TCP_IOC_ABORT_CONN:
 177  178                          tcp_ioctl_abort_conn(q, mp);
 178  179                          return (0);
 179  180                  case TI_GETPEERNAME:
 180  181                  case TI_GETMYNAME:
 181  182                          mi_copyin(q, mp, NULL,
 182  183                              SIZEOF_STRUCT(strbuf, iocp->ioc_flag));
 183  184                          return (0);
 184  185  
 185  186                  default:
 186  187                          output_proc = tcp_wput_ioctl;
 187  188                          break;
 188  189                  }
 189  190                  break;
 190  191          default:
 191  192                  output_proc = tcp_wput_nondata;
 192  193                  break;
 193  194          }
 194  195  
 195  196          CONN_INC_REF(connp);
 196  197          SQUEUE_ENTER_ONE(connp->conn_sqp, mp, output_proc, connp,
 197  198              NULL, tcp_squeue_flag, SQTAG_TCP_WPUT_OTHER);
 198  199          return (0);
 199  200  }
 200  201  
 201  202  /*
 202  203   * The TCP normal data output path.
 203  204   * NOTE: the logic of the fast path is duplicated from this function.
 204  205   */
 205  206  void
 206  207  tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent)
 207  208  {
 208  209          int             len;
 209  210          mblk_t          *local_time;
 210  211          mblk_t          *mp1;
 211  212          uint32_t        snxt;
 212  213          int             tail_unsent;
 213  214          int             tcpstate;
 214  215          int             usable = 0;
 215  216          mblk_t          *xmit_tail;
 216  217          int32_t         mss;
 217  218          int32_t         num_sack_blk = 0;
 218  219          int32_t         total_hdr_len;
 219  220          int32_t         tcp_hdr_len;
 220  221          int             rc;
 221  222          tcp_stack_t     *tcps = tcp->tcp_tcps;
 222  223          conn_t          *connp = tcp->tcp_connp;
 223  224          clock_t         now = LBOLT_FASTPATH;
 224  225  
 225  226          tcpstate = tcp->tcp_state;
 226  227          if (mp == NULL) {
 227  228                  /*
 228  229                   * tcp_wput_data() with NULL mp should only be called when
 229  230                   * there is unsent data.
 230  231                   */
 231  232                  ASSERT(tcp->tcp_unsent > 0);
 232  233                  /* Really tacky... but we need this for detached closes. */
 233  234                  len = tcp->tcp_unsent;
 234  235                  goto data_null;
 235  236          }
 236  237  
 237  238          ASSERT(mp->b_datap->db_type == M_DATA);
 238  239          /*
 239  240           * Don't allow data after T_ORDREL_REQ or T_DISCON_REQ,
 240  241           * or before a connection attempt has begun.
 241  242           */
 242  243          if (tcpstate < TCPS_SYN_SENT || tcpstate > TCPS_CLOSE_WAIT ||
 243  244              (tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) {
 244  245                  if ((tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) {
 245  246  #ifdef DEBUG
 246  247                          cmn_err(CE_WARN,
 247  248                              "tcp_wput_data: data after ordrel, %s",
 248  249                              tcp_display(tcp, NULL,
 249  250                              DISP_ADDR_AND_PORT));
 250  251  #else
 251  252                          if (connp->conn_debug) {
 252  253                                  (void) strlog(TCP_MOD_ID, 0, 1,
 253  254                                      SL_TRACE|SL_ERROR,
 254  255                                      "tcp_wput_data: data after ordrel, %s\n",
 255  256                                      tcp_display(tcp, NULL,
 256  257                                      DISP_ADDR_AND_PORT));
 257  258                          }
 258  259  #endif /* DEBUG */
 259  260                  }
 260  261                  if (tcp->tcp_snd_zcopy_aware &&
 261  262                      (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY))
 262  263                          tcp_zcopy_notify(tcp);
 263  264                  freemsg(mp);
 264  265                  mutex_enter(&tcp->tcp_non_sq_lock);
 265  266                  if (tcp->tcp_flow_stopped &&
 266  267                      TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) {
 267  268                          tcp_clrqfull(tcp);
 268  269                  }
 269  270                  mutex_exit(&tcp->tcp_non_sq_lock);
 270  271                  return;
 271  272          }
 272  273  
 273  274          /* Strip empties */
 274  275          for (;;) {
 275  276                  ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <=
 276  277                      (uintptr_t)INT_MAX);
 277  278                  len = (int)(mp->b_wptr - mp->b_rptr);
 278  279                  if (len > 0)
 279  280                          break;
 280  281                  mp1 = mp;
 281  282                  mp = mp->b_cont;
 282  283                  freeb(mp1);
 283  284                  if (mp == NULL) {
 284  285                          return;
 285  286                  }
 286  287          }
 287  288  
 288  289          /* If we are the first on the list ... */
 289  290          if (tcp->tcp_xmit_head == NULL) {
 290  291                  tcp->tcp_xmit_head = mp;
 291  292                  tcp->tcp_xmit_tail = mp;
 292  293                  tcp->tcp_xmit_tail_unsent = len;
 293  294          } else {
 294  295                  /* If tiny tx and room in txq tail, pullup to save mblks. */
 295  296                  struct datab *dp;
 296  297  
 297  298                  mp1 = tcp->tcp_xmit_last;
 298  299                  if (len < tcp_tx_pull_len &&
 299  300                      (dp = mp1->b_datap)->db_ref == 1 &&
 300  301                      dp->db_lim - mp1->b_wptr >= len) {
 301  302                          ASSERT(len > 0);
 302  303                          ASSERT(!mp1->b_cont);
 303  304                          if (len == 1) {
 304  305                                  *mp1->b_wptr++ = *mp->b_rptr;
 305  306                          } else {
 306  307                                  bcopy(mp->b_rptr, mp1->b_wptr, len);
 307  308                                  mp1->b_wptr += len;
 308  309                          }
 309  310                          if (mp1 == tcp->tcp_xmit_tail)
 310  311                                  tcp->tcp_xmit_tail_unsent += len;
 311  312                          mp1->b_cont = mp->b_cont;
 312  313                          if (tcp->tcp_snd_zcopy_aware &&
 313  314                              (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY))
 314  315                                  mp1->b_datap->db_struioflag |= STRUIO_ZCNOTIFY;
 315  316                          freeb(mp);
 316  317                          mp = mp1;
 317  318                  } else {
 318  319                          tcp->tcp_xmit_last->b_cont = mp;
 319  320                  }
 320  321                  len += tcp->tcp_unsent;
 321  322          }
 322  323  
 323  324          /* Tack on however many more positive length mblks we have */
 324  325          if ((mp1 = mp->b_cont) != NULL) {
 325  326                  do {
 326  327                          int tlen;
 327  328                          ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <=
 328  329                              (uintptr_t)INT_MAX);
 329  330                          tlen = (int)(mp1->b_wptr - mp1->b_rptr);
 330  331                          if (tlen <= 0) {
 331  332                                  mp->b_cont = mp1->b_cont;
 332  333                                  freeb(mp1);
 333  334                          } else {
 334  335                                  len += tlen;
 335  336                                  mp = mp1;
 336  337                          }
 337  338                  } while ((mp1 = mp->b_cont) != NULL);
 338  339          }
 339  340          tcp->tcp_xmit_last = mp;
 340  341          tcp->tcp_unsent = len;
 341  342  
 342  343          if (urgent)
 343  344                  usable = 1;
 344  345  
 345  346  data_null:
 346  347          snxt = tcp->tcp_snxt;
 347  348          xmit_tail = tcp->tcp_xmit_tail;
 348  349          tail_unsent = tcp->tcp_xmit_tail_unsent;
 349  350  
 350  351          /*
 351  352           * Note that tcp_mss has been adjusted to take into account the
 352  353           * timestamp option if applicable.  Because SACK options do not
 353  354           * appear in every TCP segments and they are of variable lengths,
 354  355           * they cannot be included in tcp_mss.  Thus we need to calculate
 355  356           * the actual segment length when we need to send a segment which
 356  357           * includes SACK options.
 357  358           */
 358  359          if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) {
 359  360                  int32_t opt_len;
 360  361  
 361  362                  num_sack_blk = MIN(tcp->tcp_max_sack_blk,
 362  363                      tcp->tcp_num_sack_blk);
 363  364                  opt_len = num_sack_blk * sizeof (sack_blk_t) + TCPOPT_NOP_LEN *
 364  365                      2 + TCPOPT_HEADER_LEN;
 365  366                  mss = tcp->tcp_mss - opt_len;
 366  367                  total_hdr_len = connp->conn_ht_iphc_len + opt_len;
 367  368                  tcp_hdr_len = connp->conn_ht_ulp_len + opt_len;
 368  369          } else {
 369  370                  mss = tcp->tcp_mss;
 370  371                  total_hdr_len = connp->conn_ht_iphc_len;
 371  372                  tcp_hdr_len = connp->conn_ht_ulp_len;
 372  373          }
 373  374  
 374  375          if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
 375  376              (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
 376  377                  TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle);
 377  378          }
 378  379          if (tcpstate == TCPS_SYN_RCVD) {
 379  380                  /*
 380  381                   * The three-way connection establishment handshake is not
 381  382                   * complete yet. We want to queue the data for transmission
 382  383                   * after entering ESTABLISHED state (RFC793). A jump to
 383  384                   * "done" label effectively leaves data on the queue.
 384  385                   */
 385  386                  goto done;
 386  387          } else {
 387  388                  int usable_r;
 388  389  
 389  390                  /*
 390  391                   * In the special case when cwnd is zero, which can only
 391  392                   * happen if the connection is ECN capable, return now.
 392  393                   * New segments is sent using tcp_timer().  The timer
 393  394                   * is set in tcp_input_data().
 394  395                   */
 395  396                  if (tcp->tcp_cwnd == 0) {
 396  397                          /*
 397  398                           * Note that tcp_cwnd is 0 before 3-way handshake is
 398  399                           * finished.
 399  400                           */
 400  401                          ASSERT(tcp->tcp_ecn_ok ||
 401  402                              tcp->tcp_state < TCPS_ESTABLISHED);
 402  403                          return;
 403  404                  }
 404  405  
 405  406                  /* NOTE: trouble if xmitting while SYN not acked? */
 406  407                  usable_r = snxt - tcp->tcp_suna;
 407  408                  usable_r = tcp->tcp_swnd - usable_r;
 408  409  
 409  410                  /*
 410  411                   * Check if the receiver has shrunk the window.  If
 411  412                   * tcp_wput_data() with NULL mp is called, tcp_fin_sent
 412  413                   * cannot be set as there is unsent data, so FIN cannot
 413  414                   * be sent out.  Otherwise, we need to take into account
 414  415                   * of FIN as it consumes an "invisible" sequence number.
 415  416                   */
 416  417                  ASSERT(tcp->tcp_fin_sent == 0);
 417  418                  if (usable_r < 0) {
 418  419                          /*
 419  420                           * The receiver has shrunk the window and we have sent
 420  421                           * -usable_r date beyond the window, re-adjust.
 421  422                           *
 422  423                           * If TCP window scaling is enabled, there can be
 423  424                           * round down error as the advertised receive window
 424  425                           * is actually right shifted n bits.  This means that
 425  426                           * the lower n bits info is wiped out.  It will look
 426  427                           * like the window is shrunk.  Do a check here to
 427  428                           * see if the shrunk amount is actually within the
 428  429                           * error in window calculation.  If it is, just
 429  430                           * return.  Note that this check is inside the
 430  431                           * shrunk window check.  This makes sure that even
 431  432                           * though tcp_process_shrunk_swnd() is not called,
 432  433                           * we will stop further processing.
 433  434                           */
 434  435                          if ((-usable_r >> tcp->tcp_snd_ws) > 0) {
 435  436                                  tcp_process_shrunk_swnd(tcp, -usable_r);
 436  437                          }
 437  438                          return;
 438  439                  }
 439  440  
 440  441                  /* usable = MIN(swnd, cwnd) - unacked_bytes */
 441  442                  if (tcp->tcp_swnd > tcp->tcp_cwnd)
 442  443                          usable_r -= tcp->tcp_swnd - tcp->tcp_cwnd;
 443  444  
 444  445                  /* usable = MIN(usable, unsent) */
 445  446                  if (usable_r > len)
 446  447                          usable_r = len;

↓ open down ↓

370 lines elided

↑ open up ↑

 447  448  
 448  449                  /* usable = MAX(usable, {1 for urgent, 0 for data}) */
 449  450                  if (usable_r > 0) {
 450  451                          usable = usable_r;
 451  452                  } else {
 452  453                          /* Bypass all other unnecessary processing. */
 453  454                          goto done;
 454  455                  }
 455  456          }
 456  457  
 457      -        local_time = (mblk_t *)now;
      458 +        local_time = (mblk_t *)(intptr_t)gethrtime();
 458  459  
 459  460          /*
 460  461           * "Our" Nagle Algorithm.  This is not the same as in the old
 461  462           * BSD.  This is more in line with the true intent of Nagle.
 462  463           *
 463  464           * The conditions are:
 464  465           * 1. The amount of unsent data (or amount of data which can be
 465  466           *    sent, whichever is smaller) is less than Nagle limit.
 466  467           * 2. The last sent size is also less than Nagle limit.
 467  468           * 3. There is unack'ed data.

 468  469           * 4. Urgent pointer is not set.  Send urgent data ignoring the
 469  470           *    Nagle algorithm.  This reduces the probability that urgent
 470  471           *    bytes get "merged" together.
 471  472           * 5. The app has not closed the connection.  This eliminates the
 472  473           *    wait time of the receiving side waiting for the last piece of
 473  474           *    (small) data.
 474  475           *
 475  476           * If all are satisified, exit without sending anything.  Note
 476  477           * that Nagle limit can be smaller than 1 MSS.  Nagle limit is
 477  478           * the smaller of 1 MSS and global tcp_naglim_def (default to be
 478  479           * 4095).
 479  480           */
 480  481          if (usable < (int)tcp->tcp_naglim &&
 481  482              tcp->tcp_naglim > tcp->tcp_last_sent_len &&
 482  483              snxt != tcp->tcp_suna &&
 483  484              !(tcp->tcp_valid_bits & TCP_URG_VALID) &&
 484  485              !(tcp->tcp_valid_bits & TCP_FSS_VALID)) {
 485  486                  goto done;
 486  487          }
 487  488  
 488  489          /*
 489  490           * If tcp_zero_win_probe is not set and the tcp->tcp_cork option
 490  491           * is set, then we have to force TCP not to send partial segment
 491  492           * (smaller than MSS bytes). We are calculating the usable now
 492  493           * based on full mss and will save the rest of remaining data for
 493  494           * later. When tcp_zero_win_probe is set, TCP needs to send out
 494  495           * something to do zero window probe.
 495  496           */
 496  497          if (tcp->tcp_cork && !tcp->tcp_zero_win_probe) {
 497  498                  if (usable < mss)
 498  499                          goto done;
 499  500                  usable = (usable / mss) * mss;
 500  501          }
 501  502  
 502  503          /* Update the latest receive window size in TCP header. */
 503  504          tcp->tcp_tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
 504  505  
 505  506          /* Send the packet. */
 506  507          rc = tcp_send(tcp, mss, total_hdr_len, tcp_hdr_len,
 507  508              num_sack_blk, &usable, &snxt, &tail_unsent, &xmit_tail,
 508  509              local_time);
 509  510  
 510  511          /* Pretend that all we were trying to send really got sent */
 511  512          if (rc < 0 && tail_unsent < 0) {
 512  513                  do {
 513  514                          xmit_tail = xmit_tail->b_cont;
 514  515                          xmit_tail->b_prev = local_time;
 515  516                          ASSERT((uintptr_t)(xmit_tail->b_wptr -
 516  517                              xmit_tail->b_rptr) <= (uintptr_t)INT_MAX);
 517  518                          tail_unsent += (int)(xmit_tail->b_wptr -
 518  519                              xmit_tail->b_rptr);
 519  520                  } while (tail_unsent < 0);
 520  521          }
 521  522  done:;
 522  523          tcp->tcp_xmit_tail = xmit_tail;
 523  524          tcp->tcp_xmit_tail_unsent = tail_unsent;
 524  525          len = tcp->tcp_snxt - snxt;
 525  526          if (len) {
 526  527                  /*
 527  528                   * If new data was sent, need to update the notsack
 528  529                   * list, which is, afterall, data blocks that have
 529  530                   * not been sack'ed by the receiver.  New data is
 530  531                   * not sack'ed.
 531  532                   */
 532  533                  if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) {
 533  534                          /* len is a negative value. */
 534  535                          tcp->tcp_pipe -= len;
 535  536                          tcp_notsack_update(&(tcp->tcp_notsack_list),
 536  537                              tcp->tcp_snxt, snxt,
 537  538                              &(tcp->tcp_num_notsack_blk),
 538  539                              &(tcp->tcp_cnt_notsack_list));
 539  540                  }
 540  541                  tcp->tcp_snxt = snxt + tcp->tcp_fin_sent;
 541  542                  tcp->tcp_rack = tcp->tcp_rnxt;
 542  543                  tcp->tcp_rack_cnt = 0;
 543  544                  if ((snxt + len) == tcp->tcp_suna) {
 544  545                          TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
 545  546                  }
 546  547          } else if (snxt == tcp->tcp_suna && tcp->tcp_swnd == 0) {
 547  548                  /*
 548  549                   * Didn't send anything. Make sure the timer is running
 549  550                   * so that we will probe a zero window.
 550  551                   */
 551  552                  TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
 552  553          }
 553  554          /* Note that len is the amount we just sent but with a negative sign */
 554  555          tcp->tcp_unsent += len;
 555  556          mutex_enter(&tcp->tcp_non_sq_lock);
 556  557          if (tcp->tcp_flow_stopped) {
 557  558                  if (TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) {
 558  559                          tcp_clrqfull(tcp);
 559  560                  }
 560  561          } else if (TCP_UNSENT_BYTES(tcp) >= connp->conn_sndbuf) {
 561  562                  if (!(tcp->tcp_detached))
 562  563                          tcp_setqfull(tcp);
 563  564          }
 564  565          mutex_exit(&tcp->tcp_non_sq_lock);
 565  566  }
 566  567  
 567  568  /*
 568  569   * Initial STREAMS write side put() procedure for sockets. It tries to
 569  570   * handle the T_CAPABILITY_REQ which sockfs sends down while setting
 570  571   * up the socket without using the squeue. Non T_CAPABILITY_REQ messages
 571  572   * are handled by tcp_wput() as usual.
 572  573   *
 573  574   * All further messages will also be handled by tcp_wput() because we cannot
 574  575   * be sure that the above short cut is safe later.
 575  576   */
 576  577  int
 577  578  tcp_wput_sock(queue_t *wq, mblk_t *mp)
 578  579  {
 579  580          conn_t                  *connp = Q_TO_CONN(wq);
 580  581          tcp_t                   *tcp = connp->conn_tcp;
 581  582          struct T_capability_req *car = (struct T_capability_req *)mp->b_rptr;
 582  583  
 583  584          ASSERT(wq->q_qinfo == &tcp_sock_winit);
 584  585          wq->q_qinfo = &tcp_winit;
 585  586  
 586  587          ASSERT(IPCL_IS_TCP(connp));
 587  588          ASSERT(TCP_IS_SOCKET(tcp));
 588  589  
 589  590          if (DB_TYPE(mp) == M_PCPROTO &&
 590  591              MBLKL(mp) == sizeof (struct T_capability_req) &&
 591  592              car->PRIM_type == T_CAPABILITY_REQ) {
 592  593                  tcp_capability_req(tcp, mp);
 593  594                  return (0);
 594  595          }
 595  596  
 596  597          tcp_wput(wq, mp);
 597  598          return (0);
 598  599  }
 599  600  
 600  601  /* ARGSUSED */
 601  602  int
 602  603  tcp_wput_fallback(queue_t *wq, mblk_t *mp)
 603  604  {
 604  605  #ifdef DEBUG
 605  606          cmn_err(CE_CONT, "tcp_wput_fallback: Message during fallback \n");
 606  607  #endif
 607  608          freemsg(mp);
 608  609          return (0);
 609  610  }
 610  611  
 611  612  /*
 612  613   * Call by tcp_wput() to handle misc non M_DATA messages.
 613  614   */
 614  615  /* ARGSUSED */
 615  616  static void
 616  617  tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
 617  618  {
 618  619          conn_t  *connp = (conn_t *)arg;
 619  620          tcp_t   *tcp = connp->conn_tcp;
 620  621  
 621  622          ASSERT(DB_TYPE(mp) != M_IOCTL);
 622  623          /*
 623  624           * TCP is D_MP and qprocsoff() is done towards the end of the tcp_close.
 624  625           * Once the close starts, streamhead and sockfs will not let any data
 625  626           * packets come down (close ensures that there are no threads using the
 626  627           * queue and no new threads will come down) but since qprocsoff()
 627  628           * hasn't happened yet, a M_FLUSH or some non data message might
 628  629           * get reflected back (in response to our own FLUSHRW) and get
 629  630           * processed after tcp_close() is done. The conn would still be valid
 630  631           * because a ref would have added but we need to check the state
 631  632           * before actually processing the packet.
 632  633           */
 633  634          if (TCP_IS_DETACHED(tcp) || (tcp->tcp_state == TCPS_CLOSED)) {
 634  635                  freemsg(mp);
 635  636                  return;
 636  637          }
 637  638  
 638  639          switch (DB_TYPE(mp)) {
 639  640          case M_IOCDATA:
 640  641                  tcp_wput_iocdata(tcp, mp);
 641  642                  break;
 642  643          case M_FLUSH:
 643  644                  tcp_wput_flush(tcp, mp);
 644  645                  break;
 645  646          default:
 646  647                  ip_wput_nondata(connp->conn_wq, mp);
 647  648                  break;
 648  649          }
 649  650  }
 650  651  
 651  652  /* tcp_wput_flush is called by tcp_wput_nondata to handle M_FLUSH messages. */
 652  653  static void
 653  654  tcp_wput_flush(tcp_t *tcp, mblk_t *mp)
 654  655  {
 655  656          uchar_t fval = *mp->b_rptr;
 656  657          mblk_t  *tail;
 657  658          conn_t  *connp = tcp->tcp_connp;
 658  659          queue_t *q = connp->conn_wq;
 659  660  
 660  661          /* TODO: How should flush interact with urgent data? */
 661  662          if ((fval & FLUSHW) && tcp->tcp_xmit_head != NULL &&
 662  663              !(tcp->tcp_valid_bits & TCP_URG_VALID)) {
 663  664                  /*
 664  665                   * Flush only data that has not yet been put on the wire.  If
 665  666                   * we flush data that we have already transmitted, life, as we
 666  667                   * know it, may come to an end.
 667  668                   */
 668  669                  tail = tcp->tcp_xmit_tail;
 669  670                  tail->b_wptr -= tcp->tcp_xmit_tail_unsent;
 670  671                  tcp->tcp_xmit_tail_unsent = 0;
 671  672                  tcp->tcp_unsent = 0;
 672  673                  if (tail->b_wptr != tail->b_rptr)
 673  674                          tail = tail->b_cont;
 674  675                  if (tail) {
 675  676                          mblk_t **excess = &tcp->tcp_xmit_head;
 676  677                          for (;;) {
 677  678                                  mblk_t *mp1 = *excess;
 678  679                                  if (mp1 == tail)
 679  680                                          break;
 680  681                                  tcp->tcp_xmit_tail = mp1;
 681  682                                  tcp->tcp_xmit_last = mp1;
 682  683                                  excess = &mp1->b_cont;
 683  684                          }
 684  685                          *excess = NULL;
 685  686                          tcp_close_mpp(&tail);
 686  687                          if (tcp->tcp_snd_zcopy_aware)
 687  688                                  tcp_zcopy_notify(tcp);
 688  689                  }
 689  690                  /*
 690  691                   * We have no unsent data, so unsent must be less than
 691  692                   * conn_sndlowat, so re-enable flow.
 692  693                   */
 693  694                  mutex_enter(&tcp->tcp_non_sq_lock);
 694  695                  if (tcp->tcp_flow_stopped) {
 695  696                          tcp_clrqfull(tcp);
 696  697                  }
 697  698                  mutex_exit(&tcp->tcp_non_sq_lock);
 698  699          }
 699  700          /*
 700  701           * TODO: you can't just flush these, you have to increase rwnd for one
 701  702           * thing.  For another, how should urgent data interact?
 702  703           */
 703  704          if (fval & FLUSHR) {
 704  705                  *mp->b_rptr = fval & ~FLUSHW;
 705  706                  /* XXX */
 706  707                  qreply(q, mp);
 707  708                  return;
 708  709          }
 709  710          freemsg(mp);
 710  711  }
 711  712  
 712  713  /*
 713  714   * tcp_wput_iocdata is called by tcp_wput_nondata to handle all M_IOCDATA
 714  715   * messages.
 715  716   */
 716  717  static void
 717  718  tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp)
 718  719  {
 719  720          mblk_t          *mp1;
 720  721          struct iocblk   *iocp = (struct iocblk *)mp->b_rptr;
 721  722          STRUCT_HANDLE(strbuf, sb);
 722  723          uint_t          addrlen;
 723  724          conn_t          *connp = tcp->tcp_connp;
 724  725          queue_t         *q = connp->conn_wq;
 725  726  
 726  727          /* Make sure it is one of ours. */
 727  728          switch (iocp->ioc_cmd) {
 728  729          case TI_GETMYNAME:
 729  730          case TI_GETPEERNAME:
 730  731                  break;
 731  732          default:
 732  733                  /*
 733  734                   * If the conn is closing, then error the ioctl here. Otherwise
 734  735                   * use the CONN_IOCTLREF_* macros to hold off tcp_close until
 735  736                   * we're done here.
 736  737                   */
 737  738                  mutex_enter(&connp->conn_lock);
 738  739                  if (connp->conn_state_flags & CONN_CLOSING) {
 739  740                          mutex_exit(&connp->conn_lock);
 740  741                          iocp->ioc_error = EINVAL;
 741  742                          mp->b_datap->db_type = M_IOCNAK;
 742  743                          iocp->ioc_count = 0;
 743  744                          qreply(q, mp);
 744  745                          return;
 745  746                  }
 746  747  
 747  748                  CONN_INC_IOCTLREF_LOCKED(connp);
 748  749                  ip_wput_nondata(q, mp);
 749  750                  CONN_DEC_IOCTLREF(connp);
 750  751                  return;
 751  752          }
 752  753          switch (mi_copy_state(q, mp, &mp1)) {
 753  754          case -1:
 754  755                  return;
 755  756          case MI_COPY_CASE(MI_COPY_IN, 1):
 756  757                  break;
 757  758          case MI_COPY_CASE(MI_COPY_OUT, 1):
 758  759                  /* Copy out the strbuf. */
 759  760                  mi_copyout(q, mp);
 760  761                  return;
 761  762          case MI_COPY_CASE(MI_COPY_OUT, 2):
 762  763                  /* All done. */
 763  764                  mi_copy_done(q, mp, 0);
 764  765                  return;
 765  766          default:
 766  767                  mi_copy_done(q, mp, EPROTO);
 767  768                  return;
 768  769          }
 769  770          /* Check alignment of the strbuf */
 770  771          if (!OK_32PTR(mp1->b_rptr)) {
 771  772                  mi_copy_done(q, mp, EINVAL);
 772  773                  return;
 773  774          }
 774  775  
 775  776          STRUCT_SET_HANDLE(sb, iocp->ioc_flag, (void *)mp1->b_rptr);
 776  777  
 777  778          if (connp->conn_family == AF_INET)
 778  779                  addrlen = sizeof (sin_t);
 779  780          else
 780  781                  addrlen = sizeof (sin6_t);
 781  782  
 782  783          if (STRUCT_FGET(sb, maxlen) < addrlen) {
 783  784                  mi_copy_done(q, mp, EINVAL);
 784  785                  return;
 785  786          }
 786  787  
 787  788          switch (iocp->ioc_cmd) {
 788  789          case TI_GETMYNAME:
 789  790                  break;
 790  791          case TI_GETPEERNAME:
 791  792                  if (tcp->tcp_state < TCPS_SYN_RCVD) {
 792  793                          mi_copy_done(q, mp, ENOTCONN);
 793  794                          return;
 794  795                  }
 795  796                  break;
 796  797          }
 797  798          mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE);
 798  799          if (!mp1)
 799  800                  return;
 800  801  
 801  802          STRUCT_FSET(sb, len, addrlen);
 802  803          switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
 803  804          case TI_GETMYNAME:
 804  805                  (void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr,
 805  806                      &addrlen);
 806  807                  break;
 807  808          case TI_GETPEERNAME:
 808  809                  (void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr,
 809  810                      &addrlen);
 810  811                  break;
 811  812          }
 812  813          mp1->b_wptr += addrlen;
 813  814          /* Copy out the address */
 814  815          mi_copyout(q, mp);
 815  816  }
 816  817  
 817  818  /*
 818  819   * tcp_wput_ioctl is called by tcp_wput_nondata() to handle all M_IOCTL
 819  820   * messages.
 820  821   */
 821  822  /* ARGSUSED */
 822  823  static void
 823  824  tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
 824  825  {
 825  826          conn_t          *connp = (conn_t *)arg;
 826  827          tcp_t           *tcp = connp->conn_tcp;
 827  828          queue_t         *q = connp->conn_wq;
 828  829          struct iocblk   *iocp;
 829  830  
 830  831          ASSERT(DB_TYPE(mp) == M_IOCTL);
 831  832          /*
 832  833           * Try and ASSERT the minimum possible references on the
 833  834           * conn early enough. Since we are executing on write side,
 834  835           * the connection is obviously not detached and that means
 835  836           * there is a ref each for TCP and IP. Since we are behind
 836  837           * the squeue, the minimum references needed are 3. If the
 837  838           * conn is in classifier hash list, there should be an
 838  839           * extra ref for that (we check both the possibilities).
 839  840           */
 840  841          ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
 841  842              (connp->conn_fanout == NULL && connp->conn_ref >= 3));
 842  843  
 843  844          iocp = (struct iocblk *)mp->b_rptr;
 844  845          switch (iocp->ioc_cmd) {
 845  846          case _SIOCSOCKFALLBACK:
 846  847                  /*
 847  848                   * Either sockmod is about to be popped and the socket
 848  849                   * would now be treated as a plain stream, or a module
 849  850                   * is about to be pushed so we could no longer use read-
 850  851                   * side synchronous streams for fused loopback tcp.
 851  852                   * Drain any queued data and disable direct sockfs
 852  853                   * interface from now on.
 853  854                   */
 854  855                  if (!tcp->tcp_issocket) {
 855  856                          DB_TYPE(mp) = M_IOCNAK;
 856  857                          iocp->ioc_error = EINVAL;
 857  858                  } else {
 858  859                          tcp_use_pure_tpi(tcp);
 859  860                          DB_TYPE(mp) = M_IOCACK;
 860  861                          iocp->ioc_error = 0;
 861  862                  }
 862  863                  iocp->ioc_count = 0;
 863  864                  iocp->ioc_rval = 0;
 864  865                  qreply(q, mp);
 865  866                  return;
 866  867          }
 867  868  
 868  869          /*
 869  870           * If the conn is closing, then error the ioctl here. Otherwise bump the
 870  871           * conn_ioctlref to hold off tcp_close until we're done here.
 871  872           */
 872  873          mutex_enter(&(connp)->conn_lock);
 873  874          if ((connp)->conn_state_flags & CONN_CLOSING) {
 874  875                  mutex_exit(&(connp)->conn_lock);
 875  876                  iocp->ioc_error = EINVAL;
 876  877                  mp->b_datap->db_type = M_IOCNAK;
 877  878                  iocp->ioc_count = 0;
 878  879                  qreply(q, mp);
 879  880                  return;
 880  881          }
 881  882  
 882  883          CONN_INC_IOCTLREF_LOCKED(connp);
 883  884          ip_wput_nondata(q, mp);
 884  885          CONN_DEC_IOCTLREF(connp);
 885  886  }
 886  887  
 887  888  /*
 888  889   * This routine is called by tcp_wput() to handle all TPI requests.
 889  890   */
 890  891  /* ARGSUSED */
 891  892  static void
 892  893  tcp_wput_proto(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
 893  894  {
 894  895          conn_t          *connp = (conn_t *)arg;
 895  896          tcp_t           *tcp = connp->conn_tcp;
 896  897          union T_primitives *tprim = (union T_primitives *)mp->b_rptr;
 897  898          uchar_t         *rptr;
 898  899          t_scalar_t      type;
 899  900          cred_t          *cr;
 900  901  
 901  902          /*
 902  903           * Try and ASSERT the minimum possible references on the
 903  904           * conn early enough. Since we are executing on write side,
 904  905           * the connection is obviously not detached and that means
 905  906           * there is a ref each for TCP and IP. Since we are behind
 906  907           * the squeue, the minimum references needed are 3. If the
 907  908           * conn is in classifier hash list, there should be an
 908  909           * extra ref for that (we check both the possibilities).
 909  910           */
 910  911          ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
 911  912              (connp->conn_fanout == NULL && connp->conn_ref >= 3));
 912  913  
 913  914          rptr = mp->b_rptr;
 914  915          ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX);
 915  916          if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) {
 916  917                  type = ((union T_primitives *)rptr)->type;
 917  918                  if (type == T_EXDATA_REQ) {
 918  919                          tcp_output_urgent(connp, mp, arg2, NULL);
 919  920                  } else if (type != T_DATA_REQ) {
 920  921                          goto non_urgent_data;
 921  922                  } else {
 922  923                          /* TODO: options, flags, ... from user */
 923  924                          /* Set length to zero for reclamation below */
 924  925                          tcp_wput_data(tcp, mp->b_cont, B_TRUE);
 925  926                          freeb(mp);
 926  927                  }
 927  928                  return;
 928  929          } else {
 929  930                  if (connp->conn_debug) {
 930  931                          (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
 931  932                              "tcp_wput_proto, dropping one...");
 932  933                  }
 933  934                  freemsg(mp);
 934  935                  return;
 935  936          }
 936  937  
 937  938  non_urgent_data:
 938  939  
 939  940          switch ((int)tprim->type) {
 940  941          case O_T_BIND_REQ:      /* bind request */
 941  942          case T_BIND_REQ:        /* new semantics bind request */
 942  943                  tcp_tpi_bind(tcp, mp);
 943  944                  break;
 944  945          case T_UNBIND_REQ:      /* unbind request */
 945  946                  tcp_tpi_unbind(tcp, mp);
 946  947                  break;
 947  948          case O_T_CONN_RES:      /* old connection response XXX */
 948  949          case T_CONN_RES:        /* connection response */
 949  950                  tcp_tli_accept(tcp, mp);
 950  951                  break;
 951  952          case T_CONN_REQ:        /* connection request */
 952  953                  tcp_tpi_connect(tcp, mp);
 953  954                  break;
 954  955          case T_DISCON_REQ:      /* disconnect request */
 955  956                  tcp_disconnect(tcp, mp);
 956  957                  break;
 957  958          case T_CAPABILITY_REQ:
 958  959                  tcp_capability_req(tcp, mp);    /* capability request */
 959  960                  break;
 960  961          case T_INFO_REQ:        /* information request */
 961  962                  tcp_info_req(tcp, mp);
 962  963                  break;
 963  964          case T_SVR4_OPTMGMT_REQ:        /* manage options req */
 964  965          case T_OPTMGMT_REQ:
 965  966                  /*
 966  967                   * Note:  no support for snmpcom_req() through new
 967  968                   * T_OPTMGMT_REQ. See comments in ip.c
 968  969                   */
 969  970  
 970  971                  /*
 971  972                   * All Solaris components should pass a db_credp
 972  973                   * for this TPI message, hence we ASSERT.
 973  974                   * But in case there is some other M_PROTO that looks
 974  975                   * like a TPI message sent by some other kernel
 975  976                   * component, we check and return an error.
 976  977                   */
 977  978                  cr = msg_getcred(mp, NULL);
 978  979                  ASSERT(cr != NULL);
 979  980                  if (cr == NULL) {
 980  981                          tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
 981  982                          return;
 982  983                  }
 983  984                  /*
 984  985                   * If EINPROGRESS is returned, the request has been queued
 985  986                   * for subsequent processing by ip_restart_optmgmt(), which
 986  987                   * will do the CONN_DEC_REF().
 987  988                   */
 988  989                  if ((int)tprim->type == T_SVR4_OPTMGMT_REQ) {
 989  990                          svr4_optcom_req(connp->conn_wq, mp, cr, &tcp_opt_obj);
 990  991                  } else {
 991  992                          tpi_optcom_req(connp->conn_wq, mp, cr, &tcp_opt_obj);
 992  993                  }
 993  994                  break;
 994  995  
 995  996          case T_UNITDATA_REQ:    /* unitdata request */
 996  997                  tcp_err_ack(tcp, mp, TNOTSUPPORT, 0);
 997  998                  break;
 998  999          case T_ORDREL_REQ:      /* orderly release req */
 999 1000                  freemsg(mp);
1000 1001  
1001 1002                  if (tcp->tcp_fused)
1002 1003                          tcp_unfuse(tcp);
1003 1004  
1004 1005                  if (tcp_xmit_end(tcp) != 0) {
1005 1006                          /*
1006 1007                           * We were crossing FINs and got a reset from
1007 1008                           * the other side. Just ignore it.
1008 1009                           */
1009 1010                          if (connp->conn_debug) {
1010 1011                                  (void) strlog(TCP_MOD_ID, 0, 1,
1011 1012                                      SL_ERROR|SL_TRACE,
1012 1013                                      "tcp_wput_proto, T_ORDREL_REQ out of "
1013 1014                                      "state %s",
1014 1015                                      tcp_display(tcp, NULL,
1015 1016                                      DISP_ADDR_AND_PORT));
1016 1017                          }
1017 1018                  }
1018 1019                  break;
1019 1020          case T_ADDR_REQ:
1020 1021                  tcp_addr_req(tcp, mp);
1021 1022                  break;
1022 1023          default:
1023 1024                  if (connp->conn_debug) {
1024 1025                          (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
1025 1026                              "tcp_wput_proto, bogus TPI msg, type %d",
1026 1027                              tprim->type);
1027 1028                  }
1028 1029                  /*
1029 1030                   * We used to M_ERROR.  Sending TNOTSUPPORT gives the user
1030 1031                   * to recover.
1031 1032                   */
1032 1033                  tcp_err_ack(tcp, mp, TNOTSUPPORT, 0);
1033 1034                  break;
1034 1035          }
1035 1036  }
1036 1037  
1037 1038  /*
1038 1039   * Handle special out-of-band ioctl requests (see PSARC/2008/265).
1039 1040   */
1040 1041  static void
1041 1042  tcp_wput_cmdblk(queue_t *q, mblk_t *mp)
1042 1043  {
1043 1044          void    *data;
1044 1045          mblk_t  *datamp = mp->b_cont;
1045 1046          conn_t  *connp = Q_TO_CONN(q);
1046 1047          tcp_t   *tcp = connp->conn_tcp;
1047 1048          cmdblk_t *cmdp = (cmdblk_t *)mp->b_rptr;
1048 1049  
1049 1050          if (datamp == NULL || MBLKL(datamp) < cmdp->cb_len) {
1050 1051                  cmdp->cb_error = EPROTO;
1051 1052                  qreply(q, mp);
1052 1053                  return;
1053 1054          }
1054 1055  
1055 1056          data = datamp->b_rptr;
1056 1057  
1057 1058          switch (cmdp->cb_cmd) {
1058 1059          case TI_GETPEERNAME:
1059 1060                  if (tcp->tcp_state < TCPS_SYN_RCVD)
1060 1061                          cmdp->cb_error = ENOTCONN;
1061 1062                  else
1062 1063                          cmdp->cb_error = conn_getpeername(connp, data,
1063 1064                              &cmdp->cb_len);
1064 1065                  break;
1065 1066          case TI_GETMYNAME:
1066 1067                  cmdp->cb_error = conn_getsockname(connp, data, &cmdp->cb_len);
1067 1068                  break;
1068 1069          default:
1069 1070                  cmdp->cb_error = EINVAL;
1070 1071                  break;
1071 1072          }
1072 1073  
1073 1074          qreply(q, mp);
1074 1075  }
1075 1076  
1076 1077  /*
1077 1078   * The TCP fast path write put procedure.
1078 1079   * NOTE: the logic of the fast path is duplicated from tcp_wput_data()
1079 1080   */
1080 1081  /* ARGSUSED */
1081 1082  void
1082 1083  tcp_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
1083 1084  {
1084 1085          int             len;
1085 1086          int             hdrlen;
1086 1087          int             plen;
1087 1088          mblk_t          *mp1;
1088 1089          uchar_t         *rptr;
1089 1090          uint32_t        snxt;
1090 1091          tcpha_t         *tcpha;
1091 1092          struct datab    *db;
1092 1093          uint32_t        suna;
1093 1094          uint32_t        mss;
1094 1095          ipaddr_t        *dst;
1095 1096          ipaddr_t        *src;
1096 1097          uint32_t        sum;
1097 1098          int             usable;
1098 1099          conn_t          *connp = (conn_t *)arg;
1099 1100          tcp_t           *tcp = connp->conn_tcp;
1100 1101          uint32_t        msize;
1101 1102          tcp_stack_t     *tcps = tcp->tcp_tcps;
1102 1103          ip_xmit_attr_t  *ixa;
1103 1104          clock_t         now;
1104 1105  
1105 1106          /*
1106 1107           * Try and ASSERT the minimum possible references on the
1107 1108           * conn early enough. Since we are executing on write side,
1108 1109           * the connection is obviously not detached and that means
1109 1110           * there is a ref each for TCP and IP. Since we are behind
1110 1111           * the squeue, the minimum references needed are 3. If the
1111 1112           * conn is in classifier hash list, there should be an
1112 1113           * extra ref for that (we check both the possibilities).
1113 1114           */
1114 1115          ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
1115 1116              (connp->conn_fanout == NULL && connp->conn_ref >= 3));
1116 1117  
1117 1118          ASSERT(DB_TYPE(mp) == M_DATA);
1118 1119          msize = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp);
1119 1120  
1120 1121          mutex_enter(&tcp->tcp_non_sq_lock);
1121 1122          tcp->tcp_squeue_bytes -= msize;
1122 1123          mutex_exit(&tcp->tcp_non_sq_lock);
1123 1124  
1124 1125          /* Bypass tcp protocol for fused tcp loopback */
1125 1126          if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize))
1126 1127                  return;
1127 1128  
1128 1129          mss = tcp->tcp_mss;
1129 1130          /*
1130 1131           * If ZEROCOPY has turned off, try not to send any zero-copy message
1131 1132           * down. Do backoff, now.
1132 1133           */
1133 1134          if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_snd_zcopy_on)
1134 1135                  mp = tcp_zcopy_backoff(tcp, mp, B_FALSE);
1135 1136  
1136 1137  
1137 1138          ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
1138 1139          len = (int)(mp->b_wptr - mp->b_rptr);
1139 1140  
1140 1141          /*
1141 1142           * Criteria for fast path:
1142 1143           *
1143 1144           *   1. no unsent data
1144 1145           *   2. single mblk in request
1145 1146           *   3. connection established
1146 1147           *   4. data in mblk
1147 1148           *   5. len <= mss
1148 1149           *   6. no tcp_valid bits
1149 1150           */
1150 1151          if ((tcp->tcp_unsent != 0) ||
1151 1152              (tcp->tcp_cork) ||
1152 1153              (mp->b_cont != NULL) ||
1153 1154              (tcp->tcp_state != TCPS_ESTABLISHED) ||
1154 1155              (len == 0) ||
1155 1156              (len > mss) ||
1156 1157              (tcp->tcp_valid_bits != 0)) {
1157 1158                  tcp_wput_data(tcp, mp, B_FALSE);
1158 1159                  return;
1159 1160          }
1160 1161  
1161 1162          ASSERT(tcp->tcp_xmit_tail_unsent == 0);
1162 1163          ASSERT(tcp->tcp_fin_sent == 0);
1163 1164  
1164 1165          /* queue new packet onto retransmission queue */
1165 1166          if (tcp->tcp_xmit_head == NULL) {
1166 1167                  tcp->tcp_xmit_head = mp;
1167 1168          } else {
1168 1169                  tcp->tcp_xmit_last->b_cont = mp;
1169 1170          }
1170 1171          tcp->tcp_xmit_last = mp;
1171 1172          tcp->tcp_xmit_tail = mp;
1172 1173  
1173 1174          /* find out how much we can send */
1174 1175          /* BEGIN CSTYLED */
1175 1176          /*

↓ open down ↓

708 lines elided

↑ open up ↑

1176 1177           *    un-acked     usable
1177 1178           *  |--------------|-----------------|
1178 1179           *  tcp_suna       tcp_snxt       tcp_suna+tcp_swnd
1179 1180           */
1180 1181          /* END CSTYLED */
1181 1182  
1182 1183          /* start sending from tcp_snxt */
1183 1184          snxt = tcp->tcp_snxt;
1184 1185  
1185 1186          /*
1186      -         * Check to see if this connection has been idled for some
1187      -         * time and no ACK is expected.  If it is, we need to slow
1188      -         * start again to get back the connection's "self-clock" as
1189      -         * described in VJ's paper.
     1187 +         * Check to see if this connection has been idle for some time and no
     1188 +         * ACK is expected. If so, then the congestion window size is no longer
     1189 +         * meaningfully tied to current network conditions.
1190 1190           *
1191      -         * Reinitialize tcp_cwnd after idle.
     1191 +         * We reinitialize tcp_cwnd, and slow start again to get back the
     1192 +         * connection's "self-clock" as described in Van Jacobson's 1988 paper
     1193 +         * "Congestion avoidance and control".
1192 1194           */
1193 1195          now = LBOLT_FASTPATH;
1194 1196          if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
1195 1197              (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
1196 1198                  TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle);
1197 1199          }
1198 1200  
1199 1201          usable = tcp->tcp_swnd;         /* tcp window size */
1200 1202          if (usable > tcp->tcp_cwnd)
1201 1203                  usable = tcp->tcp_cwnd; /* congestion window smaller */

1202 1204          usable -= snxt;         /* subtract stuff already sent */
1203 1205          suna = tcp->tcp_suna;
1204 1206          usable += suna;
1205 1207          /* usable can be < 0 if the congestion window is smaller */
1206 1208          if (len > usable) {
1207 1209                  /* Can't send complete M_DATA in one shot */
1208 1210                  goto slow;
1209 1211          }
1210 1212  
1211 1213          mutex_enter(&tcp->tcp_non_sq_lock);
1212 1214          if (tcp->tcp_flow_stopped &&
1213 1215              TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) {
1214 1216                  tcp_clrqfull(tcp);
1215 1217          }
1216 1218          mutex_exit(&tcp->tcp_non_sq_lock);
1217 1219  
1218 1220          /*
1219 1221           * determine if anything to send (Nagle).
1220 1222           *
1221 1223           *   1. len < tcp_mss (i.e. small)
1222 1224           *   2. unacknowledged data present
1223 1225           *   3. len < nagle limit
1224 1226           *   4. last packet sent < nagle limit (previous packet sent)
1225 1227           */
1226 1228          if ((len < mss) && (snxt != suna) &&
1227 1229              (len < (int)tcp->tcp_naglim) &&
1228 1230              (tcp->tcp_last_sent_len < tcp->tcp_naglim)) {
1229 1231                  /*
1230 1232                   * This was the first unsent packet and normally
1231 1233                   * mss < xmit_hiwater so there is no need to worry
1232 1234                   * about flow control. The next packet will go
1233 1235                   * through the flow control check in tcp_wput_data().
1234 1236                   */
1235 1237                  /* leftover work from above */
1236 1238                  tcp->tcp_unsent = len;
1237 1239                  tcp->tcp_xmit_tail_unsent = len;
1238 1240  
1239 1241                  return;
1240 1242          }
1241 1243  
1242 1244          /*
1243 1245           * len <= tcp->tcp_mss && len == unsent so no sender silly window.  Can
1244 1246           * send now.
1245 1247           */
1246 1248  
1247 1249          if (snxt == suna) {
1248 1250                  TCP_TIMER_RESTART(tcp, tcp->tcp_rto);

↓ open down ↓

47 lines elided

↑ open up ↑

1249 1251          }
1250 1252  
1251 1253          /* we have always sent something */
1252 1254          tcp->tcp_rack_cnt = 0;
1253 1255  
1254 1256          tcp->tcp_snxt = snxt + len;
1255 1257          tcp->tcp_rack = tcp->tcp_rnxt;
1256 1258  
1257 1259          if ((mp1 = dupb(mp)) == 0)
1258 1260                  goto no_memory;
1259      -        mp->b_prev = (mblk_t *)(uintptr_t)now;
     1261 +        mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
1260 1262          mp->b_next = (mblk_t *)(uintptr_t)snxt;
1261 1263  
1262 1264          /* adjust tcp header information */
1263 1265          tcpha = tcp->tcp_tcpha;
1264 1266          tcpha->tha_flags = (TH_ACK|TH_PUSH);
1265 1267  
1266 1268          sum = len + connp->conn_ht_ulp_len + connp->conn_sum;
1267 1269          sum = (sum >> 16) + (sum & 0xFFFF);
1268 1270          tcpha->tha_sum = htons(sum);
1269 1271

1270 1272          tcpha->tha_seq = htonl(snxt);
1271 1273  
1272 1274          TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
1273 1275          TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
1274 1276          BUMP_LOCAL(tcp->tcp_obsegs);
1275 1277  
1276 1278          /* Update the latest receive window size in TCP header. */
1277 1279          tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
1278 1280  
1279 1281          tcp->tcp_last_sent_len = (ushort_t)len;
1280 1282  
1281 1283          plen = len + connp->conn_ht_iphc_len;
1282 1284  
1283 1285          ixa = connp->conn_ixa;
1284 1286          ixa->ixa_pktlen = plen;
1285 1287  
1286 1288          if (ixa->ixa_flags & IXAF_IS_IPV4) {
1287 1289                  tcp->tcp_ipha->ipha_length = htons(plen);
1288 1290          } else {
1289 1291                  tcp->tcp_ip6h->ip6_plen = htons(plen - IPV6_HDR_LEN);
1290 1292          }
1291 1293  
1292 1294          /* see if we need to allocate a mblk for the headers */
1293 1295          hdrlen = connp->conn_ht_iphc_len;
1294 1296          rptr = mp1->b_rptr - hdrlen;
1295 1297          db = mp1->b_datap;
1296 1298          if ((db->db_ref != 2) || rptr < db->db_base ||
1297 1299              (!OK_32PTR(rptr))) {
1298 1300                  /* NOTE: we assume allocb returns an OK_32PTR */
1299 1301                  mp = allocb(hdrlen + tcps->tcps_wroff_xtra, BPRI_MED);
1300 1302                  if (!mp) {
1301 1303                          freemsg(mp1);
1302 1304                          goto no_memory;
1303 1305                  }

↓ open down ↓

34 lines elided

↑ open up ↑

1304 1306                  mp->b_cont = mp1;
1305 1307                  mp1 = mp;
1306 1308                  /* Leave room for Link Level header */
1307 1309                  rptr = &mp1->b_rptr[tcps->tcps_wroff_xtra];
1308 1310                  mp1->b_wptr = &rptr[hdrlen];
1309 1311          }
1310 1312          mp1->b_rptr = rptr;
1311 1313  
1312 1314          /* Fill in the timestamp option. */
1313 1315          if (tcp->tcp_snd_ts_ok) {
1314      -                uint32_t llbolt = (uint32_t)LBOLT_FASTPATH;
1315      -
1316      -                U32_TO_BE32(llbolt,
1317      -                    (char *)tcpha + TCP_MIN_HEADER_LENGTH+4);
     1316 +                U32_TO_BE32(now,
     1317 +                    (char *)tcpha + TCP_MIN_HEADER_LENGTH + 4);
1318 1318                  U32_TO_BE32(tcp->tcp_ts_recent,
1319      -                    (char *)tcpha + TCP_MIN_HEADER_LENGTH+8);
     1319 +                    (char *)tcpha + TCP_MIN_HEADER_LENGTH + 8);
1320 1320          } else {
1321 1321                  ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
1322 1322          }
1323 1323  
1324 1324          /* copy header into outgoing packet */
1325 1325          dst = (ipaddr_t *)rptr;
1326 1326          src = (ipaddr_t *)connp->conn_ht_iphc;
1327 1327          dst[0] = src[0];
1328 1328          dst[1] = src[1];
1329 1329          dst[2] = src[2];

1330 1330          dst[3] = src[3];
1331 1331          dst[4] = src[4];
1332 1332          dst[5] = src[5];
1333 1333          dst[6] = src[6];
1334 1334          dst[7] = src[7];
1335 1335          dst[8] = src[8];
1336 1336          dst[9] = src[9];
1337 1337          if (hdrlen -= 40) {
1338 1338                  hdrlen >>= 2;
1339 1339                  dst += 10;
1340 1340                  src += 10;
1341 1341                  do {
1342 1342                          *dst++ = *src++;
1343 1343                  } while (--hdrlen);
1344 1344          }
1345 1345  
1346 1346          /*
1347 1347           * Set the ECN info in the TCP header.  Note that this
1348 1348           * is not the template header.
1349 1349           */
1350 1350          if (tcp->tcp_ecn_ok) {
1351 1351                  TCP_SET_ECT(tcp, rptr);
1352 1352  
1353 1353                  tcpha = (tcpha_t *)(rptr + ixa->ixa_ip_hdr_length);
1354 1354                  if (tcp->tcp_ecn_echo_on)
1355 1355                          tcpha->tha_flags |= TH_ECE;
1356 1356                  if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) {
1357 1357                          tcpha->tha_flags |= TH_CWR;
1358 1358                          tcp->tcp_ecn_cwr_sent = B_TRUE;
1359 1359                  }
1360 1360          }
1361 1361  
1362 1362          if (tcp->tcp_ip_forward_progress) {
1363 1363                  tcp->tcp_ip_forward_progress = B_FALSE;
1364 1364                  connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF;
1365 1365          } else {
1366 1366                  connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF;
1367 1367          }
1368 1368          tcp_send_data(tcp, mp1);
1369 1369          return;
1370 1370  
1371 1371          /*
1372 1372           * If we ran out of memory, we pretend to have sent the packet
1373 1373           * and that it was lost on the wire.
1374 1374           */
1375 1375  no_memory:
1376 1376          return;
1377 1377  
1378 1378  slow:
1379 1379          /* leftover work from above */
1380 1380          tcp->tcp_unsent = len;
1381 1381          tcp->tcp_xmit_tail_unsent = len;
1382 1382          tcp_wput_data(tcp, NULL, B_FALSE);
1383 1383  }
1384 1384  
1385 1385  /* ARGSUSED2 */
1386 1386  void
1387 1387  tcp_output_urgent(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
1388 1388  {
1389 1389          int len;
1390 1390          uint32_t msize;
1391 1391          conn_t *connp = (conn_t *)arg;
1392 1392          tcp_t *tcp = connp->conn_tcp;
1393 1393  
1394 1394          msize = msgdsize(mp);
1395 1395  
1396 1396          len = msize - 1;
1397 1397          if (len < 0) {
1398 1398                  freemsg(mp);
1399 1399                  return;
1400 1400          }
1401 1401  
1402 1402          /*
1403 1403           * Try to force urgent data out on the wire. Even if we have unsent
1404 1404           * data this will at least send the urgent flag.
1405 1405           * XXX does not handle more flag correctly.
1406 1406           */
1407 1407          len += tcp->tcp_unsent;
1408 1408          len += tcp->tcp_snxt;
1409 1409          tcp->tcp_urg = len;
1410 1410          tcp->tcp_valid_bits |= TCP_URG_VALID;
1411 1411  
1412 1412          /* Bypass tcp protocol for fused tcp loopback */
1413 1413          if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize))
1414 1414                  return;
1415 1415  
1416 1416          /* Strip off the T_EXDATA_REQ if the data is from TPI */
1417 1417          if (DB_TYPE(mp) != M_DATA) {
1418 1418                  mblk_t *mp1 = mp;
1419 1419                  ASSERT(!IPCL_IS_NONSTR(connp));
1420 1420                  mp = mp->b_cont;
1421 1421                  freeb(mp1);
1422 1422          }
1423 1423          tcp_wput_data(tcp, mp, B_TRUE);
1424 1424  }
1425 1425  
1426 1426  /*
1427 1427   * Called by streams close routine via squeues when our client blows off its
1428 1428   * descriptor, we take this to mean: "close the stream state NOW, close the tcp
1429 1429   * connection politely" When SO_LINGER is set (with a non-zero linger time and
1430 1430   * it is not a nonblocking socket) then this routine sleeps until the FIN is
1431 1431   * acked.
1432 1432   *
1433 1433   * NOTE: tcp_close potentially returns error when lingering.
1434 1434   * However, the stream head currently does not pass these errors
1435 1435   * to the application. 4.4BSD only returns EINTR and EWOULDBLOCK
1436 1436   * errors to the application (from tsleep()) and not errors
1437 1437   * like ECONNRESET caused by receiving a reset packet.
1438 1438   */
1439 1439  
1440 1440  /* ARGSUSED */
1441 1441  void
1442 1442  tcp_close_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
1443 1443  {
1444 1444          char    *msg;
1445 1445          conn_t  *connp = (conn_t *)arg;
1446 1446          tcp_t   *tcp = connp->conn_tcp;
1447 1447          clock_t delta = 0;
1448 1448          tcp_stack_t     *tcps = tcp->tcp_tcps;
1449 1449  
1450 1450          /*
1451 1451           * When a non-STREAMS socket is being closed, it does not always
1452 1452           * stick around waiting for tcp_close_output to run and can therefore
1453 1453           * have dropped a reference already. So adjust the asserts accordingly.
1454 1454           */
1455 1455          ASSERT((connp->conn_fanout != NULL &&
1456 1456              connp->conn_ref >= (IPCL_IS_NONSTR(connp) ? 3 : 4)) ||
1457 1457              (connp->conn_fanout == NULL &&
1458 1458              connp->conn_ref >= (IPCL_IS_NONSTR(connp) ? 2 : 3)));
1459 1459  
1460 1460          mutex_enter(&tcp->tcp_eager_lock);
1461 1461          if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) {
1462 1462                  /*
1463 1463                   * Cleanup for listener. For non-STREAM sockets sockfs will
1464 1464                   * close all the eagers on 'q', so in that case only deal
1465 1465                   * with 'q0'.
1466 1466                   */
1467 1467                  tcp_eager_cleanup(tcp, IPCL_IS_NONSTR(connp) ? 1 : 0);
1468 1468                  tcp->tcp_wait_for_eagers = 1;
1469 1469          }
1470 1470          mutex_exit(&tcp->tcp_eager_lock);
1471 1471  
1472 1472          tcp->tcp_lso = B_FALSE;
1473 1473  
1474 1474          msg = NULL;
1475 1475          switch (tcp->tcp_state) {
1476 1476          case TCPS_CLOSED:
1477 1477          case TCPS_IDLE:
1478 1478                  break;
1479 1479          case TCPS_BOUND:
1480 1480                  if (tcp->tcp_listener != NULL) {
1481 1481                          ASSERT(IPCL_IS_NONSTR(connp));
1482 1482                          /*
1483 1483                           * Unlink from the listener and drop the reference
1484 1484                           * put on it by the eager. tcp_closei_local will not
1485 1485                           * do it because tcp_tconnind_started is TRUE.
1486 1486                           */
1487 1487                          mutex_enter(&tcp->tcp_saved_listener->tcp_eager_lock);
1488 1488                          tcp_eager_unlink(tcp);
1489 1489                          mutex_exit(&tcp->tcp_saved_listener->tcp_eager_lock);
1490 1490                          CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp);
1491 1491                  }
1492 1492                  break;
1493 1493          case TCPS_LISTEN:
1494 1494                  break;
1495 1495          case TCPS_SYN_SENT:
1496 1496                  msg = "tcp_close, during connect";
1497 1497                  break;
1498 1498          case TCPS_SYN_RCVD:
1499 1499                  /*
1500 1500                   * Close during the connect 3-way handshake
1501 1501                   * but here there may or may not be pending data
1502 1502                   * already on queue. Process almost same as in
1503 1503                   * the ESTABLISHED state.
1504 1504                   */
1505 1505                  /* FALLTHRU */
1506 1506          default:
1507 1507                  if (tcp->tcp_fused)
1508 1508                          tcp_unfuse(tcp);
1509 1509  
1510 1510                  /*
1511 1511                   * If SO_LINGER has set a zero linger time, abort the
1512 1512                   * connection with a reset.
1513 1513                   */
1514 1514                  if (connp->conn_linger && connp->conn_lingertime == 0) {
1515 1515                          msg = "tcp_close, zero lingertime";
1516 1516                          break;
1517 1517                  }
1518 1518  
1519 1519                  /*
1520 1520                   * Abort connection if there is unread data queued.
1521 1521                   */
1522 1522                  if (tcp->tcp_rcv_list || tcp->tcp_reass_head) {
1523 1523                          msg = "tcp_close, unread data";
1524 1524                          break;
1525 1525                  }
1526 1526  
1527 1527                  /*
1528 1528                   * Abort connection if it is being closed without first
1529 1529                   * being accepted. This can happen if a listening non-STREAM
1530 1530                   * socket wants to get rid of the socket, for example, if the
1531 1531                   * listener is closing.
1532 1532                   */
1533 1533                  if (tcp->tcp_listener != NULL) {
1534 1534                          ASSERT(IPCL_IS_NONSTR(connp));
1535 1535                          msg = "tcp_close, close before accept";
1536 1536  
1537 1537                          /*
1538 1538                           * Unlink from the listener and drop the reference
1539 1539                           * put on it by the eager. tcp_closei_local will not
1540 1540                           * do it because tcp_tconnind_started is TRUE.
1541 1541                           */
1542 1542                          mutex_enter(&tcp->tcp_saved_listener->tcp_eager_lock);
1543 1543                          tcp_eager_unlink(tcp);
1544 1544                          mutex_exit(&tcp->tcp_saved_listener->tcp_eager_lock);
1545 1545                          CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp);
1546 1546                          break;
1547 1547                  }
1548 1548  
1549 1549                  /*
1550 1550                   * Transmit the FIN before detaching the tcp_t.
1551 1551                   * After tcp_detach returns this queue/perimeter
1552 1552                   * no longer owns the tcp_t thus others can modify it.
1553 1553                   */
1554 1554                  (void) tcp_xmit_end(tcp);
1555 1555  
1556 1556                  /*
1557 1557                   * If lingering on close then wait until the fin is acked,
1558 1558                   * the SO_LINGER time passes, or a reset is sent/received.
1559 1559                   */
1560 1560                  if (connp->conn_linger && connp->conn_lingertime > 0 &&
1561 1561                      !(tcp->tcp_fin_acked) &&
1562 1562                      tcp->tcp_state >= TCPS_ESTABLISHED) {
1563 1563                          if (tcp->tcp_closeflags & (FNDELAY|FNONBLOCK)) {
1564 1564                                  tcp->tcp_client_errno = EWOULDBLOCK;
1565 1565                          } else if (tcp->tcp_client_errno == 0) {
1566 1566  
1567 1567                                  ASSERT(tcp->tcp_linger_tid == 0);
1568 1568  
1569 1569                                  /* conn_lingertime is in sec. */
1570 1570                                  tcp->tcp_linger_tid = TCP_TIMER(tcp,
1571 1571                                      tcp_close_linger_timeout,
1572 1572                                      connp->conn_lingertime * MILLISEC);
1573 1573  
1574 1574                                  /* tcp_close_linger_timeout will finish close */
1575 1575                                  if (tcp->tcp_linger_tid == 0)
1576 1576                                          tcp->tcp_client_errno = ENOSR;
1577 1577                                  else
1578 1578                                          return;
1579 1579                          }
1580 1580  
1581 1581                          /*
1582 1582                           * Check if we need to detach or just close
1583 1583                           * the instance.
1584 1584                           */
1585 1585                          if (tcp->tcp_state <= TCPS_LISTEN)
1586 1586                                  break;
1587 1587                  }
1588 1588  
1589 1589                  /*
1590 1590                   * Make sure that no other thread will access the conn_rq of
1591 1591                   * this instance (through lookups etc.) as conn_rq will go
1592 1592                   * away shortly.
1593 1593                   */
1594 1594                  tcp_acceptor_hash_remove(tcp);
1595 1595  
1596 1596                  mutex_enter(&tcp->tcp_non_sq_lock);
1597 1597                  if (tcp->tcp_flow_stopped) {
1598 1598                          tcp_clrqfull(tcp);
1599 1599                  }
1600 1600                  mutex_exit(&tcp->tcp_non_sq_lock);
1601 1601  
1602 1602                  if (tcp->tcp_timer_tid != 0) {
1603 1603                          delta = TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid);
1604 1604                          tcp->tcp_timer_tid = 0;
1605 1605                  }
1606 1606                  /*
1607 1607                   * Need to cancel those timers which will not be used when
1608 1608                   * TCP is detached.  This has to be done before the conn_wq
1609 1609                   * is set to NULL.
1610 1610                   */
1611 1611                  tcp_timers_stop(tcp);
1612 1612  
1613 1613                  tcp->tcp_detached = B_TRUE;
1614 1614                  if (tcp->tcp_state == TCPS_TIME_WAIT) {
1615 1615                          tcp_time_wait_append(tcp);
1616 1616                          TCP_DBGSTAT(tcps, tcp_detach_time_wait);
1617 1617                          ASSERT(connp->conn_ref >=
1618 1618                              (IPCL_IS_NONSTR(connp) ? 2 : 3));
1619 1619                          goto finish;
1620 1620                  }
1621 1621  
1622 1622                  /*
1623 1623                   * If delta is zero the timer event wasn't executed and was
1624 1624                   * successfully canceled. In this case we need to restart it
1625 1625                   * with the minimal delta possible.
1626 1626                   */
1627 1627                  if (delta >= 0)
1628 1628                          tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer,
1629 1629                              delta ? delta : 1);
1630 1630  
1631 1631                  ASSERT(connp->conn_ref >= (IPCL_IS_NONSTR(connp) ? 2 : 3));
1632 1632                  goto finish;
1633 1633          }
1634 1634  
1635 1635          /* Detach did not complete. Still need to remove q from stream. */
1636 1636          if (msg) {
1637 1637                  if (tcp->tcp_state == TCPS_ESTABLISHED ||
1638 1638                      tcp->tcp_state == TCPS_CLOSE_WAIT)
1639 1639                          TCPS_BUMP_MIB(tcps, tcpEstabResets);
1640 1640                  if (tcp->tcp_state == TCPS_SYN_SENT ||
1641 1641                      tcp->tcp_state == TCPS_SYN_RCVD)
1642 1642                          TCPS_BUMP_MIB(tcps, tcpAttemptFails);
1643 1643                  tcp_xmit_ctl(msg, tcp,  tcp->tcp_snxt, 0, TH_RST);
1644 1644          }
1645 1645  
1646 1646          tcp_closei_local(tcp);
1647 1647          CONN_DEC_REF(connp);
1648 1648          ASSERT(connp->conn_ref >= (IPCL_IS_NONSTR(connp) ? 1 : 2));
1649 1649  
1650 1650  finish:
1651 1651          /*
1652 1652           * Don't change the queues in the case of a listener that has
1653 1653           * eagers in its q or q0. It could surprise the eagers.
1654 1654           * Instead wait for the eagers outside the squeue.
1655 1655           *
1656 1656           * For non-STREAMS sockets tcp_wait_for_eagers implies that
1657 1657           * we should delay the su_closed upcall until all eagers have
1658 1658           * dropped their references.
1659 1659           */
1660 1660          if (!tcp->tcp_wait_for_eagers) {
1661 1661                  tcp->tcp_detached = B_TRUE;
1662 1662                  connp->conn_rq = NULL;
1663 1663                  connp->conn_wq = NULL;
1664 1664  
1665 1665                  /* non-STREAM socket, release the upper handle */
1666 1666                  if (IPCL_IS_NONSTR(connp)) {
1667 1667                          ASSERT(connp->conn_upper_handle != NULL);
1668 1668                          (*connp->conn_upcalls->su_closed)
1669 1669                              (connp->conn_upper_handle);
1670 1670                          connp->conn_upper_handle = NULL;
1671 1671                          connp->conn_upcalls = NULL;
1672 1672                  }
1673 1673          }
1674 1674  
1675 1675          /* Signal tcp_close() to finish closing. */
1676 1676          mutex_enter(&tcp->tcp_closelock);
1677 1677          tcp->tcp_closed = 1;
1678 1678          cv_signal(&tcp->tcp_closecv);
1679 1679          mutex_exit(&tcp->tcp_closelock);
1680 1680  }
1681 1681  
1682 1682  /* ARGSUSED */
1683 1683  void
1684 1684  tcp_shutdown_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
1685 1685  {
1686 1686          conn_t  *connp = (conn_t *)arg;
1687 1687          tcp_t   *tcp = connp->conn_tcp;
1688 1688  
1689 1689          freemsg(mp);
1690 1690  
1691 1691          if (tcp->tcp_fused)
1692 1692                  tcp_unfuse(tcp);
1693 1693  
1694 1694          if (tcp_xmit_end(tcp) != 0) {
1695 1695                  /*
1696 1696                   * We were crossing FINs and got a reset from
1697 1697                   * the other side. Just ignore it.
1698 1698                   */
1699 1699                  if (connp->conn_debug) {
1700 1700                          (void) strlog(TCP_MOD_ID, 0, 1,
1701 1701                              SL_ERROR|SL_TRACE,
1702 1702                              "tcp_shutdown_output() out of state %s",
1703 1703                              tcp_display(tcp, NULL, DISP_ADDR_AND_PORT));
1704 1704                  }
1705 1705          }
1706 1706  }
1707 1707  
1708 1708  #pragma inline(tcp_send_data)
1709 1709  
1710 1710  void
1711 1711  tcp_send_data(tcp_t *tcp, mblk_t *mp)
1712 1712  {
1713 1713          conn_t          *connp = tcp->tcp_connp;
1714 1714  
1715 1715          /*
1716 1716           * Check here to avoid sending zero-copy message down to IP when
1717 1717           * ZEROCOPY capability has turned off. We only need to deal with
1718 1718           * the race condition between sockfs and the notification here.
1719 1719           * Since we have tried to backoff the tcp_xmit_head when turning
1720 1720           * zero-copy off and new messages in tcp_output(), we simply drop
1721 1721           * the dup'ed packet here and let tcp retransmit, if tcp_xmit_zc_clean
1722 1722           * is not true.
1723 1723           */
1724 1724          if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_snd_zcopy_on &&
1725 1725              !tcp->tcp_xmit_zc_clean) {
1726 1726                  ip_drop_output("TCP ZC was disabled but not clean", mp, NULL);
1727 1727                  freemsg(mp);
1728 1728                  return;
1729 1729          }
1730 1730  
1731 1731          DTRACE_TCP5(send, mblk_t *, NULL, ip_xmit_attr_t *, connp->conn_ixa,
1732 1732              __dtrace_tcp_void_ip_t *, mp->b_rptr, tcp_t *, tcp,
1733 1733              __dtrace_tcp_tcph_t *,
1734 1734              &mp->b_rptr[connp->conn_ixa->ixa_ip_hdr_length]);
1735 1735  
1736 1736          ASSERT(connp->conn_ixa->ixa_notify_cookie == connp->conn_tcp);
1737 1737          (void) conn_ip_output(mp, connp->conn_ixa);
1738 1738  }
1739 1739  
1740 1740  /* ARGSUSED2 */
1741 1741  void
1742 1742  tcp_send_synack(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
1743 1743  {
1744 1744          conn_t  *econnp = (conn_t *)arg;
1745 1745          tcp_t   *tcp = econnp->conn_tcp;
1746 1746          ip_xmit_attr_t *ixa = econnp->conn_ixa;
1747 1747  
1748 1748          /* Guard against a RST having blown it away while on the squeue */
1749 1749          if (tcp->tcp_state == TCPS_CLOSED) {
1750 1750                  freemsg(mp);
1751 1751                  return;
1752 1752          }
1753 1753  
1754 1754          /*
1755 1755           * In the off-chance that the eager received and responded to
1756 1756           * some other packet while the SYN|ACK was queued, we recalculate
1757 1757           * the ixa_pktlen. It would be better to fix the SYN/accept
1758 1758           * multithreading scheme to avoid this complexity.
1759 1759           */
1760 1760          ixa->ixa_pktlen = msgdsize(mp);
1761 1761          (void) conn_ip_output(mp, ixa);
1762 1762  }
1763 1763

↓ open down ↓

434 lines elided

↑ open up ↑

1764 1764  /*
1765 1765   * tcp_send() is called by tcp_wput_data() and returns one of the following:
1766 1766   *
1767 1767   * -1 = failed allocation.
1768 1768   *  0 = We've either successfully sent data, or our usable send window is too
1769 1769   *      small and we'd rather wait until later before sending again.
1770 1770   */
1771 1771  static int
1772 1772  tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len,
1773 1773      const int tcp_hdr_len, const int num_sack_blk, int *usable,
1774      -    uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time)
     1774 +    uint32_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time)
1775 1775  {
1776 1776          int             num_lso_seg = 1;
1777 1777          uint_t          lso_usable;
1778 1778          boolean_t       do_lso_send = B_FALSE;
1779 1779          tcp_stack_t     *tcps = tcp->tcp_tcps;
1780 1780          conn_t          *connp = tcp->tcp_connp;
1781 1781          ip_xmit_attr_t  *ixa = connp->conn_ixa;
1782 1782  
1783 1783          /*
1784 1784           * Check LSO possibility. The value of tcp->tcp_lso indicates whether

1785 1785           * the underlying connection is LSO capable. Will check whether having
1786 1786           * enough available data to initiate LSO transmission in the for(){}
1787 1787           * loops.
1788 1788           */
1789 1789          if (tcp->tcp_lso && (tcp->tcp_valid_bits & ~TCP_FSS_VALID) == 0)
1790 1790                  do_lso_send = B_TRUE;
1791 1791  
1792 1792          for (;;) {
1793 1793                  struct datab    *db;
1794 1794                  tcpha_t         *tcpha;
1795 1795                  uint32_t        sum;
1796 1796                  mblk_t          *mp, *mp1;
1797 1797                  uchar_t         *rptr;
1798 1798                  int             len;
1799 1799  
1800 1800                  /*
1801 1801                   * Calculate the maximum payload length we can send at one
1802 1802                   * time.
1803 1803                   */
1804 1804                  if (do_lso_send) {
1805 1805                          /*
1806 1806                           * Determine whether or not it's possible to do LSO,
1807 1807                           * and if so, how much data we can send.
1808 1808                           */
1809 1809                          if ((*usable - 1) / mss >= 1) {
1810 1810                                  lso_usable = MIN(tcp->tcp_lso_max, *usable);
1811 1811                                  num_lso_seg = lso_usable / mss;
1812 1812                                  if (lso_usable % mss) {
1813 1813                                          num_lso_seg++;
1814 1814                                          tcp->tcp_last_sent_len = (ushort_t)
1815 1815                                              (lso_usable % mss);
1816 1816                                  } else {
1817 1817                                          tcp->tcp_last_sent_len = (ushort_t)mss;
1818 1818                                  }
1819 1819                          } else {
1820 1820                                  do_lso_send = B_FALSE;
1821 1821                                  num_lso_seg = 1;
1822 1822                                  lso_usable = mss;
1823 1823                          }
1824 1824                  }
1825 1825  
1826 1826                  ASSERT(num_lso_seg <= IP_MAXPACKET / mss + 1);
1827 1827  
1828 1828                  len = mss;
1829 1829                  if (len > *usable) {
1830 1830                          ASSERT(do_lso_send == B_FALSE);
1831 1831  
1832 1832                          len = *usable;
1833 1833                          if (len <= 0) {
1834 1834                                  /* Terminate the loop */
1835 1835                                  break;  /* success; too small */
1836 1836                          }
1837 1837                          /*
1838 1838                           * Sender silly-window avoidance.
1839 1839                           * Ignore this if we are going to send a
1840 1840                           * zero window probe out.
1841 1841                           *
1842 1842                           * TODO: force data into microscopic window?
1843 1843                           *      ==> (!pushed || (unsent > usable))
1844 1844                           */
1845 1845                          if (len < (tcp->tcp_max_swnd >> 1) &&
1846 1846                              (tcp->tcp_unsent - (*snxt - tcp->tcp_snxt)) > len &&
1847 1847                              !((tcp->tcp_valid_bits & TCP_URG_VALID) &&
1848 1848                              len == 1) && (! tcp->tcp_zero_win_probe)) {
1849 1849                                  /*
1850 1850                                   * If the retransmit timer is not running
1851 1851                                   * we start it so that we will retransmit
1852 1852                                   * in the case when the receiver has
1853 1853                                   * decremented the window.
1854 1854                                   */
1855 1855                                  if (*snxt == tcp->tcp_snxt &&
1856 1856                                      *snxt == tcp->tcp_suna) {
1857 1857                                          /*
1858 1858                                           * We are not supposed to send
1859 1859                                           * anything.  So let's wait a little
1860 1860                                           * bit longer before breaking SWS
1861 1861                                           * avoidance.
1862 1862                                           *
1863 1863                                           * What should the value be?
1864 1864                                           * Suggestion: MAX(init rexmit time,
1865 1865                                           * tcp->tcp_rto)
1866 1866                                           */
1867 1867                                          TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
1868 1868                                  }
1869 1869                                  break;  /* success; too small */
1870 1870                          }
1871 1871                  }
1872 1872  
1873 1873                  tcpha = tcp->tcp_tcpha;
1874 1874  
1875 1875                  /*
1876 1876                   * The reason to adjust len here is that we need to set flags
1877 1877                   * and calculate checksum.
1878 1878                   */
1879 1879                  if (do_lso_send)
1880 1880                          len = lso_usable;
1881 1881  
1882 1882                  *usable -= len; /* Approximate - can be adjusted later */
1883 1883                  if (*usable > 0)
1884 1884                          tcpha->tha_flags = TH_ACK;
1885 1885                  else
1886 1886                          tcpha->tha_flags = (TH_ACK | TH_PUSH);
1887 1887  
1888 1888                  /*
1889 1889                   * Prime pump for IP's checksumming on our behalf.
1890 1890                   * Include the adjustment for a source route if any.
1891 1891                   * In case of LSO, the partial pseudo-header checksum should
1892 1892                   * exclusive TCP length, so zero tha_sum before IP calculate
1893 1893                   * pseudo-header checksum for partial checksum offload.
1894 1894                   */
1895 1895                  if (do_lso_send) {
1896 1896                          sum = 0;
1897 1897                  } else {
1898 1898                          sum = len + tcp_hdr_len + connp->conn_sum;
1899 1899                          sum = (sum >> 16) + (sum & 0xFFFF);
1900 1900                  }
1901 1901                  tcpha->tha_sum = htons(sum);
1902 1902                  tcpha->tha_seq = htonl(*snxt);
1903 1903  
1904 1904                  /*
1905 1905                   * Branch off to tcp_xmit_mp() if any of the VALID bits is
1906 1906                   * set.  For the case when TCP_FSS_VALID is the only valid
1907 1907                   * bit (normal active close), branch off only when we think
1908 1908                   * that the FIN flag needs to be set.  Note for this case,
1909 1909                   * that (snxt + len) may not reflect the actual seg_len,
1910 1910                   * as len may be further reduced in tcp_xmit_mp().  If len
1911 1911                   * gets modified, we will end up here again.
1912 1912                   */
1913 1913                  if (tcp->tcp_valid_bits != 0 &&
1914 1914                      (tcp->tcp_valid_bits != TCP_FSS_VALID ||
1915 1915                      ((*snxt + len) == tcp->tcp_fss))) {
1916 1916                          uchar_t         *prev_rptr;
1917 1917                          uint32_t        prev_snxt = tcp->tcp_snxt;
1918 1918  
1919 1919                          if (*tail_unsent == 0) {
1920 1920                                  ASSERT((*xmit_tail)->b_cont != NULL);
1921 1921                                  *xmit_tail = (*xmit_tail)->b_cont;
1922 1922                                  prev_rptr = (*xmit_tail)->b_rptr;
1923 1923                                  *tail_unsent = (int)((*xmit_tail)->b_wptr -
1924 1924                                      (*xmit_tail)->b_rptr);
1925 1925                          } else {
1926 1926                                  prev_rptr = (*xmit_tail)->b_rptr;
1927 1927                                  (*xmit_tail)->b_rptr = (*xmit_tail)->b_wptr -
1928 1928                                      *tail_unsent;
1929 1929                          }
1930 1930                          mp = tcp_xmit_mp(tcp, *xmit_tail, len, NULL, NULL,
1931 1931                              *snxt, B_FALSE, (uint32_t *)&len, B_FALSE);
1932 1932                          /* Restore tcp_snxt so we get amount sent right. */
1933 1933                          tcp->tcp_snxt = prev_snxt;
1934 1934                          if (prev_rptr == (*xmit_tail)->b_rptr) {
1935 1935                                  /*
1936 1936                                   * If the previous timestamp is still in use,
1937 1937                                   * don't stomp on it.
1938 1938                                   */
1939 1939                                  if ((*xmit_tail)->b_next == NULL) {
1940 1940                                          (*xmit_tail)->b_prev = local_time;
1941 1941                                          (*xmit_tail)->b_next =
1942 1942                                              (mblk_t *)(uintptr_t)(*snxt);
1943 1943                                  }
1944 1944                          } else
1945 1945                                  (*xmit_tail)->b_rptr = prev_rptr;
1946 1946  
1947 1947                          if (mp == NULL) {
1948 1948                                  return (-1);
1949 1949                          }
1950 1950                          mp1 = mp->b_cont;
1951 1951  
1952 1952                          if (len <= mss) /* LSO is unusable (!do_lso_send) */
1953 1953                                  tcp->tcp_last_sent_len = (ushort_t)len;
1954 1954                          while (mp1->b_cont) {
1955 1955                                  *xmit_tail = (*xmit_tail)->b_cont;
1956 1956                                  (*xmit_tail)->b_prev = local_time;
1957 1957                                  (*xmit_tail)->b_next =
1958 1958                                      (mblk_t *)(uintptr_t)(*snxt);
1959 1959                                  mp1 = mp1->b_cont;
1960 1960                          }
1961 1961                          *snxt += len;
1962 1962                          *tail_unsent = (*xmit_tail)->b_wptr - mp1->b_wptr;
1963 1963                          BUMP_LOCAL(tcp->tcp_obsegs);
1964 1964                          TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
1965 1965                          TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
1966 1966                          tcp_send_data(tcp, mp);
1967 1967                          continue;
1968 1968                  }
1969 1969  
1970 1970                  *snxt += len;   /* Adjust later if we don't send all of len */
1971 1971                  TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
1972 1972                  TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
1973 1973  
1974 1974                  if (*tail_unsent) {
1975 1975                          /* Are the bytes above us in flight? */
1976 1976                          rptr = (*xmit_tail)->b_wptr - *tail_unsent;
1977 1977                          if (rptr != (*xmit_tail)->b_rptr) {
1978 1978                                  *tail_unsent -= len;
1979 1979                                  if (len <= mss) /* LSO is unusable */
1980 1980                                          tcp->tcp_last_sent_len = (ushort_t)len;
1981 1981                                  len += total_hdr_len;
1982 1982                                  ixa->ixa_pktlen = len;
1983 1983  
1984 1984                                  if (ixa->ixa_flags & IXAF_IS_IPV4) {
1985 1985                                          tcp->tcp_ipha->ipha_length = htons(len);
1986 1986                                  } else {
1987 1987                                          tcp->tcp_ip6h->ip6_plen =
1988 1988                                              htons(len - IPV6_HDR_LEN);
1989 1989                                  }
1990 1990  
1991 1991                                  mp = dupb(*xmit_tail);
1992 1992                                  if (mp == NULL) {
1993 1993                                          return (-1);    /* out_of_mem */
1994 1994                                  }
1995 1995                                  mp->b_rptr = rptr;
1996 1996                                  /*
1997 1997                                   * If the old timestamp is no longer in use,
1998 1998                                   * sample a new timestamp now.
1999 1999                                   */
2000 2000                                  if ((*xmit_tail)->b_next == NULL) {
2001 2001                                          (*xmit_tail)->b_prev = local_time;
2002 2002                                          (*xmit_tail)->b_next =
2003 2003                                              (mblk_t *)(uintptr_t)(*snxt-len);
2004 2004                                  }
2005 2005                                  goto must_alloc;
2006 2006                          }
2007 2007                  } else {
2008 2008                          *xmit_tail = (*xmit_tail)->b_cont;
2009 2009                          ASSERT((uintptr_t)((*xmit_tail)->b_wptr -
2010 2010                              (*xmit_tail)->b_rptr) <= (uintptr_t)INT_MAX);
2011 2011                          *tail_unsent = (int)((*xmit_tail)->b_wptr -
2012 2012                              (*xmit_tail)->b_rptr);
2013 2013                  }
2014 2014  
2015 2015                  (*xmit_tail)->b_prev = local_time;
2016 2016                  (*xmit_tail)->b_next = (mblk_t *)(uintptr_t)(*snxt - len);
2017 2017  
2018 2018                  *tail_unsent -= len;
2019 2019                  if (len <= mss) /* LSO is unusable (!do_lso_send) */
2020 2020                          tcp->tcp_last_sent_len = (ushort_t)len;
2021 2021  
2022 2022                  len += total_hdr_len;
2023 2023                  ixa->ixa_pktlen = len;
2024 2024  
2025 2025                  if (ixa->ixa_flags & IXAF_IS_IPV4) {
2026 2026                          tcp->tcp_ipha->ipha_length = htons(len);
2027 2027                  } else {
2028 2028                          tcp->tcp_ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2029 2029                  }
2030 2030  
2031 2031                  mp = dupb(*xmit_tail);
2032 2032                  if (mp == NULL) {
2033 2033                          return (-1);    /* out_of_mem */
2034 2034                  }
2035 2035  
2036 2036                  len = total_hdr_len;
2037 2037                  /*
2038 2038                   * There are four reasons to allocate a new hdr mblk:
2039 2039                   *  1) The bytes above us are in use by another packet
2040 2040                   *  2) We don't have good alignment
2041 2041                   *  3) The mblk is being shared
2042 2042                   *  4) We don't have enough room for a header
2043 2043                   */
2044 2044                  rptr = mp->b_rptr - len;
2045 2045                  if (!OK_32PTR(rptr) ||
2046 2046                      ((db = mp->b_datap), db->db_ref != 2) ||
2047 2047                      rptr < db->db_base) {
2048 2048                          /* NOTE: we assume allocb returns an OK_32PTR */
2049 2049  
2050 2050                  must_alloc:;
2051 2051                          mp1 = allocb(connp->conn_ht_iphc_allocated +
2052 2052                              tcps->tcps_wroff_xtra, BPRI_MED);
2053 2053                          if (mp1 == NULL) {
2054 2054                                  freemsg(mp);
2055 2055                                  return (-1);    /* out_of_mem */
2056 2056                          }
2057 2057                          mp1->b_cont = mp;
2058 2058                          mp = mp1;

↓ open down ↓

274 lines elided

↑ open up ↑

2059 2059                          /* Leave room for Link Level header */
2060 2060                          len = total_hdr_len;
2061 2061                          rptr = &mp->b_rptr[tcps->tcps_wroff_xtra];
2062 2062                          mp->b_wptr = &rptr[len];
2063 2063                  }
2064 2064  
2065 2065                  /*
2066 2066                   * Fill in the header using the template header, and add
2067 2067                   * options such as time-stamp, ECN and/or SACK, as needed.
2068 2068                   */
2069      -                tcp_fill_header(tcp, rptr, (clock_t)local_time, num_sack_blk);
     2069 +                tcp_fill_header(tcp, rptr, num_sack_blk);
2070 2070  
2071 2071                  mp->b_rptr = rptr;
2072 2072  
2073 2073                  if (*tail_unsent) {
2074 2074                          int spill = *tail_unsent;
2075 2075  
2076 2076                          mp1 = mp->b_cont;
2077 2077                          if (mp1 == NULL)
2078 2078                                  mp1 = mp;
2079 2079

2080 2080                          /*
2081 2081                           * If we're a little short, tack on more mblks until
2082 2082                           * there is no more spillover.
2083 2083                           */
2084 2084                          while (spill < 0) {
2085 2085                                  mblk_t *nmp;
2086 2086                                  int nmpsz;
2087 2087  
2088 2088                                  nmp = (*xmit_tail)->b_cont;
2089 2089                                  nmpsz = MBLKL(nmp);
2090 2090  
2091 2091                                  /*
2092 2092                                   * Excess data in mblk; can we split it?
2093 2093                                   * If LSO is enabled for the connection,
2094 2094                                   * keep on splitting as this is a transient
2095 2095                                   * send path.
2096 2096                                   */
2097 2097                                  if (!do_lso_send && (spill + nmpsz > 0)) {
2098 2098                                          /*
2099 2099                                           * Don't split if stream head was
2100 2100                                           * told to break up larger writes
2101 2101                                           * into smaller ones.
2102 2102                                           */
2103 2103                                          if (tcp->tcp_maxpsz_multiplier > 0)
2104 2104                                                  break;
2105 2105  
2106 2106                                          /*
2107 2107                                           * Next mblk is less than SMSS/2
2108 2108                                           * rounded up to nearest 64-byte;
2109 2109                                           * let it get sent as part of the
2110 2110                                           * next segment.
2111 2111                                           */
2112 2112                                          if (tcp->tcp_localnet &&
2113 2113                                              !tcp->tcp_cork &&
2114 2114                                              (nmpsz < roundup((mss >> 1), 64)))
2115 2115                                                  break;
2116 2116                                  }
2117 2117  
2118 2118                                  *xmit_tail = nmp;
2119 2119                                  ASSERT((uintptr_t)nmpsz <= (uintptr_t)INT_MAX);
2120 2120                                  /* Stash for rtt use later */
2121 2121                                  (*xmit_tail)->b_prev = local_time;
2122 2122                                  (*xmit_tail)->b_next =
2123 2123                                      (mblk_t *)(uintptr_t)(*snxt - len);
2124 2124                                  mp1->b_cont = dupb(*xmit_tail);
2125 2125                                  mp1 = mp1->b_cont;
2126 2126  
2127 2127                                  spill += nmpsz;
2128 2128                                  if (mp1 == NULL) {
2129 2129                                          *tail_unsent = spill;
2130 2130                                          freemsg(mp);
2131 2131                                          return (-1);    /* out_of_mem */
2132 2132                                  }
2133 2133                          }
2134 2134  
2135 2135                          /* Trim back any surplus on the last mblk */
2136 2136                          if (spill >= 0) {
2137 2137                                  mp1->b_wptr -= spill;
2138 2138                                  *tail_unsent = spill;
2139 2139                          } else {
2140 2140                                  /*
2141 2141                                   * We did not send everything we could in
2142 2142                                   * order to remain within the b_cont limit.
2143 2143                                   */
2144 2144                                  *usable -= spill;
2145 2145                                  *snxt += spill;
2146 2146                                  tcp->tcp_last_sent_len += spill;
2147 2147                                  TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, spill);
2148 2148                                  /*
2149 2149                                   * Adjust the checksum
2150 2150                                   */
2151 2151                                  tcpha = (tcpha_t *)(rptr +
2152 2152                                      ixa->ixa_ip_hdr_length);
2153 2153                                  sum += spill;
2154 2154                                  sum = (sum >> 16) + (sum & 0xFFFF);
2155 2155                                  tcpha->tha_sum = htons(sum);
2156 2156                                  if (connp->conn_ipversion == IPV4_VERSION) {
2157 2157                                          sum = ntohs(
2158 2158                                              ((ipha_t *)rptr)->ipha_length) +
2159 2159                                              spill;
2160 2160                                          ((ipha_t *)rptr)->ipha_length =
2161 2161                                              htons(sum);
2162 2162                                  } else {
2163 2163                                          sum = ntohs(
2164 2164                                              ((ip6_t *)rptr)->ip6_plen) +
2165 2165                                              spill;
2166 2166                                          ((ip6_t *)rptr)->ip6_plen =
2167 2167                                              htons(sum);
2168 2168                                  }
2169 2169                                  ixa->ixa_pktlen += spill;
2170 2170                                  *tail_unsent = 0;
2171 2171                          }
2172 2172                  }
2173 2173                  if (tcp->tcp_ip_forward_progress) {
2174 2174                          tcp->tcp_ip_forward_progress = B_FALSE;
2175 2175                          ixa->ixa_flags |= IXAF_REACH_CONF;
2176 2176                  } else {
2177 2177                          ixa->ixa_flags &= ~IXAF_REACH_CONF;
2178 2178                  }
2179 2179  
2180 2180                  if (do_lso_send) {
2181 2181                          /* Append LSO information to the mp. */
2182 2182                          lso_info_set(mp, mss, HW_LSO);
2183 2183                          ixa->ixa_fragsize = IP_MAXPACKET;
2184 2184                          ixa->ixa_extra_ident = num_lso_seg - 1;
2185 2185  
2186 2186                          DTRACE_PROBE2(tcp_send_lso, int, num_lso_seg,
2187 2187                              boolean_t, B_TRUE);
2188 2188  
2189 2189                          tcp_send_data(tcp, mp);
2190 2190  
2191 2191                          /*
2192 2192                           * Restore values of ixa_fragsize and ixa_extra_ident.
2193 2193                           */
2194 2194                          ixa->ixa_fragsize = ixa->ixa_pmtu;
2195 2195                          ixa->ixa_extra_ident = 0;
2196 2196                          tcp->tcp_obsegs += num_lso_seg;
2197 2197                          TCP_STAT(tcps, tcp_lso_times);
2198 2198                          TCP_STAT_UPDATE(tcps, tcp_lso_pkt_out, num_lso_seg);
2199 2199                  } else {
2200 2200                          /*
2201 2201                           * Make sure to clean up LSO information. Wherever a
2202 2202                           * new mp uses the prepended header room after dupb(),
2203 2203                           * lso_info_cleanup() should be called.
2204 2204                           */
2205 2205                          lso_info_cleanup(mp);
2206 2206                          tcp_send_data(tcp, mp);
2207 2207                          BUMP_LOCAL(tcp->tcp_obsegs);
2208 2208                  }
2209 2209          }
2210 2210  
2211 2211          return (0);
2212 2212  }
2213 2213  
2214 2214  /*
2215 2215   * Initiate closedown sequence on an active connection.  (May be called as
2216 2216   * writer.)  Return value zero for OK return, non-zero for error return.
2217 2217   */
2218 2218  static int
2219 2219  tcp_xmit_end(tcp_t *tcp)
2220 2220  {
2221 2221          mblk_t          *mp;
2222 2222          tcp_stack_t     *tcps = tcp->tcp_tcps;
2223 2223          iulp_t          uinfo;
2224 2224          ip_stack_t      *ipst = tcps->tcps_netstack->netstack_ip;
2225 2225          conn_t          *connp = tcp->tcp_connp;
2226 2226  
2227 2227          if (tcp->tcp_state < TCPS_SYN_RCVD ||
2228 2228              tcp->tcp_state > TCPS_CLOSE_WAIT) {
2229 2229                  /*
2230 2230                   * Invalid state, only states TCPS_SYN_RCVD,
2231 2231                   * TCPS_ESTABLISHED and TCPS_CLOSE_WAIT are valid
2232 2232                   */
2233 2233                  return (-1);
2234 2234          }
2235 2235  
2236 2236          tcp->tcp_fss = tcp->tcp_snxt + tcp->tcp_unsent;
2237 2237          tcp->tcp_valid_bits |= TCP_FSS_VALID;
2238 2238          /*
2239 2239           * If there is nothing more unsent, send the FIN now.
2240 2240           * Otherwise, it will go out with the last segment.
2241 2241           */
2242 2242          if (tcp->tcp_unsent == 0) {
2243 2243                  mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL,
2244 2244                      tcp->tcp_fss, B_FALSE, NULL, B_FALSE);
2245 2245  
2246 2246                  if (mp) {
2247 2247                          tcp_send_data(tcp, mp);
2248 2248                  } else {
2249 2249                          /*
2250 2250                           * Couldn't allocate msg.  Pretend we got it out.
2251 2251                           * Wait for rexmit timeout.
2252 2252                           */
2253 2253                          tcp->tcp_snxt = tcp->tcp_fss + 1;
2254 2254                          TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
2255 2255                  }
2256 2256  
2257 2257                  /*
2258 2258                   * If needed, update tcp_rexmit_snxt as tcp_snxt is
2259 2259                   * changed.
2260 2260                   */
2261 2261                  if (tcp->tcp_rexmit && tcp->tcp_rexmit_nxt == tcp->tcp_fss) {
2262 2262                          tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
2263 2263                  }
2264 2264          } else {
2265 2265                  /*
2266 2266                   * If tcp->tcp_cork is set, then the data will not get sent,
2267 2267                   * so we have to check that and unset it first.
2268 2268                   */
2269 2269                  if (tcp->tcp_cork)
2270 2270                          tcp->tcp_cork = B_FALSE;
2271 2271                  tcp_wput_data(tcp, NULL, B_FALSE);
2272 2272          }
2273 2273  
2274 2274          /*
2275 2275           * If TCP does not get enough samples of RTT or tcp_rtt_updates
2276 2276           * is 0, don't update the cache.

↓ open down ↓

197 lines elided

↑ open up ↑

2277 2277           */
2278 2278          if (tcps->tcps_rtt_updates == 0 ||
2279 2279              tcp->tcp_rtt_update < tcps->tcps_rtt_updates)
2280 2280                  return (0);
2281 2281  
2282 2282          /*
2283 2283           * We do not have a good algorithm to update ssthresh at this time.
2284 2284           * So don't do any update.
2285 2285           */
2286 2286          bzero(&uinfo, sizeof (uinfo));
2287      -        uinfo.iulp_rtt = tcp->tcp_rtt_sa;
2288      -        uinfo.iulp_rtt_sd = tcp->tcp_rtt_sd;
     2287 +        uinfo.iulp_rtt = NSEC2MSEC(tcp->tcp_rtt_sa);
     2288 +        uinfo.iulp_rtt_sd = NSEC2MSEC(tcp->tcp_rtt_sd);
2289 2289  
2290 2290          /*
2291 2291           * Note that uinfo is kept for conn_faddr in the DCE. Could update even
2292 2292           * if source routed but we don't.
2293 2293           */
2294 2294          if (connp->conn_ipversion == IPV4_VERSION) {
2295 2295                  if (connp->conn_faddr_v4 !=  tcp->tcp_ipha->ipha_dst) {
2296 2296                          return (0);
2297 2297                  }
2298 2298                  (void) dce_update_uinfo_v4(connp->conn_faddr_v4, &uinfo, ipst);

2299 2299          } else {
2300 2300                  uint_t ifindex;
2301 2301  
2302 2302                  if (!(IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
2303 2303                      &tcp->tcp_ip6h->ip6_dst))) {
2304 2304                          return (0);
2305 2305                  }
2306 2306                  ifindex = 0;
2307 2307                  if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_faddr_v6)) {
2308 2308                          ip_xmit_attr_t *ixa = connp->conn_ixa;
2309 2309  
2310 2310                          /*
2311 2311                           * If we are going to create a DCE we'd better have
2312 2312                           * an ifindex
2313 2313                           */
2314 2314                          if (ixa->ixa_nce != NULL) {
2315 2315                                  ifindex = ixa->ixa_nce->nce_common->ncec_ill->
2316 2316                                      ill_phyint->phyint_ifindex;
2317 2317                          } else {
2318 2318                                  return (0);
2319 2319                          }
2320 2320                  }
2321 2321  
2322 2322                  (void) dce_update_uinfo(&connp->conn_faddr_v6, ifindex, &uinfo,
2323 2323                      ipst);
2324 2324          }
2325 2325          return (0);
2326 2326  }
2327 2327  
2328 2328  /*
2329 2329   * Send out a control packet on the tcp connection specified.  This routine
2330 2330   * is typically called where we need a simple ACK or RST generated.
2331 2331   */
2332 2332  void
2333 2333  tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, uint32_t ack, int ctl)
2334 2334  {
2335 2335          uchar_t         *rptr;
2336 2336          tcpha_t         *tcpha;
2337 2337          ipha_t          *ipha = NULL;
2338 2338          ip6_t           *ip6h = NULL;
2339 2339          uint32_t        sum;
2340 2340          int             total_hdr_len;
2341 2341          int             ip_hdr_len;
2342 2342          mblk_t          *mp;
2343 2343          tcp_stack_t     *tcps = tcp->tcp_tcps;
2344 2344          conn_t          *connp = tcp->tcp_connp;
2345 2345          ip_xmit_attr_t  *ixa = connp->conn_ixa;
2346 2346  
2347 2347          /*
2348 2348           * Save sum for use in source route later.
2349 2349           */
2350 2350          sum = connp->conn_ht_ulp_len + connp->conn_sum;
2351 2351          total_hdr_len = connp->conn_ht_iphc_len;
2352 2352          ip_hdr_len = ixa->ixa_ip_hdr_length;
2353 2353  
2354 2354          /* If a text string is passed in with the request, pass it to strlog. */
2355 2355          if (str != NULL && connp->conn_debug) {
2356 2356                  (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
2357 2357                      "tcp_xmit_ctl: '%s', seq 0x%x, ack 0x%x, ctl 0x%x",
2358 2358                      str, seq, ack, ctl);
2359 2359          }
2360 2360          mp = allocb(connp->conn_ht_iphc_allocated + tcps->tcps_wroff_xtra,
2361 2361              BPRI_MED);
2362 2362          if (mp == NULL) {
2363 2363                  return;
2364 2364          }
2365 2365          rptr = &mp->b_rptr[tcps->tcps_wroff_xtra];
2366 2366          mp->b_rptr = rptr;
2367 2367          mp->b_wptr = &rptr[total_hdr_len];
2368 2368          bcopy(connp->conn_ht_iphc, rptr, total_hdr_len);
2369 2369  
2370 2370          ixa->ixa_pktlen = total_hdr_len;
2371 2371  
2372 2372          if (ixa->ixa_flags & IXAF_IS_IPV4) {
2373 2373                  ipha = (ipha_t *)rptr;
2374 2374                  ipha->ipha_length = htons(total_hdr_len);
2375 2375          } else {
2376 2376                  ip6h = (ip6_t *)rptr;
2377 2377                  ip6h->ip6_plen = htons(total_hdr_len - IPV6_HDR_LEN);
2378 2378          }
2379 2379          tcpha = (tcpha_t *)&rptr[ip_hdr_len];
2380 2380          tcpha->tha_flags = (uint8_t)ctl;
2381 2381          if (ctl & TH_RST) {
2382 2382                  TCPS_BUMP_MIB(tcps, tcpOutRsts);
2383 2383                  TCPS_BUMP_MIB(tcps, tcpOutControl);
2384 2384                  /*
2385 2385                   * Don't send TSopt w/ TH_RST packets per RFC 1323.
2386 2386                   */
2387 2387                  if (tcp->tcp_snd_ts_ok &&
2388 2388                      tcp->tcp_state > TCPS_SYN_SENT) {
2389 2389                          mp->b_wptr = &rptr[total_hdr_len - TCPOPT_REAL_TS_LEN];
2390 2390                          *(mp->b_wptr) = TCPOPT_EOL;
2391 2391  
2392 2392                          ixa->ixa_pktlen = total_hdr_len - TCPOPT_REAL_TS_LEN;
2393 2393  
2394 2394                          if (connp->conn_ipversion == IPV4_VERSION) {
2395 2395                                  ipha->ipha_length = htons(total_hdr_len -
2396 2396                                      TCPOPT_REAL_TS_LEN);
2397 2397                          } else {
2398 2398                                  ip6h->ip6_plen = htons(total_hdr_len -
2399 2399                                      IPV6_HDR_LEN - TCPOPT_REAL_TS_LEN);
2400 2400                          }
2401 2401                          tcpha->tha_offset_and_reserved -= (3 << 4);
2402 2402                          sum -= TCPOPT_REAL_TS_LEN;
2403 2403                  }
2404 2404          }
2405 2405          if (ctl & TH_ACK) {
2406 2406                  if (tcp->tcp_snd_ts_ok) {
2407 2407                          uint32_t llbolt = (uint32_t)LBOLT_FASTPATH;
2408 2408  
2409 2409                          U32_TO_BE32(llbolt,
2410 2410                              (char *)tcpha + TCP_MIN_HEADER_LENGTH+4);
2411 2411                          U32_TO_BE32(tcp->tcp_ts_recent,
2412 2412                              (char *)tcpha + TCP_MIN_HEADER_LENGTH+8);
2413 2413                  }
2414 2414  
2415 2415                  /* Update the latest receive window size in TCP header. */
2416 2416                  tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
2417 2417                  /* Track what we sent to the peer */
2418 2418                  tcp->tcp_tcpha->tha_win = tcpha->tha_win;
2419 2419                  tcp->tcp_rack = ack;
2420 2420                  tcp->tcp_rack_cnt = 0;
2421 2421                  TCPS_BUMP_MIB(tcps, tcpOutAck);
2422 2422          }
2423 2423          BUMP_LOCAL(tcp->tcp_obsegs);
2424 2424          tcpha->tha_seq = htonl(seq);
2425 2425          tcpha->tha_ack = htonl(ack);
2426 2426          /*
2427 2427           * Include the adjustment for a source route if any.
2428 2428           */
2429 2429          sum = (sum >> 16) + (sum & 0xFFFF);
2430 2430          tcpha->tha_sum = htons(sum);
2431 2431          tcp_send_data(tcp, mp);
2432 2432  }
2433 2433  
2434 2434  /*
2435 2435   * Generate a reset based on an inbound packet, connp is set by caller
2436 2436   * when RST is in response to an unexpected inbound packet for which
2437 2437   * there is active tcp state in the system.
2438 2438   *
2439 2439   * IPSEC NOTE : Try to send the reply with the same protection as it came
2440 2440   * in.  We have the ip_recv_attr_t which is reversed to form the ip_xmit_attr_t.
2441 2441   * That way the packet will go out at the same level of protection as it
2442 2442   * came in with.
2443 2443   */
2444 2444  static void
2445 2445  tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, uint32_t ack, int ctl,
2446 2446      ip_recv_attr_t *ira, ip_stack_t *ipst, conn_t *connp)
2447 2447  {
2448 2448          ipha_t          *ipha = NULL;
2449 2449          ip6_t           *ip6h = NULL;
2450 2450          ushort_t        len;
2451 2451          tcpha_t         *tcpha;
2452 2452          int             i;
2453 2453          ipaddr_t        v4addr;
2454 2454          in6_addr_t      v6addr;
2455 2455          netstack_t      *ns = ipst->ips_netstack;
2456 2456          tcp_stack_t     *tcps = ns->netstack_tcp;
2457 2457          ip_xmit_attr_t  ixas, *ixa;
2458 2458          uint_t          ip_hdr_len = ira->ira_ip_hdr_length;
2459 2459          boolean_t       need_refrele = B_FALSE;         /* ixa_refrele(ixa) */
2460 2460          ushort_t        port;
2461 2461  
2462 2462          if (!tcp_send_rst_chk(tcps)) {
2463 2463                  TCP_STAT(tcps, tcp_rst_unsent);
2464 2464                  freemsg(mp);
2465 2465                  return;
2466 2466          }
2467 2467  
2468 2468          /*
2469 2469           * If connp != NULL we use conn_ixa to keep IP_NEXTHOP and other
2470 2470           * options from the listener. In that case the caller must ensure that
2471 2471           * we are running on the listener = connp squeue.
2472 2472           *
2473 2473           * We get a safe copy of conn_ixa so we don't need to restore anything
2474 2474           * we or ip_output_simple might change in the ixa.
2475 2475           */
2476 2476          if (connp != NULL) {
2477 2477                  ASSERT(connp->conn_on_sqp);
2478 2478  
2479 2479                  ixa = conn_get_ixa_exclusive(connp);
2480 2480                  if (ixa == NULL) {
2481 2481                          TCP_STAT(tcps, tcp_rst_unsent);
2482 2482                          freemsg(mp);
2483 2483                          return;
2484 2484                  }
2485 2485                  need_refrele = B_TRUE;
2486 2486          } else {
2487 2487                  bzero(&ixas, sizeof (ixas));
2488 2488                  ixa = &ixas;
2489 2489                  /*
2490 2490                   * IXAF_VERIFY_SOURCE is overkill since we know the
2491 2491                   * packet was for us.
2492 2492                   */
2493 2493                  ixa->ixa_flags |= IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE;
2494 2494                  ixa->ixa_protocol = IPPROTO_TCP;
2495 2495                  ixa->ixa_zoneid = ira->ira_zoneid;
2496 2496                  ixa->ixa_ifindex = 0;
2497 2497                  ixa->ixa_ipst = ipst;
2498 2498                  ixa->ixa_cred = kcred;
2499 2499                  ixa->ixa_cpid = NOPID;
2500 2500          }
2501 2501  
2502 2502          if (str && tcps->tcps_dbg) {
2503 2503                  (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
2504 2504                      "tcp_xmit_early_reset: '%s', seq 0x%x, ack 0x%x, "
2505 2505                      "flags 0x%x",
2506 2506                      str, seq, ack, ctl);
2507 2507          }
2508 2508          if (mp->b_datap->db_ref != 1) {
2509 2509                  mblk_t *mp1 = copyb(mp);
2510 2510                  freemsg(mp);
2511 2511                  mp = mp1;
2512 2512                  if (mp == NULL)
2513 2513                          goto done;
2514 2514          } else if (mp->b_cont) {
2515 2515                  freemsg(mp->b_cont);
2516 2516                  mp->b_cont = NULL;
2517 2517                  DB_CKSUMFLAGS(mp) = 0;
2518 2518          }
2519 2519          /*
2520 2520           * We skip reversing source route here.
2521 2521           * (for now we replace all IP options with EOL)
2522 2522           */
2523 2523          if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
2524 2524                  ipha = (ipha_t *)mp->b_rptr;
2525 2525                  for (i = IP_SIMPLE_HDR_LENGTH; i < (int)ip_hdr_len; i++)
2526 2526                          mp->b_rptr[i] = IPOPT_EOL;
2527 2527                  /*
2528 2528                   * Make sure that src address isn't flagrantly invalid.
2529 2529                   * Not all broadcast address checking for the src address
2530 2530                   * is possible, since we don't know the netmask of the src
2531 2531                   * addr.  No check for destination address is done, since
2532 2532                   * IP will not pass up a packet with a broadcast dest
2533 2533                   * address to TCP.  Similar checks are done below for IPv6.
2534 2534                   */
2535 2535                  if (ipha->ipha_src == 0 || ipha->ipha_src == INADDR_BROADCAST ||
2536 2536                      CLASSD(ipha->ipha_src)) {
2537 2537                          BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards);
2538 2538                          ip_drop_input("ipIfStatsInDiscards", mp, NULL);
2539 2539                          freemsg(mp);
2540 2540                          goto done;
2541 2541                  }
2542 2542          } else {
2543 2543                  ip6h = (ip6_t *)mp->b_rptr;
2544 2544  
2545 2545                  if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) ||
2546 2546                      IN6_IS_ADDR_MULTICAST(&ip6h->ip6_src)) {
2547 2547                          BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsInDiscards);
2548 2548                          ip_drop_input("ipIfStatsInDiscards", mp, NULL);
2549 2549                          freemsg(mp);
2550 2550                          goto done;
2551 2551                  }
2552 2552  
2553 2553                  /* Remove any extension headers assuming partial overlay */
2554 2554                  if (ip_hdr_len > IPV6_HDR_LEN) {
2555 2555                          uint8_t *to;
2556 2556  
2557 2557                          to = mp->b_rptr + ip_hdr_len - IPV6_HDR_LEN;
2558 2558                          ovbcopy(ip6h, to, IPV6_HDR_LEN);
2559 2559                          mp->b_rptr += ip_hdr_len - IPV6_HDR_LEN;
2560 2560                          ip_hdr_len = IPV6_HDR_LEN;
2561 2561                          ip6h = (ip6_t *)mp->b_rptr;
2562 2562                          ip6h->ip6_nxt = IPPROTO_TCP;
2563 2563                  }
2564 2564          }
2565 2565          tcpha = (tcpha_t *)&mp->b_rptr[ip_hdr_len];
2566 2566          if (tcpha->tha_flags & TH_RST) {
2567 2567                  freemsg(mp);
2568 2568                  goto done;
2569 2569          }
2570 2570          tcpha->tha_offset_and_reserved = (5 << 4);
2571 2571          len = ip_hdr_len + sizeof (tcpha_t);
2572 2572          mp->b_wptr = &mp->b_rptr[len];
2573 2573          if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
2574 2574                  ipha->ipha_length = htons(len);
2575 2575                  /* Swap addresses */
2576 2576                  v4addr = ipha->ipha_src;
2577 2577                  ipha->ipha_src = ipha->ipha_dst;
2578 2578                  ipha->ipha_dst = v4addr;
2579 2579                  ipha->ipha_ident = 0;
2580 2580                  ipha->ipha_ttl = (uchar_t)tcps->tcps_ipv4_ttl;
2581 2581                  ixa->ixa_flags |= IXAF_IS_IPV4;
2582 2582                  ixa->ixa_ip_hdr_length = ip_hdr_len;
2583 2583          } else {
2584 2584                  ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2585 2585                  /* Swap addresses */
2586 2586                  v6addr = ip6h->ip6_src;
2587 2587                  ip6h->ip6_src = ip6h->ip6_dst;
2588 2588                  ip6h->ip6_dst = v6addr;
2589 2589                  ip6h->ip6_hops = (uchar_t)tcps->tcps_ipv6_hoplimit;
2590 2590                  ixa->ixa_flags &= ~IXAF_IS_IPV4;
2591 2591  
2592 2592                  if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_dst)) {
2593 2593                          ixa->ixa_flags |= IXAF_SCOPEID_SET;
2594 2594                          ixa->ixa_scopeid = ira->ira_ruifindex;
2595 2595                  }
2596 2596                  ixa->ixa_ip_hdr_length = IPV6_HDR_LEN;
2597 2597          }
2598 2598          ixa->ixa_pktlen = len;
2599 2599  
2600 2600          /* Swap the ports */
2601 2601          port = tcpha->tha_fport;
2602 2602          tcpha->tha_fport = tcpha->tha_lport;
2603 2603          tcpha->tha_lport = port;
2604 2604  
2605 2605          tcpha->tha_ack = htonl(ack);
2606 2606          tcpha->tha_seq = htonl(seq);
2607 2607          tcpha->tha_win = 0;
2608 2608          tcpha->tha_sum = htons(sizeof (tcpha_t));
2609 2609          tcpha->tha_flags = (uint8_t)ctl;
2610 2610          if (ctl & TH_RST) {
2611 2611                  if (ctl & TH_ACK) {
2612 2612                          /*
2613 2613                           * Probe connection rejection here.
2614 2614                           * tcp_xmit_listeners_reset() drops non-SYN segments
2615 2615                           * that do not specify TH_ACK in their flags without
2616 2616                           * calling this function.  As a consequence, if this
2617 2617                           * function is called with a TH_RST|TH_ACK ctl argument,
2618 2618                           * it is being called in response to a SYN segment
2619 2619                           * and thus the tcp:::accept-refused probe point
2620 2620                           * is valid here.
2621 2621                           */
2622 2622                          DTRACE_TCP5(accept__refused, mblk_t *, NULL,
2623 2623                              void, NULL, void_ip_t *, mp->b_rptr, tcp_t *, NULL,
2624 2624                              tcph_t *, tcpha);
2625 2625                  }
2626 2626                  TCPS_BUMP_MIB(tcps, tcpOutRsts);
2627 2627                  TCPS_BUMP_MIB(tcps, tcpOutControl);
2628 2628          }
2629 2629  
2630 2630          /* Discard any old label */
2631 2631          if (ixa->ixa_free_flags & IXA_FREE_TSL) {
2632 2632                  ASSERT(ixa->ixa_tsl != NULL);
2633 2633                  label_rele(ixa->ixa_tsl);
2634 2634                  ixa->ixa_free_flags &= ~IXA_FREE_TSL;
2635 2635          }
2636 2636          ixa->ixa_tsl = ira->ira_tsl;    /* Behave as a multi-level responder */
2637 2637  
2638 2638          if (ira->ira_flags & IRAF_IPSEC_SECURE) {
2639 2639                  /*
2640 2640                   * Apply IPsec based on how IPsec was applied to
2641 2641                   * the packet that caused the RST.
2642 2642                   */
2643 2643                  if (!ipsec_in_to_out(ira, ixa, mp, ipha, ip6h)) {
2644 2644                          BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
2645 2645                          /* Note: mp already consumed and ip_drop_packet done */
2646 2646                          goto done;
2647 2647                  }
2648 2648          } else {
2649 2649                  /*
2650 2650                   * This is in clear. The RST message we are building
2651 2651                   * here should go out in clear, independent of our policy.
2652 2652                   */
2653 2653                  ixa->ixa_flags |= IXAF_NO_IPSEC;
2654 2654          }
2655 2655  
2656 2656          DTRACE_TCP5(send, mblk_t *, NULL, ip_xmit_attr_t *, ixa,
2657 2657              __dtrace_tcp_void_ip_t *, mp->b_rptr, tcp_t *, NULL,
2658 2658              __dtrace_tcp_tcph_t *, tcpha);
2659 2659  
2660 2660          /*
2661 2661           * NOTE:  one might consider tracing a TCP packet here, but
2662 2662           * this function has no active TCP state and no tcp structure
2663 2663           * that has a trace buffer.  If we traced here, we would have
2664 2664           * to keep a local trace buffer in tcp_record_trace().
2665 2665           */
2666 2666  
2667 2667          (void) ip_output_simple(mp, ixa);
2668 2668  done:
2669 2669          ixa_cleanup(ixa);
2670 2670          if (need_refrele) {
2671 2671                  ASSERT(ixa != &ixas);
2672 2672                  ixa_refrele(ixa);
2673 2673          }
2674 2674  }
2675 2675  
2676 2676  /*
2677 2677   * Generate a "no listener here" RST in response to an "unknown" segment.
2678 2678   * connp is set by caller when RST is in response to an unexpected
2679 2679   * inbound packet for which there is active tcp state in the system.
2680 2680   * Note that we are reusing the incoming mp to construct the outgoing RST.
2681 2681   */
2682 2682  void
2683 2683  tcp_xmit_listeners_reset(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst,
2684 2684      conn_t *connp)
2685 2685  {
2686 2686          uchar_t         *rptr;
2687 2687          uint32_t        seg_len;
2688 2688          tcpha_t         *tcpha;
2689 2689          uint32_t        seg_seq;
2690 2690          uint32_t        seg_ack;
2691 2691          uint_t          flags;
2692 2692          ipha_t          *ipha;
2693 2693          ip6_t           *ip6h;
2694 2694          boolean_t       policy_present;
2695 2695          netstack_t      *ns = ipst->ips_netstack;
2696 2696          tcp_stack_t     *tcps = ns->netstack_tcp;
2697 2697          ipsec_stack_t   *ipss = tcps->tcps_netstack->netstack_ipsec;
2698 2698          uint_t          ip_hdr_len = ira->ira_ip_hdr_length;
2699 2699  
2700 2700          TCP_STAT(tcps, tcp_no_listener);
2701 2701  
2702 2702          /*
2703 2703           * DTrace this "unknown" segment as a tcp:::receive, as we did
2704 2704           * just receive something that was TCP.
2705 2705           */
2706 2706          DTRACE_TCP5(receive, mblk_t *, NULL, ip_xmit_attr_t *, NULL,
2707 2707              __dtrace_tcp_void_ip_t *, mp->b_rptr, tcp_t *, NULL,
2708 2708              __dtrace_tcp_tcph_t *, &mp->b_rptr[ip_hdr_len]);
2709 2709  
2710 2710          if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
2711 2711                  policy_present = ipss->ipsec_inbound_v4_policy_present;
2712 2712                  ipha = (ipha_t *)mp->b_rptr;
2713 2713                  ip6h = NULL;
2714 2714          } else {
2715 2715                  policy_present = ipss->ipsec_inbound_v6_policy_present;
2716 2716                  ipha = NULL;
2717 2717                  ip6h = (ip6_t *)mp->b_rptr;
2718 2718          }
2719 2719  
2720 2720          if (policy_present) {
2721 2721                  /*
2722 2722                   * The conn_t parameter is NULL because we already know
2723 2723                   * nobody's home.
2724 2724                   */
2725 2725                  mp = ipsec_check_global_policy(mp, (conn_t *)NULL, ipha, ip6h,
2726 2726                      ira, ns);
2727 2727                  if (mp == NULL)
2728 2728                          return;
2729 2729          }
2730 2730          if (is_system_labeled() && !tsol_can_reply_error(mp, ira)) {
2731 2731                  DTRACE_PROBE2(
2732 2732                      tx__ip__log__error__nolistener__tcp,
2733 2733                      char *, "Could not reply with RST to mp(1)",
2734 2734                      mblk_t *, mp);
2735 2735                  ip2dbg(("tcp_xmit_listeners_reset: not permitted to reply\n"));
2736 2736                  freemsg(mp);
2737 2737                  return;
2738 2738          }
2739 2739  
2740 2740          rptr = mp->b_rptr;
2741 2741  
2742 2742          tcpha = (tcpha_t *)&rptr[ip_hdr_len];
2743 2743          seg_seq = ntohl(tcpha->tha_seq);
2744 2744          seg_ack = ntohl(tcpha->tha_ack);
2745 2745          flags = tcpha->tha_flags;
2746 2746  
2747 2747          seg_len = msgdsize(mp) - (TCP_HDR_LENGTH(tcpha) + ip_hdr_len);
2748 2748          if (flags & TH_RST) {
2749 2749                  freemsg(mp);
2750 2750          } else if (flags & TH_ACK) {
2751 2751                  tcp_xmit_early_reset("no tcp, reset", mp, seg_ack, 0, TH_RST,
2752 2752                      ira, ipst, connp);
2753 2753          } else {
2754 2754                  if (flags & TH_SYN) {
2755 2755                          seg_len++;
2756 2756                  } else {
2757 2757                          /*
2758 2758                           * Here we violate the RFC.  Note that a normal
2759 2759                           * TCP will never send a segment without the ACK
2760 2760                           * flag, except for RST or SYN segment.  This
2761 2761                           * segment is neither.  Just drop it on the
2762 2762                           * floor.
2763 2763                           */
2764 2764                          freemsg(mp);
2765 2765                          TCP_STAT(tcps, tcp_rst_unsent);
2766 2766                          return;
2767 2767                  }
2768 2768  
2769 2769                  tcp_xmit_early_reset("no tcp, reset/ack", mp, 0,
2770 2770                      seg_seq + seg_len, TH_RST | TH_ACK, ira, ipst, connp);
2771 2771          }
2772 2772  }
2773 2773  
2774 2774  /*
2775 2775   * Helper function for tcp_xmit_mp() in handling connection set up flag
2776 2776   * options setting.
2777 2777   */
2778 2778  static void
2779 2779  tcp_xmit_mp_aux_iss(tcp_t *tcp, conn_t *connp, tcpha_t *tcpha, mblk_t *mp,
2780 2780      uint_t *flags)
2781 2781  {
2782 2782          uint32_t u1;
2783 2783          uint8_t *wptr = mp->b_wptr;
2784 2784          tcp_stack_t *tcps = tcp->tcp_tcps;
2785 2785          boolean_t add_sack = B_FALSE;
2786 2786  
2787 2787          /*
2788 2788           * If TCP_ISS_VALID and the seq number is tcp_iss,
2789 2789           * TCP can only be in SYN-SENT, SYN-RCVD or
2790 2790           * FIN-WAIT-1 state.  It can be FIN-WAIT-1 if
2791 2791           * our SYN is not ack'ed but the app closes this
2792 2792           * TCP connection.
2793 2793           */
2794 2794          ASSERT(tcp->tcp_state == TCPS_SYN_SENT ||
2795 2795              tcp->tcp_state == TCPS_SYN_RCVD ||
2796 2796              tcp->tcp_state == TCPS_FIN_WAIT_1);
2797 2797  
2798 2798          /*
2799 2799           * Tack on the MSS option.  It is always needed
2800 2800           * for both active and passive open.
2801 2801           *
2802 2802           * MSS option value should be interface MTU - MIN
2803 2803           * TCP/IP header according to RFC 793 as it means
2804 2804           * the maximum segment size TCP can receive.  But
2805 2805           * to get around some broken middle boxes/end hosts
2806 2806           * out there, we allow the option value to be the
2807 2807           * same as the MSS option size on the peer side.
2808 2808           * In this way, the other side will not send
2809 2809           * anything larger than they can receive.
2810 2810           *
2811 2811           * Note that for SYN_SENT state, the ndd param
2812 2812           * tcp_use_smss_as_mss_opt has no effect as we
2813 2813           * don't know the peer's MSS option value. So
2814 2814           * the only case we need to take care of is in
2815 2815           * SYN_RCVD state, which is done later.
2816 2816           */
2817 2817          wptr[0] = TCPOPT_MAXSEG;
2818 2818          wptr[1] = TCPOPT_MAXSEG_LEN;
2819 2819          wptr += 2;
2820 2820          u1 = tcp->tcp_initial_pmtu - (connp->conn_ipversion == IPV4_VERSION ?
2821 2821              IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) - TCP_MIN_HEADER_LENGTH;
2822 2822          U16_TO_BE16(u1, wptr);
2823 2823          wptr += 2;
2824 2824  
2825 2825          /* Update the offset to cover the additional word */
2826 2826          tcpha->tha_offset_and_reserved += (1 << 4);
2827 2827  
2828 2828          switch (tcp->tcp_state) {
2829 2829          case TCPS_SYN_SENT:
2830 2830                  *flags = TH_SYN;
2831 2831  
2832 2832                  if (tcp->tcp_snd_sack_ok)
2833 2833                          add_sack = B_TRUE;
2834 2834  
2835 2835                  if (tcp->tcp_snd_ts_ok) {
2836 2836                          uint32_t llbolt = (uint32_t)LBOLT_FASTPATH;
2837 2837  
2838 2838                          if (add_sack) {
2839 2839                                  wptr[0] = TCPOPT_SACK_PERMITTED;
2840 2840                                  wptr[1] = TCPOPT_SACK_OK_LEN;
2841 2841                                  add_sack = B_FALSE;
2842 2842                          } else {
2843 2843                                  wptr[0] = TCPOPT_NOP;
2844 2844                                  wptr[1] = TCPOPT_NOP;
2845 2845                          }
2846 2846                          wptr[2] = TCPOPT_TSTAMP;
2847 2847                          wptr[3] = TCPOPT_TSTAMP_LEN;
2848 2848                          wptr += 4;
2849 2849                          U32_TO_BE32(llbolt, wptr);
2850 2850                          wptr += 4;
2851 2851                          ASSERT(tcp->tcp_ts_recent == 0);
2852 2852                          U32_TO_BE32(0L, wptr);
2853 2853                          wptr += 4;
2854 2854                          tcpha->tha_offset_and_reserved += (3 << 4);
2855 2855                  }
2856 2856  
2857 2857                  /*
2858 2858                   * Set up all the bits to tell other side
2859 2859                   * we are ECN capable.
2860 2860                   */
2861 2861                  if (tcp->tcp_ecn_ok)
2862 2862                          *flags |= (TH_ECE | TH_CWR);
2863 2863  
2864 2864                  break;
2865 2865  
2866 2866          case TCPS_SYN_RCVD:
2867 2867                  *flags |= TH_SYN;
2868 2868  
2869 2869                  /*
2870 2870                   * Reset the MSS option value to be SMSS
2871 2871                   * We should probably add back the bytes
2872 2872                   * for timestamp option and IPsec.  We
2873 2873                   * don't do that as this is a workaround
2874 2874                   * for broken middle boxes/end hosts, it
2875 2875                   * is better for us to be more cautious.
2876 2876                   * They may not take these things into
2877 2877                   * account in their SMSS calculation.  Thus
2878 2878                   * the peer's calculated SMSS may be smaller
2879 2879                   * than what it can be.  This should be OK.
2880 2880                   */
2881 2881                  if (tcps->tcps_use_smss_as_mss_opt) {
2882 2882                          u1 = tcp->tcp_mss;
2883 2883                          /*
2884 2884                           * Note that wptr points just past the MSS
2885 2885                           * option value.
2886 2886                           */
2887 2887                          U16_TO_BE16(u1, wptr - 2);
2888 2888                  }
2889 2889  
2890 2890                  /*
2891 2891                   * tcp_snd_ts_ok can only be set in TCPS_SYN_RCVD
2892 2892                   * when the peer also uses timestamps option.  And
2893 2893                   * the TCP header template must have already been
2894 2894                   * updated to include the timestamps option.
2895 2895                   */
2896 2896                  if (tcp->tcp_snd_sack_ok) {
2897 2897                          if (tcp->tcp_snd_ts_ok) {
2898 2898                                  uint8_t *tmp_wptr;
2899 2899  
2900 2900                                  /*
2901 2901                                   * Use the NOP in the header just
2902 2902                                   * before timestamps opton.
2903 2903                                   */
2904 2904                                  tmp_wptr = (uint8_t *)tcpha +
2905 2905                                      TCP_MIN_HEADER_LENGTH;
2906 2906                                  ASSERT(tmp_wptr[0] == TCPOPT_NOP &&
2907 2907                                      tmp_wptr[1] == TCPOPT_NOP);
2908 2908                                  tmp_wptr[0] = TCPOPT_SACK_PERMITTED;
2909 2909                                  tmp_wptr[1] = TCPOPT_SACK_OK_LEN;
2910 2910                          } else {
2911 2911                                  add_sack = B_TRUE;
2912 2912                          }
2913 2913                  }
2914 2914  
2915 2915  
2916 2916                  /*
2917 2917                   * If the other side is ECN capable, reply
2918 2918                   * that we are also ECN capable.
2919 2919                   */
2920 2920                  if (tcp->tcp_ecn_ok)
2921 2921                          *flags |= TH_ECE;
2922 2922                  break;
2923 2923  
2924 2924          default:
2925 2925                  /*
2926 2926                   * The above ASSERT() makes sure that this
2927 2927                   * must be FIN-WAIT-1 state.  Our SYN has
2928 2928                   * not been ack'ed so retransmit it.
2929 2929                   */
2930 2930                  *flags |= TH_SYN;
2931 2931                  break;
2932 2932          }
2933 2933  
2934 2934          if (add_sack) {
2935 2935                  wptr[0] = TCPOPT_NOP;
2936 2936                  wptr[1] = TCPOPT_NOP;
2937 2937                  wptr[2] = TCPOPT_SACK_PERMITTED;
2938 2938                  wptr[3] = TCPOPT_SACK_OK_LEN;
2939 2939                  wptr += TCPOPT_REAL_SACK_OK_LEN;
2940 2940                  tcpha->tha_offset_and_reserved += (1 << 4);
2941 2941          }
2942 2942  
2943 2943          if (tcp->tcp_snd_ws_ok) {
2944 2944                  wptr[0] =  TCPOPT_NOP;
2945 2945                  wptr[1] =  TCPOPT_WSCALE;
2946 2946                  wptr[2] =  TCPOPT_WS_LEN;
2947 2947                  wptr[3] = (uchar_t)tcp->tcp_rcv_ws;
2948 2948                  wptr += TCPOPT_REAL_WS_LEN;
2949 2949                  tcpha->tha_offset_and_reserved += (1 << 4);
2950 2950          }
2951 2951  
2952 2952          mp->b_wptr = wptr;
2953 2953          u1 = (int)(mp->b_wptr - mp->b_rptr);
2954 2954          /*
2955 2955           * Get IP set to checksum on our behalf
2956 2956           * Include the adjustment for a source route if any.
2957 2957           */
2958 2958          u1 += connp->conn_sum;
2959 2959          u1 = (u1 >> 16) + (u1 & 0xFFFF);
2960 2960          tcpha->tha_sum = htons(u1);
2961 2961          TCPS_BUMP_MIB(tcps, tcpOutControl);
2962 2962  }
2963 2963  
2964 2964  /*
2965 2965   * Helper function for tcp_xmit_mp() in handling connection tear down
2966 2966   * flag setting and state changes.
2967 2967   */
2968 2968  static void
2969 2969  tcp_xmit_mp_aux_fss(tcp_t *tcp, ip_xmit_attr_t *ixa, uint_t *flags)
2970 2970  {
2971 2971          if (!tcp->tcp_fin_acked) {
2972 2972                  *flags |= TH_FIN;
2973 2973                  TCPS_BUMP_MIB(tcp->tcp_tcps, tcpOutControl);
2974 2974          }
2975 2975          if (!tcp->tcp_fin_sent) {
2976 2976                  tcp->tcp_fin_sent = B_TRUE;
2977 2977                  switch (tcp->tcp_state) {
2978 2978                  case TCPS_SYN_RCVD:
2979 2979                          tcp->tcp_state = TCPS_FIN_WAIT_1;
2980 2980                          DTRACE_TCP6(state__change, void, NULL,
2981 2981                              ip_xmit_attr_t *, ixa, void, NULL,
2982 2982                              tcp_t *, tcp, void, NULL,
2983 2983                              int32_t, TCPS_SYN_RCVD);
2984 2984                          break;
2985 2985                  case TCPS_ESTABLISHED:
2986 2986                          tcp->tcp_state = TCPS_FIN_WAIT_1;
2987 2987                          DTRACE_TCP6(state__change, void, NULL,
2988 2988                              ip_xmit_attr_t *, ixa, void, NULL,
2989 2989                              tcp_t *, tcp, void, NULL,
2990 2990                              int32_t, TCPS_ESTABLISHED);
2991 2991                          break;
2992 2992                  case TCPS_CLOSE_WAIT:
2993 2993                          tcp->tcp_state = TCPS_LAST_ACK;
2994 2994                          DTRACE_TCP6(state__change, void, NULL,
2995 2995                              ip_xmit_attr_t *, ixa, void, NULL,
2996 2996                              tcp_t *, tcp, void, NULL,
2997 2997                              int32_t, TCPS_CLOSE_WAIT);
2998 2998                          break;
2999 2999                  }
3000 3000                  if (tcp->tcp_suna == tcp->tcp_snxt)
3001 3001                          TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
3002 3002                  tcp->tcp_snxt = tcp->tcp_fss + 1;
3003 3003          }
3004 3004  }
3005 3005  
3006 3006  /*
3007 3007   * tcp_xmit_mp is called to return a pointer to an mblk chain complete with
3008 3008   * ip and tcp header ready to pass down to IP.  If the mp passed in is
3009 3009   * non-NULL, then up to max_to_send bytes of data will be dup'ed off that
3010 3010   * mblk. (If sendall is not set the dup'ing will stop at an mblk boundary
3011 3011   * otherwise it will dup partial mblks.)
3012 3012   * Otherwise, an appropriate ACK packet will be generated.  This
3013 3013   * routine is not usually called to send new data for the first time.  It
3014 3014   * is mostly called out of the timer for retransmits, and to generate ACKs.
3015 3015   *
3016 3016   * If offset is not NULL, the returned mblk chain's first mblk's b_rptr will
3017 3017   * be adjusted by *offset.  And after dupb(), the offset and the ending mblk
3018 3018   * of the original mblk chain will be returned in *offset and *end_mp.
3019 3019   */
3020 3020  mblk_t *
3021 3021  tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset,
3022 3022      mblk_t **end_mp, uint32_t seq, boolean_t sendall, uint32_t *seg_len,
3023 3023      boolean_t rexmit)
3024 3024  {
3025 3025          int     data_length;
3026 3026          int32_t off = 0;
3027 3027          uint_t  flags;
3028 3028          mblk_t  *mp1;
3029 3029          mblk_t  *mp2;
3030 3030          uchar_t *rptr;
3031 3031          tcpha_t *tcpha;
3032 3032          int32_t num_sack_blk = 0;
3033 3033          int32_t sack_opt_len = 0;
3034 3034          tcp_stack_t     *tcps = tcp->tcp_tcps;
3035 3035          conn_t          *connp = tcp->tcp_connp;
3036 3036          ip_xmit_attr_t  *ixa = connp->conn_ixa;
3037 3037  
3038 3038          /* Allocate for our maximum TCP header + link-level */
3039 3039          mp1 = allocb(connp->conn_ht_iphc_allocated + tcps->tcps_wroff_xtra,
3040 3040              BPRI_MED);
3041 3041          if (mp1 == NULL)
3042 3042                  return (NULL);
3043 3043          data_length = 0;
3044 3044  
3045 3045          /*
3046 3046           * Note that tcp_mss has been adjusted to take into account the
3047 3047           * timestamp option if applicable.  Because SACK options do not
3048 3048           * appear in every TCP segments and they are of variable lengths,
3049 3049           * they cannot be included in tcp_mss.  Thus we need to calculate
3050 3050           * the actual segment length when we need to send a segment which
3051 3051           * includes SACK options.
3052 3052           */
3053 3053          if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) {
3054 3054                  num_sack_blk = MIN(tcp->tcp_max_sack_blk,
3055 3055                      tcp->tcp_num_sack_blk);
3056 3056                  sack_opt_len = num_sack_blk * sizeof (sack_blk_t) +
3057 3057                      TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN;
3058 3058                  if (max_to_send + sack_opt_len > tcp->tcp_mss)
3059 3059                          max_to_send -= sack_opt_len;
3060 3060          }
3061 3061  
3062 3062          if (offset != NULL) {
3063 3063                  off = *offset;
3064 3064                  /* We use offset as an indicator that end_mp is not NULL. */
3065 3065                  *end_mp = NULL;
3066 3066          }
3067 3067          for (mp2 = mp1; mp && data_length != max_to_send; mp = mp->b_cont) {
3068 3068                  /* This could be faster with cooperation from downstream */
3069 3069                  if (mp2 != mp1 && !sendall &&
3070 3070                      data_length + (int)(mp->b_wptr - mp->b_rptr) >
3071 3071                      max_to_send)
3072 3072                          /*
3073 3073                           * Don't send the next mblk since the whole mblk
3074 3074                           * does not fit.
3075 3075                           */
3076 3076                          break;
3077 3077                  mp2->b_cont = dupb(mp);
3078 3078                  mp2 = mp2->b_cont;
3079 3079                  if (!mp2) {
3080 3080                          freemsg(mp1);
3081 3081                          return (NULL);
3082 3082                  }
3083 3083                  mp2->b_rptr += off;
3084 3084                  ASSERT((uintptr_t)(mp2->b_wptr - mp2->b_rptr) <=
3085 3085                      (uintptr_t)INT_MAX);
3086 3086  
3087 3087                  data_length += (int)(mp2->b_wptr - mp2->b_rptr);
3088 3088                  if (data_length > max_to_send) {
3089 3089                          mp2->b_wptr -= data_length - max_to_send;
3090 3090                          data_length = max_to_send;
3091 3091                          off = mp2->b_wptr - mp->b_rptr;
3092 3092                          break;
3093 3093                  } else {
3094 3094                          off = 0;
3095 3095                  }
3096 3096          }
3097 3097          if (offset != NULL) {
3098 3098                  *offset = off;
3099 3099                  *end_mp = mp;
3100 3100          }
3101 3101          if (seg_len != NULL) {
3102 3102                  *seg_len = data_length;
3103 3103          }
3104 3104  
3105 3105          /* Update the latest receive window size in TCP header. */
3106 3106          tcp->tcp_tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
3107 3107  
3108 3108          rptr = mp1->b_rptr + tcps->tcps_wroff_xtra;
3109 3109          mp1->b_rptr = rptr;
3110 3110          mp1->b_wptr = rptr + connp->conn_ht_iphc_len + sack_opt_len;
3111 3111          bcopy(connp->conn_ht_iphc, rptr, connp->conn_ht_iphc_len);
3112 3112          tcpha = (tcpha_t *)&rptr[ixa->ixa_ip_hdr_length];
3113 3113          tcpha->tha_seq = htonl(seq);
3114 3114  
3115 3115          /*
3116 3116           * Use tcp_unsent to determine if the PUSH bit should be used assumes
3117 3117           * that this function was called from tcp_wput_data. Thus, when called
3118 3118           * to retransmit data the setting of the PUSH bit may appear some
3119 3119           * what random in that it might get set when it should not. This
3120 3120           * should not pose any performance issues.
3121 3121           */
3122 3122          if (data_length != 0 && (tcp->tcp_unsent == 0 ||
3123 3123              tcp->tcp_unsent == data_length)) {
3124 3124                  flags = TH_ACK | TH_PUSH;
3125 3125          } else {
3126 3126                  flags = TH_ACK;
3127 3127          }
3128 3128  
3129 3129          if (tcp->tcp_ecn_ok) {
3130 3130                  if (tcp->tcp_ecn_echo_on)
3131 3131                          flags |= TH_ECE;
3132 3132  
3133 3133                  /*
3134 3134                   * Only set ECT bit and ECN_CWR if a segment contains new data.
3135 3135                   * There is no TCP flow control for non-data segments, and
3136 3136                   * only data segment is transmitted reliably.
3137 3137                   */
3138 3138                  if (data_length > 0 && !rexmit) {
3139 3139                          TCP_SET_ECT(tcp, rptr);
3140 3140                          if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) {
3141 3141                                  flags |= TH_CWR;
3142 3142                                  tcp->tcp_ecn_cwr_sent = B_TRUE;
3143 3143                          }
3144 3144                  }
3145 3145          }
3146 3146  
3147 3147          /* Check if there is any special processing needs to be done. */
3148 3148          if (tcp->tcp_valid_bits) {
3149 3149                  uint32_t u1;
3150 3150  
3151 3151                  /* We don't allow having SYN and FIN in the same segment... */
3152 3152                  if ((tcp->tcp_valid_bits & TCP_ISS_VALID) &&
3153 3153                      seq == tcp->tcp_iss) {
3154 3154                          /* Need to do connection set up processing. */
3155 3155                          tcp_xmit_mp_aux_iss(tcp, connp, tcpha, mp1, &flags);
3156 3156                  } else if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
3157 3157                      (seq + data_length) == tcp->tcp_fss) {
3158 3158                          /* Need to do connection tear down processing. */
3159 3159                          tcp_xmit_mp_aux_fss(tcp, ixa, &flags);
3160 3160                  }
3161 3161  
3162 3162                  /*
3163 3163                   * Need to do urgent pointer processing.
3164 3164                   *
3165 3165                   * Note the trick here.  u1 is unsigned.  When tcp_urg
3166 3166                   * is smaller than seq, u1 will become a very huge value.
3167 3167                   * So the comparison will fail.  Also note that tcp_urp
3168 3168                   * should be positive, see RFC 793 page 17.
3169 3169                   */
3170 3170                  u1 = tcp->tcp_urg - seq + TCP_OLD_URP_INTERPRETATION;
3171 3171                  if ((tcp->tcp_valid_bits & TCP_URG_VALID) && u1 != 0 &&
3172 3172                      u1 < (uint32_t)(64 * 1024)) {
3173 3173                          flags |= TH_URG;
3174 3174                          TCPS_BUMP_MIB(tcps, tcpOutUrg);
3175 3175                          tcpha->tha_urp = htons(u1);
3176 3176                  }
3177 3177          }
3178 3178          tcpha->tha_flags = (uchar_t)flags;
3179 3179          tcp->tcp_rack = tcp->tcp_rnxt;
3180 3180          tcp->tcp_rack_cnt = 0;
3181 3181  
3182 3182          /* Fill in the current value of timestamps option. */
3183 3183          if (tcp->tcp_snd_ts_ok) {
3184 3184                  if (tcp->tcp_state != TCPS_SYN_SENT) {
3185 3185                          uint32_t llbolt = (uint32_t)LBOLT_FASTPATH;
3186 3186  
3187 3187                          U32_TO_BE32(llbolt,
3188 3188                              (char *)tcpha + TCP_MIN_HEADER_LENGTH+4);
3189 3189                          U32_TO_BE32(tcp->tcp_ts_recent,
3190 3190                              (char *)tcpha + TCP_MIN_HEADER_LENGTH+8);
3191 3191                  }
3192 3192          }
3193 3193  
3194 3194          /* Fill in the SACK blocks. */
3195 3195          if (num_sack_blk > 0) {
3196 3196                  uchar_t *wptr = (uchar_t *)tcpha + connp->conn_ht_ulp_len;
3197 3197                  sack_blk_t *tmp;
3198 3198                  int32_t i;
3199 3199  
3200 3200                  wptr[0] = TCPOPT_NOP;
3201 3201                  wptr[1] = TCPOPT_NOP;
3202 3202                  wptr[2] = TCPOPT_SACK;
3203 3203                  wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk *
3204 3204                      sizeof (sack_blk_t);
3205 3205                  wptr += TCPOPT_REAL_SACK_LEN;
3206 3206  
3207 3207                  tmp = tcp->tcp_sack_list;
3208 3208                  for (i = 0; i < num_sack_blk; i++) {
3209 3209                          U32_TO_BE32(tmp[i].begin, wptr);
3210 3210                          wptr += sizeof (tcp_seq);
3211 3211                          U32_TO_BE32(tmp[i].end, wptr);
3212 3212                          wptr += sizeof (tcp_seq);
3213 3213                  }
3214 3214                  tcpha->tha_offset_and_reserved += ((num_sack_blk * 2 + 1) << 4);
3215 3215          }
3216 3216          ASSERT((uintptr_t)(mp1->b_wptr - rptr) <= (uintptr_t)INT_MAX);
3217 3217          data_length += (int)(mp1->b_wptr - rptr);
3218 3218  
3219 3219          ixa->ixa_pktlen = data_length;
3220 3220  
3221 3221          if (ixa->ixa_flags & IXAF_IS_IPV4) {
3222 3222                  ((ipha_t *)rptr)->ipha_length = htons(data_length);
3223 3223          } else {
3224 3224                  ip6_t *ip6 = (ip6_t *)rptr;
3225 3225  
3226 3226                  ip6->ip6_plen = htons(data_length - IPV6_HDR_LEN);
3227 3227          }
3228 3228  
3229 3229          /*
3230 3230           * Prime pump for IP
3231 3231           * Include the adjustment for a source route if any.
3232 3232           */
3233 3233          data_length -= ixa->ixa_ip_hdr_length;
3234 3234          data_length += connp->conn_sum;
3235 3235          data_length = (data_length >> 16) + (data_length & 0xFFFF);
3236 3236          tcpha->tha_sum = htons(data_length);
3237 3237          if (tcp->tcp_ip_forward_progress) {
3238 3238                  tcp->tcp_ip_forward_progress = B_FALSE;
3239 3239                  connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF;
3240 3240          } else {
3241 3241                  connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF;
3242 3242          }
3243 3243          return (mp1);
3244 3244  }
3245 3245  
3246 3246  /*
3247 3247   * If this routine returns B_TRUE, TCP can generate a RST in response
3248 3248   * to a segment.  If it returns B_FALSE, TCP should not respond.
3249 3249   */
3250 3250  static boolean_t
3251 3251  tcp_send_rst_chk(tcp_stack_t *tcps)
3252 3252  {
3253 3253          int64_t now;
3254 3254  
3255 3255          /*
3256 3256           * TCP needs to protect itself from generating too many RSTs.
3257 3257           * This can be a DoS attack by sending us random segments
3258 3258           * soliciting RSTs.
3259 3259           *
3260 3260           * What we do here is to have a limit of tcp_rst_sent_rate RSTs
3261 3261           * in each 1 second interval.  In this way, TCP still generate
3262 3262           * RSTs in normal cases but when under attack, the impact is
3263 3263           * limited.
3264 3264           */
3265 3265          if (tcps->tcps_rst_sent_rate_enabled != 0) {
3266 3266                  now = ddi_get_lbolt64();
3267 3267                  if (TICK_TO_MSEC(now - tcps->tcps_last_rst_intrvl) >
3268 3268                      1*SECONDS) {
3269 3269                          tcps->tcps_last_rst_intrvl = now;
3270 3270                          tcps->tcps_rst_cnt = 1;
3271 3271                  } else if (++tcps->tcps_rst_cnt > tcps->tcps_rst_sent_rate) {
3272 3272                          return (B_FALSE);
3273 3273                  }
3274 3274          }
3275 3275          return (B_TRUE);
3276 3276  }
3277 3277  
3278 3278  /*
3279 3279   * This function handles all retransmissions if SACK is enabled for this
3280 3280   * connection.  First it calculates how many segments can be retransmitted
3281 3281   * based on tcp_pipe.  Then it goes thru the notsack list to find eligible
3282 3282   * segments.  A segment is eligible if sack_cnt for that segment is greater
3283 3283   * than or equal tcp_dupack_fast_retransmit.  After it has retransmitted
3284 3284   * all eligible segments, it checks to see if TCP can send some new segments
3285 3285   * (fast recovery).  If it can, set the appropriate flag for tcp_input_data().
3286 3286   *
3287 3287   * Parameters:
3288 3288   *      tcp_t *tcp: the tcp structure of the connection.
3289 3289   *      uint_t *flags: in return, appropriate value will be set for
3290 3290   *      tcp_input_data().
3291 3291   */
3292 3292  void
3293 3293  tcp_sack_rexmit(tcp_t *tcp, uint_t *flags)
3294 3294  {
3295 3295          notsack_blk_t   *notsack_blk;
3296 3296          int32_t         usable_swnd;
3297 3297          int32_t         mss;
3298 3298          uint32_t        seg_len;
3299 3299          mblk_t          *xmit_mp;
3300 3300          tcp_stack_t     *tcps = tcp->tcp_tcps;
3301 3301  
3302 3302          ASSERT(tcp->tcp_notsack_list != NULL);
3303 3303          ASSERT(tcp->tcp_rexmit == B_FALSE);
3304 3304  
3305 3305          /* Defensive coding in case there is a bug... */
3306 3306          if (tcp->tcp_notsack_list == NULL) {
3307 3307                  return;
3308 3308          }
3309 3309          notsack_blk = tcp->tcp_notsack_list;
3310 3310          mss = tcp->tcp_mss;
3311 3311  
3312 3312          /*
3313 3313           * Limit the num of outstanding data in the network to be
3314 3314           * tcp_cwnd_ssthresh, which is half of the original congestion wnd.
3315 3315           */
3316 3316          usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe;
3317 3317  
3318 3318          /* At least retransmit 1 MSS of data. */
3319 3319          if (usable_swnd <= 0) {
3320 3320                  usable_swnd = mss;
3321 3321          }
3322 3322  
3323 3323          /* Make sure no new RTT samples will be taken. */
3324 3324          tcp->tcp_csuna = tcp->tcp_snxt;
3325 3325  
3326 3326          notsack_blk = tcp->tcp_notsack_list;
3327 3327          while (usable_swnd > 0) {
3328 3328                  mblk_t          *snxt_mp, *tmp_mp;
3329 3329                  tcp_seq         begin = tcp->tcp_sack_snxt;
3330 3330                  tcp_seq         end;
3331 3331                  int32_t         off;
3332 3332  
3333 3333                  for (; notsack_blk != NULL; notsack_blk = notsack_blk->next) {
3334 3334                          if (SEQ_GT(notsack_blk->end, begin) &&
3335 3335                              (notsack_blk->sack_cnt >=
3336 3336                              tcps->tcps_dupack_fast_retransmit)) {
3337 3337                                  end = notsack_blk->end;
3338 3338                                  if (SEQ_LT(begin, notsack_blk->begin)) {
3339 3339                                          begin = notsack_blk->begin;
3340 3340                                  }
3341 3341                                  break;
3342 3342                          }
3343 3343                  }
3344 3344                  /*
3345 3345                   * All holes are filled.  Manipulate tcp_cwnd to send more
3346 3346                   * if we can.  Note that after the SACK recovery, tcp_cwnd is
3347 3347                   * set to tcp_cwnd_ssthresh.
3348 3348                   */
3349 3349                  if (notsack_blk == NULL) {
3350 3350                          usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe;
3351 3351                          if (usable_swnd <= 0 || tcp->tcp_unsent == 0) {
3352 3352                                  tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna;
3353 3353                                  ASSERT(tcp->tcp_cwnd > 0);
3354 3354                                  return;
3355 3355                          } else {
3356 3356                                  usable_swnd = usable_swnd / mss;
3357 3357                                  tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna +
3358 3358                                      MAX(usable_swnd * mss, mss);
3359 3359                                  *flags |= TH_XMIT_NEEDED;
3360 3360                                  return;
3361 3361                          }
3362 3362                  }
3363 3363  
3364 3364                  /*
3365 3365                   * Note that we may send more than usable_swnd allows here
3366 3366                   * because of round off, but no more than 1 MSS of data.
3367 3367                   */
3368 3368                  seg_len = end - begin;
3369 3369                  if (seg_len > mss)
3370 3370                          seg_len = mss;
3371 3371                  snxt_mp = tcp_get_seg_mp(tcp, begin, &off);
3372 3372                  ASSERT(snxt_mp != NULL);
3373 3373                  /* This should not happen.  Defensive coding again... */
3374 3374                  if (snxt_mp == NULL) {
3375 3375                          return;
3376 3376                  }
3377 3377  
3378 3378                  xmit_mp = tcp_xmit_mp(tcp, snxt_mp, seg_len, &off,
3379 3379                      &tmp_mp, begin, B_TRUE, &seg_len, B_TRUE);
3380 3380                  if (xmit_mp == NULL)
3381 3381                          return;

↓ open down ↓

1083 lines elided

↑ open up ↑

3382 3382  
3383 3383                  usable_swnd -= seg_len;
3384 3384                  tcp->tcp_pipe += seg_len;
3385 3385                  tcp->tcp_sack_snxt = begin + seg_len;
3386 3386  
3387 3387                  tcp_send_data(tcp, xmit_mp);
3388 3388  
3389 3389                  /*
3390 3390                   * Update the send timestamp to avoid false retransmission.
3391 3391                   */
3392      -                snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt();
     3392 +                snxt_mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
3393 3393  
3394 3394                  TCPS_BUMP_MIB(tcps, tcpRetransSegs);
3395 3395                  TCPS_UPDATE_MIB(tcps, tcpRetransBytes, seg_len);
3396 3396                  TCPS_BUMP_MIB(tcps, tcpOutSackRetransSegs);
3397 3397                  /*
3398 3398                   * Update tcp_rexmit_max to extend this SACK recovery phase.
3399 3399                   * This happens when new data sent during fast recovery is
3400 3400                   * also lost.  If TCP retransmits those new data, it needs
3401 3401                   * to extend SACK recover phase to avoid starting another
3402 3402                   * fast retransmit/recovery unnecessarily.

3403 3403                   */
3404 3404                  if (SEQ_GT(tcp->tcp_sack_snxt, tcp->tcp_rexmit_max)) {
3405 3405                          tcp->tcp_rexmit_max = tcp->tcp_sack_snxt;
3406 3406                  }
3407 3407          }
3408 3408  }
3409 3409  
3410 3410  /*
3411 3411   * tcp_ss_rexmit() is called to do slow start retransmission after a timeout
3412 3412   * or ICMP errors.
3413 3413   */
3414 3414  void
3415 3415  tcp_ss_rexmit(tcp_t *tcp)
3416 3416  {
3417 3417          uint32_t        snxt;
3418 3418          uint32_t        smax;
3419 3419          int32_t         win;
3420 3420          int32_t         mss;
3421 3421          int32_t         off;
3422 3422          mblk_t          *snxt_mp;
3423 3423          tcp_stack_t     *tcps = tcp->tcp_tcps;
3424 3424  
3425 3425          /*
3426 3426           * Note that tcp_rexmit can be set even though TCP has retransmitted
3427 3427           * all unack'ed segments.
3428 3428           */
3429 3429          if (SEQ_LT(tcp->tcp_rexmit_nxt, tcp->tcp_rexmit_max)) {
3430 3430                  smax = tcp->tcp_rexmit_max;
3431 3431                  snxt = tcp->tcp_rexmit_nxt;
3432 3432                  if (SEQ_LT(snxt, tcp->tcp_suna)) {
3433 3433                          snxt = tcp->tcp_suna;
3434 3434                  }
3435 3435                  win = MIN(tcp->tcp_cwnd, tcp->tcp_swnd);
3436 3436                  win -= snxt - tcp->tcp_suna;
3437 3437                  mss = tcp->tcp_mss;
3438 3438                  snxt_mp = tcp_get_seg_mp(tcp, snxt, &off);
3439 3439  
3440 3440                  while (SEQ_LT(snxt, smax) && (win > 0) && (snxt_mp != NULL)) {
3441 3441                          mblk_t  *xmit_mp;
3442 3442                          mblk_t  *old_snxt_mp = snxt_mp;
3443 3443                          uint32_t cnt = mss;
3444 3444  
3445 3445                          if (win < cnt) {
3446 3446                                  cnt = win;
3447 3447                          }
3448 3448                          if (SEQ_GT(snxt + cnt, smax)) {
3449 3449                                  cnt = smax - snxt;
3450 3450                          }
3451 3451                          xmit_mp = tcp_xmit_mp(tcp, snxt_mp, cnt, &off,
3452 3452                              &snxt_mp, snxt, B_TRUE, &cnt, B_TRUE);
3453 3453                          if (xmit_mp == NULL)

↓ open down ↓

51 lines elided

↑ open up ↑

3454 3454                                  return;
3455 3455  
3456 3456                          tcp_send_data(tcp, xmit_mp);
3457 3457  
3458 3458                          snxt += cnt;
3459 3459                          win -= cnt;
3460 3460                          /*
3461 3461                           * Update the send timestamp to avoid false
3462 3462                           * retransmission.
3463 3463                           */
3464      -                        old_snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt();
     3464 +                        old_snxt_mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
3465 3465                          TCPS_BUMP_MIB(tcps, tcpRetransSegs);
3466 3466                          TCPS_UPDATE_MIB(tcps, tcpRetransBytes, cnt);
3467 3467  
3468 3468                          tcp->tcp_rexmit_nxt = snxt;
3469 3469                  }
3470 3470                  /*
3471 3471                   * If we have transmitted all we have at the time
3472 3472                   * we started the retranmission, we can leave
3473 3473                   * the rest of the job to tcp_wput_data().  But we
3474 3474                   * need to check the send window first.  If the

3475 3475                   * win is not 0, go on with tcp_wput_data().
3476 3476                   */
3477 3477                  if (SEQ_LT(snxt, smax) || win == 0) {
3478 3478                          return;
3479 3479                  }
3480 3480          }
3481 3481          /* Only call tcp_wput_data() if there is data to be sent. */
3482 3482          if (tcp->tcp_unsent) {
3483 3483                  tcp_wput_data(tcp, NULL, B_FALSE);
3484 3484          }
3485 3485  }
3486 3486  
3487 3487  /*
3488 3488   * Do slow start retransmission after ICMP errors of PMTU changes.
3489 3489   */
3490 3490  void
3491 3491  tcp_rexmit_after_error(tcp_t *tcp)
3492 3492  {
3493 3493          /*
3494 3494           * All sent data has been acknowledged or no data left to send, just
3495 3495           * to return.
3496 3496           */
3497 3497          if (!SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) ||
3498 3498              (tcp->tcp_xmit_head == NULL))
3499 3499                  return;
3500 3500  
3501 3501          if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && (tcp->tcp_unsent == 0))
3502 3502                  tcp->tcp_rexmit_max = tcp->tcp_fss;
3503 3503          else
3504 3504                  tcp->tcp_rexmit_max = tcp->tcp_snxt;
3505 3505  
3506 3506          tcp->tcp_rexmit_nxt = tcp->tcp_suna;
3507 3507          tcp->tcp_rexmit = B_TRUE;
3508 3508          tcp->tcp_dupack_cnt = 0;
3509 3509          tcp_ss_rexmit(tcp);
3510 3510  }
3511 3511  
3512 3512  /*
3513 3513   * tcp_get_seg_mp() is called to get the pointer to a segment in the
3514 3514   * send queue which starts at the given sequence number. If the given
3515 3515   * sequence number is equal to last valid sequence number (tcp_snxt), the
3516 3516   * returned mblk is the last valid mblk, and off is set to the length of
3517 3517   * that mblk.
3518 3518   *
3519 3519   * send queue which starts at the given seq. no.
3520 3520   *
3521 3521   * Parameters:
3522 3522   *      tcp_t *tcp: the tcp instance pointer.
3523 3523   *      uint32_t seq: the starting seq. no of the requested segment.
3524 3524   *      int32_t *off: after the execution, *off will be the offset to
3525 3525   *              the returned mblk which points to the requested seq no.
3526 3526   *              It is the caller's responsibility to send in a non-null off.
3527 3527   *
3528 3528   * Return:
3529 3529   *      A mblk_t pointer pointing to the requested segment in send queue.
3530 3530   */
3531 3531  static mblk_t *
3532 3532  tcp_get_seg_mp(tcp_t *tcp, uint32_t seq, int32_t *off)
3533 3533  {
3534 3534          int32_t cnt;
3535 3535          mblk_t  *mp;
3536 3536  
3537 3537          /* Defensive coding.  Make sure we don't send incorrect data. */
3538 3538          if (SEQ_LT(seq, tcp->tcp_suna) || SEQ_GT(seq, tcp->tcp_snxt))
3539 3539                  return (NULL);
3540 3540  
3541 3541          cnt = seq - tcp->tcp_suna;
3542 3542          mp = tcp->tcp_xmit_head;
3543 3543          while (cnt > 0 && mp != NULL) {
3544 3544                  cnt -= mp->b_wptr - mp->b_rptr;
3545 3545                  if (cnt <= 0) {
3546 3546                          cnt += mp->b_wptr - mp->b_rptr;
3547 3547                          break;
3548 3548                  }
3549 3549                  mp = mp->b_cont;
3550 3550          }
3551 3551          ASSERT(mp != NULL);
3552 3552          *off = cnt;
3553 3553          return (mp);
3554 3554  }
3555 3555  
3556 3556  /*
3557 3557   * This routine adjusts next-to-send sequence number variables, in the
3558 3558   * case where the reciever has shrunk it's window.
3559 3559   */
3560 3560  void
3561 3561  tcp_update_xmit_tail(tcp_t *tcp, uint32_t snxt)
3562 3562  {
3563 3563          mblk_t *xmit_tail;
3564 3564          int32_t offset;
3565 3565  
3566 3566          tcp->tcp_snxt = snxt;
3567 3567  
3568 3568          /* Get the mblk, and the offset in it, as per the shrunk window */
3569 3569          xmit_tail = tcp_get_seg_mp(tcp, snxt, &offset);
3570 3570          ASSERT(xmit_tail != NULL);
3571 3571          tcp->tcp_xmit_tail = xmit_tail;
3572 3572          tcp->tcp_xmit_tail_unsent = xmit_tail->b_wptr -
3573 3573              xmit_tail->b_rptr - offset;
3574 3574  }
3575 3575  
3576 3576  /*
3577 3577   * This handles the case when the receiver has shrunk its win. Per RFC 1122
3578 3578   * if the receiver shrinks the window, i.e. moves the right window to the
3579 3579   * left, the we should not send new data, but should retransmit normally the
3580 3580   * old unacked data between suna and suna + swnd. We might has sent data
3581 3581   * that is now outside the new window, pretend that we didn't send  it.
3582 3582   */
3583 3583  static void
3584 3584  tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_count)
3585 3585  {
3586 3586          uint32_t        snxt = tcp->tcp_snxt;
3587 3587  
3588 3588          ASSERT(shrunk_count > 0);
3589 3589  
3590 3590          if (!tcp->tcp_is_wnd_shrnk) {
3591 3591                  tcp->tcp_snxt_shrunk = snxt;
3592 3592                  tcp->tcp_is_wnd_shrnk = B_TRUE;
3593 3593          } else if (SEQ_GT(snxt, tcp->tcp_snxt_shrunk)) {
3594 3594                  tcp->tcp_snxt_shrunk = snxt;
3595 3595          }
3596 3596  
3597 3597          /* Pretend we didn't send the data outside the window */
3598 3598          snxt -= shrunk_count;
3599 3599  
3600 3600          /* Reset all the values per the now shrunk window */
3601 3601          tcp_update_xmit_tail(tcp, snxt);
3602 3602          tcp->tcp_unsent += shrunk_count;
3603 3603  
3604 3604          /*
3605 3605           * If the SACK option is set, delete the entire list of
3606 3606           * notsack'ed blocks.
3607 3607           */
3608 3608          TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, tcp);
3609 3609  
3610 3610          if (tcp->tcp_suna == tcp->tcp_snxt && tcp->tcp_swnd == 0)
3611 3611                  /*
3612 3612                   * Make sure the timer is running so that we will probe a zero
3613 3613                   * window.

↓ open down ↓

139 lines elided

↑ open up ↑

3614 3614                   */
3615 3615                  TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
3616 3616  }
3617 3617  
3618 3618  /*
3619 3619   * tcp_fill_header is called by tcp_send() to fill the outgoing TCP header
3620 3620   * with the template header, as well as other options such as time-stamp,
3621 3621   * ECN and/or SACK.
3622 3622   */
3623 3623  static void
3624      -tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk)
     3624 +tcp_fill_header(tcp_t *tcp, uchar_t *rptr, int num_sack_blk)
3625 3625  {
3626 3626          tcpha_t *tcp_tmpl, *tcpha;
3627 3627          uint32_t *dst, *src;
3628 3628          int hdrlen;
3629 3629          conn_t *connp = tcp->tcp_connp;
3630 3630  
3631 3631          ASSERT(OK_32PTR(rptr));
3632 3632  
3633 3633          /* Template header */
3634 3634          tcp_tmpl = tcp->tcp_tcpha;

3635 3635

↓ open down ↓

1 lines elided

↑ open up ↑

3636 3636          /* Header of outgoing packet */
3637 3637          tcpha = (tcpha_t *)(rptr + connp->conn_ixa->ixa_ip_hdr_length);
3638 3638  
3639 3639          /* dst and src are opaque 32-bit fields, used for copying */
3640 3640          dst = (uint32_t *)rptr;
3641 3641          src = (uint32_t *)connp->conn_ht_iphc;
3642 3642          hdrlen = connp->conn_ht_iphc_len;
3643 3643  
3644 3644          /* Fill time-stamp option if needed */
3645 3645          if (tcp->tcp_snd_ts_ok) {
3646      -                U32_TO_BE32((uint32_t)now,
     3646 +                U32_TO_BE32(LBOLT_FASTPATH,
3647 3647                      (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 4);
3648 3648                  U32_TO_BE32(tcp->tcp_ts_recent,
3649 3649                      (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 8);
3650 3650          } else {
3651 3651                  ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
3652 3652          }
3653 3653  
3654 3654          /*
3655 3655           * Copy the template header; is this really more efficient than
3656 3656           * calling bcopy()?  For simple IPv4/TCP, it may be the case,

3657 3657           * but perhaps not for other scenarios.
3658 3658           */
3659 3659          dst[0] = src[0];
3660 3660          dst[1] = src[1];
3661 3661          dst[2] = src[2];
3662 3662          dst[3] = src[3];
3663 3663          dst[4] = src[4];
3664 3664          dst[5] = src[5];
3665 3665          dst[6] = src[6];
3666 3666          dst[7] = src[7];
3667 3667          dst[8] = src[8];
3668 3668          dst[9] = src[9];
3669 3669          if (hdrlen -= 40) {
3670 3670                  hdrlen >>= 2;
3671 3671                  dst += 10;
3672 3672                  src += 10;
3673 3673                  do {
3674 3674                          *dst++ = *src++;
3675 3675                  } while (--hdrlen);
3676 3676          }
3677 3677  
3678 3678          /*
3679 3679           * Set the ECN info in the TCP header if it is not a zero
3680 3680           * window probe.  Zero window probe is only sent in
3681 3681           * tcp_wput_data() and tcp_timer().
3682 3682           */
3683 3683          if (tcp->tcp_ecn_ok && !tcp->tcp_zero_win_probe) {
3684 3684                  TCP_SET_ECT(tcp, rptr);
3685 3685  
3686 3686                  if (tcp->tcp_ecn_echo_on)
3687 3687                          tcpha->tha_flags |= TH_ECE;
3688 3688                  if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) {
3689 3689                          tcpha->tha_flags |= TH_CWR;
3690 3690                          tcp->tcp_ecn_cwr_sent = B_TRUE;
3691 3691                  }
3692 3692          }
3693 3693  
3694 3694          /* Fill in SACK options */
3695 3695          if (num_sack_blk > 0) {
3696 3696                  uchar_t *wptr = rptr + connp->conn_ht_iphc_len;
3697 3697                  sack_blk_t *tmp;
3698 3698                  int32_t i;
3699 3699  
3700 3700                  wptr[0] = TCPOPT_NOP;
3701 3701                  wptr[1] = TCPOPT_NOP;
3702 3702                  wptr[2] = TCPOPT_SACK;
3703 3703                  wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk *
3704 3704                      sizeof (sack_blk_t);
3705 3705                  wptr += TCPOPT_REAL_SACK_LEN;
3706 3706  
3707 3707                  tmp = tcp->tcp_sack_list;
3708 3708                  for (i = 0; i < num_sack_blk; i++) {
3709 3709                          U32_TO_BE32(tmp[i].begin, wptr);
3710 3710                          wptr += sizeof (tcp_seq);
3711 3711                          U32_TO_BE32(tmp[i].end, wptr);
3712 3712                          wptr += sizeof (tcp_seq);
3713 3713                  }
3714 3714                  tcpha->tha_offset_and_reserved +=
3715 3715                      ((num_sack_blk * 2 + 1) << 4);
3716 3716          }
3717 3717  }

↓ open down ↓

61 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX