illumos-gate Sdiff usr/src/uts/common/inet/tcp/tcp

Print this page

11546 Track TCP round-trip time in nanoseconds
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Portions contributed by: Brandon Baker <bbaker@delphix.com>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>

   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2014 by Delphix. All rights reserved.

  25  */
  26 
  27 /* This file contains all TCP output processing functions. */
  28 
  29 #include <sys/types.h>
  30 #include <sys/stream.h>
  31 #include <sys/strsun.h>
  32 #include <sys/strsubr.h>
  33 #include <sys/stropts.h>
  34 #include <sys/strlog.h>
  35 #define _SUN_TPI_VERSION 2
  36 #include <sys/tihdr.h>
  37 #include <sys/suntpi.h>
  38 #include <sys/xti_inet.h>
  39 #include <sys/timod.h>
  40 #include <sys/pattr.h>
  41 #include <sys/squeue_impl.h>
  42 #include <sys/squeue.h>
  43 #include <sys/sockio.h>
  44 #include <sys/tsol/tnet.h>
  45 
  46 #include <inet/common.h>
  47 #include <inet/ip.h>
  48 #include <inet/tcp.h>
  49 #include <inet/tcp_impl.h>
  50 #include <inet/snmpcom.h>
  51 #include <inet/proto_set.h>
  52 #include <inet/ipsec_impl.h>
  53 #include <inet/ip_ndp.h>
  54 
  55 static mblk_t   *tcp_get_seg_mp(tcp_t *, uint32_t, int32_t *);
  56 static void     tcp_wput_cmdblk(queue_t *, mblk_t *);
  57 static void     tcp_wput_flush(tcp_t *, mblk_t *);
  58 static void     tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp);
  59 static int      tcp_xmit_end(tcp_t *);
  60 static int      tcp_send(tcp_t *, const int, const int, const int,
  61                     const int, int *, uint_t *, int *, mblk_t **, mblk_t *);
  62 static void     tcp_xmit_early_reset(char *, mblk_t *, uint32_t, uint32_t,
  63                     int, ip_recv_attr_t *, ip_stack_t *, conn_t *);
  64 static boolean_t        tcp_send_rst_chk(tcp_stack_t *);
  65 static void     tcp_process_shrunk_swnd(tcp_t *, uint32_t);
  66 static void     tcp_fill_header(tcp_t *, uchar_t *, clock_t, int);
  67 
  68 /*
  69  * Functions called directly via squeue having a prototype of edesc_t.
  70  */
  71 static void     tcp_wput_nondata(void *, mblk_t *, void *, ip_recv_attr_t *);
  72 static void     tcp_wput_ioctl(void *, mblk_t *, void *, ip_recv_attr_t *);
  73 static void     tcp_wput_proto(void *, mblk_t *, void *, ip_recv_attr_t *);
  74 
  75 /*
  76  * This controls how tiny a write must be before we try to copy it
  77  * into the mblk on the tail of the transmit queue.  Not much
  78  * speedup is observed for values larger than sixteen.  Zero will
  79  * disable the optimisation.
  80  */
  81 static int tcp_tx_pull_len = 16;
  82 
  83 int
  84 tcp_wput(queue_t *q, mblk_t *mp)
  85 {
  86         conn_t  *connp = Q_TO_CONN(q);

 437                         return;
 438                 }
 439 
 440                 /* usable = MIN(swnd, cwnd) - unacked_bytes */
 441                 if (tcp->tcp_swnd > tcp->tcp_cwnd)
 442                         usable_r -= tcp->tcp_swnd - tcp->tcp_cwnd;
 443 
 444                 /* usable = MIN(usable, unsent) */
 445                 if (usable_r > len)
 446                         usable_r = len;
 447 
 448                 /* usable = MAX(usable, {1 for urgent, 0 for data}) */
 449                 if (usable_r > 0) {
 450                         usable = usable_r;
 451                 } else {
 452                         /* Bypass all other unnecessary processing. */
 453                         goto done;
 454                 }
 455         }
 456 
 457         local_time = (mblk_t *)now;
 458 
 459         /*
 460          * "Our" Nagle Algorithm.  This is not the same as in the old
 461          * BSD.  This is more in line with the true intent of Nagle.
 462          *
 463          * The conditions are:
 464          * 1. The amount of unsent data (or amount of data which can be
 465          *    sent, whichever is smaller) is less than Nagle limit.
 466          * 2. The last sent size is also less than Nagle limit.
 467          * 3. There is unack'ed data.
 468          * 4. Urgent pointer is not set.  Send urgent data ignoring the
 469          *    Nagle algorithm.  This reduces the probability that urgent
 470          *    bytes get "merged" together.
 471          * 5. The app has not closed the connection.  This eliminates the
 472          *    wait time of the receiving side waiting for the last piece of
 473          *    (small) data.
 474          *
 475          * If all are satisified, exit without sending anything.  Note
 476          * that Nagle limit can be smaller than 1 MSS.  Nagle limit is
 477          * the smaller of 1 MSS and global tcp_naglim_def (default to be

1166                 tcp->tcp_xmit_head = mp;
1167         } else {
1168                 tcp->tcp_xmit_last->b_cont = mp;
1169         }
1170         tcp->tcp_xmit_last = mp;
1171         tcp->tcp_xmit_tail = mp;
1172 
1173         /* find out how much we can send */
1174         /* BEGIN CSTYLED */
1175         /*
1176          *    un-acked     usable
1177          *  |--------------|-----------------|
1178          *  tcp_suna       tcp_snxt       tcp_suna+tcp_swnd
1179          */
1180         /* END CSTYLED */
1181 
1182         /* start sending from tcp_snxt */
1183         snxt = tcp->tcp_snxt;
1184 
1185         /*
1186          * Check to see if this connection has been idled for some
1187          * time and no ACK is expected.  If it is, we need to slow
1188          * start again to get back the connection's "self-clock" as
1189          * described in VJ's paper.
1190          *
1191          * Reinitialize tcp_cwnd after idle.


1192          */
1193         now = LBOLT_FASTPATH;
1194         if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
1195             (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
1196                 TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle);
1197         }
1198 
1199         usable = tcp->tcp_swnd;              /* tcp window size */
1200         if (usable > tcp->tcp_cwnd)
1201                 usable = tcp->tcp_cwnd;      /* congestion window smaller */
1202         usable -= snxt;         /* subtract stuff already sent */
1203         suna = tcp->tcp_suna;
1204         usable += suna;
1205         /* usable can be < 0 if the congestion window is smaller */
1206         if (len > usable) {
1207                 /* Can't send complete M_DATA in one shot */
1208                 goto slow;
1209         }
1210 
1211         mutex_enter(&tcp->tcp_non_sq_lock);

1239                 return;
1240         }
1241 
1242         /*
1243          * len <= tcp->tcp_mss && len == unsent so no sender silly window.  Can
1244          * send now.
1245          */
1246 
1247         if (snxt == suna) {
1248                 TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
1249         }
1250 
1251         /* we have always sent something */
1252         tcp->tcp_rack_cnt = 0;
1253 
1254         tcp->tcp_snxt = snxt + len;
1255         tcp->tcp_rack = tcp->tcp_rnxt;
1256 
1257         if ((mp1 = dupb(mp)) == 0)
1258                 goto no_memory;
1259         mp->b_prev = (mblk_t *)(uintptr_t)now;
1260         mp->b_next = (mblk_t *)(uintptr_t)snxt;
1261 
1262         /* adjust tcp header information */
1263         tcpha = tcp->tcp_tcpha;
1264         tcpha->tha_flags = (TH_ACK|TH_PUSH);
1265 
1266         sum = len + connp->conn_ht_ulp_len + connp->conn_sum;
1267         sum = (sum >> 16) + (sum & 0xFFFF);
1268         tcpha->tha_sum = htons(sum);
1269 
1270         tcpha->tha_seq = htonl(snxt);
1271 
1272         TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
1273         TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
1274         BUMP_LOCAL(tcp->tcp_obsegs);
1275 
1276         /* Update the latest receive window size in TCP header. */
1277         tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
1278 
1279         tcp->tcp_last_sent_len = (ushort_t)len;

1294         rptr = mp1->b_rptr - hdrlen;
1295         db = mp1->b_datap;
1296         if ((db->db_ref != 2) || rptr < db->db_base ||
1297             (!OK_32PTR(rptr))) {
1298                 /* NOTE: we assume allocb returns an OK_32PTR */
1299                 mp = allocb(hdrlen + tcps->tcps_wroff_xtra, BPRI_MED);
1300                 if (!mp) {
1301                         freemsg(mp1);
1302                         goto no_memory;
1303                 }
1304                 mp->b_cont = mp1;
1305                 mp1 = mp;
1306                 /* Leave room for Link Level header */
1307                 rptr = &mp1->b_rptr[tcps->tcps_wroff_xtra];
1308                 mp1->b_wptr = &rptr[hdrlen];
1309         }
1310         mp1->b_rptr = rptr;
1311 
1312         /* Fill in the timestamp option. */
1313         if (tcp->tcp_snd_ts_ok) {
1314                 uint32_t llbolt = (uint32_t)LBOLT_FASTPATH;
1315 
1316                 U32_TO_BE32(llbolt,
1317                     (char *)tcpha + TCP_MIN_HEADER_LENGTH+4);
1318                 U32_TO_BE32(tcp->tcp_ts_recent,
1319                     (char *)tcpha + TCP_MIN_HEADER_LENGTH+8);
1320         } else {
1321                 ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
1322         }
1323 
1324         /* copy header into outgoing packet */
1325         dst = (ipaddr_t *)rptr;
1326         src = (ipaddr_t *)connp->conn_ht_iphc;
1327         dst[0] = src[0];
1328         dst[1] = src[1];
1329         dst[2] = src[2];
1330         dst[3] = src[3];
1331         dst[4] = src[4];
1332         dst[5] = src[5];
1333         dst[6] = src[6];
1334         dst[7] = src[7];
1335         dst[8] = src[8];
1336         dst[9] = src[9];
1337         if (hdrlen -= 40) {
1338                 hdrlen >>= 2;
1339                 dst += 10;

1754         /*
1755          * In the off-chance that the eager received and responded to
1756          * some other packet while the SYN|ACK was queued, we recalculate
1757          * the ixa_pktlen. It would be better to fix the SYN/accept
1758          * multithreading scheme to avoid this complexity.
1759          */
1760         ixa->ixa_pktlen = msgdsize(mp);
1761         (void) conn_ip_output(mp, ixa);
1762 }
1763 
1764 /*
1765  * tcp_send() is called by tcp_wput_data() and returns one of the following:
1766  *
1767  * -1 = failed allocation.
1768  *  0 = We've either successfully sent data, or our usable send window is too
1769  *      small and we'd rather wait until later before sending again.
1770  */
1771 static int
1772 tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len,
1773     const int tcp_hdr_len, const int num_sack_blk, int *usable,
1774     uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time)
1775 {
1776         int             num_lso_seg = 1;
1777         uint_t          lso_usable;
1778         boolean_t       do_lso_send = B_FALSE;
1779         tcp_stack_t     *tcps = tcp->tcp_tcps;
1780         conn_t          *connp = tcp->tcp_connp;
1781         ip_xmit_attr_t  *ixa = connp->conn_ixa;
1782 
1783         /*
1784          * Check LSO possibility. The value of tcp->tcp_lso indicates whether
1785          * the underlying connection is LSO capable. Will check whether having
1786          * enough available data to initiate LSO transmission in the for(){}
1787          * loops.
1788          */
1789         if (tcp->tcp_lso && (tcp->tcp_valid_bits & ~TCP_FSS_VALID) == 0)
1790                 do_lso_send = B_TRUE;
1791 
1792         for (;;) {
1793                 struct datab    *db;
1794                 tcpha_t         *tcpha;

2049 
2050                 must_alloc:;
2051                         mp1 = allocb(connp->conn_ht_iphc_allocated +
2052                             tcps->tcps_wroff_xtra, BPRI_MED);
2053                         if (mp1 == NULL) {
2054                                 freemsg(mp);
2055                                 return (-1);    /* out_of_mem */
2056                         }
2057                         mp1->b_cont = mp;
2058                         mp = mp1;
2059                         /* Leave room for Link Level header */
2060                         len = total_hdr_len;
2061                         rptr = &mp->b_rptr[tcps->tcps_wroff_xtra];
2062                         mp->b_wptr = &rptr[len];
2063                 }
2064 
2065                 /*
2066                  * Fill in the header using the template header, and add
2067                  * options such as time-stamp, ECN and/or SACK, as needed.
2068                  */
2069                 tcp_fill_header(tcp, rptr, (clock_t)local_time, num_sack_blk);
2070 
2071                 mp->b_rptr = rptr;
2072 
2073                 if (*tail_unsent) {
2074                         int spill = *tail_unsent;
2075 
2076                         mp1 = mp->b_cont;
2077                         if (mp1 == NULL)
2078                                 mp1 = mp;
2079 
2080                         /*
2081                          * If we're a little short, tack on more mblks until
2082                          * there is no more spillover.
2083                          */
2084                         while (spill < 0) {
2085                                 mblk_t *nmp;
2086                                 int nmpsz;
2087 
2088                                 nmp = (*xmit_tail)->b_cont;
2089                                 nmpsz = MBLKL(nmp);

2267                  * so we have to check that and unset it first.
2268                  */
2269                 if (tcp->tcp_cork)
2270                         tcp->tcp_cork = B_FALSE;
2271                 tcp_wput_data(tcp, NULL, B_FALSE);
2272         }
2273 
2274         /*
2275          * If TCP does not get enough samples of RTT or tcp_rtt_updates
2276          * is 0, don't update the cache.
2277          */
2278         if (tcps->tcps_rtt_updates == 0 ||
2279             tcp->tcp_rtt_update < tcps->tcps_rtt_updates)
2280                 return (0);
2281 
2282         /*
2283          * We do not have a good algorithm to update ssthresh at this time.
2284          * So don't do any update.
2285          */
2286         bzero(&uinfo, sizeof (uinfo));
2287         uinfo.iulp_rtt = tcp->tcp_rtt_sa;
2288         uinfo.iulp_rtt_sd = tcp->tcp_rtt_sd;
2289 
2290         /*
2291          * Note that uinfo is kept for conn_faddr in the DCE. Could update even
2292          * if source routed but we don't.
2293          */
2294         if (connp->conn_ipversion == IPV4_VERSION) {
2295                 if (connp->conn_faddr_v4 !=  tcp->tcp_ipha->ipha_dst) {
2296                         return (0);
2297                 }
2298                 (void) dce_update_uinfo_v4(connp->conn_faddr_v4, &uinfo, ipst);
2299         } else {
2300                 uint_t ifindex;
2301 
2302                 if (!(IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
2303                     &tcp->tcp_ip6h->ip6_dst))) {
2304                         return (0);
2305                 }
2306                 ifindex = 0;
2307                 if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_faddr_v6)) {
2308                         ip_xmit_attr_t *ixa = connp->conn_ixa;

3372                 ASSERT(snxt_mp != NULL);
3373                 /* This should not happen.  Defensive coding again... */
3374                 if (snxt_mp == NULL) {
3375                         return;
3376                 }
3377 
3378                 xmit_mp = tcp_xmit_mp(tcp, snxt_mp, seg_len, &off,
3379                     &tmp_mp, begin, B_TRUE, &seg_len, B_TRUE);
3380                 if (xmit_mp == NULL)
3381                         return;
3382 
3383                 usable_swnd -= seg_len;
3384                 tcp->tcp_pipe += seg_len;
3385                 tcp->tcp_sack_snxt = begin + seg_len;
3386 
3387                 tcp_send_data(tcp, xmit_mp);
3388 
3389                 /*
3390                  * Update the send timestamp to avoid false retransmission.
3391                  */
3392                 snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt();
3393 
3394                 TCPS_BUMP_MIB(tcps, tcpRetransSegs);
3395                 TCPS_UPDATE_MIB(tcps, tcpRetransBytes, seg_len);
3396                 TCPS_BUMP_MIB(tcps, tcpOutSackRetransSegs);
3397                 /*
3398                  * Update tcp_rexmit_max to extend this SACK recovery phase.
3399                  * This happens when new data sent during fast recovery is
3400                  * also lost.  If TCP retransmits those new data, it needs
3401                  * to extend SACK recover phase to avoid starting another
3402                  * fast retransmit/recovery unnecessarily.
3403                  */
3404                 if (SEQ_GT(tcp->tcp_sack_snxt, tcp->tcp_rexmit_max)) {
3405                         tcp->tcp_rexmit_max = tcp->tcp_sack_snxt;
3406                 }
3407         }
3408 }
3409 
3410 /*
3411  * tcp_ss_rexmit() is called to do slow start retransmission after a timeout
3412  * or ICMP errors.

3444 
3445                         if (win < cnt) {
3446                                 cnt = win;
3447                         }
3448                         if (SEQ_GT(snxt + cnt, smax)) {
3449                                 cnt = smax - snxt;
3450                         }
3451                         xmit_mp = tcp_xmit_mp(tcp, snxt_mp, cnt, &off,
3452                             &snxt_mp, snxt, B_TRUE, &cnt, B_TRUE);
3453                         if (xmit_mp == NULL)
3454                                 return;
3455 
3456                         tcp_send_data(tcp, xmit_mp);
3457 
3458                         snxt += cnt;
3459                         win -= cnt;
3460                         /*
3461                          * Update the send timestamp to avoid false
3462                          * retransmission.
3463                          */
3464                         old_snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt();
3465                         TCPS_BUMP_MIB(tcps, tcpRetransSegs);
3466                         TCPS_UPDATE_MIB(tcps, tcpRetransBytes, cnt);
3467 
3468                         tcp->tcp_rexmit_nxt = snxt;
3469                 }
3470                 /*
3471                  * If we have transmitted all we have at the time
3472                  * we started the retranmission, we can leave
3473                  * the rest of the job to tcp_wput_data().  But we
3474                  * need to check the send window first.  If the
3475                  * win is not 0, go on with tcp_wput_data().
3476                  */
3477                 if (SEQ_LT(snxt, smax) || win == 0) {
3478                         return;
3479                 }
3480         }
3481         /* Only call tcp_wput_data() if there is data to be sent. */
3482         if (tcp->tcp_unsent) {
3483                 tcp_wput_data(tcp, NULL, B_FALSE);
3484         }

3604         /*
3605          * If the SACK option is set, delete the entire list of
3606          * notsack'ed blocks.
3607          */
3608         TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, tcp);
3609 
3610         if (tcp->tcp_suna == tcp->tcp_snxt && tcp->tcp_swnd == 0)
3611                 /*
3612                  * Make sure the timer is running so that we will probe a zero
3613                  * window.
3614                  */
3615                 TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
3616 }
3617 
3618 /*
3619  * tcp_fill_header is called by tcp_send() to fill the outgoing TCP header
3620  * with the template header, as well as other options such as time-stamp,
3621  * ECN and/or SACK.
3622  */
3623 static void
3624 tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk)
3625 {
3626         tcpha_t *tcp_tmpl, *tcpha;
3627         uint32_t *dst, *src;
3628         int hdrlen;
3629         conn_t *connp = tcp->tcp_connp;
3630 
3631         ASSERT(OK_32PTR(rptr));
3632 
3633         /* Template header */
3634         tcp_tmpl = tcp->tcp_tcpha;
3635 
3636         /* Header of outgoing packet */
3637         tcpha = (tcpha_t *)(rptr + connp->conn_ixa->ixa_ip_hdr_length);
3638 
3639         /* dst and src are opaque 32-bit fields, used for copying */
3640         dst = (uint32_t *)rptr;
3641         src = (uint32_t *)connp->conn_ht_iphc;
3642         hdrlen = connp->conn_ht_iphc_len;
3643 
3644         /* Fill time-stamp option if needed */
3645         if (tcp->tcp_snd_ts_ok) {
3646                 U32_TO_BE32((uint32_t)now,
3647                     (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 4);
3648                 U32_TO_BE32(tcp->tcp_ts_recent,
3649                     (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 8);
3650         } else {
3651                 ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
3652         }
3653 
3654         /*
3655          * Copy the template header; is this really more efficient than
3656          * calling bcopy()?  For simple IPv4/TCP, it may be the case,
3657          * but perhaps not for other scenarios.
3658          */
3659         dst[0] = src[0];
3660         dst[1] = src[1];
3661         dst[2] = src[2];
3662         dst[3] = src[3];
3663         dst[4] = src[4];
3664         dst[5] = src[5];
3665         dst[6] = src[6];
3666         dst[7] = src[7];

   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
  25  * Copyright 2019 Joyent, Inc.
  26  */
  27 
  28 /* This file contains all TCP output processing functions. */
  29 
  30 #include <sys/types.h>
  31 #include <sys/stream.h>
  32 #include <sys/strsun.h>
  33 #include <sys/strsubr.h>
  34 #include <sys/stropts.h>
  35 #include <sys/strlog.h>
  36 #define _SUN_TPI_VERSION 2
  37 #include <sys/tihdr.h>
  38 #include <sys/suntpi.h>
  39 #include <sys/xti_inet.h>
  40 #include <sys/timod.h>
  41 #include <sys/pattr.h>
  42 #include <sys/squeue_impl.h>
  43 #include <sys/squeue.h>
  44 #include <sys/sockio.h>
  45 #include <sys/tsol/tnet.h>
  46 
  47 #include <inet/common.h>
  48 #include <inet/ip.h>
  49 #include <inet/tcp.h>
  50 #include <inet/tcp_impl.h>
  51 #include <inet/snmpcom.h>
  52 #include <inet/proto_set.h>
  53 #include <inet/ipsec_impl.h>
  54 #include <inet/ip_ndp.h>
  55 
  56 static mblk_t   *tcp_get_seg_mp(tcp_t *, uint32_t, int32_t *);
  57 static void     tcp_wput_cmdblk(queue_t *, mblk_t *);
  58 static void     tcp_wput_flush(tcp_t *, mblk_t *);
  59 static void     tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp);
  60 static int      tcp_xmit_end(tcp_t *);
  61 static int      tcp_send(tcp_t *, const int, const int, const int,
  62                     const int, int *, uint32_t *, int *, mblk_t **, mblk_t *);
  63 static void     tcp_xmit_early_reset(char *, mblk_t *, uint32_t, uint32_t,
  64                     int, ip_recv_attr_t *, ip_stack_t *, conn_t *);
  65 static boolean_t        tcp_send_rst_chk(tcp_stack_t *);
  66 static void     tcp_process_shrunk_swnd(tcp_t *, uint32_t);
  67 static void     tcp_fill_header(tcp_t *, uchar_t *, int);
  68 
  69 /*
  70  * Functions called directly via squeue having a prototype of edesc_t.
  71  */
  72 static void     tcp_wput_nondata(void *, mblk_t *, void *, ip_recv_attr_t *);
  73 static void     tcp_wput_ioctl(void *, mblk_t *, void *, ip_recv_attr_t *);
  74 static void     tcp_wput_proto(void *, mblk_t *, void *, ip_recv_attr_t *);
  75 
  76 /*
  77  * This controls how tiny a write must be before we try to copy it
  78  * into the mblk on the tail of the transmit queue.  Not much
  79  * speedup is observed for values larger than sixteen.  Zero will
  80  * disable the optimisation.
  81  */
  82 static int tcp_tx_pull_len = 16;
  83 
  84 int
  85 tcp_wput(queue_t *q, mblk_t *mp)
  86 {
  87         conn_t  *connp = Q_TO_CONN(q);

 438                         return;
 439                 }
 440 
 441                 /* usable = MIN(swnd, cwnd) - unacked_bytes */
 442                 if (tcp->tcp_swnd > tcp->tcp_cwnd)
 443                         usable_r -= tcp->tcp_swnd - tcp->tcp_cwnd;
 444 
 445                 /* usable = MIN(usable, unsent) */
 446                 if (usable_r > len)
 447                         usable_r = len;
 448 
 449                 /* usable = MAX(usable, {1 for urgent, 0 for data}) */
 450                 if (usable_r > 0) {
 451                         usable = usable_r;
 452                 } else {
 453                         /* Bypass all other unnecessary processing. */
 454                         goto done;
 455                 }
 456         }
 457 
 458         local_time = (mblk_t *)(intptr_t)gethrtime();
 459 
 460         /*
 461          * "Our" Nagle Algorithm.  This is not the same as in the old
 462          * BSD.  This is more in line with the true intent of Nagle.
 463          *
 464          * The conditions are:
 465          * 1. The amount of unsent data (or amount of data which can be
 466          *    sent, whichever is smaller) is less than Nagle limit.
 467          * 2. The last sent size is also less than Nagle limit.
 468          * 3. There is unack'ed data.
 469          * 4. Urgent pointer is not set.  Send urgent data ignoring the
 470          *    Nagle algorithm.  This reduces the probability that urgent
 471          *    bytes get "merged" together.
 472          * 5. The app has not closed the connection.  This eliminates the
 473          *    wait time of the receiving side waiting for the last piece of
 474          *    (small) data.
 475          *
 476          * If all are satisified, exit without sending anything.  Note
 477          * that Nagle limit can be smaller than 1 MSS.  Nagle limit is
 478          * the smaller of 1 MSS and global tcp_naglim_def (default to be

1167                 tcp->tcp_xmit_head = mp;
1168         } else {
1169                 tcp->tcp_xmit_last->b_cont = mp;
1170         }
1171         tcp->tcp_xmit_last = mp;
1172         tcp->tcp_xmit_tail = mp;
1173 
1174         /* find out how much we can send */
1175         /* BEGIN CSTYLED */
1176         /*
1177          *    un-acked     usable
1178          *  |--------------|-----------------|
1179          *  tcp_suna       tcp_snxt       tcp_suna+tcp_swnd
1180          */
1181         /* END CSTYLED */
1182 
1183         /* start sending from tcp_snxt */
1184         snxt = tcp->tcp_snxt;
1185 
1186         /*
1187          * Check to see if this connection has been idle for some time and no
1188          * ACK is expected. If so, then the congestion window size is no longer
1189          * meaningfully tied to current network conditions.

1190          *
1191          * We reinitialize tcp_cwnd, and slow start again to get back the
1192          * connection's "self-clock" as described in Van Jacobson's 1988 paper
1193          * "Congestion avoidance and control".
1194          */
1195         now = LBOLT_FASTPATH;
1196         if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
1197             (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
1198                 TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle);
1199         }
1200 
1201         usable = tcp->tcp_swnd;              /* tcp window size */
1202         if (usable > tcp->tcp_cwnd)
1203                 usable = tcp->tcp_cwnd;      /* congestion window smaller */
1204         usable -= snxt;         /* subtract stuff already sent */
1205         suna = tcp->tcp_suna;
1206         usable += suna;
1207         /* usable can be < 0 if the congestion window is smaller */
1208         if (len > usable) {
1209                 /* Can't send complete M_DATA in one shot */
1210                 goto slow;
1211         }
1212 
1213         mutex_enter(&tcp->tcp_non_sq_lock);

1241                 return;
1242         }
1243 
1244         /*
1245          * len <= tcp->tcp_mss && len == unsent so no sender silly window.  Can
1246          * send now.
1247          */
1248 
1249         if (snxt == suna) {
1250                 TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
1251         }
1252 
1253         /* we have always sent something */
1254         tcp->tcp_rack_cnt = 0;
1255 
1256         tcp->tcp_snxt = snxt + len;
1257         tcp->tcp_rack = tcp->tcp_rnxt;
1258 
1259         if ((mp1 = dupb(mp)) == 0)
1260                 goto no_memory;
1261         mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
1262         mp->b_next = (mblk_t *)(uintptr_t)snxt;
1263 
1264         /* adjust tcp header information */
1265         tcpha = tcp->tcp_tcpha;
1266         tcpha->tha_flags = (TH_ACK|TH_PUSH);
1267 
1268         sum = len + connp->conn_ht_ulp_len + connp->conn_sum;
1269         sum = (sum >> 16) + (sum & 0xFFFF);
1270         tcpha->tha_sum = htons(sum);
1271 
1272         tcpha->tha_seq = htonl(snxt);
1273 
1274         TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
1275         TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
1276         BUMP_LOCAL(tcp->tcp_obsegs);
1277 
1278         /* Update the latest receive window size in TCP header. */
1279         tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
1280 
1281         tcp->tcp_last_sent_len = (ushort_t)len;

1296         rptr = mp1->b_rptr - hdrlen;
1297         db = mp1->b_datap;
1298         if ((db->db_ref != 2) || rptr < db->db_base ||
1299             (!OK_32PTR(rptr))) {
1300                 /* NOTE: we assume allocb returns an OK_32PTR */
1301                 mp = allocb(hdrlen + tcps->tcps_wroff_xtra, BPRI_MED);
1302                 if (!mp) {
1303                         freemsg(mp1);
1304                         goto no_memory;
1305                 }
1306                 mp->b_cont = mp1;
1307                 mp1 = mp;
1308                 /* Leave room for Link Level header */
1309                 rptr = &mp1->b_rptr[tcps->tcps_wroff_xtra];
1310                 mp1->b_wptr = &rptr[hdrlen];
1311         }
1312         mp1->b_rptr = rptr;
1313 
1314         /* Fill in the timestamp option. */
1315         if (tcp->tcp_snd_ts_ok) {
1316                 U32_TO_BE32(now,
1317                     (char *)tcpha + TCP_MIN_HEADER_LENGTH + 4);


1318                 U32_TO_BE32(tcp->tcp_ts_recent,
1319                     (char *)tcpha + TCP_MIN_HEADER_LENGTH + 8);
1320         } else {
1321                 ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
1322         }
1323 
1324         /* copy header into outgoing packet */
1325         dst = (ipaddr_t *)rptr;
1326         src = (ipaddr_t *)connp->conn_ht_iphc;
1327         dst[0] = src[0];
1328         dst[1] = src[1];
1329         dst[2] = src[2];
1330         dst[3] = src[3];
1331         dst[4] = src[4];
1332         dst[5] = src[5];
1333         dst[6] = src[6];
1334         dst[7] = src[7];
1335         dst[8] = src[8];
1336         dst[9] = src[9];
1337         if (hdrlen -= 40) {
1338                 hdrlen >>= 2;
1339                 dst += 10;

1754         /*
1755          * In the off-chance that the eager received and responded to
1756          * some other packet while the SYN|ACK was queued, we recalculate
1757          * the ixa_pktlen. It would be better to fix the SYN/accept
1758          * multithreading scheme to avoid this complexity.
1759          */
1760         ixa->ixa_pktlen = msgdsize(mp);
1761         (void) conn_ip_output(mp, ixa);
1762 }
1763 
1764 /*
1765  * tcp_send() is called by tcp_wput_data() and returns one of the following:
1766  *
1767  * -1 = failed allocation.
1768  *  0 = We've either successfully sent data, or our usable send window is too
1769  *      small and we'd rather wait until later before sending again.
1770  */
1771 static int
1772 tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len,
1773     const int tcp_hdr_len, const int num_sack_blk, int *usable,
1774     uint32_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time)
1775 {
1776         int             num_lso_seg = 1;
1777         uint_t          lso_usable;
1778         boolean_t       do_lso_send = B_FALSE;
1779         tcp_stack_t     *tcps = tcp->tcp_tcps;
1780         conn_t          *connp = tcp->tcp_connp;
1781         ip_xmit_attr_t  *ixa = connp->conn_ixa;
1782 
1783         /*
1784          * Check LSO possibility. The value of tcp->tcp_lso indicates whether
1785          * the underlying connection is LSO capable. Will check whether having
1786          * enough available data to initiate LSO transmission in the for(){}
1787          * loops.
1788          */
1789         if (tcp->tcp_lso && (tcp->tcp_valid_bits & ~TCP_FSS_VALID) == 0)
1790                 do_lso_send = B_TRUE;
1791 
1792         for (;;) {
1793                 struct datab    *db;
1794                 tcpha_t         *tcpha;

2049 
2050                 must_alloc:;
2051                         mp1 = allocb(connp->conn_ht_iphc_allocated +
2052                             tcps->tcps_wroff_xtra, BPRI_MED);
2053                         if (mp1 == NULL) {
2054                                 freemsg(mp);
2055                                 return (-1);    /* out_of_mem */
2056                         }
2057                         mp1->b_cont = mp;
2058                         mp = mp1;
2059                         /* Leave room for Link Level header */
2060                         len = total_hdr_len;
2061                         rptr = &mp->b_rptr[tcps->tcps_wroff_xtra];
2062                         mp->b_wptr = &rptr[len];
2063                 }
2064 
2065                 /*
2066                  * Fill in the header using the template header, and add
2067                  * options such as time-stamp, ECN and/or SACK, as needed.
2068                  */
2069                 tcp_fill_header(tcp, rptr, num_sack_blk);
2070 
2071                 mp->b_rptr = rptr;
2072 
2073                 if (*tail_unsent) {
2074                         int spill = *tail_unsent;
2075 
2076                         mp1 = mp->b_cont;
2077                         if (mp1 == NULL)
2078                                 mp1 = mp;
2079 
2080                         /*
2081                          * If we're a little short, tack on more mblks until
2082                          * there is no more spillover.
2083                          */
2084                         while (spill < 0) {
2085                                 mblk_t *nmp;
2086                                 int nmpsz;
2087 
2088                                 nmp = (*xmit_tail)->b_cont;
2089                                 nmpsz = MBLKL(nmp);

2267                  * so we have to check that and unset it first.
2268                  */
2269                 if (tcp->tcp_cork)
2270                         tcp->tcp_cork = B_FALSE;
2271                 tcp_wput_data(tcp, NULL, B_FALSE);
2272         }
2273 
2274         /*
2275          * If TCP does not get enough samples of RTT or tcp_rtt_updates
2276          * is 0, don't update the cache.
2277          */
2278         if (tcps->tcps_rtt_updates == 0 ||
2279             tcp->tcp_rtt_update < tcps->tcps_rtt_updates)
2280                 return (0);
2281 
2282         /*
2283          * We do not have a good algorithm to update ssthresh at this time.
2284          * So don't do any update.
2285          */
2286         bzero(&uinfo, sizeof (uinfo));
2287         uinfo.iulp_rtt = NSEC2MSEC(tcp->tcp_rtt_sa);
2288         uinfo.iulp_rtt_sd = NSEC2MSEC(tcp->tcp_rtt_sd);
2289 
2290         /*
2291          * Note that uinfo is kept for conn_faddr in the DCE. Could update even
2292          * if source routed but we don't.
2293          */
2294         if (connp->conn_ipversion == IPV4_VERSION) {
2295                 if (connp->conn_faddr_v4 !=  tcp->tcp_ipha->ipha_dst) {
2296                         return (0);
2297                 }
2298                 (void) dce_update_uinfo_v4(connp->conn_faddr_v4, &uinfo, ipst);
2299         } else {
2300                 uint_t ifindex;
2301 
2302                 if (!(IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
2303                     &tcp->tcp_ip6h->ip6_dst))) {
2304                         return (0);
2305                 }
2306                 ifindex = 0;
2307                 if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_faddr_v6)) {
2308                         ip_xmit_attr_t *ixa = connp->conn_ixa;

3372                 ASSERT(snxt_mp != NULL);
3373                 /* This should not happen.  Defensive coding again... */
3374                 if (snxt_mp == NULL) {
3375                         return;
3376                 }
3377 
3378                 xmit_mp = tcp_xmit_mp(tcp, snxt_mp, seg_len, &off,
3379                     &tmp_mp, begin, B_TRUE, &seg_len, B_TRUE);
3380                 if (xmit_mp == NULL)
3381                         return;
3382 
3383                 usable_swnd -= seg_len;
3384                 tcp->tcp_pipe += seg_len;
3385                 tcp->tcp_sack_snxt = begin + seg_len;
3386 
3387                 tcp_send_data(tcp, xmit_mp);
3388 
3389                 /*
3390                  * Update the send timestamp to avoid false retransmission.
3391                  */
3392                 snxt_mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
3393 
3394                 TCPS_BUMP_MIB(tcps, tcpRetransSegs);
3395                 TCPS_UPDATE_MIB(tcps, tcpRetransBytes, seg_len);
3396                 TCPS_BUMP_MIB(tcps, tcpOutSackRetransSegs);
3397                 /*
3398                  * Update tcp_rexmit_max to extend this SACK recovery phase.
3399                  * This happens when new data sent during fast recovery is
3400                  * also lost.  If TCP retransmits those new data, it needs
3401                  * to extend SACK recover phase to avoid starting another
3402                  * fast retransmit/recovery unnecessarily.
3403                  */
3404                 if (SEQ_GT(tcp->tcp_sack_snxt, tcp->tcp_rexmit_max)) {
3405                         tcp->tcp_rexmit_max = tcp->tcp_sack_snxt;
3406                 }
3407         }
3408 }
3409 
3410 /*
3411  * tcp_ss_rexmit() is called to do slow start retransmission after a timeout
3412  * or ICMP errors.

3444 
3445                         if (win < cnt) {
3446                                 cnt = win;
3447                         }
3448                         if (SEQ_GT(snxt + cnt, smax)) {
3449                                 cnt = smax - snxt;
3450                         }
3451                         xmit_mp = tcp_xmit_mp(tcp, snxt_mp, cnt, &off,
3452                             &snxt_mp, snxt, B_TRUE, &cnt, B_TRUE);
3453                         if (xmit_mp == NULL)
3454                                 return;
3455 
3456                         tcp_send_data(tcp, xmit_mp);
3457 
3458                         snxt += cnt;
3459                         win -= cnt;
3460                         /*
3461                          * Update the send timestamp to avoid false
3462                          * retransmission.
3463                          */
3464                         old_snxt_mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
3465                         TCPS_BUMP_MIB(tcps, tcpRetransSegs);
3466                         TCPS_UPDATE_MIB(tcps, tcpRetransBytes, cnt);
3467 
3468                         tcp->tcp_rexmit_nxt = snxt;
3469                 }
3470                 /*
3471                  * If we have transmitted all we have at the time
3472                  * we started the retranmission, we can leave
3473                  * the rest of the job to tcp_wput_data().  But we
3474                  * need to check the send window first.  If the
3475                  * win is not 0, go on with tcp_wput_data().
3476                  */
3477                 if (SEQ_LT(snxt, smax) || win == 0) {
3478                         return;
3479                 }
3480         }
3481         /* Only call tcp_wput_data() if there is data to be sent. */
3482         if (tcp->tcp_unsent) {
3483                 tcp_wput_data(tcp, NULL, B_FALSE);
3484         }

3604         /*
3605          * If the SACK option is set, delete the entire list of
3606          * notsack'ed blocks.
3607          */
3608         TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, tcp);
3609 
3610         if (tcp->tcp_suna == tcp->tcp_snxt && tcp->tcp_swnd == 0)
3611                 /*
3612                  * Make sure the timer is running so that we will probe a zero
3613                  * window.
3614                  */
3615                 TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
3616 }
3617 
3618 /*
3619  * tcp_fill_header is called by tcp_send() to fill the outgoing TCP header
3620  * with the template header, as well as other options such as time-stamp,
3621  * ECN and/or SACK.
3622  */
3623 static void
3624 tcp_fill_header(tcp_t *tcp, uchar_t *rptr, int num_sack_blk)
3625 {
3626         tcpha_t *tcp_tmpl, *tcpha;
3627         uint32_t *dst, *src;
3628         int hdrlen;
3629         conn_t *connp = tcp->tcp_connp;
3630 
3631         ASSERT(OK_32PTR(rptr));
3632 
3633         /* Template header */
3634         tcp_tmpl = tcp->tcp_tcpha;
3635 
3636         /* Header of outgoing packet */
3637         tcpha = (tcpha_t *)(rptr + connp->conn_ixa->ixa_ip_hdr_length);
3638 
3639         /* dst and src are opaque 32-bit fields, used for copying */
3640         dst = (uint32_t *)rptr;
3641         src = (uint32_t *)connp->conn_ht_iphc;
3642         hdrlen = connp->conn_ht_iphc_len;
3643 
3644         /* Fill time-stamp option if needed */
3645         if (tcp->tcp_snd_ts_ok) {
3646                 U32_TO_BE32(LBOLT_FASTPATH,
3647                     (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 4);
3648                 U32_TO_BE32(tcp->tcp_ts_recent,
3649                     (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 8);
3650         } else {
3651                 ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
3652         }
3653 
3654         /*
3655          * Copy the template header; is this really more efficient than
3656          * calling bcopy()?  For simple IPv4/TCP, it may be the case,
3657          * but perhaps not for other scenarios.
3658          */
3659         dst[0] = src[0];
3660         dst[1] = src[1];
3661         dst[2] = src[2];
3662         dst[3] = src[3];
3663         dst[4] = src[4];
3664         dst[5] = src[5];
3665         dst[6] = src[6];
3666         dst[7] = src[7];