Print this page
11546 Track TCP round-trip time in nanoseconds
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Portions contributed by: Brandon Baker <bbaker@delphix.com>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>


   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  25  * Copyright 2016 Joyent, Inc.
  26  * Copyright (c) 2014 by Delphix. All rights reserved.
  27  */
  28 
  29 /* This file contains all TCP input processing functions. */
  30 
  31 #include <sys/types.h>
  32 #include <sys/stream.h>
  33 #include <sys/strsun.h>
  34 #include <sys/strsubr.h>
  35 #include <sys/stropts.h>
  36 #include <sys/strlog.h>
  37 #define _SUN_TPI_VERSION 2
  38 #include <sys/tihdr.h>
  39 #include <sys/suntpi.h>
  40 #include <sys/xti_inet.h>
  41 #include <sys/squeue_impl.h>
  42 #include <sys/squeue.h>
  43 #include <sys/tsol/tnet.h>
  44 
  45 #include <inet/common.h>
  46 #include <inet/ip.h>


 149 static uint32_t tcp_init_wnd_chk = 4096;
 150 
 151 /* Process ICMP source quench message or not. */
 152 static boolean_t tcp_icmp_source_quench = B_FALSE;
 153 
 154 static boolean_t tcp_outbound_squeue_switch = B_FALSE;
 155 
 156 static mblk_t   *tcp_conn_create_v4(conn_t *, conn_t *, mblk_t *,
 157                     ip_recv_attr_t *);
 158 static mblk_t   *tcp_conn_create_v6(conn_t *, conn_t *, mblk_t *,
 159                     ip_recv_attr_t *);
 160 static boolean_t        tcp_drop_q0(tcp_t *);
 161 static void     tcp_icmp_error_ipv6(tcp_t *, mblk_t *, ip_recv_attr_t *);
 162 static mblk_t   *tcp_input_add_ancillary(tcp_t *, mblk_t *, ip_pkt_t *,
 163                     ip_recv_attr_t *);
 164 static void     tcp_input_listener(void *, mblk_t *, void *, ip_recv_attr_t *);
 165 static void     tcp_process_options(tcp_t *, tcpha_t *);
 166 static mblk_t   *tcp_reass(tcp_t *, mblk_t *, uint32_t);
 167 static void     tcp_reass_elim_overlap(tcp_t *, mblk_t *);
 168 static void     tcp_rsrv_input(void *, mblk_t *, void *, ip_recv_attr_t *);
 169 static void     tcp_set_rto(tcp_t *, time_t);
 170 static void     tcp_setcred_data(mblk_t *, ip_recv_attr_t *);
 171 
 172 /*
 173  * Set the MSS associated with a particular tcp based on its current value,
 174  * and a new one passed in. Observe minimums and maximums, and reset other
 175  * state variables that we want to view as multiples of MSS.
 176  *
 177  * The value of MSS could be either increased or descreased.
 178  */
 179 void
 180 tcp_mss_set(tcp_t *tcp, uint32_t mss)
 181 {
 182         uint32_t        mss_max;
 183         tcp_stack_t     *tcps = tcp->tcp_tcps;
 184         conn_t          *connp = tcp->tcp_connp;
 185 
 186         if (connp->conn_ipversion == IPV4_VERSION)
 187                 mss_max = tcps->tcps_mss_max_ipv4;
 188         else
 189                 mss_max = tcps->tcps_mss_max_ipv6;


3345                     SEQ_LEQ(seg_seq, tcp->tcp_rnxt + tcp->tcp_rwnd));
3346                 freemsg(mp);
3347                 /*
3348                  * If the ACK flag is not set, just use our snxt as the
3349                  * seq number of the RST segment.
3350                  */
3351                 if (!(flags & TH_ACK)) {
3352                         seg_ack = tcp->tcp_snxt;
3353                 }
3354                 tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1,
3355                     TH_RST|TH_ACK);
3356                 ASSERT(tcp->tcp_state != TCPS_TIME_WAIT);
3357                 (void) tcp_clean_death(tcp, ECONNRESET);
3358                 return;
3359         }
3360         /*
3361          * urp could be -1 when the urp field in the packet is 0
3362          * and TCP_OLD_URP_INTERPRETATION is set. This implies that the urgent
3363          * byte was at seg_seq - 1, in which case we ignore the urgent flag.
3364          */
3365         if (flags & TH_URG && urp >= 0) {
3366                 if (!tcp->tcp_urp_last_valid ||
3367                     SEQ_GT(urp + seg_seq, tcp->tcp_urp_last)) {
3368                         /*
3369                          * Non-STREAMS sockets handle the urgent data a litte
3370                          * differently from STREAMS based sockets. There is no
3371                          * need to mark any mblks with the MSG{NOT,}MARKNEXT
3372                          * flags to keep SIOCATMARK happy. Instead a
3373                          * su_signal_oob upcall is made to update the mark.
3374                          * Neither is a T_EXDATA_IND mblk needed to be
3375                          * prepended to the urgent data. The urgent data is
3376                          * delivered using the su_recv upcall, where we set
3377                          * the MSG_OOB flag to indicate that it is urg data.
3378                          *
3379                          * Neither TH_SEND_URP_MARK nor TH_MARKNEXT_NEEDED
3380                          * are used by non-STREAMS sockets.
3381                          */
3382                         if (IPCL_IS_NONSTR(connp)) {
3383                                 if (!TCP_IS_DETACHED(tcp)) {
3384                                         (*sockupcalls->su_signal_oob)
3385                                             (connp->conn_upper_handle, urp);


4287                          *
4288                          * By initializing tcp_cwnd_cnt to new tcp_cwnd and
4289                          * decrementing it by 1 MSS for every ACKs, tcp_cwnd is
4290                          * increased by 1 MSS for every RTTs.
4291                          */
4292                         if (tcp->tcp_cwnd_cnt <= 0) {
4293                                 tcp->tcp_cwnd_cnt = cwnd + add;
4294                         } else {
4295                                 tcp->tcp_cwnd_cnt -= add;
4296                                 add = 0;
4297                         }
4298                 }
4299                 tcp->tcp_cwnd = MIN(cwnd + add, tcp->tcp_cwnd_max);
4300         }
4301 
4302         /* See if the latest urgent data has been acknowledged */
4303         if ((tcp->tcp_valid_bits & TCP_URG_VALID) &&
4304             SEQ_GT(seg_ack, tcp->tcp_urg))
4305                 tcp->tcp_valid_bits &= ~TCP_URG_VALID;
4306 
4307         /* Can we update the RTT estimates? */
4308         if (tcp->tcp_snd_ts_ok) {
4309                 /* Ignore zero timestamp echo-reply. */
4310                 if (tcpopt.tcp_opt_ts_ecr != 0) {
4311                         tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH -
4312                             (int32_t)tcpopt.tcp_opt_ts_ecr);
4313                 }
4314 
4315                 /* If needed, restart the timer. */
4316                 if (tcp->tcp_set_timer == 1) {
4317                         TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
4318                         tcp->tcp_set_timer = 0;
4319                 }
4320                 /*
4321                  * Update tcp_csuna in case the other side stops sending
4322                  * us timestamps.






4323                  */
4324                 tcp->tcp_csuna = tcp->tcp_snxt;
4325         } else if (SEQ_GT(seg_ack, tcp->tcp_csuna)) {
4326                 /*
4327                  * An ACK sequence we haven't seen before, so get the RTT
4328                  * and update the RTO. But first check if the timestamp is
4329                  * valid to use.
4330                  */
4331                 if ((mp1->b_next != NULL) &&
4332                     SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next)))
4333                         tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH -
4334                             (int32_t)(intptr_t)mp1->b_prev);
4335                 else
4336                         TCPS_BUMP_MIB(tcps, tcpRttNoUpdate);

4337 
4338                 /* Remeber the last sequence to be ACKed */
4339                 tcp->tcp_csuna = seg_ack;
4340                 if (tcp->tcp_set_timer == 1) {
4341                         TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
4342                         tcp->tcp_set_timer = 0;
4343                 }
4344         } else {
4345                 TCPS_BUMP_MIB(tcps, tcpRttNoUpdate);
4346         }
4347 
4348         /* Eat acknowledged bytes off the xmit queue. */
4349         for (;;) {
4350                 mblk_t  *mp2;
4351                 uchar_t *wptr;
4352 
4353                 wptr = mp1->b_wptr;
4354                 ASSERT((uintptr_t)(wptr - mp1->b_rptr) <= (uintptr_t)INT_MAX);
4355                 bytes_acked -= (int)(wptr - mp1->b_rptr);
4356                 if (bytes_acked < 0) {
4357                         mp1->b_rptr = wptr + bytes_acked;
4358                         /*
4359                          * Set a new timestamp if all the bytes timed by the
4360                          * old timestamp have been ack'ed.
4361                          */
4362                         if (SEQ_GT(seg_ack,
4363                             (uint32_t)(uintptr_t)(mp1->b_next))) {
4364                                 mp1->b_prev =
4365                                     (mblk_t *)(uintptr_t)LBOLT_FASTPATH;
4366                                 mp1->b_next = NULL;
4367                         }
4368                         break;
4369                 }
4370                 mp1->b_next = NULL;
4371                 mp1->b_prev = NULL;
4372                 mp2 = mp1;
4373                 mp1 = mp1->b_cont;
4374 
4375                 /*
4376                  * This notification is required for some zero-copy
4377                  * clients to maintain a copy semantic. After the data
4378                  * is ack'ed, client is safe to modify or reuse the buffer.
4379                  */
4380                 if (tcp->tcp_snd_zcopy_aware &&
4381                     (mp2->b_datap->db_struioflag & STRUIO_ZCNOTIFY))
4382                         tcp_zcopy_notify(tcp);
4383                 freeb(mp2);
4384                 if (bytes_acked == 0) {
4385                         if (mp1 == NULL) {


4822             TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0)
4823                 goto done;
4824 
4825         /* Any transmit work to do and a non-zero window? */
4826         if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_NEED_SACK_REXMIT|
4827             TH_LIMIT_XMIT)) && tcp->tcp_swnd != 0) {
4828                 if (flags & TH_REXMIT_NEEDED) {
4829                         uint32_t snd_size = tcp->tcp_snxt - tcp->tcp_suna;
4830 
4831                         TCPS_BUMP_MIB(tcps, tcpOutFastRetrans);
4832                         if (snd_size > mss)
4833                                 snd_size = mss;
4834                         if (snd_size > tcp->tcp_swnd)
4835                                 snd_size = tcp->tcp_swnd;
4836                         mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, snd_size,
4837                             NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size,
4838                             B_TRUE);
4839 
4840                         if (mp1 != NULL) {
4841                                 tcp->tcp_xmit_head->b_prev =
4842                                     (mblk_t *)LBOLT_FASTPATH;
4843                                 tcp->tcp_csuna = tcp->tcp_snxt;
4844                                 TCPS_BUMP_MIB(tcps, tcpRetransSegs);
4845                                 TCPS_UPDATE_MIB(tcps, tcpRetransBytes,
4846                                     snd_size);
4847                                 tcp_send_data(tcp, mp1);
4848                         }
4849                 }
4850                 if (flags & TH_NEED_SACK_REXMIT) {
4851                         tcp_sack_rexmit(tcp, &flags);
4852                 }
4853                 /*
4854                  * For TH_LIMIT_XMIT, tcp_wput_data() is called to send
4855                  * out new segment.  Note that tcp_rexmit should not be
4856                  * set, otherwise TH_LIMIT_XMIT should not be set.
4857                  */
4858                 if (flags & (TH_XMIT_NEEDED|TH_LIMIT_XMIT)) {
4859                         if (!tcp->tcp_rexmit) {
4860                                 tcp_wput_data(tcp, NULL, B_FALSE);
4861                         } else {
4862                                 tcp_ss_rexmit(tcp);
4863                         }
4864                 }
4865                 /*
4866                  * Adjust tcp_cwnd back to normal value after sending
4867                  * new data segments.
4868                  */
4869                 if (flags & TH_LIMIT_XMIT) {
4870                         tcp->tcp_cwnd -= mss << (tcp->tcp_dupack_cnt - 1);
4871                         /*
4872                          * This will restart the timer.  Restarting the
4873                          * timer is used to avoid a timeout before the
4874                          * limited transmitted segment's ACK gets back.
4875                          */
4876                         if (tcp->tcp_xmit_head != NULL)
4877                                 tcp->tcp_xmit_head->b_prev =
4878                                     (mblk_t *)LBOLT_FASTPATH;
4879                 }

4880 
4881                 /* Anything more to do? */
4882                 if ((flags & (TH_ACK_NEEDED|TH_ACK_TIMER_NEEDED|
4883                     TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0)
4884                         goto done;
4885         }
4886 ack_check:
4887         if (flags & TH_SEND_URP_MARK) {
4888                 ASSERT(tcp->tcp_urp_mark_mp);
4889                 ASSERT(!IPCL_IS_NONSTR(connp));
4890                 /*
4891                  * Send up any queued data and then send the mark message
4892                  */
4893                 if (tcp->tcp_rcv_list != NULL) {
4894                         flags |= tcp_rcv_drain(tcp);
4895 
4896                 }
4897                 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg);
4898                 mp1 = tcp->tcp_urp_mark_mp;
4899                 tcp->tcp_urp_mark_mp = NULL;


5194         }
5195         if (addflag.crb_ipv6_recvdstopts) {
5196                 toh = (struct T_opthdr *)optptr;
5197                 toh->level = IPPROTO_IPV6;
5198                 toh->name = IPV6_DSTOPTS;
5199                 toh->len = sizeof (*toh) + ipp->ipp_dstoptslen;
5200                 toh->status = 0;
5201                 optptr += sizeof (*toh);
5202                 bcopy(ipp->ipp_dstopts, optptr, ipp->ipp_dstoptslen);
5203                 optptr += ipp->ipp_dstoptslen;
5204                 ASSERT(OK_32PTR(optptr));
5205                 /* Save as last value */
5206                 ip_savebuf((void **)&tcp->tcp_dstopts, &tcp->tcp_dstoptslen,
5207                     (ipp->ipp_fields & IPPF_DSTOPTS),
5208                     ipp->ipp_dstopts, ipp->ipp_dstoptslen);
5209         }
5210         ASSERT(optptr == mp->b_wptr);
5211         return (mp);
5212 }
5213 
5214 /* The minimum of smoothed mean deviation in RTO calculation. */
5215 #define TCP_SD_MIN      400
5216 
5217 /*
5218  * Set RTO for this connection.  The formula is from Jacobson and Karels'
5219  * "Congestion Avoidance and Control" in SIGCOMM '88.  The variable names
5220  * are the same as those in Appendix A.2 of that paper.

5221  *
5222  * m = new measurement
5223  * sa = smoothed RTT average (8 * average estimates).
5224  * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates).
5225  */
5226 static void
5227 tcp_set_rto(tcp_t *tcp, clock_t rtt)
5228 {
5229         long m = TICK_TO_MSEC(rtt);
5230         clock_t sa = tcp->tcp_rtt_sa;
5231         clock_t sv = tcp->tcp_rtt_sd;
5232         clock_t rto;
5233         tcp_stack_t     *tcps = tcp->tcp_tcps;
5234 
5235         TCPS_BUMP_MIB(tcps, tcpRttUpdate);
5236         tcp->tcp_rtt_update++;
5237 
5238         /* tcp_rtt_sa is not 0 means this is a new sample. */
5239         if (sa != 0) {
5240                 /*
5241                  * Update average estimator:
5242                  *      new rtt = 7/8 old rtt + 1/8 Error













5243                  */
5244 
5245                 /* m is now Error in estimate. */
5246                 m -= sa >> 3;
5247                 if ((sa += m) <= 0) {
5248                         /*
5249                          * Don't allow the smoothed average to be negative.
5250                          * We use 0 to denote reinitialization of the
5251                          * variables.
5252                          */
5253                         sa = 1;
5254                 }
5255 
5256                 /*
5257                  * Update deviation estimator:
5258                  *      new mdev = 3/4 old mdev + 1/4 (abs(Error) - old mdev)






5259                  */
5260                 if (m < 0)
5261                         m = -m;
5262                 m -= sv >> 2;
5263                 sv += m;
5264         } else {
5265                 /*
5266                  * This follows BSD's implementation.  So the reinitialized
5267                  * RTO is 3 * m.  We cannot go less than 2 because if the
5268                  * link is bandwidth dominated, doubling the window size
5269                  * during slow start means doubling the RTT.  We want to be
5270                  * more conservative when we reinitialize our estimates.  3
5271                  * is just a convenient number.
5272                  */
5273                 sa = m << 3;
5274                 sv = m << 1;
5275         }
5276         if (sv < TCP_SD_MIN) {
5277                 /*
5278                  * We do not know that if sa captures the delay ACK
5279                  * effect as in a long train of segments, a receiver
5280                  * does not delay its ACKs.  So set the minimum of sv
5281                  * to be TCP_SD_MIN, which is default to 400 ms, twice
5282                  * of BSD DATO.  That means the minimum of mean


5283                  * deviation is 100 ms.
5284                  *
5285                  */
5286                 sv = TCP_SD_MIN;
5287         }
5288         tcp->tcp_rtt_sa = sa;
5289         tcp->tcp_rtt_sd = sv;
5290         /*
5291          * RTO = average estimates (sa / 8) + 4 * deviation estimates (sv)
5292          *
5293          * Add tcp_rexmit_interval extra in case of extreme environment
5294          * where the algorithm fails to work.  The default value of
5295          * tcp_rexmit_interval_extra should be 0.
5296          *
5297          * As we use a finer grained clock than BSD and update
5298          * RTO for every ACKs, add in another .25 of RTT to the
5299          * deviation of RTO to accomodate burstiness of 1/4 of
5300          * window size.
5301          */
5302         rto = (sa >> 3) + sv + tcps->tcps_rexmit_interval_extra + (sa >> 5);
5303 
5304         TCP_SET_RTO(tcp, rto);
5305 
5306         /* Now, we can reset tcp_timer_backoff to use the new RTO... */
5307         tcp->tcp_timer_backoff = 0;
5308 }
5309 
5310 /*
5311  * On a labeled system we have some protocols above TCP, such as RPC, which
5312  * appear to assume that every mblk in a chain has a db_credp.
5313  */
5314 static void
5315 tcp_setcred_data(mblk_t *mp, ip_recv_attr_t *ira)
5316 {
5317         ASSERT(is_system_labeled());
5318         ASSERT(ira->ira_cred != NULL);
5319 
5320         while (mp != NULL) {
5321                 mblk_setcred(mp, ira->ira_cred, NOPID);
5322                 mp = mp->b_cont;
5323         }
5324 }




   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  25  * Copyright 2019 Joyent, Inc.
  26  * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
  27  */
  28 
  29 /* This file contains all TCP input processing functions. */
  30 
  31 #include <sys/types.h>
  32 #include <sys/stream.h>
  33 #include <sys/strsun.h>
  34 #include <sys/strsubr.h>
  35 #include <sys/stropts.h>
  36 #include <sys/strlog.h>
  37 #define _SUN_TPI_VERSION 2
  38 #include <sys/tihdr.h>
  39 #include <sys/suntpi.h>
  40 #include <sys/xti_inet.h>
  41 #include <sys/squeue_impl.h>
  42 #include <sys/squeue.h>
  43 #include <sys/tsol/tnet.h>
  44 
  45 #include <inet/common.h>
  46 #include <inet/ip.h>


 149 static uint32_t tcp_init_wnd_chk = 4096;
 150 
 151 /* Process ICMP source quench message or not. */
 152 static boolean_t tcp_icmp_source_quench = B_FALSE;
 153 
 154 static boolean_t tcp_outbound_squeue_switch = B_FALSE;
 155 
 156 static mblk_t   *tcp_conn_create_v4(conn_t *, conn_t *, mblk_t *,
 157                     ip_recv_attr_t *);
 158 static mblk_t   *tcp_conn_create_v6(conn_t *, conn_t *, mblk_t *,
 159                     ip_recv_attr_t *);
 160 static boolean_t        tcp_drop_q0(tcp_t *);
 161 static void     tcp_icmp_error_ipv6(tcp_t *, mblk_t *, ip_recv_attr_t *);
 162 static mblk_t   *tcp_input_add_ancillary(tcp_t *, mblk_t *, ip_pkt_t *,
 163                     ip_recv_attr_t *);
 164 static void     tcp_input_listener(void *, mblk_t *, void *, ip_recv_attr_t *);
 165 static void     tcp_process_options(tcp_t *, tcpha_t *);
 166 static mblk_t   *tcp_reass(tcp_t *, mblk_t *, uint32_t);
 167 static void     tcp_reass_elim_overlap(tcp_t *, mblk_t *);
 168 static void     tcp_rsrv_input(void *, mblk_t *, void *, ip_recv_attr_t *);
 169 static void     tcp_set_rto(tcp_t *, hrtime_t);
 170 static void     tcp_setcred_data(mblk_t *, ip_recv_attr_t *);
 171 
 172 /*
 173  * Set the MSS associated with a particular tcp based on its current value,
 174  * and a new one passed in. Observe minimums and maximums, and reset other
 175  * state variables that we want to view as multiples of MSS.
 176  *
 177  * The value of MSS could be either increased or descreased.
 178  */
 179 void
 180 tcp_mss_set(tcp_t *tcp, uint32_t mss)
 181 {
 182         uint32_t        mss_max;
 183         tcp_stack_t     *tcps = tcp->tcp_tcps;
 184         conn_t          *connp = tcp->tcp_connp;
 185 
 186         if (connp->conn_ipversion == IPV4_VERSION)
 187                 mss_max = tcps->tcps_mss_max_ipv4;
 188         else
 189                 mss_max = tcps->tcps_mss_max_ipv6;


3345                     SEQ_LEQ(seg_seq, tcp->tcp_rnxt + tcp->tcp_rwnd));
3346                 freemsg(mp);
3347                 /*
3348                  * If the ACK flag is not set, just use our snxt as the
3349                  * seq number of the RST segment.
3350                  */
3351                 if (!(flags & TH_ACK)) {
3352                         seg_ack = tcp->tcp_snxt;
3353                 }
3354                 tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1,
3355                     TH_RST|TH_ACK);
3356                 ASSERT(tcp->tcp_state != TCPS_TIME_WAIT);
3357                 (void) tcp_clean_death(tcp, ECONNRESET);
3358                 return;
3359         }
3360         /*
3361          * urp could be -1 when the urp field in the packet is 0
3362          * and TCP_OLD_URP_INTERPRETATION is set. This implies that the urgent
3363          * byte was at seg_seq - 1, in which case we ignore the urgent flag.
3364          */
3365         if ((flags & TH_URG) && urp >= 0) {
3366                 if (!tcp->tcp_urp_last_valid ||
3367                     SEQ_GT(urp + seg_seq, tcp->tcp_urp_last)) {
3368                         /*
3369                          * Non-STREAMS sockets handle the urgent data a litte
3370                          * differently from STREAMS based sockets. There is no
3371                          * need to mark any mblks with the MSG{NOT,}MARKNEXT
3372                          * flags to keep SIOCATMARK happy. Instead a
3373                          * su_signal_oob upcall is made to update the mark.
3374                          * Neither is a T_EXDATA_IND mblk needed to be
3375                          * prepended to the urgent data. The urgent data is
3376                          * delivered using the su_recv upcall, where we set
3377                          * the MSG_OOB flag to indicate that it is urg data.
3378                          *
3379                          * Neither TH_SEND_URP_MARK nor TH_MARKNEXT_NEEDED
3380                          * are used by non-STREAMS sockets.
3381                          */
3382                         if (IPCL_IS_NONSTR(connp)) {
3383                                 if (!TCP_IS_DETACHED(tcp)) {
3384                                         (*sockupcalls->su_signal_oob)
3385                                             (connp->conn_upper_handle, urp);


4287                          *
4288                          * By initializing tcp_cwnd_cnt to new tcp_cwnd and
4289                          * decrementing it by 1 MSS for every ACKs, tcp_cwnd is
4290                          * increased by 1 MSS for every RTTs.
4291                          */
4292                         if (tcp->tcp_cwnd_cnt <= 0) {
4293                                 tcp->tcp_cwnd_cnt = cwnd + add;
4294                         } else {
4295                                 tcp->tcp_cwnd_cnt -= add;
4296                                 add = 0;
4297                         }
4298                 }
4299                 tcp->tcp_cwnd = MIN(cwnd + add, tcp->tcp_cwnd_max);
4300         }
4301 
4302         /* See if the latest urgent data has been acknowledged */
4303         if ((tcp->tcp_valid_bits & TCP_URG_VALID) &&
4304             SEQ_GT(seg_ack, tcp->tcp_urg))
4305                 tcp->tcp_valid_bits &= ~TCP_URG_VALID;
4306 













4307         /*
4308          * Update the RTT estimates. Note that we don't use the TCP
4309          * timestamp option to calculate RTT even if one is present. This is
4310          * because the timestamp option's resolution (CPU tick) is
4311          * too coarse to measure modern datacenter networks' microsecond
4312          * latencies. The timestamp field's resolution is limited by its
4313          * 4-byte width (see RFC1323), and since we always store a
4314          * high-resolution nanosecond presision timestamp along with the data,
4315          * there is no point to ever using the timestamp option.
4316          */
4317         if (SEQ_GT(seg_ack, tcp->tcp_csuna)) {

4318                 /*
4319                  * An ACK sequence we haven't seen before, so get the RTT
4320                  * and update the RTO. But first check if the timestamp is
4321                  * valid to use.
4322                  */
4323                 if ((mp1->b_next != NULL) &&
4324                     SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next))) {
4325                         tcp_set_rto(tcp, gethrtime() -
4326                             (hrtime_t)(intptr_t)mp1->b_prev);
4327                 } else {
4328                         TCPS_BUMP_MIB(tcps, tcpRttNoUpdate);
4329                 }
4330 
4331                 /* Remeber the last sequence to be ACKed */
4332                 tcp->tcp_csuna = seg_ack;
4333                 if (tcp->tcp_set_timer == 1) {
4334                         TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
4335                         tcp->tcp_set_timer = 0;
4336                 }
4337         } else {
4338                 TCPS_BUMP_MIB(tcps, tcpRttNoUpdate);
4339         }
4340 
4341         /* Eat acknowledged bytes off the xmit queue. */
4342         for (;;) {
4343                 mblk_t  *mp2;
4344                 uchar_t *wptr;
4345 
4346                 wptr = mp1->b_wptr;
4347                 ASSERT((uintptr_t)(wptr - mp1->b_rptr) <= (uintptr_t)INT_MAX);
4348                 bytes_acked -= (int)(wptr - mp1->b_rptr);
4349                 if (bytes_acked < 0) {
4350                         mp1->b_rptr = wptr + bytes_acked;
4351                         /*
4352                          * Set a new timestamp if all the bytes timed by the
4353                          * old timestamp have been ack'ed.
4354                          */
4355                         if (SEQ_GT(seg_ack,
4356                             (uint32_t)(uintptr_t)(mp1->b_next))) {
4357                                 mp1->b_prev =
4358                                     (mblk_t *)(intptr_t)gethrtime();
4359                                 mp1->b_next = NULL;
4360                         }
4361                         break;
4362                 }
4363                 mp1->b_next = NULL;
4364                 mp1->b_prev = NULL;
4365                 mp2 = mp1;
4366                 mp1 = mp1->b_cont;
4367 
4368                 /*
4369                  * This notification is required for some zero-copy
4370                  * clients to maintain a copy semantic. After the data
4371                  * is ack'ed, client is safe to modify or reuse the buffer.
4372                  */
4373                 if (tcp->tcp_snd_zcopy_aware &&
4374                     (mp2->b_datap->db_struioflag & STRUIO_ZCNOTIFY))
4375                         tcp_zcopy_notify(tcp);
4376                 freeb(mp2);
4377                 if (bytes_acked == 0) {
4378                         if (mp1 == NULL) {


4815             TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0)
4816                 goto done;
4817 
4818         /* Any transmit work to do and a non-zero window? */
4819         if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_NEED_SACK_REXMIT|
4820             TH_LIMIT_XMIT)) && tcp->tcp_swnd != 0) {
4821                 if (flags & TH_REXMIT_NEEDED) {
4822                         uint32_t snd_size = tcp->tcp_snxt - tcp->tcp_suna;
4823 
4824                         TCPS_BUMP_MIB(tcps, tcpOutFastRetrans);
4825                         if (snd_size > mss)
4826                                 snd_size = mss;
4827                         if (snd_size > tcp->tcp_swnd)
4828                                 snd_size = tcp->tcp_swnd;
4829                         mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, snd_size,
4830                             NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size,
4831                             B_TRUE);
4832 
4833                         if (mp1 != NULL) {
4834                                 tcp->tcp_xmit_head->b_prev =
4835                                     (mblk_t *)(intptr_t)gethrtime();
4836                                 tcp->tcp_csuna = tcp->tcp_snxt;
4837                                 TCPS_BUMP_MIB(tcps, tcpRetransSegs);
4838                                 TCPS_UPDATE_MIB(tcps, tcpRetransBytes,
4839                                     snd_size);
4840                                 tcp_send_data(tcp, mp1);
4841                         }
4842                 }
4843                 if (flags & TH_NEED_SACK_REXMIT) {
4844                         tcp_sack_rexmit(tcp, &flags);
4845                 }
4846                 /*
4847                  * For TH_LIMIT_XMIT, tcp_wput_data() is called to send
4848                  * out new segment.  Note that tcp_rexmit should not be
4849                  * set, otherwise TH_LIMIT_XMIT should not be set.
4850                  */
4851                 if (flags & (TH_XMIT_NEEDED|TH_LIMIT_XMIT)) {
4852                         if (!tcp->tcp_rexmit) {
4853                                 tcp_wput_data(tcp, NULL, B_FALSE);
4854                         } else {
4855                                 tcp_ss_rexmit(tcp);
4856                         }
4857                 }
4858                 /*
4859                  * Adjust tcp_cwnd back to normal value after sending
4860                  * new data segments.
4861                  */
4862                 if (flags & TH_LIMIT_XMIT) {
4863                         tcp->tcp_cwnd -= mss << (tcp->tcp_dupack_cnt - 1);
4864                         /*
4865                          * This will restart the timer.  Restarting the
4866                          * timer is used to avoid a timeout before the
4867                          * limited transmitted segment's ACK gets back.
4868                          */
4869                         if (tcp->tcp_xmit_head != NULL) {
4870                                 tcp->tcp_xmit_head->b_prev =
4871                                     (mblk_t *)(intptr_t)gethrtime();
4872                         }
4873                 }
4874 
4875                 /* Anything more to do? */
4876                 if ((flags & (TH_ACK_NEEDED|TH_ACK_TIMER_NEEDED|
4877                     TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0)
4878                         goto done;
4879         }
4880 ack_check:
4881         if (flags & TH_SEND_URP_MARK) {
4882                 ASSERT(tcp->tcp_urp_mark_mp);
4883                 ASSERT(!IPCL_IS_NONSTR(connp));
4884                 /*
4885                  * Send up any queued data and then send the mark message
4886                  */
4887                 if (tcp->tcp_rcv_list != NULL) {
4888                         flags |= tcp_rcv_drain(tcp);
4889 
4890                 }
4891                 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg);
4892                 mp1 = tcp->tcp_urp_mark_mp;
4893                 tcp->tcp_urp_mark_mp = NULL;


5188         }
5189         if (addflag.crb_ipv6_recvdstopts) {
5190                 toh = (struct T_opthdr *)optptr;
5191                 toh->level = IPPROTO_IPV6;
5192                 toh->name = IPV6_DSTOPTS;
5193                 toh->len = sizeof (*toh) + ipp->ipp_dstoptslen;
5194                 toh->status = 0;
5195                 optptr += sizeof (*toh);
5196                 bcopy(ipp->ipp_dstopts, optptr, ipp->ipp_dstoptslen);
5197                 optptr += ipp->ipp_dstoptslen;
5198                 ASSERT(OK_32PTR(optptr));
5199                 /* Save as last value */
5200                 ip_savebuf((void **)&tcp->tcp_dstopts, &tcp->tcp_dstoptslen,
5201                     (ipp->ipp_fields & IPPF_DSTOPTS),
5202                     ipp->ipp_dstopts, ipp->ipp_dstoptslen);
5203         }
5204         ASSERT(optptr == mp->b_wptr);
5205         return (mp);
5206 }
5207 
5208 /* The minimum of smoothed mean deviation in RTO calculation (nsec). */
5209 #define TCP_SD_MIN      400000000
5210 
5211 /*
5212  * Set RTO for this connection based on a new round-trip time measurement.
5213  * The formula is from Jacobson and Karels' "Congestion Avoidance and Control"
5214  * in SIGCOMM '88.  The variable names are the same as those in Appendix A.2
5215  * of that paper.
5216  *
5217  * m = new measurement
5218  * sa = smoothed RTT average (8 * average estimates).
5219  * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates).
5220  */
5221 static void
5222 tcp_set_rto(tcp_t *tcp, hrtime_t rtt)
5223 {
5224         hrtime_t m = rtt;
5225         hrtime_t sa = tcp->tcp_rtt_sa;
5226         hrtime_t sv = tcp->tcp_rtt_sd;

5227         tcp_stack_t *tcps = tcp->tcp_tcps;
5228 
5229         TCPS_BUMP_MIB(tcps, tcpRttUpdate);
5230         tcp->tcp_rtt_update++;
5231 
5232         /* tcp_rtt_sa is not 0 means this is a new sample. */
5233         if (sa != 0) {
5234                 /*
5235                  * Update average estimator (see section 2.3 of RFC6298):
5236                  *      SRTT = 7/8 SRTT + 1/8 rtt
5237                  *
5238                  * We maintain tcp_rtt_sa as 8 * SRTT, so this reduces to:
5239                  *      tcp_rtt_sa = 7 * SRTT + rtt
5240                  *      tcp_rtt_sa = 7 * (tcp_rtt_sa / 8) + rtt
5241                  *      tcp_rtt_sa = tcp_rtt_sa - (tcp_rtt_sa / 8) + rtt
5242                  *      tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa / 8))
5243                  *      tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa / 2^3))
5244                  *      tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa >> 3))
5245                  *
5246                  * (rtt - tcp_rtt_sa / 8) is simply the difference
5247                  * between the new rtt measurement and the existing smoothed
5248                  * RTT average. This is referred to as "Error" in subsequent
5249                  * calculations.
5250                  */
5251 
5252                 /* m is now Error. */
5253                 m -= sa >> 3;
5254                 if ((sa += m) <= 0) {
5255                         /*
5256                          * Don't allow the smoothed average to be negative.
5257                          * We use 0 to denote reinitialization of the
5258                          * variables.
5259                          */
5260                         sa = 1;
5261                 }
5262 
5263                 /*
5264                  * Update deviation estimator:
5265                  *  mdev = 3/4 mdev + 1/4 abs(Error)
5266                  *
5267                  * We maintain tcp_rtt_sd as 4 * mdev, so this reduces to:
5268                  *  tcp_rtt_sd = 3 * mdev + abs(Error)
5269                  *  tcp_rtt_sd = tcp_rtt_sd - (tcp_rtt_sd / 4) + abs(Error)
5270                  *  tcp_rtt_sd = tcp_rtt_sd - (tcp_rtt_sd / 2^2) + abs(Error)
5271                  *  tcp_rtt_sd = tcp_rtt_sd - (tcp_rtt_sd >> 2) + abs(Error)
5272                  */
5273                 if (m < 0)
5274                         m = -m;
5275                 m -= sv >> 2;
5276                 sv += m;
5277         } else {
5278                 /*
5279                  * This follows BSD's implementation.  So the reinitialized
5280                  * RTO is 3 * m.  We cannot go less than 2 because if the
5281                  * link is bandwidth dominated, doubling the window size
5282                  * during slow start means doubling the RTT.  We want to be
5283                  * more conservative when we reinitialize our estimates.  3
5284                  * is just a convenient number.
5285                  */
5286                 sa = m << 3;
5287                 sv = m << 1;
5288         }
5289         if (sv < TCP_SD_MIN) {
5290                 /*
5291                  * Since a receiver doesn't delay its ACKs during a long run of
5292                  * segments, sa may not have captured the effect of delayed ACK
5293                  * timeouts on the RTT.  To make sure we always account for the
5294                  * possible delay (and avoid the unnecessary retransmission),
5295                  * TCP_SD_MIN is set to 400ms, twice the delayed ACK timeout of
5296                  * 200ms on older SunOS/BSD systems and modern Windows systems
5297                  * (as of 2019).  This means that the minimum possible mean
5298                  * deviation is 100 ms.

5299                  */
5300                 sv = TCP_SD_MIN;
5301         }
5302         tcp->tcp_rtt_sa = sa;
5303         tcp->tcp_rtt_sd = sv;













5304 
5305         tcp->tcp_rto = tcp_calculate_rto(tcp, tcps, 0);
5306 
5307         /* Now, we can reset tcp_timer_backoff to use the new RTO... */
5308         tcp->tcp_timer_backoff = 0;
5309 }
5310 
5311 /*
5312  * On a labeled system we have some protocols above TCP, such as RPC, which
5313  * appear to assume that every mblk in a chain has a db_credp.
5314  */
5315 static void
5316 tcp_setcred_data(mblk_t *mp, ip_recv_attr_t *ira)
5317 {
5318         ASSERT(is_system_labeled());
5319         ASSERT(ira->ira_cred != NULL);
5320 
5321         while (mp != NULL) {
5322                 mblk_setcred(mp, ira->ira_cred, NOPID);
5323                 mp = mp->b_cont;
5324         }
5325 }