5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
25 * Copyright 2016 Joyent, Inc.
26 * Copyright (c) 2014 by Delphix. All rights reserved.
27 */
28
29 /* This file contains all TCP input processing functions. */
30
31 #include <sys/types.h>
32 #include <sys/stream.h>
33 #include <sys/strsun.h>
34 #include <sys/strsubr.h>
35 #include <sys/stropts.h>
36 #include <sys/strlog.h>
37 #define _SUN_TPI_VERSION 2
38 #include <sys/tihdr.h>
39 #include <sys/suntpi.h>
40 #include <sys/xti_inet.h>
41 #include <sys/squeue_impl.h>
42 #include <sys/squeue.h>
43 #include <sys/tsol/tnet.h>
44
45 #include <inet/common.h>
46 #include <inet/ip.h>
149 static uint32_t tcp_init_wnd_chk = 4096;
150
151 /* Process ICMP source quench message or not. */
152 static boolean_t tcp_icmp_source_quench = B_FALSE;
153
154 static boolean_t tcp_outbound_squeue_switch = B_FALSE;
155
156 static mblk_t *tcp_conn_create_v4(conn_t *, conn_t *, mblk_t *,
157 ip_recv_attr_t *);
158 static mblk_t *tcp_conn_create_v6(conn_t *, conn_t *, mblk_t *,
159 ip_recv_attr_t *);
160 static boolean_t tcp_drop_q0(tcp_t *);
161 static void tcp_icmp_error_ipv6(tcp_t *, mblk_t *, ip_recv_attr_t *);
162 static mblk_t *tcp_input_add_ancillary(tcp_t *, mblk_t *, ip_pkt_t *,
163 ip_recv_attr_t *);
164 static void tcp_input_listener(void *, mblk_t *, void *, ip_recv_attr_t *);
165 static void tcp_process_options(tcp_t *, tcpha_t *);
166 static mblk_t *tcp_reass(tcp_t *, mblk_t *, uint32_t);
167 static void tcp_reass_elim_overlap(tcp_t *, mblk_t *);
168 static void tcp_rsrv_input(void *, mblk_t *, void *, ip_recv_attr_t *);
169 static void tcp_set_rto(tcp_t *, time_t);
170 static void tcp_setcred_data(mblk_t *, ip_recv_attr_t *);
171
172 /*
173 * Set the MSS associated with a particular tcp based on its current value,
174 * and a new one passed in. Observe minimums and maximums, and reset other
175 * state variables that we want to view as multiples of MSS.
176 *
177 * The value of MSS could be either increased or descreased.
178 */
179 void
180 tcp_mss_set(tcp_t *tcp, uint32_t mss)
181 {
182 uint32_t mss_max;
183 tcp_stack_t *tcps = tcp->tcp_tcps;
184 conn_t *connp = tcp->tcp_connp;
185
186 if (connp->conn_ipversion == IPV4_VERSION)
187 mss_max = tcps->tcps_mss_max_ipv4;
188 else
189 mss_max = tcps->tcps_mss_max_ipv6;
3345 SEQ_LEQ(seg_seq, tcp->tcp_rnxt + tcp->tcp_rwnd));
3346 freemsg(mp);
3347 /*
3348 * If the ACK flag is not set, just use our snxt as the
3349 * seq number of the RST segment.
3350 */
3351 if (!(flags & TH_ACK)) {
3352 seg_ack = tcp->tcp_snxt;
3353 }
3354 tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1,
3355 TH_RST|TH_ACK);
3356 ASSERT(tcp->tcp_state != TCPS_TIME_WAIT);
3357 (void) tcp_clean_death(tcp, ECONNRESET);
3358 return;
3359 }
3360 /*
3361 * urp could be -1 when the urp field in the packet is 0
3362 * and TCP_OLD_URP_INTERPRETATION is set. This implies that the urgent
3363 * byte was at seg_seq - 1, in which case we ignore the urgent flag.
3364 */
3365 if (flags & TH_URG && urp >= 0) {
3366 if (!tcp->tcp_urp_last_valid ||
3367 SEQ_GT(urp + seg_seq, tcp->tcp_urp_last)) {
3368 /*
3369 * Non-STREAMS sockets handle the urgent data a litte
3370 * differently from STREAMS based sockets. There is no
3371 * need to mark any mblks with the MSG{NOT,}MARKNEXT
3372 * flags to keep SIOCATMARK happy. Instead a
3373 * su_signal_oob upcall is made to update the mark.
3374 * Neither is a T_EXDATA_IND mblk needed to be
3375 * prepended to the urgent data. The urgent data is
3376 * delivered using the su_recv upcall, where we set
3377 * the MSG_OOB flag to indicate that it is urg data.
3378 *
3379 * Neither TH_SEND_URP_MARK nor TH_MARKNEXT_NEEDED
3380 * are used by non-STREAMS sockets.
3381 */
3382 if (IPCL_IS_NONSTR(connp)) {
3383 if (!TCP_IS_DETACHED(tcp)) {
3384 (*sockupcalls->su_signal_oob)
3385 (connp->conn_upper_handle, urp);
4287 *
4288 * By initializing tcp_cwnd_cnt to new tcp_cwnd and
4289 * decrementing it by 1 MSS for every ACKs, tcp_cwnd is
4290 * increased by 1 MSS for every RTTs.
4291 */
4292 if (tcp->tcp_cwnd_cnt <= 0) {
4293 tcp->tcp_cwnd_cnt = cwnd + add;
4294 } else {
4295 tcp->tcp_cwnd_cnt -= add;
4296 add = 0;
4297 }
4298 }
4299 tcp->tcp_cwnd = MIN(cwnd + add, tcp->tcp_cwnd_max);
4300 }
4301
4302 /* See if the latest urgent data has been acknowledged */
4303 if ((tcp->tcp_valid_bits & TCP_URG_VALID) &&
4304 SEQ_GT(seg_ack, tcp->tcp_urg))
4305 tcp->tcp_valid_bits &= ~TCP_URG_VALID;
4306
4307 /* Can we update the RTT estimates? */
4308 if (tcp->tcp_snd_ts_ok) {
4309 /* Ignore zero timestamp echo-reply. */
4310 if (tcpopt.tcp_opt_ts_ecr != 0) {
4311 tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH -
4312 (int32_t)tcpopt.tcp_opt_ts_ecr);
4313 }
4314
4315 /* If needed, restart the timer. */
4316 if (tcp->tcp_set_timer == 1) {
4317 TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
4318 tcp->tcp_set_timer = 0;
4319 }
4320 /*
4321 * Update tcp_csuna in case the other side stops sending
4322 * us timestamps.
4323 */
4324 tcp->tcp_csuna = tcp->tcp_snxt;
4325 } else if (SEQ_GT(seg_ack, tcp->tcp_csuna)) {
4326 /*
4327 * An ACK sequence we haven't seen before, so get the RTT
4328 * and update the RTO. But first check if the timestamp is
4329 * valid to use.
4330 */
4331 if ((mp1->b_next != NULL) &&
4332 SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next)))
4333 tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH -
4334 (int32_t)(intptr_t)mp1->b_prev);
4335 else
4336 TCPS_BUMP_MIB(tcps, tcpRttNoUpdate);
4337
4338 /* Remeber the last sequence to be ACKed */
4339 tcp->tcp_csuna = seg_ack;
4340 if (tcp->tcp_set_timer == 1) {
4341 TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
4342 tcp->tcp_set_timer = 0;
4343 }
4344 } else {
4345 TCPS_BUMP_MIB(tcps, tcpRttNoUpdate);
4346 }
4347
4348 /* Eat acknowledged bytes off the xmit queue. */
4349 for (;;) {
4350 mblk_t *mp2;
4351 uchar_t *wptr;
4352
4353 wptr = mp1->b_wptr;
4354 ASSERT((uintptr_t)(wptr - mp1->b_rptr) <= (uintptr_t)INT_MAX);
4355 bytes_acked -= (int)(wptr - mp1->b_rptr);
4356 if (bytes_acked < 0) {
4357 mp1->b_rptr = wptr + bytes_acked;
4358 /*
4359 * Set a new timestamp if all the bytes timed by the
4360 * old timestamp have been ack'ed.
4361 */
4362 if (SEQ_GT(seg_ack,
4363 (uint32_t)(uintptr_t)(mp1->b_next))) {
4364 mp1->b_prev =
4365 (mblk_t *)(uintptr_t)LBOLT_FASTPATH;
4366 mp1->b_next = NULL;
4367 }
4368 break;
4369 }
4370 mp1->b_next = NULL;
4371 mp1->b_prev = NULL;
4372 mp2 = mp1;
4373 mp1 = mp1->b_cont;
4374
4375 /*
4376 * This notification is required for some zero-copy
4377 * clients to maintain a copy semantic. After the data
4378 * is ack'ed, client is safe to modify or reuse the buffer.
4379 */
4380 if (tcp->tcp_snd_zcopy_aware &&
4381 (mp2->b_datap->db_struioflag & STRUIO_ZCNOTIFY))
4382 tcp_zcopy_notify(tcp);
4383 freeb(mp2);
4384 if (bytes_acked == 0) {
4385 if (mp1 == NULL) {
4822 TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0)
4823 goto done;
4824
4825 /* Any transmit work to do and a non-zero window? */
4826 if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_NEED_SACK_REXMIT|
4827 TH_LIMIT_XMIT)) && tcp->tcp_swnd != 0) {
4828 if (flags & TH_REXMIT_NEEDED) {
4829 uint32_t snd_size = tcp->tcp_snxt - tcp->tcp_suna;
4830
4831 TCPS_BUMP_MIB(tcps, tcpOutFastRetrans);
4832 if (snd_size > mss)
4833 snd_size = mss;
4834 if (snd_size > tcp->tcp_swnd)
4835 snd_size = tcp->tcp_swnd;
4836 mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, snd_size,
4837 NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size,
4838 B_TRUE);
4839
4840 if (mp1 != NULL) {
4841 tcp->tcp_xmit_head->b_prev =
4842 (mblk_t *)LBOLT_FASTPATH;
4843 tcp->tcp_csuna = tcp->tcp_snxt;
4844 TCPS_BUMP_MIB(tcps, tcpRetransSegs);
4845 TCPS_UPDATE_MIB(tcps, tcpRetransBytes,
4846 snd_size);
4847 tcp_send_data(tcp, mp1);
4848 }
4849 }
4850 if (flags & TH_NEED_SACK_REXMIT) {
4851 tcp_sack_rexmit(tcp, &flags);
4852 }
4853 /*
4854 * For TH_LIMIT_XMIT, tcp_wput_data() is called to send
4855 * out new segment. Note that tcp_rexmit should not be
4856 * set, otherwise TH_LIMIT_XMIT should not be set.
4857 */
4858 if (flags & (TH_XMIT_NEEDED|TH_LIMIT_XMIT)) {
4859 if (!tcp->tcp_rexmit) {
4860 tcp_wput_data(tcp, NULL, B_FALSE);
4861 } else {
4862 tcp_ss_rexmit(tcp);
4863 }
4864 }
4865 /*
4866 * Adjust tcp_cwnd back to normal value after sending
4867 * new data segments.
4868 */
4869 if (flags & TH_LIMIT_XMIT) {
4870 tcp->tcp_cwnd -= mss << (tcp->tcp_dupack_cnt - 1);
4871 /*
4872 * This will restart the timer. Restarting the
4873 * timer is used to avoid a timeout before the
4874 * limited transmitted segment's ACK gets back.
4875 */
4876 if (tcp->tcp_xmit_head != NULL)
4877 tcp->tcp_xmit_head->b_prev =
4878 (mblk_t *)LBOLT_FASTPATH;
4879 }
4880
4881 /* Anything more to do? */
4882 if ((flags & (TH_ACK_NEEDED|TH_ACK_TIMER_NEEDED|
4883 TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0)
4884 goto done;
4885 }
4886 ack_check:
4887 if (flags & TH_SEND_URP_MARK) {
4888 ASSERT(tcp->tcp_urp_mark_mp);
4889 ASSERT(!IPCL_IS_NONSTR(connp));
4890 /*
4891 * Send up any queued data and then send the mark message
4892 */
4893 if (tcp->tcp_rcv_list != NULL) {
4894 flags |= tcp_rcv_drain(tcp);
4895
4896 }
4897 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg);
4898 mp1 = tcp->tcp_urp_mark_mp;
4899 tcp->tcp_urp_mark_mp = NULL;
5194 }
5195 if (addflag.crb_ipv6_recvdstopts) {
5196 toh = (struct T_opthdr *)optptr;
5197 toh->level = IPPROTO_IPV6;
5198 toh->name = IPV6_DSTOPTS;
5199 toh->len = sizeof (*toh) + ipp->ipp_dstoptslen;
5200 toh->status = 0;
5201 optptr += sizeof (*toh);
5202 bcopy(ipp->ipp_dstopts, optptr, ipp->ipp_dstoptslen);
5203 optptr += ipp->ipp_dstoptslen;
5204 ASSERT(OK_32PTR(optptr));
5205 /* Save as last value */
5206 ip_savebuf((void **)&tcp->tcp_dstopts, &tcp->tcp_dstoptslen,
5207 (ipp->ipp_fields & IPPF_DSTOPTS),
5208 ipp->ipp_dstopts, ipp->ipp_dstoptslen);
5209 }
5210 ASSERT(optptr == mp->b_wptr);
5211 return (mp);
5212 }
5213
5214 /* The minimum of smoothed mean deviation in RTO calculation. */
5215 #define TCP_SD_MIN 400
5216
5217 /*
5218 * Set RTO for this connection. The formula is from Jacobson and Karels'
5219 * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names
5220 * are the same as those in Appendix A.2 of that paper.
5221 *
5222 * m = new measurement
5223 * sa = smoothed RTT average (8 * average estimates).
5224 * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates).
5225 */
5226 static void
5227 tcp_set_rto(tcp_t *tcp, clock_t rtt)
5228 {
5229 long m = TICK_TO_MSEC(rtt);
5230 clock_t sa = tcp->tcp_rtt_sa;
5231 clock_t sv = tcp->tcp_rtt_sd;
5232 clock_t rto;
5233 tcp_stack_t *tcps = tcp->tcp_tcps;
5234
5235 TCPS_BUMP_MIB(tcps, tcpRttUpdate);
5236 tcp->tcp_rtt_update++;
5237
5238 /* tcp_rtt_sa is not 0 means this is a new sample. */
5239 if (sa != 0) {
5240 /*
5241 * Update average estimator:
5242 * new rtt = 7/8 old rtt + 1/8 Error
5243 */
5244
5245 /* m is now Error in estimate. */
5246 m -= sa >> 3;
5247 if ((sa += m) <= 0) {
5248 /*
5249 * Don't allow the smoothed average to be negative.
5250 * We use 0 to denote reinitialization of the
5251 * variables.
5252 */
5253 sa = 1;
5254 }
5255
5256 /*
5257 * Update deviation estimator:
5258 * new mdev = 3/4 old mdev + 1/4 (abs(Error) - old mdev)
5259 */
5260 if (m < 0)
5261 m = -m;
5262 m -= sv >> 2;
5263 sv += m;
5264 } else {
5265 /*
5266 * This follows BSD's implementation. So the reinitialized
5267 * RTO is 3 * m. We cannot go less than 2 because if the
5268 * link is bandwidth dominated, doubling the window size
5269 * during slow start means doubling the RTT. We want to be
5270 * more conservative when we reinitialize our estimates. 3
5271 * is just a convenient number.
5272 */
5273 sa = m << 3;
5274 sv = m << 1;
5275 }
5276 if (sv < TCP_SD_MIN) {
5277 /*
5278 * We do not know that if sa captures the delay ACK
5279 * effect as in a long train of segments, a receiver
5280 * does not delay its ACKs. So set the minimum of sv
5281 * to be TCP_SD_MIN, which is default to 400 ms, twice
5282 * of BSD DATO. That means the minimum of mean
5283 * deviation is 100 ms.
5284 *
5285 */
5286 sv = TCP_SD_MIN;
5287 }
5288 tcp->tcp_rtt_sa = sa;
5289 tcp->tcp_rtt_sd = sv;
5290 /*
5291 * RTO = average estimates (sa / 8) + 4 * deviation estimates (sv)
5292 *
5293 * Add tcp_rexmit_interval extra in case of extreme environment
5294 * where the algorithm fails to work. The default value of
5295 * tcp_rexmit_interval_extra should be 0.
5296 *
5297 * As we use a finer grained clock than BSD and update
5298 * RTO for every ACKs, add in another .25 of RTT to the
5299 * deviation of RTO to accomodate burstiness of 1/4 of
5300 * window size.
5301 */
5302 rto = (sa >> 3) + sv + tcps->tcps_rexmit_interval_extra + (sa >> 5);
5303
5304 TCP_SET_RTO(tcp, rto);
5305
5306 /* Now, we can reset tcp_timer_backoff to use the new RTO... */
5307 tcp->tcp_timer_backoff = 0;
5308 }
5309
5310 /*
5311 * On a labeled system we have some protocols above TCP, such as RPC, which
5312 * appear to assume that every mblk in a chain has a db_credp.
5313 */
5314 static void
5315 tcp_setcred_data(mblk_t *mp, ip_recv_attr_t *ira)
5316 {
5317 ASSERT(is_system_labeled());
5318 ASSERT(ira->ira_cred != NULL);
5319
5320 while (mp != NULL) {
5321 mblk_setcred(mp, ira->ira_cred, NOPID);
5322 mp = mp->b_cont;
5323 }
5324 }
|
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
25 * Copyright 2019 Joyent, Inc.
26 * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
27 */
28
29 /* This file contains all TCP input processing functions. */
30
31 #include <sys/types.h>
32 #include <sys/stream.h>
33 #include <sys/strsun.h>
34 #include <sys/strsubr.h>
35 #include <sys/stropts.h>
36 #include <sys/strlog.h>
37 #define _SUN_TPI_VERSION 2
38 #include <sys/tihdr.h>
39 #include <sys/suntpi.h>
40 #include <sys/xti_inet.h>
41 #include <sys/squeue_impl.h>
42 #include <sys/squeue.h>
43 #include <sys/tsol/tnet.h>
44
45 #include <inet/common.h>
46 #include <inet/ip.h>
149 static uint32_t tcp_init_wnd_chk = 4096;
150
151 /* Process ICMP source quench message or not. */
152 static boolean_t tcp_icmp_source_quench = B_FALSE;
153
154 static boolean_t tcp_outbound_squeue_switch = B_FALSE;
155
156 static mblk_t *tcp_conn_create_v4(conn_t *, conn_t *, mblk_t *,
157 ip_recv_attr_t *);
158 static mblk_t *tcp_conn_create_v6(conn_t *, conn_t *, mblk_t *,
159 ip_recv_attr_t *);
160 static boolean_t tcp_drop_q0(tcp_t *);
161 static void tcp_icmp_error_ipv6(tcp_t *, mblk_t *, ip_recv_attr_t *);
162 static mblk_t *tcp_input_add_ancillary(tcp_t *, mblk_t *, ip_pkt_t *,
163 ip_recv_attr_t *);
164 static void tcp_input_listener(void *, mblk_t *, void *, ip_recv_attr_t *);
165 static void tcp_process_options(tcp_t *, tcpha_t *);
166 static mblk_t *tcp_reass(tcp_t *, mblk_t *, uint32_t);
167 static void tcp_reass_elim_overlap(tcp_t *, mblk_t *);
168 static void tcp_rsrv_input(void *, mblk_t *, void *, ip_recv_attr_t *);
169 static void tcp_set_rto(tcp_t *, hrtime_t);
170 static void tcp_setcred_data(mblk_t *, ip_recv_attr_t *);
171
172 /*
173 * Set the MSS associated with a particular tcp based on its current value,
174 * and a new one passed in. Observe minimums and maximums, and reset other
175 * state variables that we want to view as multiples of MSS.
176 *
177 * The value of MSS could be either increased or descreased.
178 */
179 void
180 tcp_mss_set(tcp_t *tcp, uint32_t mss)
181 {
182 uint32_t mss_max;
183 tcp_stack_t *tcps = tcp->tcp_tcps;
184 conn_t *connp = tcp->tcp_connp;
185
186 if (connp->conn_ipversion == IPV4_VERSION)
187 mss_max = tcps->tcps_mss_max_ipv4;
188 else
189 mss_max = tcps->tcps_mss_max_ipv6;
3345 SEQ_LEQ(seg_seq, tcp->tcp_rnxt + tcp->tcp_rwnd));
3346 freemsg(mp);
3347 /*
3348 * If the ACK flag is not set, just use our snxt as the
3349 * seq number of the RST segment.
3350 */
3351 if (!(flags & TH_ACK)) {
3352 seg_ack = tcp->tcp_snxt;
3353 }
3354 tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1,
3355 TH_RST|TH_ACK);
3356 ASSERT(tcp->tcp_state != TCPS_TIME_WAIT);
3357 (void) tcp_clean_death(tcp, ECONNRESET);
3358 return;
3359 }
3360 /*
3361 * urp could be -1 when the urp field in the packet is 0
3362 * and TCP_OLD_URP_INTERPRETATION is set. This implies that the urgent
3363 * byte was at seg_seq - 1, in which case we ignore the urgent flag.
3364 */
3365 if ((flags & TH_URG) && urp >= 0) {
3366 if (!tcp->tcp_urp_last_valid ||
3367 SEQ_GT(urp + seg_seq, tcp->tcp_urp_last)) {
3368 /*
3369 * Non-STREAMS sockets handle the urgent data a litte
3370 * differently from STREAMS based sockets. There is no
3371 * need to mark any mblks with the MSG{NOT,}MARKNEXT
3372 * flags to keep SIOCATMARK happy. Instead a
3373 * su_signal_oob upcall is made to update the mark.
3374 * Neither is a T_EXDATA_IND mblk needed to be
3375 * prepended to the urgent data. The urgent data is
3376 * delivered using the su_recv upcall, where we set
3377 * the MSG_OOB flag to indicate that it is urg data.
3378 *
3379 * Neither TH_SEND_URP_MARK nor TH_MARKNEXT_NEEDED
3380 * are used by non-STREAMS sockets.
3381 */
3382 if (IPCL_IS_NONSTR(connp)) {
3383 if (!TCP_IS_DETACHED(tcp)) {
3384 (*sockupcalls->su_signal_oob)
3385 (connp->conn_upper_handle, urp);
4287 *
4288 * By initializing tcp_cwnd_cnt to new tcp_cwnd and
4289 * decrementing it by 1 MSS for every ACKs, tcp_cwnd is
4290 * increased by 1 MSS for every RTTs.
4291 */
4292 if (tcp->tcp_cwnd_cnt <= 0) {
4293 tcp->tcp_cwnd_cnt = cwnd + add;
4294 } else {
4295 tcp->tcp_cwnd_cnt -= add;
4296 add = 0;
4297 }
4298 }
4299 tcp->tcp_cwnd = MIN(cwnd + add, tcp->tcp_cwnd_max);
4300 }
4301
4302 /* See if the latest urgent data has been acknowledged */
4303 if ((tcp->tcp_valid_bits & TCP_URG_VALID) &&
4304 SEQ_GT(seg_ack, tcp->tcp_urg))
4305 tcp->tcp_valid_bits &= ~TCP_URG_VALID;
4306
4307 /*
4308 * Update the RTT estimates. Note that we don't use the TCP
4309 * timestamp option to calculate RTT even if one is present. This is
4310 * because the timestamp option's resolution (CPU tick) is
4311 * too coarse to measure modern datacenter networks' microsecond
4312 * latencies. The timestamp field's resolution is limited by its
4313 * 4-byte width (see RFC1323), and since we always store a
4314 * high-resolution nanosecond presision timestamp along with the data,
4315 * there is no point to ever using the timestamp option.
4316 */
4317 if (SEQ_GT(seg_ack, tcp->tcp_csuna)) {
4318 /*
4319 * An ACK sequence we haven't seen before, so get the RTT
4320 * and update the RTO. But first check if the timestamp is
4321 * valid to use.
4322 */
4323 if ((mp1->b_next != NULL) &&
4324 SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next))) {
4325 tcp_set_rto(tcp, gethrtime() -
4326 (hrtime_t)(intptr_t)mp1->b_prev);
4327 } else {
4328 TCPS_BUMP_MIB(tcps, tcpRttNoUpdate);
4329 }
4330
4331 /* Remeber the last sequence to be ACKed */
4332 tcp->tcp_csuna = seg_ack;
4333 if (tcp->tcp_set_timer == 1) {
4334 TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
4335 tcp->tcp_set_timer = 0;
4336 }
4337 } else {
4338 TCPS_BUMP_MIB(tcps, tcpRttNoUpdate);
4339 }
4340
4341 /* Eat acknowledged bytes off the xmit queue. */
4342 for (;;) {
4343 mblk_t *mp2;
4344 uchar_t *wptr;
4345
4346 wptr = mp1->b_wptr;
4347 ASSERT((uintptr_t)(wptr - mp1->b_rptr) <= (uintptr_t)INT_MAX);
4348 bytes_acked -= (int)(wptr - mp1->b_rptr);
4349 if (bytes_acked < 0) {
4350 mp1->b_rptr = wptr + bytes_acked;
4351 /*
4352 * Set a new timestamp if all the bytes timed by the
4353 * old timestamp have been ack'ed.
4354 */
4355 if (SEQ_GT(seg_ack,
4356 (uint32_t)(uintptr_t)(mp1->b_next))) {
4357 mp1->b_prev =
4358 (mblk_t *)(intptr_t)gethrtime();
4359 mp1->b_next = NULL;
4360 }
4361 break;
4362 }
4363 mp1->b_next = NULL;
4364 mp1->b_prev = NULL;
4365 mp2 = mp1;
4366 mp1 = mp1->b_cont;
4367
4368 /*
4369 * This notification is required for some zero-copy
4370 * clients to maintain a copy semantic. After the data
4371 * is ack'ed, client is safe to modify or reuse the buffer.
4372 */
4373 if (tcp->tcp_snd_zcopy_aware &&
4374 (mp2->b_datap->db_struioflag & STRUIO_ZCNOTIFY))
4375 tcp_zcopy_notify(tcp);
4376 freeb(mp2);
4377 if (bytes_acked == 0) {
4378 if (mp1 == NULL) {
4815 TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0)
4816 goto done;
4817
4818 /* Any transmit work to do and a non-zero window? */
4819 if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_NEED_SACK_REXMIT|
4820 TH_LIMIT_XMIT)) && tcp->tcp_swnd != 0) {
4821 if (flags & TH_REXMIT_NEEDED) {
4822 uint32_t snd_size = tcp->tcp_snxt - tcp->tcp_suna;
4823
4824 TCPS_BUMP_MIB(tcps, tcpOutFastRetrans);
4825 if (snd_size > mss)
4826 snd_size = mss;
4827 if (snd_size > tcp->tcp_swnd)
4828 snd_size = tcp->tcp_swnd;
4829 mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, snd_size,
4830 NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size,
4831 B_TRUE);
4832
4833 if (mp1 != NULL) {
4834 tcp->tcp_xmit_head->b_prev =
4835 (mblk_t *)(intptr_t)gethrtime();
4836 tcp->tcp_csuna = tcp->tcp_snxt;
4837 TCPS_BUMP_MIB(tcps, tcpRetransSegs);
4838 TCPS_UPDATE_MIB(tcps, tcpRetransBytes,
4839 snd_size);
4840 tcp_send_data(tcp, mp1);
4841 }
4842 }
4843 if (flags & TH_NEED_SACK_REXMIT) {
4844 tcp_sack_rexmit(tcp, &flags);
4845 }
4846 /*
4847 * For TH_LIMIT_XMIT, tcp_wput_data() is called to send
4848 * out new segment. Note that tcp_rexmit should not be
4849 * set, otherwise TH_LIMIT_XMIT should not be set.
4850 */
4851 if (flags & (TH_XMIT_NEEDED|TH_LIMIT_XMIT)) {
4852 if (!tcp->tcp_rexmit) {
4853 tcp_wput_data(tcp, NULL, B_FALSE);
4854 } else {
4855 tcp_ss_rexmit(tcp);
4856 }
4857 }
4858 /*
4859 * Adjust tcp_cwnd back to normal value after sending
4860 * new data segments.
4861 */
4862 if (flags & TH_LIMIT_XMIT) {
4863 tcp->tcp_cwnd -= mss << (tcp->tcp_dupack_cnt - 1);
4864 /*
4865 * This will restart the timer. Restarting the
4866 * timer is used to avoid a timeout before the
4867 * limited transmitted segment's ACK gets back.
4868 */
4869 if (tcp->tcp_xmit_head != NULL) {
4870 tcp->tcp_xmit_head->b_prev =
4871 (mblk_t *)(intptr_t)gethrtime();
4872 }
4873 }
4874
4875 /* Anything more to do? */
4876 if ((flags & (TH_ACK_NEEDED|TH_ACK_TIMER_NEEDED|
4877 TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0)
4878 goto done;
4879 }
4880 ack_check:
4881 if (flags & TH_SEND_URP_MARK) {
4882 ASSERT(tcp->tcp_urp_mark_mp);
4883 ASSERT(!IPCL_IS_NONSTR(connp));
4884 /*
4885 * Send up any queued data and then send the mark message
4886 */
4887 if (tcp->tcp_rcv_list != NULL) {
4888 flags |= tcp_rcv_drain(tcp);
4889
4890 }
4891 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg);
4892 mp1 = tcp->tcp_urp_mark_mp;
4893 tcp->tcp_urp_mark_mp = NULL;
5188 }
5189 if (addflag.crb_ipv6_recvdstopts) {
5190 toh = (struct T_opthdr *)optptr;
5191 toh->level = IPPROTO_IPV6;
5192 toh->name = IPV6_DSTOPTS;
5193 toh->len = sizeof (*toh) + ipp->ipp_dstoptslen;
5194 toh->status = 0;
5195 optptr += sizeof (*toh);
5196 bcopy(ipp->ipp_dstopts, optptr, ipp->ipp_dstoptslen);
5197 optptr += ipp->ipp_dstoptslen;
5198 ASSERT(OK_32PTR(optptr));
5199 /* Save as last value */
5200 ip_savebuf((void **)&tcp->tcp_dstopts, &tcp->tcp_dstoptslen,
5201 (ipp->ipp_fields & IPPF_DSTOPTS),
5202 ipp->ipp_dstopts, ipp->ipp_dstoptslen);
5203 }
5204 ASSERT(optptr == mp->b_wptr);
5205 return (mp);
5206 }
5207
5208 /* The minimum of smoothed mean deviation in RTO calculation (nsec). */
5209 #define TCP_SD_MIN 400000000
5210
5211 /*
5212 * Set RTO for this connection based on a new round-trip time measurement.
5213 * The formula is from Jacobson and Karels' "Congestion Avoidance and Control"
5214 * in SIGCOMM '88. The variable names are the same as those in Appendix A.2
5215 * of that paper.
5216 *
5217 * m = new measurement
5218 * sa = smoothed RTT average (8 * average estimates).
5219 * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates).
5220 */
5221 static void
5222 tcp_set_rto(tcp_t *tcp, hrtime_t rtt)
5223 {
5224 hrtime_t m = rtt;
5225 hrtime_t sa = tcp->tcp_rtt_sa;
5226 hrtime_t sv = tcp->tcp_rtt_sd;
5227 tcp_stack_t *tcps = tcp->tcp_tcps;
5228
5229 TCPS_BUMP_MIB(tcps, tcpRttUpdate);
5230 tcp->tcp_rtt_update++;
5231
5232 /* tcp_rtt_sa is not 0 means this is a new sample. */
5233 if (sa != 0) {
5234 /*
5235 * Update average estimator (see section 2.3 of RFC6298):
5236 * SRTT = 7/8 SRTT + 1/8 rtt
5237 *
5238 * We maintain tcp_rtt_sa as 8 * SRTT, so this reduces to:
5239 * tcp_rtt_sa = 7 * SRTT + rtt
5240 * tcp_rtt_sa = 7 * (tcp_rtt_sa / 8) + rtt
5241 * tcp_rtt_sa = tcp_rtt_sa - (tcp_rtt_sa / 8) + rtt
5242 * tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa / 8))
5243 * tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa / 2^3))
5244 * tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa >> 3))
5245 *
5246 * (rtt - tcp_rtt_sa / 8) is simply the difference
5247 * between the new rtt measurement and the existing smoothed
5248 * RTT average. This is referred to as "Error" in subsequent
5249 * calculations.
5250 */
5251
5252 /* m is now Error. */
5253 m -= sa >> 3;
5254 if ((sa += m) <= 0) {
5255 /*
5256 * Don't allow the smoothed average to be negative.
5257 * We use 0 to denote reinitialization of the
5258 * variables.
5259 */
5260 sa = 1;
5261 }
5262
5263 /*
5264 * Update deviation estimator:
5265 * mdev = 3/4 mdev + 1/4 abs(Error)
5266 *
5267 * We maintain tcp_rtt_sd as 4 * mdev, so this reduces to:
5268 * tcp_rtt_sd = 3 * mdev + abs(Error)
5269 * tcp_rtt_sd = tcp_rtt_sd - (tcp_rtt_sd / 4) + abs(Error)
5270 * tcp_rtt_sd = tcp_rtt_sd - (tcp_rtt_sd / 2^2) + abs(Error)
5271 * tcp_rtt_sd = tcp_rtt_sd - (tcp_rtt_sd >> 2) + abs(Error)
5272 */
5273 if (m < 0)
5274 m = -m;
5275 m -= sv >> 2;
5276 sv += m;
5277 } else {
5278 /*
5279 * This follows BSD's implementation. So the reinitialized
5280 * RTO is 3 * m. We cannot go less than 2 because if the
5281 * link is bandwidth dominated, doubling the window size
5282 * during slow start means doubling the RTT. We want to be
5283 * more conservative when we reinitialize our estimates. 3
5284 * is just a convenient number.
5285 */
5286 sa = m << 3;
5287 sv = m << 1;
5288 }
5289 if (sv < TCP_SD_MIN) {
5290 /*
5291 * Since a receiver doesn't delay its ACKs during a long run of
5292 * segments, sa may not have captured the effect of delayed ACK
5293 * timeouts on the RTT. To make sure we always account for the
5294 * possible delay (and avoid the unnecessary retransmission),
5295 * TCP_SD_MIN is set to 400ms, twice the delayed ACK timeout of
5296 * 200ms on older SunOS/BSD systems and modern Windows systems
5297 * (as of 2019). This means that the minimum possible mean
5298 * deviation is 100 ms.
5299 */
5300 sv = TCP_SD_MIN;
5301 }
5302 tcp->tcp_rtt_sa = sa;
5303 tcp->tcp_rtt_sd = sv;
5304
5305 tcp->tcp_rto = tcp_calculate_rto(tcp, tcps, 0);
5306
5307 /* Now, we can reset tcp_timer_backoff to use the new RTO... */
5308 tcp->tcp_timer_backoff = 0;
5309 }
5310
5311 /*
5312 * On a labeled system we have some protocols above TCP, such as RPC, which
5313 * appear to assume that every mblk in a chain has a db_credp.
5314 */
5315 static void
5316 tcp_setcred_data(mblk_t *mp, ip_recv_attr_t *ira)
5317 {
5318 ASSERT(is_system_labeled());
5319 ASSERT(ira->ira_cred != NULL);
5320
5321 while (mp != NULL) {
5322 mblk_setcred(mp, ira->ira_cred, NOPID);
5323 mp = mp->b_cont;
5324 }
5325 }
|