153
154 static boolean_t tcp_outbound_squeue_switch = B_FALSE;
155
156 static mblk_t *tcp_conn_create_v4(conn_t *, conn_t *, mblk_t *,
157 ip_recv_attr_t *);
158 static mblk_t *tcp_conn_create_v6(conn_t *, conn_t *, mblk_t *,
159 ip_recv_attr_t *);
160 static boolean_t tcp_drop_q0(tcp_t *);
161 static void tcp_icmp_error_ipv6(tcp_t *, mblk_t *, ip_recv_attr_t *);
162 static mblk_t *tcp_input_add_ancillary(tcp_t *, mblk_t *, ip_pkt_t *,
163 ip_recv_attr_t *);
164 static void tcp_input_listener(void *, mblk_t *, void *, ip_recv_attr_t *);
165 static void tcp_process_options(tcp_t *, tcpha_t *);
166 static mblk_t *tcp_reass(tcp_t *, mblk_t *, uint32_t);
167 static void tcp_reass_elim_overlap(tcp_t *, mblk_t *);
168 static void tcp_rsrv_input(void *, mblk_t *, void *, ip_recv_attr_t *);
169 static void tcp_set_rto(tcp_t *, hrtime_t);
170 static void tcp_setcred_data(mblk_t *, ip_recv_attr_t *);
171
172 /*
173 * Set the MSS associated with a particular tcp based on its current value,
174 * and a new one passed in. Observe minimums and maximums, and reset other
175 * state variables that we want to view as multiples of MSS.
176 *
177 * The value of MSS could be either increased or descreased.
178 */
179 void
180 tcp_mss_set(tcp_t *tcp, uint32_t mss)
181 {
182 uint32_t mss_max;
183 tcp_stack_t *tcps = tcp->tcp_tcps;
184 conn_t *connp = tcp->tcp_connp;
185
186 if (connp->conn_ipversion == IPV4_VERSION)
187 mss_max = tcps->tcps_mss_max_ipv4;
188 else
189 mss_max = tcps->tcps_mss_max_ipv6;
190
191 if (mss < tcps->tcps_mss_min)
192 mss = tcps->tcps_mss_min;
531 IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) + TCP_MIN_HEADER_LENGTH);
532
533 /*
534 * Set MSS to the smaller one of both ends of the connection.
535 * We should not have called tcp_mss_set() before, but our
536 * side of the MSS should have been set to a proper value
537 * by tcp_set_destination(). tcp_mss_set() will also set up the
538 * STREAM head parameters properly.
539 *
540 * If we have a larger-than-16-bit window but the other side
541 * didn't want to do window scale, tcp_rwnd_set() will take
542 * care of that.
543 */
544 tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss));
545
546 /*
547 * Initialize tcp_cwnd value. After tcp_mss_set(), tcp_mss has been
548 * updated properly.
549 */
550 TCP_SET_INIT_CWND(tcp, tcp->tcp_mss, tcps->tcps_slow_start_initial);
551 }
552
553 /*
554 * Add a new piece to the tcp reassembly queue. If the gap at the beginning
555 * is filled, return as much as we can. The message passed in may be
556 * multi-part, chained using b_cont. "start" is the starting sequence
557 * number for this piece.
558 */
559 static mblk_t *
560 tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start)
561 {
562 uint32_t end, bytes;
563 mblk_t *mp1;
564 mblk_t *mp2;
565 mblk_t *next_mp;
566 uint32_t u1;
567 tcp_stack_t *tcps = tcp->tcp_tcps;
568
569
570 /* Walk through all the new pieces. */
1388 "reached: %u attempts dropped total\n",
1389 ntohs(listener->tcp_connp->conn_lport),
1390 tlc->tlc_max, tlc->tlc_drop);
1391 tlc->tlc_report_time = now;
1392 }
1393 goto error2;
1394 }
1395 tlc_set = B_TRUE;
1396 }
1397
1398 mutex_exit(&listener->tcp_eager_lock);
1399
1400 /*
1401 * IP sets ira_sqp to either the senders conn_sqp (for loopback)
1402 * or based on the ring (for packets from GLD). Otherwise it is
1403 * set based on lbolt i.e., a somewhat random number.
1404 */
1405 ASSERT(ira->ira_sqp != NULL);
1406 new_sqp = ira->ira_sqp;
1407
1408 econnp = (conn_t *)tcp_get_conn(arg2, tcps);
1409 if (econnp == NULL)
1410 goto error2;
1411
1412 ASSERT(econnp->conn_netstack == lconnp->conn_netstack);
1413 econnp->conn_sqp = new_sqp;
1414 econnp->conn_initial_sqp = new_sqp;
1415 econnp->conn_ixa->ixa_sqp = new_sqp;
1416
1417 econnp->conn_fport = tcpha->tha_lport;
1418 econnp->conn_lport = tcpha->tha_fport;
1419
1420 err = conn_inherit_parent(lconnp, econnp);
1421 if (err != 0)
1422 goto error3;
1423
1424 /* We already know the laddr of the new connection is ours */
1425 econnp->conn_ixa->ixa_src_generation = ipst->ips_src_generation;
1426
1427 ASSERT(OK_32PTR(mp->b_rptr));
1428 ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION ||
2307 tcp_input_data(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
2308 {
2309 int32_t bytes_acked;
2310 int32_t gap;
2311 mblk_t *mp1;
2312 uint_t flags;
2313 uint32_t new_swnd = 0;
2314 uchar_t *iphdr;
2315 uchar_t *rptr;
2316 int32_t rgap;
2317 uint32_t seg_ack;
2318 int seg_len;
2319 uint_t ip_hdr_len;
2320 uint32_t seg_seq;
2321 tcpha_t *tcpha;
2322 int urp;
2323 tcp_opt_t tcpopt;
2324 ip_pkt_t ipp;
2325 boolean_t ofo_seg = B_FALSE; /* Out of order segment */
2326 uint32_t cwnd;
2327 uint32_t add;
2328 int npkt;
2329 int mss;
2330 conn_t *connp = (conn_t *)arg;
2331 squeue_t *sqp = (squeue_t *)arg2;
2332 tcp_t *tcp = connp->conn_tcp;
2333 tcp_stack_t *tcps = tcp->tcp_tcps;
2334 sock_upcalls_t *sockupcalls;
2335
2336 /*
2337 * RST from fused tcp loopback peer should trigger an unfuse.
2338 */
2339 if (tcp->tcp_fused) {
2340 TCP_STAT(tcps, tcp_fusion_aborted);
2341 tcp_unfuse(tcp);
2342 }
2343
2344 iphdr = mp->b_rptr;
2345 rptr = mp->b_rptr;
2346 ASSERT(OK_32PTR(rptr));
2347
2348 ip_hdr_len = ira->ira_ip_hdr_length;
2584 tcp->tcp_suna = tcp->tcp_iss + 1;
2585 tcp->tcp_valid_bits &= ~TCP_ISS_VALID;
2586
2587 /*
2588 * If SYN was retransmitted, need to reset all
2589 * retransmission info. This is because this
2590 * segment will be treated as a dup ACK.
2591 */
2592 if (tcp->tcp_rexmit) {
2593 tcp->tcp_rexmit = B_FALSE;
2594 tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
2595 tcp->tcp_rexmit_max = tcp->tcp_snxt;
2596 tcp->tcp_ms_we_have_waited = 0;
2597
2598 /*
2599 * Set tcp_cwnd back to 1 MSS, per
2600 * recommendation from
2601 * draft-floyd-incr-init-win-01.txt,
2602 * Increasing TCP's Initial Window.
2603 */
2604 tcp->tcp_cwnd = tcp->tcp_mss;
2605 }
2606
2607 tcp->tcp_swl1 = seg_seq;
2608 tcp->tcp_swl2 = seg_ack;
2609
2610 new_swnd = ntohs(tcpha->tha_win);
2611 tcp->tcp_swnd = new_swnd;
2612 if (new_swnd > tcp->tcp_max_swnd)
2613 tcp->tcp_max_swnd = new_swnd;
2614
2615 /*
2616 * Always send the three-way handshake ack immediately
2617 * in order to make the connection complete as soon as
2618 * possible on the accepting host.
2619 */
2620 flags |= TH_ACK_NEEDED;
2621
2622 /*
2623 * Trace connect-established here.
3806 ip_xmit_attr_t *, connp->conn_ixa, void_ip_t *,
3807 iphdr, tcp_t *, tcp, tcph_t *, tcpha);
3808 }
3809 TCPS_CONN_INC(tcps);
3810
3811 tcp->tcp_suna = tcp->tcp_iss + 1; /* One for the SYN */
3812 bytes_acked--;
3813 /* SYN was acked - making progress */
3814 tcp->tcp_ip_forward_progress = B_TRUE;
3815
3816 /*
3817 * If SYN was retransmitted, need to reset all
3818 * retransmission info as this segment will be
3819 * treated as a dup ACK.
3820 */
3821 if (tcp->tcp_rexmit) {
3822 tcp->tcp_rexmit = B_FALSE;
3823 tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
3824 tcp->tcp_rexmit_max = tcp->tcp_snxt;
3825 tcp->tcp_ms_we_have_waited = 0;
3826 tcp->tcp_cwnd = mss;
3827 }
3828
3829 /*
3830 * We set the send window to zero here.
3831 * This is needed if there is data to be
3832 * processed already on the queue.
3833 * Later (at swnd_update label), the
3834 * "new_swnd > tcp_swnd" condition is satisfied
3835 * the XMIT_NEEDED flag is set in the current
3836 * (SYN_RCVD) state. This ensures tcp_wput_data() is
3837 * called if there is already data on queue in
3838 * this state.
3839 */
3840 tcp->tcp_swnd = 0;
3841
3842 if (new_swnd > tcp->tcp_max_swnd)
3843 tcp->tcp_max_swnd = new_swnd;
3844 tcp->tcp_swl1 = seg_seq;
3845 tcp->tcp_swl2 = seg_ack;
3849 DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *,
3850 connp->conn_ixa, void, NULL, tcp_t *, tcp, void, NULL,
3851 int32_t, TCPS_SYN_RCVD);
3852
3853 /* Fuse when both sides are in ESTABLISHED state */
3854 if (tcp->tcp_loopback && do_tcp_fusion)
3855 tcp_fuse(tcp, iphdr, tcpha);
3856
3857 }
3858 /* This code follows 4.4BSD-Lite2 mostly. */
3859 if (bytes_acked < 0)
3860 goto est;
3861
3862 /*
3863 * If TCP is ECN capable and the congestion experience bit is
3864 * set, reduce tcp_cwnd and tcp_ssthresh. But this should only be
3865 * done once per window (or more loosely, per RTT).
3866 */
3867 if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max))
3868 tcp->tcp_cwr = B_FALSE;
3869 if (tcp->tcp_ecn_ok && (flags & TH_ECE)) {
3870 if (!tcp->tcp_cwr) {
3871 npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) / mss;
3872 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * mss;
3873 tcp->tcp_cwnd = npkt * mss;
3874 /*
3875 * If the cwnd is 0, use the timer to clock out
3876 * new segments. This is required by the ECN spec.
3877 */
3878 if (npkt == 0) {
3879 TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
3880 /*
3881 * This makes sure that when the ACK comes
3882 * back, we will increase tcp_cwnd by 1 MSS.
3883 */
3884 tcp->tcp_cwnd_cnt = 0;
3885 }
3886 tcp->tcp_cwr = B_TRUE;
3887 /*
3888 * This marks the end of the current window of in
3889 * flight data. That is why we don't use
3890 * tcp_suna + tcp_swnd. Only data in flight can
3891 * provide ECN info.
3892 */
3893 tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
3894 tcp->tcp_ecn_cwr_sent = B_FALSE;
3895 }
3896 }
3897
3898 mp1 = tcp->tcp_xmit_head;
3899 if (bytes_acked == 0) {
3900 if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) {
3901 int dupack_cnt;
3902
3903 TCPS_BUMP_MIB(tcps, tcpInDupAck);
3904 /*
3905 * Fast retransmit. When we have seen exactly three
3906 * identical ACKs while we have unacked data
3907 * outstanding we take it as a hint that our peer
3908 * dropped something.
3909 *
3910 * If TCP is retransmitting, don't do fast retransmit.
3911 */
3912 if (mp1 && tcp->tcp_suna != tcp->tcp_snxt &&
3913 ! tcp->tcp_rexmit) {
3914 /* Do Limited Transmit */
3915 if ((dupack_cnt = ++tcp->tcp_dupack_cnt) <
3916 tcps->tcps_dupack_fast_retransmit) {
3917 /*
3918 * RFC 3042
3919 *
3920 * What we need to do is temporarily
3921 * increase tcp_cwnd so that new
3922 * data can be sent if it is allowed
3923 * by the receive window (tcp_rwnd).
3924 * tcp_wput_data() will take care of
3925 * the rest.
3926 *
3927 * If the connection is SACK capable,
3928 * only do limited xmit when there
3929 * is SACK info.
3930 *
3931 * Note how tcp_cwnd is incremented.
3932 * The first dup ACK will increase
3933 * it by 1 MSS. The second dup ACK
3934 * will increase it by 2 MSS. This
3935 * means that only 1 new segment will
3936 * be sent for each dup ACK.
3943 (tcp->tcp_dupack_cnt - 1);
3944 flags |= TH_LIMIT_XMIT;
3945 }
3946 } else if (dupack_cnt ==
3947 tcps->tcps_dupack_fast_retransmit) {
3948
3949 /*
3950 * If we have reduced tcp_ssthresh
3951 * because of ECN, do not reduce it again
3952 * unless it is already one window of data
3953 * away. After one window of data, tcp_cwr
3954 * should then be cleared. Note that
3955 * for non ECN capable connection, tcp_cwr
3956 * should always be false.
3957 *
3958 * Adjust cwnd since the duplicate
3959 * ack indicates that a packet was
3960 * dropped (due to congestion.)
3961 */
3962 if (!tcp->tcp_cwr) {
3963 npkt = ((tcp->tcp_snxt -
3964 tcp->tcp_suna) >> 1) / mss;
3965 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) *
3966 mss;
3967 tcp->tcp_cwnd = (npkt +
3968 tcp->tcp_dupack_cnt) * mss;
3969 }
3970 if (tcp->tcp_ecn_ok) {
3971 tcp->tcp_cwr = B_TRUE;
3972 tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
3973 tcp->tcp_ecn_cwr_sent = B_FALSE;
3974 }
3975
3976 /*
3977 * We do Hoe's algorithm. Refer to her
3978 * paper "Improving the Start-up Behavior
3979 * of a Congestion Control Scheme for TCP,"
3980 * appeared in SIGCOMM'96.
3981 *
3982 * Save highest seq no we have sent so far.
3983 * Be careful about the invisible FIN byte.
3984 */
3985 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
3986 (tcp->tcp_unsent == 0)) {
3987 tcp->tcp_rexmit_max = tcp->tcp_fss;
3988 } else {
4010 tcp->tcp_fack;
4011 tcp->tcp_sack_snxt = seg_ack;
4012 flags |= TH_NEED_SACK_REXMIT;
4013 } else {
4014 /*
4015 * Always initialize tcp_pipe
4016 * even though we don't have
4017 * any SACK info. If later
4018 * we get SACK info and
4019 * tcp_pipe is not initialized,
4020 * funny things will happen.
4021 */
4022 tcp->tcp_pipe =
4023 tcp->tcp_cwnd_ssthresh;
4024 }
4025 } else {
4026 flags |= TH_REXMIT_NEEDED;
4027 } /* tcp_snd_sack_ok */
4028
4029 } else {
4030 /*
4031 * Here we perform congestion
4032 * avoidance, but NOT slow start.
4033 * This is known as the Fast
4034 * Recovery Algorithm.
4035 */
4036 if (tcp->tcp_snd_sack_ok &&
4037 tcp->tcp_notsack_list != NULL) {
4038 flags |= TH_NEED_SACK_REXMIT;
4039 tcp->tcp_pipe -= mss;
4040 if (tcp->tcp_pipe < 0)
4041 tcp->tcp_pipe = 0;
4042 } else {
4043 /*
4044 * We know that one more packet has
4045 * left the pipe thus we can update
4046 * cwnd.
4047 */
4048 cwnd = tcp->tcp_cwnd + mss;
4049 if (cwnd > tcp->tcp_cwnd_max)
4050 cwnd = tcp->tcp_cwnd_max;
4051 tcp->tcp_cwnd = cwnd;
4052 if (tcp->tcp_unsent > 0)
4053 flags |= TH_XMIT_NEEDED;
4054 }
4055 }
4056 }
4057 } else if (tcp->tcp_zero_win_probe) {
4058 /*
4059 * If the window has opened, need to arrange
4060 * to send additional data.
4061 */
4062 if (new_swnd != 0) {
4063 /* tcp_suna != tcp_snxt */
4064 /* Packet contains a window update */
4065 TCPS_BUMP_MIB(tcps, tcpInWinUpdate);
4066 tcp->tcp_zero_win_probe = 0;
4067 tcp->tcp_timer_backoff = 0;
4068 tcp->tcp_ms_we_have_waited = 0;
4069
4070 /*
4163
4164 /*
4165 * TCP gets a new ACK, update the notsack'ed list to delete those
4166 * blocks that are covered by this ACK.
4167 */
4168 if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) {
4169 tcp_notsack_remove(&(tcp->tcp_notsack_list), seg_ack,
4170 &(tcp->tcp_num_notsack_blk), &(tcp->tcp_cnt_notsack_list));
4171 }
4172
4173 /*
4174 * If we got an ACK after fast retransmit, check to see
4175 * if it is a partial ACK. If it is not and the congestion
4176 * window was inflated to account for the other side's
4177 * cached packets, retract it. If it is, do Hoe's algorithm.
4178 */
4179 if (tcp->tcp_dupack_cnt >= tcps->tcps_dupack_fast_retransmit) {
4180 ASSERT(tcp->tcp_rexmit == B_FALSE);
4181 if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) {
4182 tcp->tcp_dupack_cnt = 0;
4183 /*
4184 * Restore the orig tcp_cwnd_ssthresh after
4185 * fast retransmit phase.
4186 */
4187 if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) {
4188 tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh;
4189 }
4190 tcp->tcp_rexmit_max = seg_ack;
4191 tcp->tcp_cwnd_cnt = 0;
4192
4193 /*
4194 * Remove all notsack info to avoid confusion with
4195 * the next fast retrasnmit/recovery phase.
4196 */
4197 if (tcp->tcp_snd_sack_ok) {
4198 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list,
4199 tcp);
4200 }
4201 } else {
4202 if (tcp->tcp_snd_sack_ok &&
4203 tcp->tcp_notsack_list != NULL) {
4204 flags |= TH_NEED_SACK_REXMIT;
4205 tcp->tcp_pipe -= mss;
4206 if (tcp->tcp_pipe < 0)
4207 tcp->tcp_pipe = 0;
4208 } else {
4209 /*
4210 * Hoe's algorithm:
4211 *
4212 * Retransmit the unack'ed segment and
4213 * restart fast recovery. Note that we
4214 * need to scale back tcp_cwnd to the
4215 * original value when we started fast
4216 * recovery. This is to prevent overly
4217 * aggressive behaviour in sending new
4218 * segments.
4219 */
4220 tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh +
4221 tcps->tcps_dupack_fast_retransmit * mss;
4222 tcp->tcp_cwnd_cnt = tcp->tcp_cwnd;
4223 flags |= TH_REXMIT_NEEDED;
4224 }
4225 }
4226 } else {
4227 tcp->tcp_dupack_cnt = 0;
4228 if (tcp->tcp_rexmit) {
4229 /*
4230 * TCP is retranmitting. If the ACK ack's all
4231 * outstanding data, update tcp_rexmit_max and
4232 * tcp_rexmit_nxt. Otherwise, update tcp_rexmit_nxt
4233 * to the correct value.
4234 *
4235 * Note that SEQ_LEQ() is used. This is to avoid
4236 * unnecessary fast retransmit caused by dup ACKs
4237 * received when TCP does slow start retransmission
4238 * after a time out. During this phase, TCP may
4239 * send out segments which are already received.
4240 * This causes dup ACKs to be sent back.
4241 */
4262 tcp->tcp_timer_backoff = 0;
4263 }
4264
4265 /*
4266 * If tcp_xmit_head is NULL, then it must be the FIN being ack'ed.
4267 * Note that it cannot be the SYN being ack'ed. The code flow
4268 * will not reach here.
4269 */
4270 if (mp1 == NULL) {
4271 goto fin_acked;
4272 }
4273
4274 /*
4275 * Update the congestion window.
4276 *
4277 * If TCP is not ECN capable or TCP is ECN capable but the
4278 * congestion experience bit is not set, increase the tcp_cwnd as
4279 * usual.
4280 */
4281 if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) {
4282 cwnd = tcp->tcp_cwnd;
4283 add = mss;
4284
4285 if (cwnd >= tcp->tcp_cwnd_ssthresh) {
4286 /*
4287 * This is to prevent an increase of less than 1 MSS of
4288 * tcp_cwnd. With partial increase, tcp_wput_data()
4289 * may send out tinygrams in order to preserve mblk
4290 * boundaries.
4291 *
4292 * By initializing tcp_cwnd_cnt to new tcp_cwnd and
4293 * decrementing it by 1 MSS for every ACKs, tcp_cwnd is
4294 * increased by 1 MSS for every RTTs.
4295 */
4296 if (tcp->tcp_cwnd_cnt <= 0) {
4297 tcp->tcp_cwnd_cnt = cwnd + add;
4298 } else {
4299 tcp->tcp_cwnd_cnt -= add;
4300 add = 0;
4301 }
4302 }
4303 tcp->tcp_cwnd = MIN(cwnd + add, tcp->tcp_cwnd_max);
4304 }
4305
4306 /* See if the latest urgent data has been acknowledged */
4307 if ((tcp->tcp_valid_bits & TCP_URG_VALID) &&
4308 SEQ_GT(seg_ack, tcp->tcp_urg))
4309 tcp->tcp_valid_bits &= ~TCP_URG_VALID;
4310
4311 /*
4312 * Update the RTT estimates. Note that we don't use the TCP
4313 * timestamp option to calculate RTT even if one is present. This is
4314 * because the timestamp option's resolution (CPU tick) is
4315 * too coarse to measure modern datacenter networks' microsecond
4316 * latencies. The timestamp field's resolution is limited by its
4317 * 4-byte width (see RFC1323), and since we always store a
4318 * high-resolution nanosecond presision timestamp along with the data,
4319 * there is no point to ever using the timestamp option.
4320 */
4321 if (SEQ_GT(seg_ack, tcp->tcp_csuna)) {
4322 /*
4323 * An ACK sequence we haven't seen before, so get the RTT
4324 * and update the RTO. But first check if the timestamp is
5617 default:
5618 break;
5619 }
5620 break;
5621 case ICMP_SOURCE_QUENCH: {
5622 /*
5623 * use a global boolean to control
5624 * whether TCP should respond to ICMP_SOURCE_QUENCH.
5625 * The default is false.
5626 */
5627 if (tcp_icmp_source_quench) {
5628 /*
5629 * Reduce the sending rate as if we got a
5630 * retransmit timeout
5631 */
5632 uint32_t npkt;
5633
5634 npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) /
5635 tcp->tcp_mss;
5636 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * tcp->tcp_mss;
5637 tcp->tcp_cwnd = tcp->tcp_mss;
5638 tcp->tcp_cwnd_cnt = 0;
5639 }
5640 break;
5641 }
5642 }
5643 freemsg(mp);
5644 }
5645
5646 /*
5647 * tcp_icmp_error_ipv6 is called from tcp_icmp_input to process ICMPv6
5648 * error messages passed up by IP.
5649 * Assumes that IP has pulled up all the extension headers as well
5650 * as the ICMPv6 header.
5651 */
5652 static void
5653 tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, ip_recv_attr_t *ira)
5654 {
5655 icmp6_t *icmp6;
5656 ip6_t *ip6h;
|
153
154 static boolean_t tcp_outbound_squeue_switch = B_FALSE;
155
156 static mblk_t *tcp_conn_create_v4(conn_t *, conn_t *, mblk_t *,
157 ip_recv_attr_t *);
158 static mblk_t *tcp_conn_create_v6(conn_t *, conn_t *, mblk_t *,
159 ip_recv_attr_t *);
160 static boolean_t tcp_drop_q0(tcp_t *);
161 static void tcp_icmp_error_ipv6(tcp_t *, mblk_t *, ip_recv_attr_t *);
162 static mblk_t *tcp_input_add_ancillary(tcp_t *, mblk_t *, ip_pkt_t *,
163 ip_recv_attr_t *);
164 static void tcp_input_listener(void *, mblk_t *, void *, ip_recv_attr_t *);
165 static void tcp_process_options(tcp_t *, tcpha_t *);
166 static mblk_t *tcp_reass(tcp_t *, mblk_t *, uint32_t);
167 static void tcp_reass_elim_overlap(tcp_t *, mblk_t *);
168 static void tcp_rsrv_input(void *, mblk_t *, void *, ip_recv_attr_t *);
169 static void tcp_set_rto(tcp_t *, hrtime_t);
170 static void tcp_setcred_data(mblk_t *, ip_recv_attr_t *);
171
172 /*
173 * CC wrapper hook functions
174 */
175 static void
176 cc_ack_received(tcp_t *tcp, uint32_t seg_ack, int32_t bytes_acked,
177 uint16_t type)
178 {
179 uint32_t old_cwnd = tcp->tcp_cwnd;
180
181 tcp->tcp_ccv.bytes_this_ack = bytes_acked;
182 if (tcp->tcp_cwnd <= tcp->tcp_swnd)
183 tcp->tcp_ccv.flags |= CCF_CWND_LIMITED;
184 else
185 tcp->tcp_ccv.flags &= ~CCF_CWND_LIMITED;
186
187 if (type == CC_ACK) {
188 if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) {
189 if (tcp->tcp_ccv.flags & CCF_RTO)
190 tcp->tcp_ccv.flags &= ~CCF_RTO;
191
192 tcp->tcp_ccv.t_bytes_acked +=
193 min(tcp->tcp_ccv.bytes_this_ack,
194 tcp->tcp_tcps->tcps_abc_l_var * tcp->tcp_mss);
195 if (tcp->tcp_ccv.t_bytes_acked >= tcp->tcp_cwnd) {
196 tcp->tcp_ccv.t_bytes_acked -= tcp->tcp_cwnd;
197 tcp->tcp_ccv.flags |= CCF_ABC_SENTAWND;
198 }
199 } else {
200 tcp->tcp_ccv.flags &= ~CCF_ABC_SENTAWND;
201 tcp->tcp_ccv.t_bytes_acked = 0;
202 }
203 }
204
205 if (CC_ALGO(tcp)->ack_received != NULL) {
206 /*
207 * The FreeBSD code where this originated had a comment "Find
208 * a way to live without this" in several places where curack
209 * got set. If they eventually dump curack from the cc
210 * variables, we'll need to adapt our code.
211 */
212 tcp->tcp_ccv.curack = seg_ack;
213 CC_ALGO(tcp)->ack_received(&tcp->tcp_ccv, type);
214 }
215
216 DTRACE_PROBE3(cwnd__cc__ack__received, tcp_t *, tcp, uint32_t, old_cwnd,
217 uint32_t, tcp->tcp_cwnd);
218 }
219
220 void
221 cc_cong_signal(tcp_t *tcp, uint32_t seg_ack, uint32_t type)
222 {
223 uint32_t old_cwnd = tcp->tcp_cwnd;
224 uint32_t old_cwnd_ssthresh = tcp->tcp_cwnd_ssthresh;
225 switch (type) {
226 case CC_NDUPACK:
227 if (!IN_FASTRECOVERY(tcp->tcp_ccv.flags)) {
228 tcp->tcp_rexmit_max = tcp->tcp_snxt;
229 if (tcp->tcp_ecn_ok) {
230 tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
231 tcp->tcp_cwr = B_TRUE;
232 tcp->tcp_ecn_cwr_sent = B_FALSE;
233 }
234 }
235 break;
236 case CC_ECN:
237 if (!IN_CONGRECOVERY(tcp->tcp_ccv.flags)) {
238 tcp->tcp_rexmit_max = tcp->tcp_snxt;
239 if (tcp->tcp_ecn_ok) {
240 tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
241 tcp->tcp_cwr = B_TRUE;
242 tcp->tcp_ecn_cwr_sent = B_FALSE;
243 }
244 }
245 break;
246 case CC_RTO:
247 tcp->tcp_ccv.flags |= CCF_RTO;
248 tcp->tcp_dupack_cnt = 0;
249 tcp->tcp_ccv.t_bytes_acked = 0;
250 /*
251 * Give up on fast recovery and congestion recovery if we were
252 * attempting either.
253 */
254 EXIT_RECOVERY(tcp->tcp_ccv.flags);
255 if (CC_ALGO(tcp)->cong_signal == NULL) {
256 /*
257 * RFC5681 Section 3.1
258 * ssthresh = max (FlightSize / 2, 2*SMSS) eq (4)
259 */
260 tcp->tcp_cwnd_ssthresh = max(
261 (tcp->tcp_snxt - tcp->tcp_suna) / 2 / tcp->tcp_mss,
262 2) * tcp->tcp_mss;
263 tcp->tcp_cwnd = tcp->tcp_mss;
264 }
265
266 if (tcp->tcp_ecn_ok) {
267 tcp->tcp_cwr = B_TRUE;
268 tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
269 tcp->tcp_ecn_cwr_sent = B_FALSE;
270 }
271 break;
272 }
273
274 if (CC_ALGO(tcp)->cong_signal != NULL) {
275 tcp->tcp_ccv.curack = seg_ack;
276 CC_ALGO(tcp)->cong_signal(&tcp->tcp_ccv, type);
277 }
278
279 DTRACE_PROBE6(cwnd__cc__cong__signal, tcp_t *, tcp, uint32_t, old_cwnd,
280 uint32_t, tcp->tcp_cwnd, uint32_t, old_cwnd_ssthresh,
281 uint32_t, tcp->tcp_cwnd_ssthresh, uint32_t, type);
282 }
283
284 static void
285 cc_post_recovery(tcp_t *tcp, uint32_t seg_ack)
286 {
287 uint32_t old_cwnd = tcp->tcp_cwnd;
288
289 if (CC_ALGO(tcp)->post_recovery != NULL) {
290 tcp->tcp_ccv.curack = seg_ack;
291 CC_ALGO(tcp)->post_recovery(&tcp->tcp_ccv);
292 }
293 tcp->tcp_ccv.t_bytes_acked = 0;
294
295 DTRACE_PROBE3(cwnd__cc__post__recovery, tcp_t *, tcp,
296 uint32_t, old_cwnd, uint32_t, tcp->tcp_cwnd);
297 }
298
299 /*
300 * Set the MSS associated with a particular tcp based on its current value,
301 * and a new one passed in. Observe minimums and maximums, and reset other
302 * state variables that we want to view as multiples of MSS.
303 *
304 * The value of MSS could be either increased or descreased.
305 */
306 void
307 tcp_mss_set(tcp_t *tcp, uint32_t mss)
308 {
309 uint32_t mss_max;
310 tcp_stack_t *tcps = tcp->tcp_tcps;
311 conn_t *connp = tcp->tcp_connp;
312
313 if (connp->conn_ipversion == IPV4_VERSION)
314 mss_max = tcps->tcps_mss_max_ipv4;
315 else
316 mss_max = tcps->tcps_mss_max_ipv6;
317
318 if (mss < tcps->tcps_mss_min)
319 mss = tcps->tcps_mss_min;
658 IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) + TCP_MIN_HEADER_LENGTH);
659
660 /*
661 * Set MSS to the smaller one of both ends of the connection.
662 * We should not have called tcp_mss_set() before, but our
663 * side of the MSS should have been set to a proper value
664 * by tcp_set_destination(). tcp_mss_set() will also set up the
665 * STREAM head parameters properly.
666 *
667 * If we have a larger-than-16-bit window but the other side
668 * didn't want to do window scale, tcp_rwnd_set() will take
669 * care of that.
670 */
671 tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss));
672
673 /*
674 * Initialize tcp_cwnd value. After tcp_mss_set(), tcp_mss has been
675 * updated properly.
676 */
677 TCP_SET_INIT_CWND(tcp, tcp->tcp_mss, tcps->tcps_slow_start_initial);
678
679 if (tcp->tcp_cc_algo->conn_init != NULL)
680 tcp->tcp_cc_algo->conn_init(&tcp->tcp_ccv);
681 }
682
683 /*
684 * Add a new piece to the tcp reassembly queue. If the gap at the beginning
685 * is filled, return as much as we can. The message passed in may be
686 * multi-part, chained using b_cont. "start" is the starting sequence
687 * number for this piece.
688 */
689 static mblk_t *
690 tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start)
691 {
692 uint32_t end, bytes;
693 mblk_t *mp1;
694 mblk_t *mp2;
695 mblk_t *next_mp;
696 uint32_t u1;
697 tcp_stack_t *tcps = tcp->tcp_tcps;
698
699
700 /* Walk through all the new pieces. */
1518 "reached: %u attempts dropped total\n",
1519 ntohs(listener->tcp_connp->conn_lport),
1520 tlc->tlc_max, tlc->tlc_drop);
1521 tlc->tlc_report_time = now;
1522 }
1523 goto error2;
1524 }
1525 tlc_set = B_TRUE;
1526 }
1527
1528 mutex_exit(&listener->tcp_eager_lock);
1529
1530 /*
1531 * IP sets ira_sqp to either the senders conn_sqp (for loopback)
1532 * or based on the ring (for packets from GLD). Otherwise it is
1533 * set based on lbolt i.e., a somewhat random number.
1534 */
1535 ASSERT(ira->ira_sqp != NULL);
1536 new_sqp = ira->ira_sqp;
1537
1538 econnp = tcp_get_conn(arg2, tcps);
1539 if (econnp == NULL)
1540 goto error2;
1541
1542 ASSERT(econnp->conn_netstack == lconnp->conn_netstack);
1543 econnp->conn_sqp = new_sqp;
1544 econnp->conn_initial_sqp = new_sqp;
1545 econnp->conn_ixa->ixa_sqp = new_sqp;
1546
1547 econnp->conn_fport = tcpha->tha_lport;
1548 econnp->conn_lport = tcpha->tha_fport;
1549
1550 err = conn_inherit_parent(lconnp, econnp);
1551 if (err != 0)
1552 goto error3;
1553
1554 /* We already know the laddr of the new connection is ours */
1555 econnp->conn_ixa->ixa_src_generation = ipst->ips_src_generation;
1556
1557 ASSERT(OK_32PTR(mp->b_rptr));
1558 ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION ||
2437 tcp_input_data(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
2438 {
2439 int32_t bytes_acked;
2440 int32_t gap;
2441 mblk_t *mp1;
2442 uint_t flags;
2443 uint32_t new_swnd = 0;
2444 uchar_t *iphdr;
2445 uchar_t *rptr;
2446 int32_t rgap;
2447 uint32_t seg_ack;
2448 int seg_len;
2449 uint_t ip_hdr_len;
2450 uint32_t seg_seq;
2451 tcpha_t *tcpha;
2452 int urp;
2453 tcp_opt_t tcpopt;
2454 ip_pkt_t ipp;
2455 boolean_t ofo_seg = B_FALSE; /* Out of order segment */
2456 uint32_t cwnd;
2457 int mss;
2458 conn_t *connp = (conn_t *)arg;
2459 squeue_t *sqp = (squeue_t *)arg2;
2460 tcp_t *tcp = connp->conn_tcp;
2461 tcp_stack_t *tcps = tcp->tcp_tcps;
2462 sock_upcalls_t *sockupcalls;
2463
2464 /*
2465 * RST from fused tcp loopback peer should trigger an unfuse.
2466 */
2467 if (tcp->tcp_fused) {
2468 TCP_STAT(tcps, tcp_fusion_aborted);
2469 tcp_unfuse(tcp);
2470 }
2471
2472 iphdr = mp->b_rptr;
2473 rptr = mp->b_rptr;
2474 ASSERT(OK_32PTR(rptr));
2475
2476 ip_hdr_len = ira->ira_ip_hdr_length;
2712 tcp->tcp_suna = tcp->tcp_iss + 1;
2713 tcp->tcp_valid_bits &= ~TCP_ISS_VALID;
2714
2715 /*
2716 * If SYN was retransmitted, need to reset all
2717 * retransmission info. This is because this
2718 * segment will be treated as a dup ACK.
2719 */
2720 if (tcp->tcp_rexmit) {
2721 tcp->tcp_rexmit = B_FALSE;
2722 tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
2723 tcp->tcp_rexmit_max = tcp->tcp_snxt;
2724 tcp->tcp_ms_we_have_waited = 0;
2725
2726 /*
2727 * Set tcp_cwnd back to 1 MSS, per
2728 * recommendation from
2729 * draft-floyd-incr-init-win-01.txt,
2730 * Increasing TCP's Initial Window.
2731 */
2732 DTRACE_PROBE3(cwnd__retransmitted__syn,
2733 tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
2734 uint32_t, tcp->tcp_mss);
2735 tcp->tcp_cwnd = tcp->tcp_mss;
2736 }
2737
2738 tcp->tcp_swl1 = seg_seq;
2739 tcp->tcp_swl2 = seg_ack;
2740
2741 new_swnd = ntohs(tcpha->tha_win);
2742 tcp->tcp_swnd = new_swnd;
2743 if (new_swnd > tcp->tcp_max_swnd)
2744 tcp->tcp_max_swnd = new_swnd;
2745
2746 /*
2747 * Always send the three-way handshake ack immediately
2748 * in order to make the connection complete as soon as
2749 * possible on the accepting host.
2750 */
2751 flags |= TH_ACK_NEEDED;
2752
2753 /*
2754 * Trace connect-established here.
3937 ip_xmit_attr_t *, connp->conn_ixa, void_ip_t *,
3938 iphdr, tcp_t *, tcp, tcph_t *, tcpha);
3939 }
3940 TCPS_CONN_INC(tcps);
3941
3942 tcp->tcp_suna = tcp->tcp_iss + 1; /* One for the SYN */
3943 bytes_acked--;
3944 /* SYN was acked - making progress */
3945 tcp->tcp_ip_forward_progress = B_TRUE;
3946
3947 /*
3948 * If SYN was retransmitted, need to reset all
3949 * retransmission info as this segment will be
3950 * treated as a dup ACK.
3951 */
3952 if (tcp->tcp_rexmit) {
3953 tcp->tcp_rexmit = B_FALSE;
3954 tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
3955 tcp->tcp_rexmit_max = tcp->tcp_snxt;
3956 tcp->tcp_ms_we_have_waited = 0;
3957 DTRACE_PROBE3(cwnd__retransmitted__syn,
3958 tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
3959 uint32_t, tcp->tcp_mss);
3960 tcp->tcp_cwnd = mss;
3961 }
3962
3963 /*
3964 * We set the send window to zero here.
3965 * This is needed if there is data to be
3966 * processed already on the queue.
3967 * Later (at swnd_update label), the
3968 * "new_swnd > tcp_swnd" condition is satisfied
3969 * the XMIT_NEEDED flag is set in the current
3970 * (SYN_RCVD) state. This ensures tcp_wput_data() is
3971 * called if there is already data on queue in
3972 * this state.
3973 */
3974 tcp->tcp_swnd = 0;
3975
3976 if (new_swnd > tcp->tcp_max_swnd)
3977 tcp->tcp_max_swnd = new_swnd;
3978 tcp->tcp_swl1 = seg_seq;
3979 tcp->tcp_swl2 = seg_ack;
3983 DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *,
3984 connp->conn_ixa, void, NULL, tcp_t *, tcp, void, NULL,
3985 int32_t, TCPS_SYN_RCVD);
3986
3987 /* Fuse when both sides are in ESTABLISHED state */
3988 if (tcp->tcp_loopback && do_tcp_fusion)
3989 tcp_fuse(tcp, iphdr, tcpha);
3990
3991 }
3992 /* This code follows 4.4BSD-Lite2 mostly. */
3993 if (bytes_acked < 0)
3994 goto est;
3995
3996 /*
3997 * If TCP is ECN capable and the congestion experience bit is
3998 * set, reduce tcp_cwnd and tcp_ssthresh. But this should only be
3999 * done once per window (or more loosely, per RTT).
4000 */
4001 if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max))
4002 tcp->tcp_cwr = B_FALSE;
4003 if (tcp->tcp_ecn_ok && (flags & TH_ECE) && !tcp->tcp_cwr) {
4004 cc_cong_signal(tcp, seg_ack, CC_ECN);
4005 /*
4006 * If the cwnd is 0, use the timer to clock out
4007 * new segments. This is required by the ECN spec.
4008 */
4009 if (tcp->tcp_cwnd == 0)
4010 TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
4011 tcp->tcp_cwr = B_TRUE;
4012 /*
4013 * This marks the end of the current window of in
4014 * flight data. That is why we don't use
4015 * tcp_suna + tcp_swnd. Only data in flight can
4016 * provide ECN info.
4017 */
4018 tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
4019 }
4020
4021 mp1 = tcp->tcp_xmit_head;
4022 if (bytes_acked == 0) {
4023 if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) {
4024 int dupack_cnt;
4025
4026 TCPS_BUMP_MIB(tcps, tcpInDupAck);
4027 /*
4028 * Fast retransmit. When we have seen exactly three
4029 * identical ACKs while we have unacked data
4030 * outstanding we take it as a hint that our peer
4031 * dropped something.
4032 *
4033 * If TCP is retransmitting, don't do fast retransmit.
4034 */
4035 if (mp1 && tcp->tcp_suna != tcp->tcp_snxt &&
4036 ! tcp->tcp_rexmit) {
4037 /* Do Limited Transmit */
4038 if ((dupack_cnt = ++tcp->tcp_dupack_cnt) <
4039 tcps->tcps_dupack_fast_retransmit) {
4040 cc_ack_received(tcp, seg_ack,
4041 bytes_acked, CC_DUPACK);
4042 /*
4043 * RFC 3042
4044 *
4045 * What we need to do is temporarily
4046 * increase tcp_cwnd so that new
4047 * data can be sent if it is allowed
4048 * by the receive window (tcp_rwnd).
4049 * tcp_wput_data() will take care of
4050 * the rest.
4051 *
4052 * If the connection is SACK capable,
4053 * only do limited xmit when there
4054 * is SACK info.
4055 *
4056 * Note how tcp_cwnd is incremented.
4057 * The first dup ACK will increase
4058 * it by 1 MSS. The second dup ACK
4059 * will increase it by 2 MSS. This
4060 * means that only 1 new segment will
4061 * be sent for each dup ACK.
4068 (tcp->tcp_dupack_cnt - 1);
4069 flags |= TH_LIMIT_XMIT;
4070 }
4071 } else if (dupack_cnt ==
4072 tcps->tcps_dupack_fast_retransmit) {
4073
4074 /*
4075 * If we have reduced tcp_ssthresh
4076 * because of ECN, do not reduce it again
4077 * unless it is already one window of data
4078 * away. After one window of data, tcp_cwr
4079 * should then be cleared. Note that
4080 * for non ECN capable connection, tcp_cwr
4081 * should always be false.
4082 *
4083 * Adjust cwnd since the duplicate
4084 * ack indicates that a packet was
4085 * dropped (due to congestion.)
4086 */
4087 if (!tcp->tcp_cwr) {
4088 cc_cong_signal(tcp, seg_ack,
4089 CC_NDUPACK);
4090 cc_ack_received(tcp, seg_ack,
4091 bytes_acked, CC_DUPACK);
4092 }
4093 if (tcp->tcp_ecn_ok) {
4094 tcp->tcp_cwr = B_TRUE;
4095 tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
4096 tcp->tcp_ecn_cwr_sent = B_FALSE;
4097 }
4098
4099 /*
4100 * We do Hoe's algorithm. Refer to her
4101 * paper "Improving the Start-up Behavior
4102 * of a Congestion Control Scheme for TCP,"
4103 * appeared in SIGCOMM'96.
4104 *
4105 * Save highest seq no we have sent so far.
4106 * Be careful about the invisible FIN byte.
4107 */
4108 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
4109 (tcp->tcp_unsent == 0)) {
4110 tcp->tcp_rexmit_max = tcp->tcp_fss;
4111 } else {
4133 tcp->tcp_fack;
4134 tcp->tcp_sack_snxt = seg_ack;
4135 flags |= TH_NEED_SACK_REXMIT;
4136 } else {
4137 /*
4138 * Always initialize tcp_pipe
4139 * even though we don't have
4140 * any SACK info. If later
4141 * we get SACK info and
4142 * tcp_pipe is not initialized,
4143 * funny things will happen.
4144 */
4145 tcp->tcp_pipe =
4146 tcp->tcp_cwnd_ssthresh;
4147 }
4148 } else {
4149 flags |= TH_REXMIT_NEEDED;
4150 } /* tcp_snd_sack_ok */
4151
4152 } else {
4153 cc_ack_received(tcp, seg_ack,
4154 bytes_acked, CC_DUPACK);
4155 /*
4156 * Here we perform congestion
4157 * avoidance, but NOT slow start.
4158 * This is known as the Fast
4159 * Recovery Algorithm.
4160 */
4161 if (tcp->tcp_snd_sack_ok &&
4162 tcp->tcp_notsack_list != NULL) {
4163 flags |= TH_NEED_SACK_REXMIT;
4164 tcp->tcp_pipe -= mss;
4165 if (tcp->tcp_pipe < 0)
4166 tcp->tcp_pipe = 0;
4167 } else {
4168 /*
4169 * We know that one more packet has
4170 * left the pipe thus we can update
4171 * cwnd.
4172 */
4173 cwnd = tcp->tcp_cwnd + mss;
4174 if (cwnd > tcp->tcp_cwnd_max)
4175 cwnd = tcp->tcp_cwnd_max;
4176 DTRACE_PROBE3(cwnd__fast__recovery,
4177 tcp_t *, tcp,
4178 uint32_t, tcp->tcp_cwnd,
4179 uint32_t, cwnd);
4180 tcp->tcp_cwnd = cwnd;
4181 if (tcp->tcp_unsent > 0)
4182 flags |= TH_XMIT_NEEDED;
4183 }
4184 }
4185 }
4186 } else if (tcp->tcp_zero_win_probe) {
4187 /*
4188 * If the window has opened, need to arrange
4189 * to send additional data.
4190 */
4191 if (new_swnd != 0) {
4192 /* tcp_suna != tcp_snxt */
4193 /* Packet contains a window update */
4194 TCPS_BUMP_MIB(tcps, tcpInWinUpdate);
4195 tcp->tcp_zero_win_probe = 0;
4196 tcp->tcp_timer_backoff = 0;
4197 tcp->tcp_ms_we_have_waited = 0;
4198
4199 /*
4292
4293 /*
4294 * TCP gets a new ACK, update the notsack'ed list to delete those
4295 * blocks that are covered by this ACK.
4296 */
4297 if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) {
4298 tcp_notsack_remove(&(tcp->tcp_notsack_list), seg_ack,
4299 &(tcp->tcp_num_notsack_blk), &(tcp->tcp_cnt_notsack_list));
4300 }
4301
4302 /*
4303 * If we got an ACK after fast retransmit, check to see
4304 * if it is a partial ACK. If it is not and the congestion
4305 * window was inflated to account for the other side's
4306 * cached packets, retract it. If it is, do Hoe's algorithm.
4307 */
4308 if (tcp->tcp_dupack_cnt >= tcps->tcps_dupack_fast_retransmit) {
4309 ASSERT(tcp->tcp_rexmit == B_FALSE);
4310 if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) {
4311 tcp->tcp_dupack_cnt = 0;
4312
4313 cc_post_recovery(tcp, seg_ack);
4314
4315 tcp->tcp_rexmit_max = seg_ack;
4316
4317 /*
4318 * Remove all notsack info to avoid confusion with
4319 * the next fast retrasnmit/recovery phase.
4320 */
4321 if (tcp->tcp_snd_sack_ok) {
4322 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list,
4323 tcp);
4324 }
4325 } else {
4326 if (tcp->tcp_snd_sack_ok &&
4327 tcp->tcp_notsack_list != NULL) {
4328 flags |= TH_NEED_SACK_REXMIT;
4329 tcp->tcp_pipe -= mss;
4330 if (tcp->tcp_pipe < 0)
4331 tcp->tcp_pipe = 0;
4332 } else {
4333 /*
4334 * Hoe's algorithm:
4335 *
4336 * Retransmit the unack'ed segment and
4337 * restart fast recovery. Note that we
4338 * need to scale back tcp_cwnd to the
4339 * original value when we started fast
4340 * recovery. This is to prevent overly
4341 * aggressive behaviour in sending new
4342 * segments.
4343 */
4344 cwnd = tcp->tcp_cwnd_ssthresh +
4345 tcps->tcps_dupack_fast_retransmit * mss;
4346 DTRACE_PROBE3(cwnd__fast__retransmit__part__ack,
4347 tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
4348 uint32_t, cwnd);
4349 tcp->tcp_cwnd = cwnd;
4350 tcp->tcp_cwnd_cnt = tcp->tcp_cwnd;
4351 flags |= TH_REXMIT_NEEDED;
4352 }
4353 }
4354 } else {
4355 tcp->tcp_dupack_cnt = 0;
4356 if (tcp->tcp_rexmit) {
4357 /*
4358 * TCP is retranmitting. If the ACK ack's all
4359 * outstanding data, update tcp_rexmit_max and
4360 * tcp_rexmit_nxt. Otherwise, update tcp_rexmit_nxt
4361 * to the correct value.
4362 *
4363 * Note that SEQ_LEQ() is used. This is to avoid
4364 * unnecessary fast retransmit caused by dup ACKs
4365 * received when TCP does slow start retransmission
4366 * after a time out. During this phase, TCP may
4367 * send out segments which are already received.
4368 * This causes dup ACKs to be sent back.
4369 */
4390 tcp->tcp_timer_backoff = 0;
4391 }
4392
4393 /*
4394 * If tcp_xmit_head is NULL, then it must be the FIN being ack'ed.
4395 * Note that it cannot be the SYN being ack'ed. The code flow
4396 * will not reach here.
4397 */
4398 if (mp1 == NULL) {
4399 goto fin_acked;
4400 }
4401
4402 /*
4403 * Update the congestion window.
4404 *
4405 * If TCP is not ECN capable or TCP is ECN capable but the
4406 * congestion experience bit is not set, increase the tcp_cwnd as
4407 * usual.
4408 */
4409 if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) {
4410 if (IN_RECOVERY(tcp->tcp_ccv.flags)) {
4411 EXIT_RECOVERY(tcp->tcp_ccv.flags);
4412 }
4413 cc_ack_received(tcp, seg_ack, bytes_acked, CC_ACK);
4414 }
4415
4416 /* See if the latest urgent data has been acknowledged */
4417 if ((tcp->tcp_valid_bits & TCP_URG_VALID) &&
4418 SEQ_GT(seg_ack, tcp->tcp_urg))
4419 tcp->tcp_valid_bits &= ~TCP_URG_VALID;
4420
4421 /*
4422 * Update the RTT estimates. Note that we don't use the TCP
4423 * timestamp option to calculate RTT even if one is present. This is
4424 * because the timestamp option's resolution (CPU tick) is
4425 * too coarse to measure modern datacenter networks' microsecond
4426 * latencies. The timestamp field's resolution is limited by its
4427 * 4-byte width (see RFC1323), and since we always store a
4428 * high-resolution nanosecond presision timestamp along with the data,
4429 * there is no point to ever using the timestamp option.
4430 */
4431 if (SEQ_GT(seg_ack, tcp->tcp_csuna)) {
4432 /*
4433 * An ACK sequence we haven't seen before, so get the RTT
4434 * and update the RTO. But first check if the timestamp is
5727 default:
5728 break;
5729 }
5730 break;
5731 case ICMP_SOURCE_QUENCH: {
5732 /*
5733 * use a global boolean to control
5734 * whether TCP should respond to ICMP_SOURCE_QUENCH.
5735 * The default is false.
5736 */
5737 if (tcp_icmp_source_quench) {
5738 /*
5739 * Reduce the sending rate as if we got a
5740 * retransmit timeout
5741 */
5742 uint32_t npkt;
5743
5744 npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) /
5745 tcp->tcp_mss;
5746 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * tcp->tcp_mss;
5747
5748 DTRACE_PROBE3(cwnd__source__quench, tcp_t *, tcp,
5749 uint32_t, tcp->tcp_cwnd,
5750 uint32_t, tcp->tcp_mss);
5751 tcp->tcp_cwnd = tcp->tcp_mss;
5752 tcp->tcp_cwnd_cnt = 0;
5753 }
5754 break;
5755 }
5756 }
5757 freemsg(mp);
5758 }
5759
5760 /*
5761 * tcp_icmp_error_ipv6 is called from tcp_icmp_input to process ICMPv6
5762 * error messages passed up by IP.
5763 * Assumes that IP has pulled up all the extension headers as well
5764 * as the ICMPv6 header.
5765 */
5766 static void
5767 tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, ip_recv_attr_t *ira)
5768 {
5769 icmp6_t *icmp6;
5770 ip6_t *ip6h;
|