Print this page
11553 Want pluggable TCP congestion control algorithms
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>
Reviewed by: Robert Mustacchi <robert.mustacchi@joyent.com>


 153 
 154 static boolean_t tcp_outbound_squeue_switch = B_FALSE;
 155 
 156 static mblk_t   *tcp_conn_create_v4(conn_t *, conn_t *, mblk_t *,
 157                     ip_recv_attr_t *);
 158 static mblk_t   *tcp_conn_create_v6(conn_t *, conn_t *, mblk_t *,
 159                     ip_recv_attr_t *);
 160 static boolean_t        tcp_drop_q0(tcp_t *);
 161 static void     tcp_icmp_error_ipv6(tcp_t *, mblk_t *, ip_recv_attr_t *);
 162 static mblk_t   *tcp_input_add_ancillary(tcp_t *, mblk_t *, ip_pkt_t *,
 163                     ip_recv_attr_t *);
 164 static void     tcp_input_listener(void *, mblk_t *, void *, ip_recv_attr_t *);
 165 static void     tcp_process_options(tcp_t *, tcpha_t *);
 166 static mblk_t   *tcp_reass(tcp_t *, mblk_t *, uint32_t);
 167 static void     tcp_reass_elim_overlap(tcp_t *, mblk_t *);
 168 static void     tcp_rsrv_input(void *, mblk_t *, void *, ip_recv_attr_t *);
 169 static void     tcp_set_rto(tcp_t *, hrtime_t);
 170 static void     tcp_setcred_data(mblk_t *, ip_recv_attr_t *);
 171 
 172 /*































































































































 173  * Set the MSS associated with a particular tcp based on its current value,
 174  * and a new one passed in. Observe minimums and maximums, and reset other
 175  * state variables that we want to view as multiples of MSS.
 176  *
 177  * The value of MSS could be either increased or descreased.
 178  */
 179 void
 180 tcp_mss_set(tcp_t *tcp, uint32_t mss)
 181 {
 182         uint32_t        mss_max;
 183         tcp_stack_t     *tcps = tcp->tcp_tcps;
 184         conn_t          *connp = tcp->tcp_connp;
 185 
 186         if (connp->conn_ipversion == IPV4_VERSION)
 187                 mss_max = tcps->tcps_mss_max_ipv4;
 188         else
 189                 mss_max = tcps->tcps_mss_max_ipv6;
 190 
 191         if (mss < tcps->tcps_mss_min)
 192                 mss = tcps->tcps_mss_min;


 531             IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) + TCP_MIN_HEADER_LENGTH);
 532 
 533         /*
 534          * Set MSS to the smaller one of both ends of the connection.
 535          * We should not have called tcp_mss_set() before, but our
 536          * side of the MSS should have been set to a proper value
 537          * by tcp_set_destination().  tcp_mss_set() will also set up the
 538          * STREAM head parameters properly.
 539          *
 540          * If we have a larger-than-16-bit window but the other side
 541          * didn't want to do window scale, tcp_rwnd_set() will take
 542          * care of that.
 543          */
 544         tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss));
 545 
 546         /*
 547          * Initialize tcp_cwnd value. After tcp_mss_set(), tcp_mss has been
 548          * updated properly.
 549          */
 550         TCP_SET_INIT_CWND(tcp, tcp->tcp_mss, tcps->tcps_slow_start_initial);



 551 }
 552 
 553 /*
 554  * Add a new piece to the tcp reassembly queue.  If the gap at the beginning
 555  * is filled, return as much as we can.  The message passed in may be
 556  * multi-part, chained using b_cont.  "start" is the starting sequence
 557  * number for this piece.
 558  */
 559 static mblk_t *
 560 tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start)
 561 {
 562         uint32_t        end, bytes;
 563         mblk_t          *mp1;
 564         mblk_t          *mp2;
 565         mblk_t          *next_mp;
 566         uint32_t        u1;
 567         tcp_stack_t     *tcps = tcp->tcp_tcps;
 568 
 569 
 570         /* Walk through all the new pieces. */


1388                                     "reached: %u attempts dropped total\n",
1389                                     ntohs(listener->tcp_connp->conn_lport),
1390                                     tlc->tlc_max, tlc->tlc_drop);
1391                                 tlc->tlc_report_time = now;
1392                         }
1393                         goto error2;
1394                 }
1395                 tlc_set = B_TRUE;
1396         }
1397 
1398         mutex_exit(&listener->tcp_eager_lock);
1399 
1400         /*
1401          * IP sets ira_sqp to either the senders conn_sqp (for loopback)
1402          * or based on the ring (for packets from GLD). Otherwise it is
1403          * set based on lbolt i.e., a somewhat random number.
1404          */
1405         ASSERT(ira->ira_sqp != NULL);
1406         new_sqp = ira->ira_sqp;
1407 
1408         econnp = (conn_t *)tcp_get_conn(arg2, tcps);
1409         if (econnp == NULL)
1410                 goto error2;
1411 
1412         ASSERT(econnp->conn_netstack == lconnp->conn_netstack);
1413         econnp->conn_sqp = new_sqp;
1414         econnp->conn_initial_sqp = new_sqp;
1415         econnp->conn_ixa->ixa_sqp = new_sqp;
1416 
1417         econnp->conn_fport = tcpha->tha_lport;
1418         econnp->conn_lport = tcpha->tha_fport;
1419 
1420         err = conn_inherit_parent(lconnp, econnp);
1421         if (err != 0)
1422                 goto error3;
1423 
1424         /* We already know the laddr of the new connection is ours */
1425         econnp->conn_ixa->ixa_src_generation = ipst->ips_src_generation;
1426 
1427         ASSERT(OK_32PTR(mp->b_rptr));
1428         ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION ||


2307 tcp_input_data(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
2308 {
2309         int32_t         bytes_acked;
2310         int32_t         gap;
2311         mblk_t          *mp1;
2312         uint_t          flags;
2313         uint32_t        new_swnd = 0;
2314         uchar_t         *iphdr;
2315         uchar_t         *rptr;
2316         int32_t         rgap;
2317         uint32_t        seg_ack;
2318         int             seg_len;
2319         uint_t          ip_hdr_len;
2320         uint32_t        seg_seq;
2321         tcpha_t         *tcpha;
2322         int             urp;
2323         tcp_opt_t       tcpopt;
2324         ip_pkt_t        ipp;
2325         boolean_t       ofo_seg = B_FALSE; /* Out of order segment */
2326         uint32_t        cwnd;
2327         uint32_t        add;
2328         int             npkt;
2329         int             mss;
2330         conn_t          *connp = (conn_t *)arg;
2331         squeue_t        *sqp = (squeue_t *)arg2;
2332         tcp_t           *tcp = connp->conn_tcp;
2333         tcp_stack_t     *tcps = tcp->tcp_tcps;
2334         sock_upcalls_t  *sockupcalls;
2335 
2336         /*
2337          * RST from fused tcp loopback peer should trigger an unfuse.
2338          */
2339         if (tcp->tcp_fused) {
2340                 TCP_STAT(tcps, tcp_fusion_aborted);
2341                 tcp_unfuse(tcp);
2342         }
2343 
2344         iphdr = mp->b_rptr;
2345         rptr = mp->b_rptr;
2346         ASSERT(OK_32PTR(rptr));
2347 
2348         ip_hdr_len = ira->ira_ip_hdr_length;


2584                         tcp->tcp_suna = tcp->tcp_iss + 1;
2585                         tcp->tcp_valid_bits &= ~TCP_ISS_VALID;
2586 
2587                         /*
2588                          * If SYN was retransmitted, need to reset all
2589                          * retransmission info.  This is because this
2590                          * segment will be treated as a dup ACK.
2591                          */
2592                         if (tcp->tcp_rexmit) {
2593                                 tcp->tcp_rexmit = B_FALSE;
2594                                 tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
2595                                 tcp->tcp_rexmit_max = tcp->tcp_snxt;
2596                                 tcp->tcp_ms_we_have_waited = 0;
2597 
2598                                 /*
2599                                  * Set tcp_cwnd back to 1 MSS, per
2600                                  * recommendation from
2601                                  * draft-floyd-incr-init-win-01.txt,
2602                                  * Increasing TCP's Initial Window.
2603                                  */



2604                                 tcp->tcp_cwnd = tcp->tcp_mss;
2605                         }
2606 
2607                         tcp->tcp_swl1 = seg_seq;
2608                         tcp->tcp_swl2 = seg_ack;
2609 
2610                         new_swnd = ntohs(tcpha->tha_win);
2611                         tcp->tcp_swnd = new_swnd;
2612                         if (new_swnd > tcp->tcp_max_swnd)
2613                                 tcp->tcp_max_swnd = new_swnd;
2614 
2615                         /*
2616                          * Always send the three-way handshake ack immediately
2617                          * in order to make the connection complete as soon as
2618                          * possible on the accepting host.
2619                          */
2620                         flags |= TH_ACK_NEEDED;
2621 
2622                         /*
2623                          * Trace connect-established here.


3806                             ip_xmit_attr_t *, connp->conn_ixa, void_ip_t *,
3807                             iphdr, tcp_t *, tcp, tcph_t *, tcpha);
3808                 }
3809                 TCPS_CONN_INC(tcps);
3810 
3811                 tcp->tcp_suna = tcp->tcp_iss + 1; /* One for the SYN */
3812                 bytes_acked--;
3813                 /* SYN was acked - making progress */
3814                 tcp->tcp_ip_forward_progress = B_TRUE;
3815 
3816                 /*
3817                  * If SYN was retransmitted, need to reset all
3818                  * retransmission info as this segment will be
3819                  * treated as a dup ACK.
3820                  */
3821                 if (tcp->tcp_rexmit) {
3822                         tcp->tcp_rexmit = B_FALSE;
3823                         tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
3824                         tcp->tcp_rexmit_max = tcp->tcp_snxt;
3825                         tcp->tcp_ms_we_have_waited = 0;



3826                         tcp->tcp_cwnd = mss;
3827                 }
3828 
3829                 /*
3830                  * We set the send window to zero here.
3831                  * This is needed if there is data to be
3832                  * processed already on the queue.
3833                  * Later (at swnd_update label), the
3834                  * "new_swnd > tcp_swnd" condition is satisfied
3835                  * the XMIT_NEEDED flag is set in the current
3836                  * (SYN_RCVD) state. This ensures tcp_wput_data() is
3837                  * called if there is already data on queue in
3838                  * this state.
3839                  */
3840                 tcp->tcp_swnd = 0;
3841 
3842                 if (new_swnd > tcp->tcp_max_swnd)
3843                         tcp->tcp_max_swnd = new_swnd;
3844                 tcp->tcp_swl1 = seg_seq;
3845                 tcp->tcp_swl2 = seg_ack;


3849                 DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *,
3850                     connp->conn_ixa, void, NULL, tcp_t *, tcp, void, NULL,
3851                     int32_t, TCPS_SYN_RCVD);
3852 
3853                 /* Fuse when both sides are in ESTABLISHED state */
3854                 if (tcp->tcp_loopback && do_tcp_fusion)
3855                         tcp_fuse(tcp, iphdr, tcpha);
3856 
3857         }
3858         /* This code follows 4.4BSD-Lite2 mostly. */
3859         if (bytes_acked < 0)
3860                 goto est;
3861 
3862         /*
3863          * If TCP is ECN capable and the congestion experience bit is
3864          * set, reduce tcp_cwnd and tcp_ssthresh.  But this should only be
3865          * done once per window (or more loosely, per RTT).
3866          */
3867         if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max))
3868                 tcp->tcp_cwr = B_FALSE;
3869         if (tcp->tcp_ecn_ok && (flags & TH_ECE)) {
3870                 if (!tcp->tcp_cwr) {
3871                         npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) / mss;
3872                         tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * mss;
3873                         tcp->tcp_cwnd = npkt * mss;
3874                         /*
3875                          * If the cwnd is 0, use the timer to clock out
3876                          * new segments.  This is required by the ECN spec.
3877                          */
3878                         if (npkt == 0) {
3879                                 TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
3880                                 /*
3881                                  * This makes sure that when the ACK comes
3882                                  * back, we will increase tcp_cwnd by 1 MSS.
3883                                  */
3884                                 tcp->tcp_cwnd_cnt = 0;
3885                         }
3886                         tcp->tcp_cwr = B_TRUE;
3887                         /*
3888                          * This marks the end of the current window of in
3889                          * flight data.  That is why we don't use
3890                          * tcp_suna + tcp_swnd.  Only data in flight can
3891                          * provide ECN info.
3892                          */
3893                         tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
3894                         tcp->tcp_ecn_cwr_sent = B_FALSE;
3895                 }
3896         }
3897 
3898         mp1 = tcp->tcp_xmit_head;
3899         if (bytes_acked == 0) {
3900                 if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) {
3901                         int dupack_cnt;
3902 
3903                         TCPS_BUMP_MIB(tcps, tcpInDupAck);
3904                         /*
3905                          * Fast retransmit.  When we have seen exactly three
3906                          * identical ACKs while we have unacked data
3907                          * outstanding we take it as a hint that our peer
3908                          * dropped something.
3909                          *
3910                          * If TCP is retransmitting, don't do fast retransmit.
3911                          */
3912                         if (mp1 && tcp->tcp_suna != tcp->tcp_snxt &&
3913                             ! tcp->tcp_rexmit) {
3914                                 /* Do Limited Transmit */
3915                                 if ((dupack_cnt = ++tcp->tcp_dupack_cnt) <
3916                                     tcps->tcps_dupack_fast_retransmit) {


3917                                         /*
3918                                          * RFC 3042
3919                                          *
3920                                          * What we need to do is temporarily
3921                                          * increase tcp_cwnd so that new
3922                                          * data can be sent if it is allowed
3923                                          * by the receive window (tcp_rwnd).
3924                                          * tcp_wput_data() will take care of
3925                                          * the rest.
3926                                          *
3927                                          * If the connection is SACK capable,
3928                                          * only do limited xmit when there
3929                                          * is SACK info.
3930                                          *
3931                                          * Note how tcp_cwnd is incremented.
3932                                          * The first dup ACK will increase
3933                                          * it by 1 MSS.  The second dup ACK
3934                                          * will increase it by 2 MSS.  This
3935                                          * means that only 1 new segment will
3936                                          * be sent for each dup ACK.


3943                                                     (tcp->tcp_dupack_cnt - 1);
3944                                                 flags |= TH_LIMIT_XMIT;
3945                                         }
3946                                 } else if (dupack_cnt ==
3947                                     tcps->tcps_dupack_fast_retransmit) {
3948 
3949                                 /*
3950                                  * If we have reduced tcp_ssthresh
3951                                  * because of ECN, do not reduce it again
3952                                  * unless it is already one window of data
3953                                  * away.  After one window of data, tcp_cwr
3954                                  * should then be cleared.  Note that
3955                                  * for non ECN capable connection, tcp_cwr
3956                                  * should always be false.
3957                                  *
3958                                  * Adjust cwnd since the duplicate
3959                                  * ack indicates that a packet was
3960                                  * dropped (due to congestion.)
3961                                  */
3962                                 if (!tcp->tcp_cwr) {
3963                                         npkt = ((tcp->tcp_snxt -
3964                                             tcp->tcp_suna) >> 1) / mss;
3965                                         tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) *
3966                                             mss;
3967                                         tcp->tcp_cwnd = (npkt +
3968                                             tcp->tcp_dupack_cnt) * mss;
3969                                 }
3970                                 if (tcp->tcp_ecn_ok) {
3971                                         tcp->tcp_cwr = B_TRUE;
3972                                         tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
3973                                         tcp->tcp_ecn_cwr_sent = B_FALSE;
3974                                 }
3975 
3976                                 /*
3977                                  * We do Hoe's algorithm.  Refer to her
3978                                  * paper "Improving the Start-up Behavior
3979                                  * of a Congestion Control Scheme for TCP,"
3980                                  * appeared in SIGCOMM'96.
3981                                  *
3982                                  * Save highest seq no we have sent so far.
3983                                  * Be careful about the invisible FIN byte.
3984                                  */
3985                                 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
3986                                     (tcp->tcp_unsent == 0)) {
3987                                         tcp->tcp_rexmit_max = tcp->tcp_fss;
3988                                 } else {


4010                                                     tcp->tcp_fack;
4011                                                 tcp->tcp_sack_snxt = seg_ack;
4012                                                 flags |= TH_NEED_SACK_REXMIT;
4013                                         } else {
4014                                                 /*
4015                                                  * Always initialize tcp_pipe
4016                                                  * even though we don't have
4017                                                  * any SACK info.  If later
4018                                                  * we get SACK info and
4019                                                  * tcp_pipe is not initialized,
4020                                                  * funny things will happen.
4021                                                  */
4022                                                 tcp->tcp_pipe =
4023                                                     tcp->tcp_cwnd_ssthresh;
4024                                         }
4025                                 } else {
4026                                         flags |= TH_REXMIT_NEEDED;
4027                                 } /* tcp_snd_sack_ok */
4028 
4029                                 } else {


4030                                         /*
4031                                          * Here we perform congestion
4032                                          * avoidance, but NOT slow start.
4033                                          * This is known as the Fast
4034                                          * Recovery Algorithm.
4035                                          */
4036                                         if (tcp->tcp_snd_sack_ok &&
4037                                             tcp->tcp_notsack_list != NULL) {
4038                                                 flags |= TH_NEED_SACK_REXMIT;
4039                                                 tcp->tcp_pipe -= mss;
4040                                                 if (tcp->tcp_pipe < 0)
4041                                                         tcp->tcp_pipe = 0;
4042                                         } else {
4043                                         /*
4044                                          * We know that one more packet has
4045                                          * left the pipe thus we can update
4046                                          * cwnd.
4047                                          */
4048                                         cwnd = tcp->tcp_cwnd + mss;
4049                                         if (cwnd > tcp->tcp_cwnd_max)
4050                                                 cwnd = tcp->tcp_cwnd_max;




4051                                         tcp->tcp_cwnd = cwnd;
4052                                         if (tcp->tcp_unsent > 0)
4053                                                 flags |= TH_XMIT_NEEDED;
4054                                         }
4055                                 }
4056                         }
4057                 } else if (tcp->tcp_zero_win_probe) {
4058                         /*
4059                          * If the window has opened, need to arrange
4060                          * to send additional data.
4061                          */
4062                         if (new_swnd != 0) {
4063                                 /* tcp_suna != tcp_snxt */
4064                                 /* Packet contains a window update */
4065                                 TCPS_BUMP_MIB(tcps, tcpInWinUpdate);
4066                                 tcp->tcp_zero_win_probe = 0;
4067                                 tcp->tcp_timer_backoff = 0;
4068                                 tcp->tcp_ms_we_have_waited = 0;
4069 
4070                                 /*


4163 
4164         /*
4165          * TCP gets a new ACK, update the notsack'ed list to delete those
4166          * blocks that are covered by this ACK.
4167          */
4168         if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) {
4169                 tcp_notsack_remove(&(tcp->tcp_notsack_list), seg_ack,
4170                     &(tcp->tcp_num_notsack_blk), &(tcp->tcp_cnt_notsack_list));
4171         }
4172 
4173         /*
4174          * If we got an ACK after fast retransmit, check to see
4175          * if it is a partial ACK.  If it is not and the congestion
4176          * window was inflated to account for the other side's
4177          * cached packets, retract it.  If it is, do Hoe's algorithm.
4178          */
4179         if (tcp->tcp_dupack_cnt >= tcps->tcps_dupack_fast_retransmit) {
4180                 ASSERT(tcp->tcp_rexmit == B_FALSE);
4181                 if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) {
4182                         tcp->tcp_dupack_cnt = 0;
4183                         /*
4184                          * Restore the orig tcp_cwnd_ssthresh after
4185                          * fast retransmit phase.
4186                          */
4187                         if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) {
4188                                 tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh;
4189                         }
4190                         tcp->tcp_rexmit_max = seg_ack;
4191                         tcp->tcp_cwnd_cnt = 0;
4192 
4193                         /*
4194                          * Remove all notsack info to avoid confusion with
4195                          * the next fast retrasnmit/recovery phase.
4196                          */
4197                         if (tcp->tcp_snd_sack_ok) {
4198                                 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list,
4199                                     tcp);
4200                         }
4201                 } else {
4202                         if (tcp->tcp_snd_sack_ok &&
4203                             tcp->tcp_notsack_list != NULL) {
4204                                 flags |= TH_NEED_SACK_REXMIT;
4205                                 tcp->tcp_pipe -= mss;
4206                                 if (tcp->tcp_pipe < 0)
4207                                         tcp->tcp_pipe = 0;
4208                         } else {
4209                                 /*
4210                                  * Hoe's algorithm:
4211                                  *
4212                                  * Retransmit the unack'ed segment and
4213                                  * restart fast recovery.  Note that we
4214                                  * need to scale back tcp_cwnd to the
4215                                  * original value when we started fast
4216                                  * recovery.  This is to prevent overly
4217                                  * aggressive behaviour in sending new
4218                                  * segments.
4219                                  */
4220                                 tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh +
4221                                     tcps->tcps_dupack_fast_retransmit * mss;




4222                                 tcp->tcp_cwnd_cnt = tcp->tcp_cwnd;
4223                                 flags |= TH_REXMIT_NEEDED;
4224                         }
4225                 }
4226         } else {
4227                 tcp->tcp_dupack_cnt = 0;
4228                 if (tcp->tcp_rexmit) {
4229                         /*
4230                          * TCP is retranmitting.  If the ACK ack's all
4231                          * outstanding data, update tcp_rexmit_max and
4232                          * tcp_rexmit_nxt.  Otherwise, update tcp_rexmit_nxt
4233                          * to the correct value.
4234                          *
4235                          * Note that SEQ_LEQ() is used.  This is to avoid
4236                          * unnecessary fast retransmit caused by dup ACKs
4237                          * received when TCP does slow start retransmission
4238                          * after a time out.  During this phase, TCP may
4239                          * send out segments which are already received.
4240                          * This causes dup ACKs to be sent back.
4241                          */


4262                 tcp->tcp_timer_backoff = 0;
4263         }
4264 
4265         /*
4266          * If tcp_xmit_head is NULL, then it must be the FIN being ack'ed.
4267          * Note that it cannot be the SYN being ack'ed.  The code flow
4268          * will not reach here.
4269          */
4270         if (mp1 == NULL) {
4271                 goto fin_acked;
4272         }
4273 
4274         /*
4275          * Update the congestion window.
4276          *
4277          * If TCP is not ECN capable or TCP is ECN capable but the
4278          * congestion experience bit is not set, increase the tcp_cwnd as
4279          * usual.
4280          */
4281         if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) {
4282                 cwnd = tcp->tcp_cwnd;
4283                 add = mss;
4284 
4285                 if (cwnd >= tcp->tcp_cwnd_ssthresh) {
4286                         /*
4287                          * This is to prevent an increase of less than 1 MSS of
4288                          * tcp_cwnd.  With partial increase, tcp_wput_data()
4289                          * may send out tinygrams in order to preserve mblk
4290                          * boundaries.
4291                          *
4292                          * By initializing tcp_cwnd_cnt to new tcp_cwnd and
4293                          * decrementing it by 1 MSS for every ACKs, tcp_cwnd is
4294                          * increased by 1 MSS for every RTTs.
4295                          */
4296                         if (tcp->tcp_cwnd_cnt <= 0) {
4297                                 tcp->tcp_cwnd_cnt = cwnd + add;
4298                         } else {
4299                                 tcp->tcp_cwnd_cnt -= add;
4300                                 add = 0;
4301                         }

4302                 }
4303                 tcp->tcp_cwnd = MIN(cwnd + add, tcp->tcp_cwnd_max);
4304         }
4305 
4306         /* See if the latest urgent data has been acknowledged */
4307         if ((tcp->tcp_valid_bits & TCP_URG_VALID) &&
4308             SEQ_GT(seg_ack, tcp->tcp_urg))
4309                 tcp->tcp_valid_bits &= ~TCP_URG_VALID;
4310 
4311         /*
4312          * Update the RTT estimates. Note that we don't use the TCP
4313          * timestamp option to calculate RTT even if one is present. This is
4314          * because the timestamp option's resolution (CPU tick) is
4315          * too coarse to measure modern datacenter networks' microsecond
4316          * latencies. The timestamp field's resolution is limited by its
4317          * 4-byte width (see RFC1323), and since we always store a
4318          * high-resolution nanosecond presision timestamp along with the data,
4319          * there is no point to ever using the timestamp option.
4320          */
4321         if (SEQ_GT(seg_ack, tcp->tcp_csuna)) {
4322                 /*
4323                  * An ACK sequence we haven't seen before, so get the RTT
4324                  * and update the RTO. But first check if the timestamp is


5617                 default:
5618                         break;
5619                 }
5620                 break;
5621         case ICMP_SOURCE_QUENCH: {
5622                 /*
5623                  * use a global boolean to control
5624                  * whether TCP should respond to ICMP_SOURCE_QUENCH.
5625                  * The default is false.
5626                  */
5627                 if (tcp_icmp_source_quench) {
5628                         /*
5629                          * Reduce the sending rate as if we got a
5630                          * retransmit timeout
5631                          */
5632                         uint32_t npkt;
5633 
5634                         npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) /
5635                             tcp->tcp_mss;
5636                         tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * tcp->tcp_mss;




5637                         tcp->tcp_cwnd = tcp->tcp_mss;
5638                         tcp->tcp_cwnd_cnt = 0;
5639                 }
5640                 break;
5641         }
5642         }
5643         freemsg(mp);
5644 }
5645 
5646 /*
5647  * tcp_icmp_error_ipv6 is called from tcp_icmp_input to process ICMPv6
5648  * error messages passed up by IP.
5649  * Assumes that IP has pulled up all the extension headers as well
5650  * as the ICMPv6 header.
5651  */
5652 static void
5653 tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, ip_recv_attr_t *ira)
5654 {
5655         icmp6_t         *icmp6;
5656         ip6_t           *ip6h;




 153 
 154 static boolean_t tcp_outbound_squeue_switch = B_FALSE;
 155 
 156 static mblk_t   *tcp_conn_create_v4(conn_t *, conn_t *, mblk_t *,
 157                     ip_recv_attr_t *);
 158 static mblk_t   *tcp_conn_create_v6(conn_t *, conn_t *, mblk_t *,
 159                     ip_recv_attr_t *);
 160 static boolean_t        tcp_drop_q0(tcp_t *);
 161 static void     tcp_icmp_error_ipv6(tcp_t *, mblk_t *, ip_recv_attr_t *);
 162 static mblk_t   *tcp_input_add_ancillary(tcp_t *, mblk_t *, ip_pkt_t *,
 163                     ip_recv_attr_t *);
 164 static void     tcp_input_listener(void *, mblk_t *, void *, ip_recv_attr_t *);
 165 static void     tcp_process_options(tcp_t *, tcpha_t *);
 166 static mblk_t   *tcp_reass(tcp_t *, mblk_t *, uint32_t);
 167 static void     tcp_reass_elim_overlap(tcp_t *, mblk_t *);
 168 static void     tcp_rsrv_input(void *, mblk_t *, void *, ip_recv_attr_t *);
 169 static void     tcp_set_rto(tcp_t *, hrtime_t);
 170 static void     tcp_setcred_data(mblk_t *, ip_recv_attr_t *);
 171 
 172 /*
 173  * CC wrapper hook functions
 174  */
 175 static void
 176 cc_ack_received(tcp_t *tcp, uint32_t seg_ack, int32_t bytes_acked,
 177     uint16_t type)
 178 {
 179         uint32_t old_cwnd = tcp->tcp_cwnd;
 180 
 181         tcp->tcp_ccv.bytes_this_ack = bytes_acked;
 182         if (tcp->tcp_cwnd <= tcp->tcp_swnd)
 183                 tcp->tcp_ccv.flags |= CCF_CWND_LIMITED;
 184         else
 185                 tcp->tcp_ccv.flags &= ~CCF_CWND_LIMITED;
 186 
 187         if (type == CC_ACK) {
 188                 if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) {
 189                         if (tcp->tcp_ccv.flags & CCF_RTO)
 190                                 tcp->tcp_ccv.flags &= ~CCF_RTO;
 191 
 192                         tcp->tcp_ccv.t_bytes_acked +=
 193                             min(tcp->tcp_ccv.bytes_this_ack,
 194                             tcp->tcp_tcps->tcps_abc_l_var * tcp->tcp_mss);
 195                         if (tcp->tcp_ccv.t_bytes_acked >= tcp->tcp_cwnd) {
 196                                 tcp->tcp_ccv.t_bytes_acked -= tcp->tcp_cwnd;
 197                                 tcp->tcp_ccv.flags |= CCF_ABC_SENTAWND;
 198                         }
 199                 } else {
 200                         tcp->tcp_ccv.flags &= ~CCF_ABC_SENTAWND;
 201                         tcp->tcp_ccv.t_bytes_acked = 0;
 202                 }
 203         }
 204 
 205         if (CC_ALGO(tcp)->ack_received != NULL) {
 206                 /*
 207                  * The FreeBSD code where this originated had a comment "Find
 208                  * a way to live without this" in several places where curack
 209                  * got set.  If they eventually dump curack from the cc
 210                  * variables, we'll need to adapt our code.
 211                  */
 212                 tcp->tcp_ccv.curack = seg_ack;
 213                 CC_ALGO(tcp)->ack_received(&tcp->tcp_ccv, type);
 214         }
 215 
 216         DTRACE_PROBE3(cwnd__cc__ack__received, tcp_t *, tcp, uint32_t, old_cwnd,
 217             uint32_t, tcp->tcp_cwnd);
 218 }
 219 
 220 void
 221 cc_cong_signal(tcp_t *tcp, uint32_t seg_ack, uint32_t type)
 222 {
 223         uint32_t old_cwnd = tcp->tcp_cwnd;
 224         uint32_t old_cwnd_ssthresh = tcp->tcp_cwnd_ssthresh;
 225         switch (type) {
 226         case CC_NDUPACK:
 227                 if (!IN_FASTRECOVERY(tcp->tcp_ccv.flags)) {
 228                         tcp->tcp_rexmit_max = tcp->tcp_snxt;
 229                         if (tcp->tcp_ecn_ok) {
 230                                 tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
 231                                 tcp->tcp_cwr = B_TRUE;
 232                                 tcp->tcp_ecn_cwr_sent = B_FALSE;
 233                         }
 234                 }
 235                 break;
 236         case CC_ECN:
 237                 if (!IN_CONGRECOVERY(tcp->tcp_ccv.flags)) {
 238                         tcp->tcp_rexmit_max = tcp->tcp_snxt;
 239                         if (tcp->tcp_ecn_ok) {
 240                                 tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
 241                                 tcp->tcp_cwr = B_TRUE;
 242                                 tcp->tcp_ecn_cwr_sent = B_FALSE;
 243                         }
 244                 }
 245                 break;
 246         case CC_RTO:
 247                 tcp->tcp_ccv.flags |= CCF_RTO;
 248                 tcp->tcp_dupack_cnt = 0;
 249                 tcp->tcp_ccv.t_bytes_acked = 0;
 250                 /*
 251                  * Give up on fast recovery and congestion recovery if we were
 252                  * attempting either.
 253                  */
 254                 EXIT_RECOVERY(tcp->tcp_ccv.flags);
 255                 if (CC_ALGO(tcp)->cong_signal == NULL) {
 256                         /*
 257                          * RFC5681 Section 3.1
 258                          * ssthresh = max (FlightSize / 2, 2*SMSS) eq (4)
 259                          */
 260                         tcp->tcp_cwnd_ssthresh = max(
 261                             (tcp->tcp_snxt - tcp->tcp_suna) / 2 / tcp->tcp_mss,
 262                             2) * tcp->tcp_mss;
 263                         tcp->tcp_cwnd = tcp->tcp_mss;
 264                 }
 265 
 266                 if (tcp->tcp_ecn_ok) {
 267                         tcp->tcp_cwr = B_TRUE;
 268                         tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
 269                         tcp->tcp_ecn_cwr_sent = B_FALSE;
 270                 }
 271                 break;
 272         }
 273 
 274         if (CC_ALGO(tcp)->cong_signal != NULL) {
 275                 tcp->tcp_ccv.curack = seg_ack;
 276                 CC_ALGO(tcp)->cong_signal(&tcp->tcp_ccv, type);
 277         }
 278 
 279         DTRACE_PROBE6(cwnd__cc__cong__signal, tcp_t *, tcp, uint32_t, old_cwnd,
 280             uint32_t, tcp->tcp_cwnd, uint32_t, old_cwnd_ssthresh,
 281             uint32_t, tcp->tcp_cwnd_ssthresh, uint32_t, type);
 282 }
 283 
 284 static void
 285 cc_post_recovery(tcp_t *tcp, uint32_t seg_ack)
 286 {
 287         uint32_t old_cwnd = tcp->tcp_cwnd;
 288 
 289         if (CC_ALGO(tcp)->post_recovery != NULL) {
 290                 tcp->tcp_ccv.curack = seg_ack;
 291                 CC_ALGO(tcp)->post_recovery(&tcp->tcp_ccv);
 292         }
 293         tcp->tcp_ccv.t_bytes_acked = 0;
 294 
 295         DTRACE_PROBE3(cwnd__cc__post__recovery, tcp_t *, tcp,
 296             uint32_t, old_cwnd, uint32_t, tcp->tcp_cwnd);
 297 }
 298 
 299 /*
 300  * Set the MSS associated with a particular tcp based on its current value,
 301  * and a new one passed in. Observe minimums and maximums, and reset other
 302  * state variables that we want to view as multiples of MSS.
 303  *
 304  * The value of MSS could be either increased or descreased.
 305  */
 306 void
 307 tcp_mss_set(tcp_t *tcp, uint32_t mss)
 308 {
 309         uint32_t        mss_max;
 310         tcp_stack_t     *tcps = tcp->tcp_tcps;
 311         conn_t          *connp = tcp->tcp_connp;
 312 
 313         if (connp->conn_ipversion == IPV4_VERSION)
 314                 mss_max = tcps->tcps_mss_max_ipv4;
 315         else
 316                 mss_max = tcps->tcps_mss_max_ipv6;
 317 
 318         if (mss < tcps->tcps_mss_min)
 319                 mss = tcps->tcps_mss_min;


 658             IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) + TCP_MIN_HEADER_LENGTH);
 659 
 660         /*
 661          * Set MSS to the smaller one of both ends of the connection.
 662          * We should not have called tcp_mss_set() before, but our
 663          * side of the MSS should have been set to a proper value
 664          * by tcp_set_destination().  tcp_mss_set() will also set up the
 665          * STREAM head parameters properly.
 666          *
 667          * If we have a larger-than-16-bit window but the other side
 668          * didn't want to do window scale, tcp_rwnd_set() will take
 669          * care of that.
 670          */
 671         tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss));
 672 
 673         /*
 674          * Initialize tcp_cwnd value. After tcp_mss_set(), tcp_mss has been
 675          * updated properly.
 676          */
 677         TCP_SET_INIT_CWND(tcp, tcp->tcp_mss, tcps->tcps_slow_start_initial);
 678 
 679         if (tcp->tcp_cc_algo->conn_init != NULL)
 680                 tcp->tcp_cc_algo->conn_init(&tcp->tcp_ccv);
 681 }
 682 
 683 /*
 684  * Add a new piece to the tcp reassembly queue.  If the gap at the beginning
 685  * is filled, return as much as we can.  The message passed in may be
 686  * multi-part, chained using b_cont.  "start" is the starting sequence
 687  * number for this piece.
 688  */
 689 static mblk_t *
 690 tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start)
 691 {
 692         uint32_t        end, bytes;
 693         mblk_t          *mp1;
 694         mblk_t          *mp2;
 695         mblk_t          *next_mp;
 696         uint32_t        u1;
 697         tcp_stack_t     *tcps = tcp->tcp_tcps;
 698 
 699 
 700         /* Walk through all the new pieces. */


1518                                     "reached: %u attempts dropped total\n",
1519                                     ntohs(listener->tcp_connp->conn_lport),
1520                                     tlc->tlc_max, tlc->tlc_drop);
1521                                 tlc->tlc_report_time = now;
1522                         }
1523                         goto error2;
1524                 }
1525                 tlc_set = B_TRUE;
1526         }
1527 
1528         mutex_exit(&listener->tcp_eager_lock);
1529 
1530         /*
1531          * IP sets ira_sqp to either the senders conn_sqp (for loopback)
1532          * or based on the ring (for packets from GLD). Otherwise it is
1533          * set based on lbolt i.e., a somewhat random number.
1534          */
1535         ASSERT(ira->ira_sqp != NULL);
1536         new_sqp = ira->ira_sqp;
1537 
1538         econnp = tcp_get_conn(arg2, tcps);
1539         if (econnp == NULL)
1540                 goto error2;
1541 
1542         ASSERT(econnp->conn_netstack == lconnp->conn_netstack);
1543         econnp->conn_sqp = new_sqp;
1544         econnp->conn_initial_sqp = new_sqp;
1545         econnp->conn_ixa->ixa_sqp = new_sqp;
1546 
1547         econnp->conn_fport = tcpha->tha_lport;
1548         econnp->conn_lport = tcpha->tha_fport;
1549 
1550         err = conn_inherit_parent(lconnp, econnp);
1551         if (err != 0)
1552                 goto error3;
1553 
1554         /* We already know the laddr of the new connection is ours */
1555         econnp->conn_ixa->ixa_src_generation = ipst->ips_src_generation;
1556 
1557         ASSERT(OK_32PTR(mp->b_rptr));
1558         ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION ||


2437 tcp_input_data(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
2438 {
2439         int32_t         bytes_acked;
2440         int32_t         gap;
2441         mblk_t          *mp1;
2442         uint_t          flags;
2443         uint32_t        new_swnd = 0;
2444         uchar_t         *iphdr;
2445         uchar_t         *rptr;
2446         int32_t         rgap;
2447         uint32_t        seg_ack;
2448         int             seg_len;
2449         uint_t          ip_hdr_len;
2450         uint32_t        seg_seq;
2451         tcpha_t         *tcpha;
2452         int             urp;
2453         tcp_opt_t       tcpopt;
2454         ip_pkt_t        ipp;
2455         boolean_t       ofo_seg = B_FALSE; /* Out of order segment */
2456         uint32_t        cwnd;


2457         int             mss;
2458         conn_t          *connp = (conn_t *)arg;
2459         squeue_t        *sqp = (squeue_t *)arg2;
2460         tcp_t           *tcp = connp->conn_tcp;
2461         tcp_stack_t     *tcps = tcp->tcp_tcps;
2462         sock_upcalls_t  *sockupcalls;
2463 
2464         /*
2465          * RST from fused tcp loopback peer should trigger an unfuse.
2466          */
2467         if (tcp->tcp_fused) {
2468                 TCP_STAT(tcps, tcp_fusion_aborted);
2469                 tcp_unfuse(tcp);
2470         }
2471 
2472         iphdr = mp->b_rptr;
2473         rptr = mp->b_rptr;
2474         ASSERT(OK_32PTR(rptr));
2475 
2476         ip_hdr_len = ira->ira_ip_hdr_length;


2712                         tcp->tcp_suna = tcp->tcp_iss + 1;
2713                         tcp->tcp_valid_bits &= ~TCP_ISS_VALID;
2714 
2715                         /*
2716                          * If SYN was retransmitted, need to reset all
2717                          * retransmission info.  This is because this
2718                          * segment will be treated as a dup ACK.
2719                          */
2720                         if (tcp->tcp_rexmit) {
2721                                 tcp->tcp_rexmit = B_FALSE;
2722                                 tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
2723                                 tcp->tcp_rexmit_max = tcp->tcp_snxt;
2724                                 tcp->tcp_ms_we_have_waited = 0;
2725 
2726                                 /*
2727                                  * Set tcp_cwnd back to 1 MSS, per
2728                                  * recommendation from
2729                                  * draft-floyd-incr-init-win-01.txt,
2730                                  * Increasing TCP's Initial Window.
2731                                  */
2732                                 DTRACE_PROBE3(cwnd__retransmitted__syn,
2733                                     tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
2734                                     uint32_t, tcp->tcp_mss);
2735                                 tcp->tcp_cwnd = tcp->tcp_mss;
2736                         }
2737 
2738                         tcp->tcp_swl1 = seg_seq;
2739                         tcp->tcp_swl2 = seg_ack;
2740 
2741                         new_swnd = ntohs(tcpha->tha_win);
2742                         tcp->tcp_swnd = new_swnd;
2743                         if (new_swnd > tcp->tcp_max_swnd)
2744                                 tcp->tcp_max_swnd = new_swnd;
2745 
2746                         /*
2747                          * Always send the three-way handshake ack immediately
2748                          * in order to make the connection complete as soon as
2749                          * possible on the accepting host.
2750                          */
2751                         flags |= TH_ACK_NEEDED;
2752 
2753                         /*
2754                          * Trace connect-established here.


3937                             ip_xmit_attr_t *, connp->conn_ixa, void_ip_t *,
3938                             iphdr, tcp_t *, tcp, tcph_t *, tcpha);
3939                 }
3940                 TCPS_CONN_INC(tcps);
3941 
3942                 tcp->tcp_suna = tcp->tcp_iss + 1; /* One for the SYN */
3943                 bytes_acked--;
3944                 /* SYN was acked - making progress */
3945                 tcp->tcp_ip_forward_progress = B_TRUE;
3946 
3947                 /*
3948                  * If SYN was retransmitted, need to reset all
3949                  * retransmission info as this segment will be
3950                  * treated as a dup ACK.
3951                  */
3952                 if (tcp->tcp_rexmit) {
3953                         tcp->tcp_rexmit = B_FALSE;
3954                         tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
3955                         tcp->tcp_rexmit_max = tcp->tcp_snxt;
3956                         tcp->tcp_ms_we_have_waited = 0;
3957                         DTRACE_PROBE3(cwnd__retransmitted__syn,
3958                             tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
3959                             uint32_t, tcp->tcp_mss);
3960                         tcp->tcp_cwnd = mss;
3961                 }
3962 
3963                 /*
3964                  * We set the send window to zero here.
3965                  * This is needed if there is data to be
3966                  * processed already on the queue.
3967                  * Later (at swnd_update label), the
3968                  * "new_swnd > tcp_swnd" condition is satisfied
3969                  * the XMIT_NEEDED flag is set in the current
3970                  * (SYN_RCVD) state. This ensures tcp_wput_data() is
3971                  * called if there is already data on queue in
3972                  * this state.
3973                  */
3974                 tcp->tcp_swnd = 0;
3975 
3976                 if (new_swnd > tcp->tcp_max_swnd)
3977                         tcp->tcp_max_swnd = new_swnd;
3978                 tcp->tcp_swl1 = seg_seq;
3979                 tcp->tcp_swl2 = seg_ack;


3983                 DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *,
3984                     connp->conn_ixa, void, NULL, tcp_t *, tcp, void, NULL,
3985                     int32_t, TCPS_SYN_RCVD);
3986 
3987                 /* Fuse when both sides are in ESTABLISHED state */
3988                 if (tcp->tcp_loopback && do_tcp_fusion)
3989                         tcp_fuse(tcp, iphdr, tcpha);
3990 
3991         }
3992         /* This code follows 4.4BSD-Lite2 mostly. */
3993         if (bytes_acked < 0)
3994                 goto est;
3995 
3996         /*
3997          * If TCP is ECN capable and the congestion experience bit is
3998          * set, reduce tcp_cwnd and tcp_ssthresh.  But this should only be
3999          * done once per window (or more loosely, per RTT).
4000          */
4001         if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max))
4002                 tcp->tcp_cwr = B_FALSE;
4003         if (tcp->tcp_ecn_ok && (flags & TH_ECE) && !tcp->tcp_cwr) {
4004                 cc_cong_signal(tcp, seg_ack, CC_ECN);



4005                 /*
4006                  * If the cwnd is 0, use the timer to clock out
4007                  * new segments.  This is required by the ECN spec.
4008                  */
4009                 if (tcp->tcp_cwnd == 0)
4010                         TCP_TIMER_RESTART(tcp, tcp->tcp_rto);






4011                 tcp->tcp_cwr = B_TRUE;
4012                 /*
4013                  * This marks the end of the current window of in
4014                  * flight data.  That is why we don't use
4015                  * tcp_suna + tcp_swnd.  Only data in flight can
4016                  * provide ECN info.
4017                  */
4018                 tcp->tcp_cwr_snd_max = tcp->tcp_snxt;

4019         }

4020 
4021         mp1 = tcp->tcp_xmit_head;
4022         if (bytes_acked == 0) {
4023                 if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) {
4024                         int dupack_cnt;
4025 
4026                         TCPS_BUMP_MIB(tcps, tcpInDupAck);
4027                         /*
4028                          * Fast retransmit.  When we have seen exactly three
4029                          * identical ACKs while we have unacked data
4030                          * outstanding we take it as a hint that our peer
4031                          * dropped something.
4032                          *
4033                          * If TCP is retransmitting, don't do fast retransmit.
4034                          */
4035                         if (mp1 && tcp->tcp_suna != tcp->tcp_snxt &&
4036                             ! tcp->tcp_rexmit) {
4037                                 /* Do Limited Transmit */
4038                                 if ((dupack_cnt = ++tcp->tcp_dupack_cnt) <
4039                                     tcps->tcps_dupack_fast_retransmit) {
4040                                         cc_ack_received(tcp, seg_ack,
4041                                             bytes_acked, CC_DUPACK);
4042                                         /*
4043                                          * RFC 3042
4044                                          *
4045                                          * What we need to do is temporarily
4046                                          * increase tcp_cwnd so that new
4047                                          * data can be sent if it is allowed
4048                                          * by the receive window (tcp_rwnd).
4049                                          * tcp_wput_data() will take care of
4050                                          * the rest.
4051                                          *
4052                                          * If the connection is SACK capable,
4053                                          * only do limited xmit when there
4054                                          * is SACK info.
4055                                          *
4056                                          * Note how tcp_cwnd is incremented.
4057                                          * The first dup ACK will increase
4058                                          * it by 1 MSS.  The second dup ACK
4059                                          * will increase it by 2 MSS.  This
4060                                          * means that only 1 new segment will
4061                                          * be sent for each dup ACK.


4068                                                     (tcp->tcp_dupack_cnt - 1);
4069                                                 flags |= TH_LIMIT_XMIT;
4070                                         }
4071                                 } else if (dupack_cnt ==
4072                                     tcps->tcps_dupack_fast_retransmit) {
4073 
4074                                 /*
4075                                  * If we have reduced tcp_ssthresh
4076                                  * because of ECN, do not reduce it again
4077                                  * unless it is already one window of data
4078                                  * away.  After one window of data, tcp_cwr
4079                                  * should then be cleared.  Note that
4080                                  * for non ECN capable connection, tcp_cwr
4081                                  * should always be false.
4082                                  *
4083                                  * Adjust cwnd since the duplicate
4084                                  * ack indicates that a packet was
4085                                  * dropped (due to congestion.)
4086                                  */
4087                                 if (!tcp->tcp_cwr) {
4088                                         cc_cong_signal(tcp, seg_ack,
4089                                             CC_NDUPACK);
4090                                         cc_ack_received(tcp, seg_ack,
4091                                             bytes_acked, CC_DUPACK);


4092                                 }
4093                                 if (tcp->tcp_ecn_ok) {
4094                                         tcp->tcp_cwr = B_TRUE;
4095                                         tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
4096                                         tcp->tcp_ecn_cwr_sent = B_FALSE;
4097                                 }
4098 
4099                                 /*
4100                                  * We do Hoe's algorithm.  Refer to her
4101                                  * paper "Improving the Start-up Behavior
4102                                  * of a Congestion Control Scheme for TCP,"
4103                                  * appeared in SIGCOMM'96.
4104                                  *
4105                                  * Save highest seq no we have sent so far.
4106                                  * Be careful about the invisible FIN byte.
4107                                  */
4108                                 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
4109                                     (tcp->tcp_unsent == 0)) {
4110                                         tcp->tcp_rexmit_max = tcp->tcp_fss;
4111                                 } else {


4133                                                     tcp->tcp_fack;
4134                                                 tcp->tcp_sack_snxt = seg_ack;
4135                                                 flags |= TH_NEED_SACK_REXMIT;
4136                                         } else {
4137                                                 /*
4138                                                  * Always initialize tcp_pipe
4139                                                  * even though we don't have
4140                                                  * any SACK info.  If later
4141                                                  * we get SACK info and
4142                                                  * tcp_pipe is not initialized,
4143                                                  * funny things will happen.
4144                                                  */
4145                                                 tcp->tcp_pipe =
4146                                                     tcp->tcp_cwnd_ssthresh;
4147                                         }
4148                                 } else {
4149                                         flags |= TH_REXMIT_NEEDED;
4150                                 } /* tcp_snd_sack_ok */
4151 
4152                                 } else {
4153                                         cc_ack_received(tcp, seg_ack,
4154                                             bytes_acked, CC_DUPACK);
4155                                         /*
4156                                          * Here we perform congestion
4157                                          * avoidance, but NOT slow start.
4158                                          * This is known as the Fast
4159                                          * Recovery Algorithm.
4160                                          */
4161                                         if (tcp->tcp_snd_sack_ok &&
4162                                             tcp->tcp_notsack_list != NULL) {
4163                                                 flags |= TH_NEED_SACK_REXMIT;
4164                                                 tcp->tcp_pipe -= mss;
4165                                                 if (tcp->tcp_pipe < 0)
4166                                                         tcp->tcp_pipe = 0;
4167                                         } else {
4168                                         /*
4169                                          * We know that one more packet has
4170                                          * left the pipe thus we can update
4171                                          * cwnd.
4172                                          */
4173                                         cwnd = tcp->tcp_cwnd + mss;
4174                                         if (cwnd > tcp->tcp_cwnd_max)
4175                                                 cwnd = tcp->tcp_cwnd_max;
4176                                         DTRACE_PROBE3(cwnd__fast__recovery,
4177                                             tcp_t *, tcp,
4178                                             uint32_t, tcp->tcp_cwnd,
4179                                             uint32_t, cwnd);
4180                                         tcp->tcp_cwnd = cwnd;
4181                                         if (tcp->tcp_unsent > 0)
4182                                                 flags |= TH_XMIT_NEEDED;
4183                                         }
4184                                 }
4185                         }
4186                 } else if (tcp->tcp_zero_win_probe) {
4187                         /*
4188                          * If the window has opened, need to arrange
4189                          * to send additional data.
4190                          */
4191                         if (new_swnd != 0) {
4192                                 /* tcp_suna != tcp_snxt */
4193                                 /* Packet contains a window update */
4194                                 TCPS_BUMP_MIB(tcps, tcpInWinUpdate);
4195                                 tcp->tcp_zero_win_probe = 0;
4196                                 tcp->tcp_timer_backoff = 0;
4197                                 tcp->tcp_ms_we_have_waited = 0;
4198 
4199                                 /*


4292 
4293         /*
4294          * TCP gets a new ACK, update the notsack'ed list to delete those
4295          * blocks that are covered by this ACK.
4296          */
4297         if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) {
4298                 tcp_notsack_remove(&(tcp->tcp_notsack_list), seg_ack,
4299                     &(tcp->tcp_num_notsack_blk), &(tcp->tcp_cnt_notsack_list));
4300         }
4301 
4302         /*
4303          * If we got an ACK after fast retransmit, check to see
4304          * if it is a partial ACK.  If it is not and the congestion
4305          * window was inflated to account for the other side's
4306          * cached packets, retract it.  If it is, do Hoe's algorithm.
4307          */
4308         if (tcp->tcp_dupack_cnt >= tcps->tcps_dupack_fast_retransmit) {
4309                 ASSERT(tcp->tcp_rexmit == B_FALSE);
4310                 if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) {
4311                         tcp->tcp_dupack_cnt = 0;
4312 
4313                         cc_post_recovery(tcp, seg_ack);
4314 




4315                         tcp->tcp_rexmit_max = seg_ack;

4316 
4317                         /*
4318                          * Remove all notsack info to avoid confusion with
4319                          * the next fast retrasnmit/recovery phase.
4320                          */
4321                         if (tcp->tcp_snd_sack_ok) {
4322                                 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list,
4323                                     tcp);
4324                         }
4325                 } else {
4326                         if (tcp->tcp_snd_sack_ok &&
4327                             tcp->tcp_notsack_list != NULL) {
4328                                 flags |= TH_NEED_SACK_REXMIT;
4329                                 tcp->tcp_pipe -= mss;
4330                                 if (tcp->tcp_pipe < 0)
4331                                         tcp->tcp_pipe = 0;
4332                         } else {
4333                                 /*
4334                                  * Hoe's algorithm:
4335                                  *
4336                                  * Retransmit the unack'ed segment and
4337                                  * restart fast recovery.  Note that we
4338                                  * need to scale back tcp_cwnd to the
4339                                  * original value when we started fast
4340                                  * recovery.  This is to prevent overly
4341                                  * aggressive behaviour in sending new
4342                                  * segments.
4343                                  */
4344                                 cwnd = tcp->tcp_cwnd_ssthresh +
4345                                     tcps->tcps_dupack_fast_retransmit * mss;
4346                                 DTRACE_PROBE3(cwnd__fast__retransmit__part__ack,
4347                                     tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
4348                                     uint32_t, cwnd);
4349                                 tcp->tcp_cwnd = cwnd;
4350                                 tcp->tcp_cwnd_cnt = tcp->tcp_cwnd;
4351                                 flags |= TH_REXMIT_NEEDED;
4352                         }
4353                 }
4354         } else {
4355                 tcp->tcp_dupack_cnt = 0;
4356                 if (tcp->tcp_rexmit) {
4357                         /*
4358                          * TCP is retranmitting.  If the ACK ack's all
4359                          * outstanding data, update tcp_rexmit_max and
4360                          * tcp_rexmit_nxt.  Otherwise, update tcp_rexmit_nxt
4361                          * to the correct value.
4362                          *
4363                          * Note that SEQ_LEQ() is used.  This is to avoid
4364                          * unnecessary fast retransmit caused by dup ACKs
4365                          * received when TCP does slow start retransmission
4366                          * after a time out.  During this phase, TCP may
4367                          * send out segments which are already received.
4368                          * This causes dup ACKs to be sent back.
4369                          */


4390                 tcp->tcp_timer_backoff = 0;
4391         }
4392 
4393         /*
4394          * If tcp_xmit_head is NULL, then it must be the FIN being ack'ed.
4395          * Note that it cannot be the SYN being ack'ed.  The code flow
4396          * will not reach here.
4397          */
4398         if (mp1 == NULL) {
4399                 goto fin_acked;
4400         }
4401 
4402         /*
4403          * Update the congestion window.
4404          *
4405          * If TCP is not ECN capable or TCP is ECN capable but the
4406          * congestion experience bit is not set, increase the tcp_cwnd as
4407          * usual.
4408          */
4409         if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) {
4410                 if (IN_RECOVERY(tcp->tcp_ccv.flags)) {
4411                         EXIT_RECOVERY(tcp->tcp_ccv.flags);

















4412                 }
4413                 cc_ack_received(tcp, seg_ack, bytes_acked, CC_ACK);
4414         }


4415 
4416         /* See if the latest urgent data has been acknowledged */
4417         if ((tcp->tcp_valid_bits & TCP_URG_VALID) &&
4418             SEQ_GT(seg_ack, tcp->tcp_urg))
4419                 tcp->tcp_valid_bits &= ~TCP_URG_VALID;
4420 
4421         /*
4422          * Update the RTT estimates. Note that we don't use the TCP
4423          * timestamp option to calculate RTT even if one is present. This is
4424          * because the timestamp option's resolution (CPU tick) is
4425          * too coarse to measure modern datacenter networks' microsecond
4426          * latencies. The timestamp field's resolution is limited by its
4427          * 4-byte width (see RFC1323), and since we always store a
4428          * high-resolution nanosecond presision timestamp along with the data,
4429          * there is no point to ever using the timestamp option.
4430          */
4431         if (SEQ_GT(seg_ack, tcp->tcp_csuna)) {
4432                 /*
4433                  * An ACK sequence we haven't seen before, so get the RTT
4434                  * and update the RTO. But first check if the timestamp is


5727                 default:
5728                         break;
5729                 }
5730                 break;
5731         case ICMP_SOURCE_QUENCH: {
5732                 /*
5733                  * use a global boolean to control
5734                  * whether TCP should respond to ICMP_SOURCE_QUENCH.
5735                  * The default is false.
5736                  */
5737                 if (tcp_icmp_source_quench) {
5738                         /*
5739                          * Reduce the sending rate as if we got a
5740                          * retransmit timeout
5741                          */
5742                         uint32_t npkt;
5743 
5744                         npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) /
5745                             tcp->tcp_mss;
5746                         tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * tcp->tcp_mss;
5747 
5748                         DTRACE_PROBE3(cwnd__source__quench, tcp_t *, tcp,
5749                             uint32_t, tcp->tcp_cwnd,
5750                             uint32_t, tcp->tcp_mss);
5751                         tcp->tcp_cwnd = tcp->tcp_mss;
5752                         tcp->tcp_cwnd_cnt = 0;
5753                 }
5754                 break;
5755         }
5756         }
5757         freemsg(mp);
5758 }
5759 
5760 /*
5761  * tcp_icmp_error_ipv6 is called from tcp_icmp_input to process ICMPv6
5762  * error messages passed up by IP.
5763  * Assumes that IP has pulled up all the extension headers as well
5764  * as the ICMPv6 header.
5765  */
5766 static void
5767 tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, ip_recv_attr_t *ira)
5768 {
5769         icmp6_t         *icmp6;
5770         ip6_t           *ip6h;