Print this page
11553 Want pluggable TCP congestion control algorithms
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>
Reviewed by: Robert Mustacchi <robert.mustacchi@joyent.com>

*** 168,177 **** --- 168,304 ---- static void tcp_rsrv_input(void *, mblk_t *, void *, ip_recv_attr_t *); static void tcp_set_rto(tcp_t *, hrtime_t); static void tcp_setcred_data(mblk_t *, ip_recv_attr_t *); /* + * CC wrapper hook functions + */ + static void + cc_ack_received(tcp_t *tcp, uint32_t seg_ack, int32_t bytes_acked, + uint16_t type) + { + uint32_t old_cwnd = tcp->tcp_cwnd; + + tcp->tcp_ccv.bytes_this_ack = bytes_acked; + if (tcp->tcp_cwnd <= tcp->tcp_swnd) + tcp->tcp_ccv.flags |= CCF_CWND_LIMITED; + else + tcp->tcp_ccv.flags &= ~CCF_CWND_LIMITED; + + if (type == CC_ACK) { + if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) { + if (tcp->tcp_ccv.flags & CCF_RTO) + tcp->tcp_ccv.flags &= ~CCF_RTO; + + tcp->tcp_ccv.t_bytes_acked += + min(tcp->tcp_ccv.bytes_this_ack, + tcp->tcp_tcps->tcps_abc_l_var * tcp->tcp_mss); + if (tcp->tcp_ccv.t_bytes_acked >= tcp->tcp_cwnd) { + tcp->tcp_ccv.t_bytes_acked -= tcp->tcp_cwnd; + tcp->tcp_ccv.flags |= CCF_ABC_SENTAWND; + } + } else { + tcp->tcp_ccv.flags &= ~CCF_ABC_SENTAWND; + tcp->tcp_ccv.t_bytes_acked = 0; + } + } + + if (CC_ALGO(tcp)->ack_received != NULL) { + /* + * The FreeBSD code where this originated had a comment "Find + * a way to live without this" in several places where curack + * got set. If they eventually dump curack from the cc + * variables, we'll need to adapt our code. + */ + tcp->tcp_ccv.curack = seg_ack; + CC_ALGO(tcp)->ack_received(&tcp->tcp_ccv, type); + } + + DTRACE_PROBE3(cwnd__cc__ack__received, tcp_t *, tcp, uint32_t, old_cwnd, + uint32_t, tcp->tcp_cwnd); + } + + void + cc_cong_signal(tcp_t *tcp, uint32_t seg_ack, uint32_t type) + { + uint32_t old_cwnd = tcp->tcp_cwnd; + uint32_t old_cwnd_ssthresh = tcp->tcp_cwnd_ssthresh; + switch (type) { + case CC_NDUPACK: + if (!IN_FASTRECOVERY(tcp->tcp_ccv.flags)) { + tcp->tcp_rexmit_max = tcp->tcp_snxt; + if (tcp->tcp_ecn_ok) { + tcp->tcp_cwr_snd_max = tcp->tcp_snxt; + tcp->tcp_cwr = B_TRUE; + tcp->tcp_ecn_cwr_sent = B_FALSE; + } + } + break; + case CC_ECN: + if (!IN_CONGRECOVERY(tcp->tcp_ccv.flags)) { + tcp->tcp_rexmit_max = tcp->tcp_snxt; + if (tcp->tcp_ecn_ok) { + tcp->tcp_cwr_snd_max = tcp->tcp_snxt; + tcp->tcp_cwr = B_TRUE; + tcp->tcp_ecn_cwr_sent = B_FALSE; + } + } + break; + case CC_RTO: + tcp->tcp_ccv.flags |= CCF_RTO; + tcp->tcp_dupack_cnt = 0; + tcp->tcp_ccv.t_bytes_acked = 0; + /* + * Give up on fast recovery and congestion recovery if we were + * attempting either. + */ + EXIT_RECOVERY(tcp->tcp_ccv.flags); + if (CC_ALGO(tcp)->cong_signal == NULL) { + /* + * RFC5681 Section 3.1 + * ssthresh = max (FlightSize / 2, 2*SMSS) eq (4) + */ + tcp->tcp_cwnd_ssthresh = max( + (tcp->tcp_snxt - tcp->tcp_suna) / 2 / tcp->tcp_mss, + 2) * tcp->tcp_mss; + tcp->tcp_cwnd = tcp->tcp_mss; + } + + if (tcp->tcp_ecn_ok) { + tcp->tcp_cwr = B_TRUE; + tcp->tcp_cwr_snd_max = tcp->tcp_snxt; + tcp->tcp_ecn_cwr_sent = B_FALSE; + } + break; + } + + if (CC_ALGO(tcp)->cong_signal != NULL) { + tcp->tcp_ccv.curack = seg_ack; + CC_ALGO(tcp)->cong_signal(&tcp->tcp_ccv, type); + } + + DTRACE_PROBE6(cwnd__cc__cong__signal, tcp_t *, tcp, uint32_t, old_cwnd, + uint32_t, tcp->tcp_cwnd, uint32_t, old_cwnd_ssthresh, + uint32_t, tcp->tcp_cwnd_ssthresh, uint32_t, type); + } + + static void + cc_post_recovery(tcp_t *tcp, uint32_t seg_ack) + { + uint32_t old_cwnd = tcp->tcp_cwnd; + + if (CC_ALGO(tcp)->post_recovery != NULL) { + tcp->tcp_ccv.curack = seg_ack; + CC_ALGO(tcp)->post_recovery(&tcp->tcp_ccv); + } + tcp->tcp_ccv.t_bytes_acked = 0; + + DTRACE_PROBE3(cwnd__cc__post__recovery, tcp_t *, tcp, + uint32_t, old_cwnd, uint32_t, tcp->tcp_cwnd); + } + + /* * Set the MSS associated with a particular tcp based on its current value, * and a new one passed in. Observe minimums and maximums, and reset other * state variables that we want to view as multiples of MSS. * * The value of MSS could be either increased or descreased.
*** 546,555 **** --- 673,685 ---- /* * Initialize tcp_cwnd value. After tcp_mss_set(), tcp_mss has been * updated properly. */ TCP_SET_INIT_CWND(tcp, tcp->tcp_mss, tcps->tcps_slow_start_initial); + + if (tcp->tcp_cc_algo->conn_init != NULL) + tcp->tcp_cc_algo->conn_init(&tcp->tcp_ccv); } /* * Add a new piece to the tcp reassembly queue. If the gap at the beginning * is filled, return as much as we can. The message passed in may be
*** 1403,1413 **** * set based on lbolt i.e., a somewhat random number. */ ASSERT(ira->ira_sqp != NULL); new_sqp = ira->ira_sqp; ! econnp = (conn_t *)tcp_get_conn(arg2, tcps); if (econnp == NULL) goto error2; ASSERT(econnp->conn_netstack == lconnp->conn_netstack); econnp->conn_sqp = new_sqp; --- 1533,1543 ---- * set based on lbolt i.e., a somewhat random number. */ ASSERT(ira->ira_sqp != NULL); new_sqp = ira->ira_sqp; ! econnp = tcp_get_conn(arg2, tcps); if (econnp == NULL) goto error2; ASSERT(econnp->conn_netstack == lconnp->conn_netstack); econnp->conn_sqp = new_sqp;
*** 2322,2333 **** int urp; tcp_opt_t tcpopt; ip_pkt_t ipp; boolean_t ofo_seg = B_FALSE; /* Out of order segment */ uint32_t cwnd; - uint32_t add; - int npkt; int mss; conn_t *connp = (conn_t *)arg; squeue_t *sqp = (squeue_t *)arg2; tcp_t *tcp = connp->conn_tcp; tcp_stack_t *tcps = tcp->tcp_tcps; --- 2452,2461 ----
*** 2599,2608 **** --- 2727,2739 ---- * Set tcp_cwnd back to 1 MSS, per * recommendation from * draft-floyd-incr-init-win-01.txt, * Increasing TCP's Initial Window. */ + DTRACE_PROBE3(cwnd__retransmitted__syn, + tcp_t *, tcp, uint32_t, tcp->tcp_cwnd, + uint32_t, tcp->tcp_mss); tcp->tcp_cwnd = tcp->tcp_mss; } tcp->tcp_swl1 = seg_seq; tcp->tcp_swl2 = seg_ack;
*** 3821,3830 **** --- 3952,3964 ---- if (tcp->tcp_rexmit) { tcp->tcp_rexmit = B_FALSE; tcp->tcp_rexmit_nxt = tcp->tcp_snxt; tcp->tcp_rexmit_max = tcp->tcp_snxt; tcp->tcp_ms_we_have_waited = 0; + DTRACE_PROBE3(cwnd__retransmitted__syn, + tcp_t *, tcp, uint32_t, tcp->tcp_cwnd, + uint32_t, tcp->tcp_mss); tcp->tcp_cwnd = mss; } /* * We set the send window to zero here.
*** 3864,3901 **** * set, reduce tcp_cwnd and tcp_ssthresh. But this should only be * done once per window (or more loosely, per RTT). */ if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max)) tcp->tcp_cwr = B_FALSE; ! if (tcp->tcp_ecn_ok && (flags & TH_ECE)) { ! if (!tcp->tcp_cwr) { ! npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) / mss; ! tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * mss; ! tcp->tcp_cwnd = npkt * mss; /* * If the cwnd is 0, use the timer to clock out * new segments. This is required by the ECN spec. */ ! if (npkt == 0) { TCP_TIMER_RESTART(tcp, tcp->tcp_rto); - /* - * This makes sure that when the ACK comes - * back, we will increase tcp_cwnd by 1 MSS. - */ - tcp->tcp_cwnd_cnt = 0; - } tcp->tcp_cwr = B_TRUE; /* * This marks the end of the current window of in * flight data. That is why we don't use * tcp_suna + tcp_swnd. Only data in flight can * provide ECN info. */ tcp->tcp_cwr_snd_max = tcp->tcp_snxt; - tcp->tcp_ecn_cwr_sent = B_FALSE; } - } mp1 = tcp->tcp_xmit_head; if (bytes_acked == 0) { if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) { int dupack_cnt; --- 3998,4024 ---- * set, reduce tcp_cwnd and tcp_ssthresh. But this should only be * done once per window (or more loosely, per RTT). */ if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max)) tcp->tcp_cwr = B_FALSE; ! if (tcp->tcp_ecn_ok && (flags & TH_ECE) && !tcp->tcp_cwr) { ! cc_cong_signal(tcp, seg_ack, CC_ECN); /* * If the cwnd is 0, use the timer to clock out * new segments. This is required by the ECN spec. */ ! if (tcp->tcp_cwnd == 0) TCP_TIMER_RESTART(tcp, tcp->tcp_rto); tcp->tcp_cwr = B_TRUE; /* * This marks the end of the current window of in * flight data. That is why we don't use * tcp_suna + tcp_swnd. Only data in flight can * provide ECN info. */ tcp->tcp_cwr_snd_max = tcp->tcp_snxt; } mp1 = tcp->tcp_xmit_head; if (bytes_acked == 0) { if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) { int dupack_cnt;
*** 3912,3921 **** --- 4035,4046 ---- if (mp1 && tcp->tcp_suna != tcp->tcp_snxt && ! tcp->tcp_rexmit) { /* Do Limited Transmit */ if ((dupack_cnt = ++tcp->tcp_dupack_cnt) < tcps->tcps_dupack_fast_retransmit) { + cc_ack_received(tcp, seg_ack, + bytes_acked, CC_DUPACK); /* * RFC 3042 * * What we need to do is temporarily * increase tcp_cwnd so that new
*** 3958,3973 **** * Adjust cwnd since the duplicate * ack indicates that a packet was * dropped (due to congestion.) */ if (!tcp->tcp_cwr) { ! npkt = ((tcp->tcp_snxt - ! tcp->tcp_suna) >> 1) / mss; ! tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * ! mss; ! tcp->tcp_cwnd = (npkt + ! tcp->tcp_dupack_cnt) * mss; } if (tcp->tcp_ecn_ok) { tcp->tcp_cwr = B_TRUE; tcp->tcp_cwr_snd_max = tcp->tcp_snxt; tcp->tcp_ecn_cwr_sent = B_FALSE; --- 4083,4096 ---- * Adjust cwnd since the duplicate * ack indicates that a packet was * dropped (due to congestion.) */ if (!tcp->tcp_cwr) { ! cc_cong_signal(tcp, seg_ack, ! CC_NDUPACK); ! cc_ack_received(tcp, seg_ack, ! bytes_acked, CC_DUPACK); } if (tcp->tcp_ecn_ok) { tcp->tcp_cwr = B_TRUE; tcp->tcp_cwr_snd_max = tcp->tcp_snxt; tcp->tcp_ecn_cwr_sent = B_FALSE;
*** 4025,4034 **** --- 4148,4159 ---- } else { flags |= TH_REXMIT_NEEDED; } /* tcp_snd_sack_ok */ } else { + cc_ack_received(tcp, seg_ack, + bytes_acked, CC_DUPACK); /* * Here we perform congestion * avoidance, but NOT slow start. * This is known as the Fast * Recovery Algorithm.
*** 4046,4055 **** --- 4171,4184 ---- * cwnd. */ cwnd = tcp->tcp_cwnd + mss; if (cwnd > tcp->tcp_cwnd_max) cwnd = tcp->tcp_cwnd_max; + DTRACE_PROBE3(cwnd__fast__recovery, + tcp_t *, tcp, + uint32_t, tcp->tcp_cwnd, + uint32_t, cwnd); tcp->tcp_cwnd = cwnd; if (tcp->tcp_unsent > 0) flags |= TH_XMIT_NEEDED; } }
*** 4178,4196 **** */ if (tcp->tcp_dupack_cnt >= tcps->tcps_dupack_fast_retransmit) { ASSERT(tcp->tcp_rexmit == B_FALSE); if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) { tcp->tcp_dupack_cnt = 0; ! /* ! * Restore the orig tcp_cwnd_ssthresh after ! * fast retransmit phase. ! */ ! if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) { ! tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh; ! } tcp->tcp_rexmit_max = seg_ack; - tcp->tcp_cwnd_cnt = 0; /* * Remove all notsack info to avoid confusion with * the next fast retrasnmit/recovery phase. */ --- 4307,4320 ---- */ if (tcp->tcp_dupack_cnt >= tcps->tcps_dupack_fast_retransmit) { ASSERT(tcp->tcp_rexmit == B_FALSE); if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) { tcp->tcp_dupack_cnt = 0; ! ! cc_post_recovery(tcp, seg_ack); ! tcp->tcp_rexmit_max = seg_ack; /* * Remove all notsack info to avoid confusion with * the next fast retrasnmit/recovery phase. */
*** 4215,4226 **** * original value when we started fast * recovery. This is to prevent overly * aggressive behaviour in sending new * segments. */ ! tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh + tcps->tcps_dupack_fast_retransmit * mss; tcp->tcp_cwnd_cnt = tcp->tcp_cwnd; flags |= TH_REXMIT_NEEDED; } } } else { --- 4339,4354 ---- * original value when we started fast * recovery. This is to prevent overly * aggressive behaviour in sending new * segments. */ ! cwnd = tcp->tcp_cwnd_ssthresh + tcps->tcps_dupack_fast_retransmit * mss; + DTRACE_PROBE3(cwnd__fast__retransmit__part__ack, + tcp_t *, tcp, uint32_t, tcp->tcp_cwnd, + uint32_t, cwnd); + tcp->tcp_cwnd = cwnd; tcp->tcp_cwnd_cnt = tcp->tcp_cwnd; flags |= TH_REXMIT_NEEDED; } } } else {
*** 4277,4309 **** * If TCP is not ECN capable or TCP is ECN capable but the * congestion experience bit is not set, increase the tcp_cwnd as * usual. */ if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) { ! cwnd = tcp->tcp_cwnd; ! add = mss; ! ! if (cwnd >= tcp->tcp_cwnd_ssthresh) { ! /* ! * This is to prevent an increase of less than 1 MSS of ! * tcp_cwnd. With partial increase, tcp_wput_data() ! * may send out tinygrams in order to preserve mblk ! * boundaries. ! * ! * By initializing tcp_cwnd_cnt to new tcp_cwnd and ! * decrementing it by 1 MSS for every ACKs, tcp_cwnd is ! * increased by 1 MSS for every RTTs. ! */ ! if (tcp->tcp_cwnd_cnt <= 0) { ! tcp->tcp_cwnd_cnt = cwnd + add; ! } else { ! tcp->tcp_cwnd_cnt -= add; ! add = 0; } } - tcp->tcp_cwnd = MIN(cwnd + add, tcp->tcp_cwnd_max); - } /* See if the latest urgent data has been acknowledged */ if ((tcp->tcp_valid_bits & TCP_URG_VALID) && SEQ_GT(seg_ack, tcp->tcp_urg)) tcp->tcp_valid_bits &= ~TCP_URG_VALID; --- 4405,4419 ---- * If TCP is not ECN capable or TCP is ECN capable but the * congestion experience bit is not set, increase the tcp_cwnd as * usual. */ if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) { ! if (IN_RECOVERY(tcp->tcp_ccv.flags)) { ! EXIT_RECOVERY(tcp->tcp_ccv.flags); } + cc_ack_received(tcp, seg_ack, bytes_acked, CC_ACK); } /* See if the latest urgent data has been acknowledged */ if ((tcp->tcp_valid_bits & TCP_URG_VALID) && SEQ_GT(seg_ack, tcp->tcp_urg)) tcp->tcp_valid_bits &= ~TCP_URG_VALID;
*** 5632,5641 **** --- 5742,5755 ---- uint32_t npkt; npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) / tcp->tcp_mss; tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * tcp->tcp_mss; + + DTRACE_PROBE3(cwnd__source__quench, tcp_t *, tcp, + uint32_t, tcp->tcp_cwnd, + uint32_t, tcp->tcp_mss); tcp->tcp_cwnd = tcp->tcp_mss; tcp->tcp_cwnd_cnt = 0; } break; }