Print this page
11553 Want pluggable TCP congestion control algorithms
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>
Reviewed by: Robert Mustacchi <robert.mustacchi@joyent.com>
*** 168,177 ****
--- 168,304 ----
static void tcp_rsrv_input(void *, mblk_t *, void *, ip_recv_attr_t *);
static void tcp_set_rto(tcp_t *, hrtime_t);
static void tcp_setcred_data(mblk_t *, ip_recv_attr_t *);
/*
+ * CC wrapper hook functions
+ */
+ static void
+ cc_ack_received(tcp_t *tcp, uint32_t seg_ack, int32_t bytes_acked,
+ uint16_t type)
+ {
+ uint32_t old_cwnd = tcp->tcp_cwnd;
+
+ tcp->tcp_ccv.bytes_this_ack = bytes_acked;
+ if (tcp->tcp_cwnd <= tcp->tcp_swnd)
+ tcp->tcp_ccv.flags |= CCF_CWND_LIMITED;
+ else
+ tcp->tcp_ccv.flags &= ~CCF_CWND_LIMITED;
+
+ if (type == CC_ACK) {
+ if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) {
+ if (tcp->tcp_ccv.flags & CCF_RTO)
+ tcp->tcp_ccv.flags &= ~CCF_RTO;
+
+ tcp->tcp_ccv.t_bytes_acked +=
+ min(tcp->tcp_ccv.bytes_this_ack,
+ tcp->tcp_tcps->tcps_abc_l_var * tcp->tcp_mss);
+ if (tcp->tcp_ccv.t_bytes_acked >= tcp->tcp_cwnd) {
+ tcp->tcp_ccv.t_bytes_acked -= tcp->tcp_cwnd;
+ tcp->tcp_ccv.flags |= CCF_ABC_SENTAWND;
+ }
+ } else {
+ tcp->tcp_ccv.flags &= ~CCF_ABC_SENTAWND;
+ tcp->tcp_ccv.t_bytes_acked = 0;
+ }
+ }
+
+ if (CC_ALGO(tcp)->ack_received != NULL) {
+ /*
+ * The FreeBSD code where this originated had a comment "Find
+ * a way to live without this" in several places where curack
+ * got set. If they eventually dump curack from the cc
+ * variables, we'll need to adapt our code.
+ */
+ tcp->tcp_ccv.curack = seg_ack;
+ CC_ALGO(tcp)->ack_received(&tcp->tcp_ccv, type);
+ }
+
+ DTRACE_PROBE3(cwnd__cc__ack__received, tcp_t *, tcp, uint32_t, old_cwnd,
+ uint32_t, tcp->tcp_cwnd);
+ }
+
+ void
+ cc_cong_signal(tcp_t *tcp, uint32_t seg_ack, uint32_t type)
+ {
+ uint32_t old_cwnd = tcp->tcp_cwnd;
+ uint32_t old_cwnd_ssthresh = tcp->tcp_cwnd_ssthresh;
+ switch (type) {
+ case CC_NDUPACK:
+ if (!IN_FASTRECOVERY(tcp->tcp_ccv.flags)) {
+ tcp->tcp_rexmit_max = tcp->tcp_snxt;
+ if (tcp->tcp_ecn_ok) {
+ tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
+ tcp->tcp_cwr = B_TRUE;
+ tcp->tcp_ecn_cwr_sent = B_FALSE;
+ }
+ }
+ break;
+ case CC_ECN:
+ if (!IN_CONGRECOVERY(tcp->tcp_ccv.flags)) {
+ tcp->tcp_rexmit_max = tcp->tcp_snxt;
+ if (tcp->tcp_ecn_ok) {
+ tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
+ tcp->tcp_cwr = B_TRUE;
+ tcp->tcp_ecn_cwr_sent = B_FALSE;
+ }
+ }
+ break;
+ case CC_RTO:
+ tcp->tcp_ccv.flags |= CCF_RTO;
+ tcp->tcp_dupack_cnt = 0;
+ tcp->tcp_ccv.t_bytes_acked = 0;
+ /*
+ * Give up on fast recovery and congestion recovery if we were
+ * attempting either.
+ */
+ EXIT_RECOVERY(tcp->tcp_ccv.flags);
+ if (CC_ALGO(tcp)->cong_signal == NULL) {
+ /*
+ * RFC5681 Section 3.1
+ * ssthresh = max (FlightSize / 2, 2*SMSS) eq (4)
+ */
+ tcp->tcp_cwnd_ssthresh = max(
+ (tcp->tcp_snxt - tcp->tcp_suna) / 2 / tcp->tcp_mss,
+ 2) * tcp->tcp_mss;
+ tcp->tcp_cwnd = tcp->tcp_mss;
+ }
+
+ if (tcp->tcp_ecn_ok) {
+ tcp->tcp_cwr = B_TRUE;
+ tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
+ tcp->tcp_ecn_cwr_sent = B_FALSE;
+ }
+ break;
+ }
+
+ if (CC_ALGO(tcp)->cong_signal != NULL) {
+ tcp->tcp_ccv.curack = seg_ack;
+ CC_ALGO(tcp)->cong_signal(&tcp->tcp_ccv, type);
+ }
+
+ DTRACE_PROBE6(cwnd__cc__cong__signal, tcp_t *, tcp, uint32_t, old_cwnd,
+ uint32_t, tcp->tcp_cwnd, uint32_t, old_cwnd_ssthresh,
+ uint32_t, tcp->tcp_cwnd_ssthresh, uint32_t, type);
+ }
+
+ static void
+ cc_post_recovery(tcp_t *tcp, uint32_t seg_ack)
+ {
+ uint32_t old_cwnd = tcp->tcp_cwnd;
+
+ if (CC_ALGO(tcp)->post_recovery != NULL) {
+ tcp->tcp_ccv.curack = seg_ack;
+ CC_ALGO(tcp)->post_recovery(&tcp->tcp_ccv);
+ }
+ tcp->tcp_ccv.t_bytes_acked = 0;
+
+ DTRACE_PROBE3(cwnd__cc__post__recovery, tcp_t *, tcp,
+ uint32_t, old_cwnd, uint32_t, tcp->tcp_cwnd);
+ }
+
+ /*
* Set the MSS associated with a particular tcp based on its current value,
* and a new one passed in. Observe minimums and maximums, and reset other
* state variables that we want to view as multiples of MSS.
*
* The value of MSS could be either increased or descreased.
*** 546,555 ****
--- 673,685 ----
/*
* Initialize tcp_cwnd value. After tcp_mss_set(), tcp_mss has been
* updated properly.
*/
TCP_SET_INIT_CWND(tcp, tcp->tcp_mss, tcps->tcps_slow_start_initial);
+
+ if (tcp->tcp_cc_algo->conn_init != NULL)
+ tcp->tcp_cc_algo->conn_init(&tcp->tcp_ccv);
}
/*
* Add a new piece to the tcp reassembly queue. If the gap at the beginning
* is filled, return as much as we can. The message passed in may be
*** 1403,1413 ****
* set based on lbolt i.e., a somewhat random number.
*/
ASSERT(ira->ira_sqp != NULL);
new_sqp = ira->ira_sqp;
! econnp = (conn_t *)tcp_get_conn(arg2, tcps);
if (econnp == NULL)
goto error2;
ASSERT(econnp->conn_netstack == lconnp->conn_netstack);
econnp->conn_sqp = new_sqp;
--- 1533,1543 ----
* set based on lbolt i.e., a somewhat random number.
*/
ASSERT(ira->ira_sqp != NULL);
new_sqp = ira->ira_sqp;
! econnp = tcp_get_conn(arg2, tcps);
if (econnp == NULL)
goto error2;
ASSERT(econnp->conn_netstack == lconnp->conn_netstack);
econnp->conn_sqp = new_sqp;
*** 2322,2333 ****
int urp;
tcp_opt_t tcpopt;
ip_pkt_t ipp;
boolean_t ofo_seg = B_FALSE; /* Out of order segment */
uint32_t cwnd;
- uint32_t add;
- int npkt;
int mss;
conn_t *connp = (conn_t *)arg;
squeue_t *sqp = (squeue_t *)arg2;
tcp_t *tcp = connp->conn_tcp;
tcp_stack_t *tcps = tcp->tcp_tcps;
--- 2452,2461 ----
*** 2599,2608 ****
--- 2727,2739 ----
* Set tcp_cwnd back to 1 MSS, per
* recommendation from
* draft-floyd-incr-init-win-01.txt,
* Increasing TCP's Initial Window.
*/
+ DTRACE_PROBE3(cwnd__retransmitted__syn,
+ tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
+ uint32_t, tcp->tcp_mss);
tcp->tcp_cwnd = tcp->tcp_mss;
}
tcp->tcp_swl1 = seg_seq;
tcp->tcp_swl2 = seg_ack;
*** 3821,3830 ****
--- 3952,3964 ----
if (tcp->tcp_rexmit) {
tcp->tcp_rexmit = B_FALSE;
tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
tcp->tcp_rexmit_max = tcp->tcp_snxt;
tcp->tcp_ms_we_have_waited = 0;
+ DTRACE_PROBE3(cwnd__retransmitted__syn,
+ tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
+ uint32_t, tcp->tcp_mss);
tcp->tcp_cwnd = mss;
}
/*
* We set the send window to zero here.
*** 3864,3901 ****
* set, reduce tcp_cwnd and tcp_ssthresh. But this should only be
* done once per window (or more loosely, per RTT).
*/
if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max))
tcp->tcp_cwr = B_FALSE;
! if (tcp->tcp_ecn_ok && (flags & TH_ECE)) {
! if (!tcp->tcp_cwr) {
! npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) / mss;
! tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * mss;
! tcp->tcp_cwnd = npkt * mss;
/*
* If the cwnd is 0, use the timer to clock out
* new segments. This is required by the ECN spec.
*/
! if (npkt == 0) {
TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
- /*
- * This makes sure that when the ACK comes
- * back, we will increase tcp_cwnd by 1 MSS.
- */
- tcp->tcp_cwnd_cnt = 0;
- }
tcp->tcp_cwr = B_TRUE;
/*
* This marks the end of the current window of in
* flight data. That is why we don't use
* tcp_suna + tcp_swnd. Only data in flight can
* provide ECN info.
*/
tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
- tcp->tcp_ecn_cwr_sent = B_FALSE;
}
- }
mp1 = tcp->tcp_xmit_head;
if (bytes_acked == 0) {
if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) {
int dupack_cnt;
--- 3998,4024 ----
* set, reduce tcp_cwnd and tcp_ssthresh. But this should only be
* done once per window (or more loosely, per RTT).
*/
if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max))
tcp->tcp_cwr = B_FALSE;
! if (tcp->tcp_ecn_ok && (flags & TH_ECE) && !tcp->tcp_cwr) {
! cc_cong_signal(tcp, seg_ack, CC_ECN);
/*
* If the cwnd is 0, use the timer to clock out
* new segments. This is required by the ECN spec.
*/
! if (tcp->tcp_cwnd == 0)
TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
tcp->tcp_cwr = B_TRUE;
/*
* This marks the end of the current window of in
* flight data. That is why we don't use
* tcp_suna + tcp_swnd. Only data in flight can
* provide ECN info.
*/
tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
}
mp1 = tcp->tcp_xmit_head;
if (bytes_acked == 0) {
if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) {
int dupack_cnt;
*** 3912,3921 ****
--- 4035,4046 ----
if (mp1 && tcp->tcp_suna != tcp->tcp_snxt &&
! tcp->tcp_rexmit) {
/* Do Limited Transmit */
if ((dupack_cnt = ++tcp->tcp_dupack_cnt) <
tcps->tcps_dupack_fast_retransmit) {
+ cc_ack_received(tcp, seg_ack,
+ bytes_acked, CC_DUPACK);
/*
* RFC 3042
*
* What we need to do is temporarily
* increase tcp_cwnd so that new
*** 3958,3973 ****
* Adjust cwnd since the duplicate
* ack indicates that a packet was
* dropped (due to congestion.)
*/
if (!tcp->tcp_cwr) {
! npkt = ((tcp->tcp_snxt -
! tcp->tcp_suna) >> 1) / mss;
! tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) *
! mss;
! tcp->tcp_cwnd = (npkt +
! tcp->tcp_dupack_cnt) * mss;
}
if (tcp->tcp_ecn_ok) {
tcp->tcp_cwr = B_TRUE;
tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
tcp->tcp_ecn_cwr_sent = B_FALSE;
--- 4083,4096 ----
* Adjust cwnd since the duplicate
* ack indicates that a packet was
* dropped (due to congestion.)
*/
if (!tcp->tcp_cwr) {
! cc_cong_signal(tcp, seg_ack,
! CC_NDUPACK);
! cc_ack_received(tcp, seg_ack,
! bytes_acked, CC_DUPACK);
}
if (tcp->tcp_ecn_ok) {
tcp->tcp_cwr = B_TRUE;
tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
tcp->tcp_ecn_cwr_sent = B_FALSE;
*** 4025,4034 ****
--- 4148,4159 ----
} else {
flags |= TH_REXMIT_NEEDED;
} /* tcp_snd_sack_ok */
} else {
+ cc_ack_received(tcp, seg_ack,
+ bytes_acked, CC_DUPACK);
/*
* Here we perform congestion
* avoidance, but NOT slow start.
* This is known as the Fast
* Recovery Algorithm.
*** 4046,4055 ****
--- 4171,4184 ----
* cwnd.
*/
cwnd = tcp->tcp_cwnd + mss;
if (cwnd > tcp->tcp_cwnd_max)
cwnd = tcp->tcp_cwnd_max;
+ DTRACE_PROBE3(cwnd__fast__recovery,
+ tcp_t *, tcp,
+ uint32_t, tcp->tcp_cwnd,
+ uint32_t, cwnd);
tcp->tcp_cwnd = cwnd;
if (tcp->tcp_unsent > 0)
flags |= TH_XMIT_NEEDED;
}
}
*** 4178,4196 ****
*/
if (tcp->tcp_dupack_cnt >= tcps->tcps_dupack_fast_retransmit) {
ASSERT(tcp->tcp_rexmit == B_FALSE);
if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) {
tcp->tcp_dupack_cnt = 0;
! /*
! * Restore the orig tcp_cwnd_ssthresh after
! * fast retransmit phase.
! */
! if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) {
! tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh;
! }
tcp->tcp_rexmit_max = seg_ack;
- tcp->tcp_cwnd_cnt = 0;
/*
* Remove all notsack info to avoid confusion with
* the next fast retrasnmit/recovery phase.
*/
--- 4307,4320 ----
*/
if (tcp->tcp_dupack_cnt >= tcps->tcps_dupack_fast_retransmit) {
ASSERT(tcp->tcp_rexmit == B_FALSE);
if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) {
tcp->tcp_dupack_cnt = 0;
!
! cc_post_recovery(tcp, seg_ack);
!
tcp->tcp_rexmit_max = seg_ack;
/*
* Remove all notsack info to avoid confusion with
* the next fast retrasnmit/recovery phase.
*/
*** 4215,4226 ****
* original value when we started fast
* recovery. This is to prevent overly
* aggressive behaviour in sending new
* segments.
*/
! tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh +
tcps->tcps_dupack_fast_retransmit * mss;
tcp->tcp_cwnd_cnt = tcp->tcp_cwnd;
flags |= TH_REXMIT_NEEDED;
}
}
} else {
--- 4339,4354 ----
* original value when we started fast
* recovery. This is to prevent overly
* aggressive behaviour in sending new
* segments.
*/
! cwnd = tcp->tcp_cwnd_ssthresh +
tcps->tcps_dupack_fast_retransmit * mss;
+ DTRACE_PROBE3(cwnd__fast__retransmit__part__ack,
+ tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
+ uint32_t, cwnd);
+ tcp->tcp_cwnd = cwnd;
tcp->tcp_cwnd_cnt = tcp->tcp_cwnd;
flags |= TH_REXMIT_NEEDED;
}
}
} else {
*** 4277,4309 ****
* If TCP is not ECN capable or TCP is ECN capable but the
* congestion experience bit is not set, increase the tcp_cwnd as
* usual.
*/
if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) {
! cwnd = tcp->tcp_cwnd;
! add = mss;
!
! if (cwnd >= tcp->tcp_cwnd_ssthresh) {
! /*
! * This is to prevent an increase of less than 1 MSS of
! * tcp_cwnd. With partial increase, tcp_wput_data()
! * may send out tinygrams in order to preserve mblk
! * boundaries.
! *
! * By initializing tcp_cwnd_cnt to new tcp_cwnd and
! * decrementing it by 1 MSS for every ACKs, tcp_cwnd is
! * increased by 1 MSS for every RTTs.
! */
! if (tcp->tcp_cwnd_cnt <= 0) {
! tcp->tcp_cwnd_cnt = cwnd + add;
! } else {
! tcp->tcp_cwnd_cnt -= add;
! add = 0;
}
}
- tcp->tcp_cwnd = MIN(cwnd + add, tcp->tcp_cwnd_max);
- }
/* See if the latest urgent data has been acknowledged */
if ((tcp->tcp_valid_bits & TCP_URG_VALID) &&
SEQ_GT(seg_ack, tcp->tcp_urg))
tcp->tcp_valid_bits &= ~TCP_URG_VALID;
--- 4405,4419 ----
* If TCP is not ECN capable or TCP is ECN capable but the
* congestion experience bit is not set, increase the tcp_cwnd as
* usual.
*/
if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) {
! if (IN_RECOVERY(tcp->tcp_ccv.flags)) {
! EXIT_RECOVERY(tcp->tcp_ccv.flags);
}
+ cc_ack_received(tcp, seg_ack, bytes_acked, CC_ACK);
}
/* See if the latest urgent data has been acknowledged */
if ((tcp->tcp_valid_bits & TCP_URG_VALID) &&
SEQ_GT(seg_ack, tcp->tcp_urg))
tcp->tcp_valid_bits &= ~TCP_URG_VALID;
*** 5632,5641 ****
--- 5742,5755 ----
uint32_t npkt;
npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) /
tcp->tcp_mss;
tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * tcp->tcp_mss;
+
+ DTRACE_PROBE3(cwnd__source__quench, tcp_t *, tcp,
+ uint32_t, tcp->tcp_cwnd,
+ uint32_t, tcp->tcp_mss);
tcp->tcp_cwnd = tcp->tcp_mss;
tcp->tcp_cwnd_cnt = 0;
}
break;
}