Print this page
11553 Want pluggable TCP congestion control algorithms
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>
Reviewed by: Robert Mustacchi <robert.mustacchi@joyent.com>
@@ -168,10 +168,137 @@
static void tcp_rsrv_input(void *, mblk_t *, void *, ip_recv_attr_t *);
static void tcp_set_rto(tcp_t *, hrtime_t);
static void tcp_setcred_data(mblk_t *, ip_recv_attr_t *);
/*
+ * CC wrapper hook functions
+ */
+static void
+cc_ack_received(tcp_t *tcp, uint32_t seg_ack, int32_t bytes_acked,
+ uint16_t type)
+{
+ uint32_t old_cwnd = tcp->tcp_cwnd;
+
+ tcp->tcp_ccv.bytes_this_ack = bytes_acked;
+ if (tcp->tcp_cwnd <= tcp->tcp_swnd)
+ tcp->tcp_ccv.flags |= CCF_CWND_LIMITED;
+ else
+ tcp->tcp_ccv.flags &= ~CCF_CWND_LIMITED;
+
+ if (type == CC_ACK) {
+ if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) {
+ if (tcp->tcp_ccv.flags & CCF_RTO)
+ tcp->tcp_ccv.flags &= ~CCF_RTO;
+
+ tcp->tcp_ccv.t_bytes_acked +=
+ min(tcp->tcp_ccv.bytes_this_ack,
+ tcp->tcp_tcps->tcps_abc_l_var * tcp->tcp_mss);
+ if (tcp->tcp_ccv.t_bytes_acked >= tcp->tcp_cwnd) {
+ tcp->tcp_ccv.t_bytes_acked -= tcp->tcp_cwnd;
+ tcp->tcp_ccv.flags |= CCF_ABC_SENTAWND;
+ }
+ } else {
+ tcp->tcp_ccv.flags &= ~CCF_ABC_SENTAWND;
+ tcp->tcp_ccv.t_bytes_acked = 0;
+ }
+ }
+
+ if (CC_ALGO(tcp)->ack_received != NULL) {
+ /*
+ * The FreeBSD code where this originated had a comment "Find
+ * a way to live without this" in several places where curack
+ * got set. If they eventually dump curack from the cc
+ * variables, we'll need to adapt our code.
+ */
+ tcp->tcp_ccv.curack = seg_ack;
+ CC_ALGO(tcp)->ack_received(&tcp->tcp_ccv, type);
+ }
+
+ DTRACE_PROBE3(cwnd__cc__ack__received, tcp_t *, tcp, uint32_t, old_cwnd,
+ uint32_t, tcp->tcp_cwnd);
+}
+
+void
+cc_cong_signal(tcp_t *tcp, uint32_t seg_ack, uint32_t type)
+{
+ uint32_t old_cwnd = tcp->tcp_cwnd;
+ uint32_t old_cwnd_ssthresh = tcp->tcp_cwnd_ssthresh;
+ switch (type) {
+ case CC_NDUPACK:
+ if (!IN_FASTRECOVERY(tcp->tcp_ccv.flags)) {
+ tcp->tcp_rexmit_max = tcp->tcp_snxt;
+ if (tcp->tcp_ecn_ok) {
+ tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
+ tcp->tcp_cwr = B_TRUE;
+ tcp->tcp_ecn_cwr_sent = B_FALSE;
+ }
+ }
+ break;
+ case CC_ECN:
+ if (!IN_CONGRECOVERY(tcp->tcp_ccv.flags)) {
+ tcp->tcp_rexmit_max = tcp->tcp_snxt;
+ if (tcp->tcp_ecn_ok) {
+ tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
+ tcp->tcp_cwr = B_TRUE;
+ tcp->tcp_ecn_cwr_sent = B_FALSE;
+ }
+ }
+ break;
+ case CC_RTO:
+ tcp->tcp_ccv.flags |= CCF_RTO;
+ tcp->tcp_dupack_cnt = 0;
+ tcp->tcp_ccv.t_bytes_acked = 0;
+ /*
+ * Give up on fast recovery and congestion recovery if we were
+ * attempting either.
+ */
+ EXIT_RECOVERY(tcp->tcp_ccv.flags);
+ if (CC_ALGO(tcp)->cong_signal == NULL) {
+ /*
+ * RFC5681 Section 3.1
+ * ssthresh = max (FlightSize / 2, 2*SMSS) eq (4)
+ */
+ tcp->tcp_cwnd_ssthresh = max(
+ (tcp->tcp_snxt - tcp->tcp_suna) / 2 / tcp->tcp_mss,
+ 2) * tcp->tcp_mss;
+ tcp->tcp_cwnd = tcp->tcp_mss;
+ }
+
+ if (tcp->tcp_ecn_ok) {
+ tcp->tcp_cwr = B_TRUE;
+ tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
+ tcp->tcp_ecn_cwr_sent = B_FALSE;
+ }
+ break;
+ }
+
+ if (CC_ALGO(tcp)->cong_signal != NULL) {
+ tcp->tcp_ccv.curack = seg_ack;
+ CC_ALGO(tcp)->cong_signal(&tcp->tcp_ccv, type);
+ }
+
+ DTRACE_PROBE6(cwnd__cc__cong__signal, tcp_t *, tcp, uint32_t, old_cwnd,
+ uint32_t, tcp->tcp_cwnd, uint32_t, old_cwnd_ssthresh,
+ uint32_t, tcp->tcp_cwnd_ssthresh, uint32_t, type);
+}
+
+static void
+cc_post_recovery(tcp_t *tcp, uint32_t seg_ack)
+{
+ uint32_t old_cwnd = tcp->tcp_cwnd;
+
+ if (CC_ALGO(tcp)->post_recovery != NULL) {
+ tcp->tcp_ccv.curack = seg_ack;
+ CC_ALGO(tcp)->post_recovery(&tcp->tcp_ccv);
+ }
+ tcp->tcp_ccv.t_bytes_acked = 0;
+
+ DTRACE_PROBE3(cwnd__cc__post__recovery, tcp_t *, tcp,
+ uint32_t, old_cwnd, uint32_t, tcp->tcp_cwnd);
+}
+
+/*
* Set the MSS associated with a particular tcp based on its current value,
* and a new one passed in. Observe minimums and maximums, and reset other
* state variables that we want to view as multiples of MSS.
*
* The value of MSS could be either increased or descreased.
@@ -546,10 +673,13 @@
/*
* Initialize tcp_cwnd value. After tcp_mss_set(), tcp_mss has been
* updated properly.
*/
TCP_SET_INIT_CWND(tcp, tcp->tcp_mss, tcps->tcps_slow_start_initial);
+
+ if (tcp->tcp_cc_algo->conn_init != NULL)
+ tcp->tcp_cc_algo->conn_init(&tcp->tcp_ccv);
}
/*
* Add a new piece to the tcp reassembly queue. If the gap at the beginning
* is filled, return as much as we can. The message passed in may be
@@ -1403,11 +1533,11 @@
* set based on lbolt i.e., a somewhat random number.
*/
ASSERT(ira->ira_sqp != NULL);
new_sqp = ira->ira_sqp;
- econnp = (conn_t *)tcp_get_conn(arg2, tcps);
+ econnp = tcp_get_conn(arg2, tcps);
if (econnp == NULL)
goto error2;
ASSERT(econnp->conn_netstack == lconnp->conn_netstack);
econnp->conn_sqp = new_sqp;
@@ -2322,12 +2452,10 @@
int urp;
tcp_opt_t tcpopt;
ip_pkt_t ipp;
boolean_t ofo_seg = B_FALSE; /* Out of order segment */
uint32_t cwnd;
- uint32_t add;
- int npkt;
int mss;
conn_t *connp = (conn_t *)arg;
squeue_t *sqp = (squeue_t *)arg2;
tcp_t *tcp = connp->conn_tcp;
tcp_stack_t *tcps = tcp->tcp_tcps;
@@ -2599,10 +2727,13 @@
* Set tcp_cwnd back to 1 MSS, per
* recommendation from
* draft-floyd-incr-init-win-01.txt,
* Increasing TCP's Initial Window.
*/
+ DTRACE_PROBE3(cwnd__retransmitted__syn,
+ tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
+ uint32_t, tcp->tcp_mss);
tcp->tcp_cwnd = tcp->tcp_mss;
}
tcp->tcp_swl1 = seg_seq;
tcp->tcp_swl2 = seg_ack;
@@ -3821,10 +3952,13 @@
if (tcp->tcp_rexmit) {
tcp->tcp_rexmit = B_FALSE;
tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
tcp->tcp_rexmit_max = tcp->tcp_snxt;
tcp->tcp_ms_we_have_waited = 0;
+ DTRACE_PROBE3(cwnd__retransmitted__syn,
+ tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
+ uint32_t, tcp->tcp_mss);
tcp->tcp_cwnd = mss;
}
/*
* We set the send window to zero here.
@@ -3864,38 +3998,27 @@
* set, reduce tcp_cwnd and tcp_ssthresh. But this should only be
* done once per window (or more loosely, per RTT).
*/
if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max))
tcp->tcp_cwr = B_FALSE;
- if (tcp->tcp_ecn_ok && (flags & TH_ECE)) {
- if (!tcp->tcp_cwr) {
- npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) / mss;
- tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * mss;
- tcp->tcp_cwnd = npkt * mss;
+ if (tcp->tcp_ecn_ok && (flags & TH_ECE) && !tcp->tcp_cwr) {
+ cc_cong_signal(tcp, seg_ack, CC_ECN);
/*
* If the cwnd is 0, use the timer to clock out
* new segments. This is required by the ECN spec.
*/
- if (npkt == 0) {
+ if (tcp->tcp_cwnd == 0)
TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
- /*
- * This makes sure that when the ACK comes
- * back, we will increase tcp_cwnd by 1 MSS.
- */
- tcp->tcp_cwnd_cnt = 0;
- }
tcp->tcp_cwr = B_TRUE;
/*
* This marks the end of the current window of in
* flight data. That is why we don't use
* tcp_suna + tcp_swnd. Only data in flight can
* provide ECN info.
*/
tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
- tcp->tcp_ecn_cwr_sent = B_FALSE;
}
- }
mp1 = tcp->tcp_xmit_head;
if (bytes_acked == 0) {
if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) {
int dupack_cnt;
@@ -3912,10 +4035,12 @@
if (mp1 && tcp->tcp_suna != tcp->tcp_snxt &&
! tcp->tcp_rexmit) {
/* Do Limited Transmit */
if ((dupack_cnt = ++tcp->tcp_dupack_cnt) <
tcps->tcps_dupack_fast_retransmit) {
+ cc_ack_received(tcp, seg_ack,
+ bytes_acked, CC_DUPACK);
/*
* RFC 3042
*
* What we need to do is temporarily
* increase tcp_cwnd so that new
@@ -3958,16 +4083,14 @@
* Adjust cwnd since the duplicate
* ack indicates that a packet was
* dropped (due to congestion.)
*/
if (!tcp->tcp_cwr) {
- npkt = ((tcp->tcp_snxt -
- tcp->tcp_suna) >> 1) / mss;
- tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) *
- mss;
- tcp->tcp_cwnd = (npkt +
- tcp->tcp_dupack_cnt) * mss;
+ cc_cong_signal(tcp, seg_ack,
+ CC_NDUPACK);
+ cc_ack_received(tcp, seg_ack,
+ bytes_acked, CC_DUPACK);
}
if (tcp->tcp_ecn_ok) {
tcp->tcp_cwr = B_TRUE;
tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
tcp->tcp_ecn_cwr_sent = B_FALSE;
@@ -4025,10 +4148,12 @@
} else {
flags |= TH_REXMIT_NEEDED;
} /* tcp_snd_sack_ok */
} else {
+ cc_ack_received(tcp, seg_ack,
+ bytes_acked, CC_DUPACK);
/*
* Here we perform congestion
* avoidance, but NOT slow start.
* This is known as the Fast
* Recovery Algorithm.
@@ -4046,10 +4171,14 @@
* cwnd.
*/
cwnd = tcp->tcp_cwnd + mss;
if (cwnd > tcp->tcp_cwnd_max)
cwnd = tcp->tcp_cwnd_max;
+ DTRACE_PROBE3(cwnd__fast__recovery,
+ tcp_t *, tcp,
+ uint32_t, tcp->tcp_cwnd,
+ uint32_t, cwnd);
tcp->tcp_cwnd = cwnd;
if (tcp->tcp_unsent > 0)
flags |= TH_XMIT_NEEDED;
}
}
@@ -4178,19 +4307,14 @@
*/
if (tcp->tcp_dupack_cnt >= tcps->tcps_dupack_fast_retransmit) {
ASSERT(tcp->tcp_rexmit == B_FALSE);
if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) {
tcp->tcp_dupack_cnt = 0;
- /*
- * Restore the orig tcp_cwnd_ssthresh after
- * fast retransmit phase.
- */
- if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) {
- tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh;
- }
+
+ cc_post_recovery(tcp, seg_ack);
+
tcp->tcp_rexmit_max = seg_ack;
- tcp->tcp_cwnd_cnt = 0;
/*
* Remove all notsack info to avoid confusion with
* the next fast retrasnmit/recovery phase.
*/
@@ -4215,12 +4339,16 @@
* original value when we started fast
* recovery. This is to prevent overly
* aggressive behaviour in sending new
* segments.
*/
- tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh +
+ cwnd = tcp->tcp_cwnd_ssthresh +
tcps->tcps_dupack_fast_retransmit * mss;
+ DTRACE_PROBE3(cwnd__fast__retransmit__part__ack,
+ tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
+ uint32_t, cwnd);
+ tcp->tcp_cwnd = cwnd;
tcp->tcp_cwnd_cnt = tcp->tcp_cwnd;
flags |= TH_REXMIT_NEEDED;
}
}
} else {
@@ -4277,33 +4405,15 @@
* If TCP is not ECN capable or TCP is ECN capable but the
* congestion experience bit is not set, increase the tcp_cwnd as
* usual.
*/
if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) {
- cwnd = tcp->tcp_cwnd;
- add = mss;
-
- if (cwnd >= tcp->tcp_cwnd_ssthresh) {
- /*
- * This is to prevent an increase of less than 1 MSS of
- * tcp_cwnd. With partial increase, tcp_wput_data()
- * may send out tinygrams in order to preserve mblk
- * boundaries.
- *
- * By initializing tcp_cwnd_cnt to new tcp_cwnd and
- * decrementing it by 1 MSS for every ACKs, tcp_cwnd is
- * increased by 1 MSS for every RTTs.
- */
- if (tcp->tcp_cwnd_cnt <= 0) {
- tcp->tcp_cwnd_cnt = cwnd + add;
- } else {
- tcp->tcp_cwnd_cnt -= add;
- add = 0;
+ if (IN_RECOVERY(tcp->tcp_ccv.flags)) {
+ EXIT_RECOVERY(tcp->tcp_ccv.flags);
}
+ cc_ack_received(tcp, seg_ack, bytes_acked, CC_ACK);
}
- tcp->tcp_cwnd = MIN(cwnd + add, tcp->tcp_cwnd_max);
- }
/* See if the latest urgent data has been acknowledged */
if ((tcp->tcp_valid_bits & TCP_URG_VALID) &&
SEQ_GT(seg_ack, tcp->tcp_urg))
tcp->tcp_valid_bits &= ~TCP_URG_VALID;
@@ -5632,10 +5742,14 @@
uint32_t npkt;
npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) /
tcp->tcp_mss;
tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * tcp->tcp_mss;
+
+ DTRACE_PROBE3(cwnd__source__quench, tcp_t *, tcp,
+ uint32_t, tcp->tcp_cwnd,
+ uint32_t, tcp->tcp_mss);
tcp->tcp_cwnd = tcp->tcp_mss;
tcp->tcp_cwnd_cnt = 0;
}
break;
}