Print this page
11553 Want pluggable TCP congestion control algorithms
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>
Reviewed by: Robert Mustacchi <robert.mustacchi@joyent.com>

@@ -168,10 +168,137 @@
 static void     tcp_rsrv_input(void *, mblk_t *, void *, ip_recv_attr_t *);
 static void     tcp_set_rto(tcp_t *, hrtime_t);
 static void     tcp_setcred_data(mblk_t *, ip_recv_attr_t *);
 
 /*
+ * CC wrapper hook functions
+ */
+static void
+cc_ack_received(tcp_t *tcp, uint32_t seg_ack, int32_t bytes_acked,
+    uint16_t type)
+{
+        uint32_t old_cwnd = tcp->tcp_cwnd;
+
+        tcp->tcp_ccv.bytes_this_ack = bytes_acked;
+        if (tcp->tcp_cwnd <= tcp->tcp_swnd)
+                tcp->tcp_ccv.flags |= CCF_CWND_LIMITED;
+        else
+                tcp->tcp_ccv.flags &= ~CCF_CWND_LIMITED;
+
+        if (type == CC_ACK) {
+                if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) {
+                        if (tcp->tcp_ccv.flags & CCF_RTO)
+                                tcp->tcp_ccv.flags &= ~CCF_RTO;
+
+                        tcp->tcp_ccv.t_bytes_acked +=
+                            min(tcp->tcp_ccv.bytes_this_ack,
+                            tcp->tcp_tcps->tcps_abc_l_var * tcp->tcp_mss);
+                        if (tcp->tcp_ccv.t_bytes_acked >= tcp->tcp_cwnd) {
+                                tcp->tcp_ccv.t_bytes_acked -= tcp->tcp_cwnd;
+                                tcp->tcp_ccv.flags |= CCF_ABC_SENTAWND;
+                        }
+                } else {
+                        tcp->tcp_ccv.flags &= ~CCF_ABC_SENTAWND;
+                        tcp->tcp_ccv.t_bytes_acked = 0;
+                }
+        }
+
+        if (CC_ALGO(tcp)->ack_received != NULL) {
+                /*
+                 * The FreeBSD code where this originated had a comment "Find
+                 * a way to live without this" in several places where curack
+                 * got set.  If they eventually dump curack from the cc
+                 * variables, we'll need to adapt our code.
+                 */
+                tcp->tcp_ccv.curack = seg_ack;
+                CC_ALGO(tcp)->ack_received(&tcp->tcp_ccv, type);
+        }
+
+        DTRACE_PROBE3(cwnd__cc__ack__received, tcp_t *, tcp, uint32_t, old_cwnd,
+            uint32_t, tcp->tcp_cwnd);
+}
+
+void
+cc_cong_signal(tcp_t *tcp, uint32_t seg_ack, uint32_t type)
+{
+        uint32_t old_cwnd = tcp->tcp_cwnd;
+        uint32_t old_cwnd_ssthresh = tcp->tcp_cwnd_ssthresh;
+        switch (type) {
+        case CC_NDUPACK:
+                if (!IN_FASTRECOVERY(tcp->tcp_ccv.flags)) {
+                        tcp->tcp_rexmit_max = tcp->tcp_snxt;
+                        if (tcp->tcp_ecn_ok) {
+                                tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
+                                tcp->tcp_cwr = B_TRUE;
+                                tcp->tcp_ecn_cwr_sent = B_FALSE;
+                        }
+                }
+                break;
+        case CC_ECN:
+                if (!IN_CONGRECOVERY(tcp->tcp_ccv.flags)) {
+                        tcp->tcp_rexmit_max = tcp->tcp_snxt;
+                        if (tcp->tcp_ecn_ok) {
+                                tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
+                                tcp->tcp_cwr = B_TRUE;
+                                tcp->tcp_ecn_cwr_sent = B_FALSE;
+                        }
+                }
+                break;
+        case CC_RTO:
+                tcp->tcp_ccv.flags |= CCF_RTO;
+                tcp->tcp_dupack_cnt = 0;
+                tcp->tcp_ccv.t_bytes_acked = 0;
+                /*
+                 * Give up on fast recovery and congestion recovery if we were
+                 * attempting either.
+                 */
+                EXIT_RECOVERY(tcp->tcp_ccv.flags);
+                if (CC_ALGO(tcp)->cong_signal == NULL) {
+                        /*
+                         * RFC5681 Section 3.1
+                         * ssthresh = max (FlightSize / 2, 2*SMSS) eq (4)
+                         */
+                        tcp->tcp_cwnd_ssthresh = max(
+                            (tcp->tcp_snxt - tcp->tcp_suna) / 2 / tcp->tcp_mss,
+                            2) * tcp->tcp_mss;
+                        tcp->tcp_cwnd = tcp->tcp_mss;
+                }
+
+                if (tcp->tcp_ecn_ok) {
+                        tcp->tcp_cwr = B_TRUE;
+                        tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
+                        tcp->tcp_ecn_cwr_sent = B_FALSE;
+                }
+                break;
+        }
+
+        if (CC_ALGO(tcp)->cong_signal != NULL) {
+                tcp->tcp_ccv.curack = seg_ack;
+                CC_ALGO(tcp)->cong_signal(&tcp->tcp_ccv, type);
+        }
+
+        DTRACE_PROBE6(cwnd__cc__cong__signal, tcp_t *, tcp, uint32_t, old_cwnd,
+            uint32_t, tcp->tcp_cwnd, uint32_t, old_cwnd_ssthresh,
+            uint32_t, tcp->tcp_cwnd_ssthresh, uint32_t, type);
+}
+
+static void
+cc_post_recovery(tcp_t *tcp, uint32_t seg_ack)
+{
+        uint32_t old_cwnd = tcp->tcp_cwnd;
+
+        if (CC_ALGO(tcp)->post_recovery != NULL) {
+                tcp->tcp_ccv.curack = seg_ack;
+                CC_ALGO(tcp)->post_recovery(&tcp->tcp_ccv);
+        }
+        tcp->tcp_ccv.t_bytes_acked = 0;
+
+        DTRACE_PROBE3(cwnd__cc__post__recovery, tcp_t *, tcp,
+            uint32_t, old_cwnd, uint32_t, tcp->tcp_cwnd);
+}
+
+/*
  * Set the MSS associated with a particular tcp based on its current value,
  * and a new one passed in. Observe minimums and maximums, and reset other
  * state variables that we want to view as multiples of MSS.
  *
  * The value of MSS could be either increased or descreased.

@@ -546,10 +673,13 @@
         /*
          * Initialize tcp_cwnd value. After tcp_mss_set(), tcp_mss has been
          * updated properly.
          */
         TCP_SET_INIT_CWND(tcp, tcp->tcp_mss, tcps->tcps_slow_start_initial);
+
+        if (tcp->tcp_cc_algo->conn_init != NULL)
+                tcp->tcp_cc_algo->conn_init(&tcp->tcp_ccv);
 }
 
 /*
  * Add a new piece to the tcp reassembly queue.  If the gap at the beginning
  * is filled, return as much as we can.  The message passed in may be

@@ -1403,11 +1533,11 @@
          * set based on lbolt i.e., a somewhat random number.
          */
         ASSERT(ira->ira_sqp != NULL);
         new_sqp = ira->ira_sqp;
 
-        econnp = (conn_t *)tcp_get_conn(arg2, tcps);
+        econnp = tcp_get_conn(arg2, tcps);
         if (econnp == NULL)
                 goto error2;
 
         ASSERT(econnp->conn_netstack == lconnp->conn_netstack);
         econnp->conn_sqp = new_sqp;

@@ -2322,12 +2452,10 @@
         int             urp;
         tcp_opt_t       tcpopt;
         ip_pkt_t        ipp;
         boolean_t       ofo_seg = B_FALSE; /* Out of order segment */
         uint32_t        cwnd;
-        uint32_t        add;
-        int             npkt;
         int             mss;
         conn_t          *connp = (conn_t *)arg;
         squeue_t        *sqp = (squeue_t *)arg2;
         tcp_t           *tcp = connp->conn_tcp;
         tcp_stack_t     *tcps = tcp->tcp_tcps;

@@ -2599,10 +2727,13 @@
                                  * Set tcp_cwnd back to 1 MSS, per
                                  * recommendation from
                                  * draft-floyd-incr-init-win-01.txt,
                                  * Increasing TCP's Initial Window.
                                  */
+                                DTRACE_PROBE3(cwnd__retransmitted__syn,
+                                    tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
+                                    uint32_t, tcp->tcp_mss);
                                 tcp->tcp_cwnd = tcp->tcp_mss;
                         }
 
                         tcp->tcp_swl1 = seg_seq;
                         tcp->tcp_swl2 = seg_ack;

@@ -3821,10 +3952,13 @@
                 if (tcp->tcp_rexmit) {
                         tcp->tcp_rexmit = B_FALSE;
                         tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
                         tcp->tcp_rexmit_max = tcp->tcp_snxt;
                         tcp->tcp_ms_we_have_waited = 0;
+                        DTRACE_PROBE3(cwnd__retransmitted__syn,
+                            tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
+                            uint32_t, tcp->tcp_mss);
                         tcp->tcp_cwnd = mss;
                 }
 
                 /*
                  * We set the send window to zero here.

@@ -3864,38 +3998,27 @@
          * set, reduce tcp_cwnd and tcp_ssthresh.  But this should only be
          * done once per window (or more loosely, per RTT).
          */
         if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max))
                 tcp->tcp_cwr = B_FALSE;
-        if (tcp->tcp_ecn_ok && (flags & TH_ECE)) {
-                if (!tcp->tcp_cwr) {
-                        npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) / mss;
-                        tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * mss;
-                        tcp->tcp_cwnd = npkt * mss;
+        if (tcp->tcp_ecn_ok && (flags & TH_ECE) && !tcp->tcp_cwr) {
+                cc_cong_signal(tcp, seg_ack, CC_ECN);
                         /*
                          * If the cwnd is 0, use the timer to clock out
                          * new segments.  This is required by the ECN spec.
                          */
-                        if (npkt == 0) {
+                if (tcp->tcp_cwnd == 0)
                                 TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
-                                /*
-                                 * This makes sure that when the ACK comes
-                                 * back, we will increase tcp_cwnd by 1 MSS.
-                                 */
-                                tcp->tcp_cwnd_cnt = 0;
-                        }
                         tcp->tcp_cwr = B_TRUE;
                         /*
                          * This marks the end of the current window of in
                          * flight data.  That is why we don't use
                          * tcp_suna + tcp_swnd.  Only data in flight can
                          * provide ECN info.
                          */
                         tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
-                        tcp->tcp_ecn_cwr_sent = B_FALSE;
                 }
-        }
 
         mp1 = tcp->tcp_xmit_head;
         if (bytes_acked == 0) {
                 if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) {
                         int dupack_cnt;

@@ -3912,10 +4035,12 @@
                         if (mp1 && tcp->tcp_suna != tcp->tcp_snxt &&
                             ! tcp->tcp_rexmit) {
                                 /* Do Limited Transmit */
                                 if ((dupack_cnt = ++tcp->tcp_dupack_cnt) <
                                     tcps->tcps_dupack_fast_retransmit) {
+                                        cc_ack_received(tcp, seg_ack,
+                                            bytes_acked, CC_DUPACK);
                                         /*
                                          * RFC 3042
                                          *
                                          * What we need to do is temporarily
                                          * increase tcp_cwnd so that new

@@ -3958,16 +4083,14 @@
                                  * Adjust cwnd since the duplicate
                                  * ack indicates that a packet was
                                  * dropped (due to congestion.)
                                  */
                                 if (!tcp->tcp_cwr) {
-                                        npkt = ((tcp->tcp_snxt -
-                                            tcp->tcp_suna) >> 1) / mss;
-                                        tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) *
-                                            mss;
-                                        tcp->tcp_cwnd = (npkt +
-                                            tcp->tcp_dupack_cnt) * mss;
+                                        cc_cong_signal(tcp, seg_ack,
+                                            CC_NDUPACK);
+                                        cc_ack_received(tcp, seg_ack,
+                                            bytes_acked, CC_DUPACK);
                                 }
                                 if (tcp->tcp_ecn_ok) {
                                         tcp->tcp_cwr = B_TRUE;
                                         tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
                                         tcp->tcp_ecn_cwr_sent = B_FALSE;

@@ -4025,10 +4148,12 @@
                                 } else {
                                         flags |= TH_REXMIT_NEEDED;
                                 } /* tcp_snd_sack_ok */
 
                                 } else {
+                                        cc_ack_received(tcp, seg_ack,
+                                            bytes_acked, CC_DUPACK);
                                         /*
                                          * Here we perform congestion
                                          * avoidance, but NOT slow start.
                                          * This is known as the Fast
                                          * Recovery Algorithm.

@@ -4046,10 +4171,14 @@
                                          * cwnd.
                                          */
                                         cwnd = tcp->tcp_cwnd + mss;
                                         if (cwnd > tcp->tcp_cwnd_max)
                                                 cwnd = tcp->tcp_cwnd_max;
+                                        DTRACE_PROBE3(cwnd__fast__recovery,
+                                            tcp_t *, tcp,
+                                            uint32_t, tcp->tcp_cwnd,
+                                            uint32_t, cwnd);
                                         tcp->tcp_cwnd = cwnd;
                                         if (tcp->tcp_unsent > 0)
                                                 flags |= TH_XMIT_NEEDED;
                                         }
                                 }

@@ -4178,19 +4307,14 @@
          */
         if (tcp->tcp_dupack_cnt >= tcps->tcps_dupack_fast_retransmit) {
                 ASSERT(tcp->tcp_rexmit == B_FALSE);
                 if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) {
                         tcp->tcp_dupack_cnt = 0;
-                        /*
-                         * Restore the orig tcp_cwnd_ssthresh after
-                         * fast retransmit phase.
-                         */
-                        if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) {
-                                tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh;
-                        }
+
+                        cc_post_recovery(tcp, seg_ack);
+
                         tcp->tcp_rexmit_max = seg_ack;
-                        tcp->tcp_cwnd_cnt = 0;
 
                         /*
                          * Remove all notsack info to avoid confusion with
                          * the next fast retrasnmit/recovery phase.
                          */

@@ -4215,12 +4339,16 @@
                                  * original value when we started fast
                                  * recovery.  This is to prevent overly
                                  * aggressive behaviour in sending new
                                  * segments.
                                  */
-                                tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh +
+                                cwnd = tcp->tcp_cwnd_ssthresh +
                                     tcps->tcps_dupack_fast_retransmit * mss;
+                                DTRACE_PROBE3(cwnd__fast__retransmit__part__ack,
+                                    tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
+                                    uint32_t, cwnd);
+                                tcp->tcp_cwnd = cwnd;
                                 tcp->tcp_cwnd_cnt = tcp->tcp_cwnd;
                                 flags |= TH_REXMIT_NEEDED;
                         }
                 }
         } else {

@@ -4277,33 +4405,15 @@
          * If TCP is not ECN capable or TCP is ECN capable but the
          * congestion experience bit is not set, increase the tcp_cwnd as
          * usual.
          */
         if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) {
-                cwnd = tcp->tcp_cwnd;
-                add = mss;
-
-                if (cwnd >= tcp->tcp_cwnd_ssthresh) {
-                        /*
-                         * This is to prevent an increase of less than 1 MSS of
-                         * tcp_cwnd.  With partial increase, tcp_wput_data()
-                         * may send out tinygrams in order to preserve mblk
-                         * boundaries.
-                         *
-                         * By initializing tcp_cwnd_cnt to new tcp_cwnd and
-                         * decrementing it by 1 MSS for every ACKs, tcp_cwnd is
-                         * increased by 1 MSS for every RTTs.
-                         */
-                        if (tcp->tcp_cwnd_cnt <= 0) {
-                                tcp->tcp_cwnd_cnt = cwnd + add;
-                        } else {
-                                tcp->tcp_cwnd_cnt -= add;
-                                add = 0;
+                if (IN_RECOVERY(tcp->tcp_ccv.flags)) {
+                        EXIT_RECOVERY(tcp->tcp_ccv.flags);
                         }
+                cc_ack_received(tcp, seg_ack, bytes_acked, CC_ACK);
                 }
-                tcp->tcp_cwnd = MIN(cwnd + add, tcp->tcp_cwnd_max);
-        }
 
         /* See if the latest urgent data has been acknowledged */
         if ((tcp->tcp_valid_bits & TCP_URG_VALID) &&
             SEQ_GT(seg_ack, tcp->tcp_urg))
                 tcp->tcp_valid_bits &= ~TCP_URG_VALID;

@@ -5632,10 +5742,14 @@
                         uint32_t npkt;
 
                         npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) /
                             tcp->tcp_mss;
                         tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * tcp->tcp_mss;
+
+                        DTRACE_PROBE3(cwnd__source__quench, tcp_t *, tcp,
+                            uint32_t, tcp->tcp_cwnd,
+                            uint32_t, tcp->tcp_mss);
                         tcp->tcp_cwnd = tcp->tcp_mss;
                         tcp->tcp_cwnd_cnt = 0;
                 }
                 break;
         }