Print this page
11553 Want pluggable TCP congestion control algorithms
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>
Reviewed by: Robert Mustacchi <robert.mustacchi@joyent.com>

@@ -21,11 +21,11 @@
 
 /*
  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, Joyent Inc. All rights reserved.
  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
  * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
  */
 /* Copyright (c) 1990 Mentat Inc. */
 
 #include <sys/types.h>

@@ -72,10 +72,11 @@
 #include <net/if.h>
 #include <net/route.h>
 #include <inet/ipsec_impl.h>
 
 #include <inet/common.h>
+#include <inet/cc.h>
 #include <inet/ip.h>
 #include <inet/ip_impl.h>
 #include <inet/ip6.h>
 #include <inet/ip_ndp.h>
 #include <inet/proto_set.h>

@@ -1407,10 +1408,14 @@
          * It happens to have exactly two members of identical size
          * the following code is enough.
          */
         tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind);
 
+        /* Allow the CC algorithm to clean up after itself. */
+        if (tcp->tcp_cc_algo != NULL && tcp->tcp_cc_algo->cb_destroy != NULL)
+                tcp->tcp_cc_algo->cb_destroy(&tcp->tcp_ccv);
+
         /*
          * If this is a non-STREAM socket still holding on to an upper
          * handle, release it. As a result of fallback we might also see
          * STREAMS based conns with upper handles, in which case there is
          * nothing to do other than clearing the field.

@@ -1453,11 +1458,11 @@
  * from which it was allocated. Best results are obtained if conn is
  * allocated from listener's squeue and freed to the same. Time wait
  * collector will free up the freelist is the connection ends up sitting
  * there for too long.
  */
-void *
+conn_t *
 tcp_get_conn(void *arg, tcp_stack_t *tcps)
 {
         tcp_t                   *tcp = NULL;
         conn_t                  *connp = NULL;
         squeue_t                *sqp = (squeue_t *)arg;

@@ -1492,11 +1497,11 @@
                 connp->conn_ixa->ixa_notify_cookie = tcp;
                 ASSERT(connp->conn_ixa->ixa_notify == tcp_notify);
                 connp->conn_recv = tcp_input_data;
                 ASSERT(connp->conn_recvicmp == tcp_icmp_input);
                 ASSERT(connp->conn_verifyicmp == tcp_verifyicmp);
-                return ((void *)connp);
+                return (connp);
         }
         mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
         /*
          * Pre-allocate the tcp_rsrv_mp. This mblk will not be freed until
          * this conn_t/tcp_t is freed at ipcl_conn_destroy().

@@ -1527,11 +1532,11 @@
          * thus it is inside the squeue.
          */
         connp->conn_ixa->ixa_notify = tcp_notify;
         connp->conn_ixa->ixa_notify_cookie = tcp;
 
-        return ((void *)connp);
+        return (connp);
 }
 
 /*
  * Handle connect to IPv4 destinations, including connections for AF_INET6
  * sockets connecting to IPv4 mapped IPv6 destinations.

@@ -2296,10 +2301,15 @@
         PRESERVE(tcp->tcp_connid);
 
         ASSERT(tcp->tcp_listen_cnt == NULL);
         ASSERT(tcp->tcp_reass_tid == 0);
 
+        /* Allow the CC algorithm to clean up after itself. */
+        if (tcp->tcp_cc_algo->cb_destroy != NULL)
+                tcp->tcp_cc_algo->cb_destroy(&tcp->tcp_ccv);
+        tcp->tcp_cc_algo = NULL;
+
 #undef  DONTCARE
 #undef  PRESERVE
 }
 
 /*

@@ -2316,11 +2326,16 @@
             connp->conn_ipversion == IPV4_VERSION) ||
             (connp->conn_family == AF_INET6 &&
             (connp->conn_ipversion == IPV4_VERSION ||
             connp->conn_ipversion == IPV6_VERSION)));
 
+        tcp->tcp_ccv.type = IPPROTO_TCP;
+        tcp->tcp_ccv.ccvc.tcp = tcp;
+
         if (parent == NULL) {
+                tcp->tcp_cc_algo = tcps->tcps_default_cc_algo;
+
                 tcp->tcp_naglim = tcps->tcps_naglim_def;
 
                 tcp->tcp_rto_initial = tcps->tcps_rexmit_interval_initial;
                 tcp->tcp_rto_min = tcps->tcps_rexmit_interval_min;
                 tcp->tcp_rto_max = tcps->tcps_rexmit_interval_max;

@@ -2344,10 +2359,12 @@
                  * Default value of tcp_init_cwnd is 0, so no need to set here
                  * if parent is NULL.  But we need to inherit it from parent.
                  */
         } else {
                 /* Inherit various TCP parameters from the parent. */
+                tcp->tcp_cc_algo = parent->tcp_cc_algo;
+
                 tcp->tcp_naglim = parent->tcp_naglim;
 
                 tcp->tcp_rto_initial = parent->tcp_rto_initial;
                 tcp->tcp_rto_min = parent->tcp_rto_min;
                 tcp->tcp_rto_max = parent->tcp_rto_max;

@@ -2370,10 +2387,13 @@
                 tcp->tcp_ka_rinterval = parent->tcp_ka_rinterval;
 
                 tcp->tcp_init_cwnd = parent->tcp_init_cwnd;
         }
 
+        if (tcp->tcp_cc_algo->cb_init != NULL)
+                VERIFY(tcp->tcp_cc_algo->cb_init(&tcp->tcp_ccv) == 0);
+
         /*
          * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO
          * will be close to tcp_rexmit_interval_initial.  By doing this, we
          * allow the algorithm to adjust slowly to large fluctuations of RTT
          * during first few transmissions of a connection as seen in slow

@@ -2614,11 +2634,11 @@
                 else
                         zoneid = crgetzoneid(credp);
         }
 
         sqp = IP_SQUEUE_GET((uint_t)gethrtime());
-        connp = (conn_t *)tcp_get_conn(sqp, tcps);
+        connp = tcp_get_conn(sqp, tcps);
         /*
          * Both tcp_get_conn and netstack_find_by_cred incremented refcnt,
          * so we drop it by one.
          */
         netstack_rele(tcps->tcps_netstack);

@@ -3805,10 +3825,13 @@
 
         mutex_init(&tcps->tcps_listener_conf_lock, NULL, MUTEX_DEFAULT, NULL);
         list_create(&tcps->tcps_listener_conf, sizeof (tcp_listener_t),
             offsetof(tcp_listener_t, tl_link));
 
+        tcps->tcps_default_cc_algo = cc_load_algo(CC_DEFAULT_ALGO_NAME);
+        VERIFY3P(tcps->tcps_default_cc_algo, !=, NULL);
+
         return (tcps);
 }
 
 /*
  * Called when the IP module is about to be unloaded.