Print this page
11553 Want pluggable TCP congestion control algorithms
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>
Reviewed by: Robert Mustacchi <robert.mustacchi@joyent.com>
@@ -21,11 +21,11 @@
/*
* Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, Joyent Inc. All rights reserved.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
* Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
*/
/* Copyright (c) 1990 Mentat Inc. */
#include <sys/types.h>
@@ -72,10 +72,11 @@
#include <net/if.h>
#include <net/route.h>
#include <inet/ipsec_impl.h>
#include <inet/common.h>
+#include <inet/cc.h>
#include <inet/ip.h>
#include <inet/ip_impl.h>
#include <inet/ip6.h>
#include <inet/ip_ndp.h>
#include <inet/proto_set.h>
@@ -1407,10 +1408,14 @@
* It happens to have exactly two members of identical size
* the following code is enough.
*/
tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind);
+ /* Allow the CC algorithm to clean up after itself. */
+ if (tcp->tcp_cc_algo != NULL && tcp->tcp_cc_algo->cb_destroy != NULL)
+ tcp->tcp_cc_algo->cb_destroy(&tcp->tcp_ccv);
+
/*
* If this is a non-STREAM socket still holding on to an upper
* handle, release it. As a result of fallback we might also see
* STREAMS based conns with upper handles, in which case there is
* nothing to do other than clearing the field.
@@ -1453,11 +1458,11 @@
* from which it was allocated. Best results are obtained if conn is
* allocated from listener's squeue and freed to the same. Time wait
* collector will free up the freelist is the connection ends up sitting
* there for too long.
*/
-void *
+conn_t *
tcp_get_conn(void *arg, tcp_stack_t *tcps)
{
tcp_t *tcp = NULL;
conn_t *connp = NULL;
squeue_t *sqp = (squeue_t *)arg;
@@ -1492,11 +1497,11 @@
connp->conn_ixa->ixa_notify_cookie = tcp;
ASSERT(connp->conn_ixa->ixa_notify == tcp_notify);
connp->conn_recv = tcp_input_data;
ASSERT(connp->conn_recvicmp == tcp_icmp_input);
ASSERT(connp->conn_verifyicmp == tcp_verifyicmp);
- return ((void *)connp);
+ return (connp);
}
mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
/*
* Pre-allocate the tcp_rsrv_mp. This mblk will not be freed until
* this conn_t/tcp_t is freed at ipcl_conn_destroy().
@@ -1527,11 +1532,11 @@
* thus it is inside the squeue.
*/
connp->conn_ixa->ixa_notify = tcp_notify;
connp->conn_ixa->ixa_notify_cookie = tcp;
- return ((void *)connp);
+ return (connp);
}
/*
* Handle connect to IPv4 destinations, including connections for AF_INET6
* sockets connecting to IPv4 mapped IPv6 destinations.
@@ -2296,10 +2301,15 @@
PRESERVE(tcp->tcp_connid);
ASSERT(tcp->tcp_listen_cnt == NULL);
ASSERT(tcp->tcp_reass_tid == 0);
+ /* Allow the CC algorithm to clean up after itself. */
+ if (tcp->tcp_cc_algo->cb_destroy != NULL)
+ tcp->tcp_cc_algo->cb_destroy(&tcp->tcp_ccv);
+ tcp->tcp_cc_algo = NULL;
+
#undef DONTCARE
#undef PRESERVE
}
/*
@@ -2316,11 +2326,16 @@
connp->conn_ipversion == IPV4_VERSION) ||
(connp->conn_family == AF_INET6 &&
(connp->conn_ipversion == IPV4_VERSION ||
connp->conn_ipversion == IPV6_VERSION)));
+ tcp->tcp_ccv.type = IPPROTO_TCP;
+ tcp->tcp_ccv.ccvc.tcp = tcp;
+
if (parent == NULL) {
+ tcp->tcp_cc_algo = tcps->tcps_default_cc_algo;
+
tcp->tcp_naglim = tcps->tcps_naglim_def;
tcp->tcp_rto_initial = tcps->tcps_rexmit_interval_initial;
tcp->tcp_rto_min = tcps->tcps_rexmit_interval_min;
tcp->tcp_rto_max = tcps->tcps_rexmit_interval_max;
@@ -2344,10 +2359,12 @@
* Default value of tcp_init_cwnd is 0, so no need to set here
* if parent is NULL. But we need to inherit it from parent.
*/
} else {
/* Inherit various TCP parameters from the parent. */
+ tcp->tcp_cc_algo = parent->tcp_cc_algo;
+
tcp->tcp_naglim = parent->tcp_naglim;
tcp->tcp_rto_initial = parent->tcp_rto_initial;
tcp->tcp_rto_min = parent->tcp_rto_min;
tcp->tcp_rto_max = parent->tcp_rto_max;
@@ -2370,10 +2387,13 @@
tcp->tcp_ka_rinterval = parent->tcp_ka_rinterval;
tcp->tcp_init_cwnd = parent->tcp_init_cwnd;
}
+ if (tcp->tcp_cc_algo->cb_init != NULL)
+ VERIFY(tcp->tcp_cc_algo->cb_init(&tcp->tcp_ccv) == 0);
+
/*
* Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO
* will be close to tcp_rexmit_interval_initial. By doing this, we
* allow the algorithm to adjust slowly to large fluctuations of RTT
* during first few transmissions of a connection as seen in slow
@@ -2614,11 +2634,11 @@
else
zoneid = crgetzoneid(credp);
}
sqp = IP_SQUEUE_GET((uint_t)gethrtime());
- connp = (conn_t *)tcp_get_conn(sqp, tcps);
+ connp = tcp_get_conn(sqp, tcps);
/*
* Both tcp_get_conn and netstack_find_by_cred incremented refcnt,
* so we drop it by one.
*/
netstack_rele(tcps->tcps_netstack);
@@ -3805,10 +3825,13 @@
mutex_init(&tcps->tcps_listener_conf_lock, NULL, MUTEX_DEFAULT, NULL);
list_create(&tcps->tcps_listener_conf, sizeof (tcp_listener_t),
offsetof(tcp_listener_t, tl_link));
+ tcps->tcps_default_cc_algo = cc_load_algo(CC_DEFAULT_ALGO_NAME);
+ VERIFY3P(tcps->tcps_default_cc_algo, !=, NULL);
+
return (tcps);
}
/*
* Called when the IP module is about to be unloaded.