Print this page
11553 Want pluggable TCP congestion control algorithms
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>
Reviewed by: Robert Mustacchi <robert.mustacchi@joyent.com>
*** 21,31 ****
/*
* Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, Joyent Inc. All rights reserved.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
! * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
* Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
*/
/* Copyright (c) 1990 Mentat Inc. */
#include <sys/types.h>
--- 21,31 ----
/*
* Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, Joyent Inc. All rights reserved.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
! * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
* Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
*/
/* Copyright (c) 1990 Mentat Inc. */
#include <sys/types.h>
*** 72,81 ****
--- 72,82 ----
#include <net/if.h>
#include <net/route.h>
#include <inet/ipsec_impl.h>
#include <inet/common.h>
+ #include <inet/cc.h>
#include <inet/ip.h>
#include <inet/ip_impl.h>
#include <inet/ip6.h>
#include <inet/ip_ndp.h>
#include <inet/proto_set.h>
*** 1407,1416 ****
--- 1408,1421 ----
* It happens to have exactly two members of identical size
* the following code is enough.
*/
tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind);
+ /* Allow the CC algorithm to clean up after itself. */
+ if (tcp->tcp_cc_algo != NULL && tcp->tcp_cc_algo->cb_destroy != NULL)
+ tcp->tcp_cc_algo->cb_destroy(&tcp->tcp_ccv);
+
/*
* If this is a non-STREAM socket still holding on to an upper
* handle, release it. As a result of fallback we might also see
* STREAMS based conns with upper handles, in which case there is
* nothing to do other than clearing the field.
*** 1453,1463 ****
* from which it was allocated. Best results are obtained if conn is
* allocated from listener's squeue and freed to the same. Time wait
* collector will free up the freelist is the connection ends up sitting
* there for too long.
*/
! void *
tcp_get_conn(void *arg, tcp_stack_t *tcps)
{
tcp_t *tcp = NULL;
conn_t *connp = NULL;
squeue_t *sqp = (squeue_t *)arg;
--- 1458,1468 ----
* from which it was allocated. Best results are obtained if conn is
* allocated from listener's squeue and freed to the same. Time wait
* collector will free up the freelist is the connection ends up sitting
* there for too long.
*/
! conn_t *
tcp_get_conn(void *arg, tcp_stack_t *tcps)
{
tcp_t *tcp = NULL;
conn_t *connp = NULL;
squeue_t *sqp = (squeue_t *)arg;
*** 1492,1502 ****
connp->conn_ixa->ixa_notify_cookie = tcp;
ASSERT(connp->conn_ixa->ixa_notify == tcp_notify);
connp->conn_recv = tcp_input_data;
ASSERT(connp->conn_recvicmp == tcp_icmp_input);
ASSERT(connp->conn_verifyicmp == tcp_verifyicmp);
! return ((void *)connp);
}
mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
/*
* Pre-allocate the tcp_rsrv_mp. This mblk will not be freed until
* this conn_t/tcp_t is freed at ipcl_conn_destroy().
--- 1497,1507 ----
connp->conn_ixa->ixa_notify_cookie = tcp;
ASSERT(connp->conn_ixa->ixa_notify == tcp_notify);
connp->conn_recv = tcp_input_data;
ASSERT(connp->conn_recvicmp == tcp_icmp_input);
ASSERT(connp->conn_verifyicmp == tcp_verifyicmp);
! return (connp);
}
mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
/*
* Pre-allocate the tcp_rsrv_mp. This mblk will not be freed until
* this conn_t/tcp_t is freed at ipcl_conn_destroy().
*** 1527,1537 ****
* thus it is inside the squeue.
*/
connp->conn_ixa->ixa_notify = tcp_notify;
connp->conn_ixa->ixa_notify_cookie = tcp;
! return ((void *)connp);
}
/*
* Handle connect to IPv4 destinations, including connections for AF_INET6
* sockets connecting to IPv4 mapped IPv6 destinations.
--- 1532,1542 ----
* thus it is inside the squeue.
*/
connp->conn_ixa->ixa_notify = tcp_notify;
connp->conn_ixa->ixa_notify_cookie = tcp;
! return (connp);
}
/*
* Handle connect to IPv4 destinations, including connections for AF_INET6
* sockets connecting to IPv4 mapped IPv6 destinations.
*** 2296,2305 ****
--- 2301,2315 ----
PRESERVE(tcp->tcp_connid);
ASSERT(tcp->tcp_listen_cnt == NULL);
ASSERT(tcp->tcp_reass_tid == 0);
+ /* Allow the CC algorithm to clean up after itself. */
+ if (tcp->tcp_cc_algo->cb_destroy != NULL)
+ tcp->tcp_cc_algo->cb_destroy(&tcp->tcp_ccv);
+ tcp->tcp_cc_algo = NULL;
+
#undef DONTCARE
#undef PRESERVE
}
/*
*** 2316,2326 ****
--- 2326,2341 ----
connp->conn_ipversion == IPV4_VERSION) ||
(connp->conn_family == AF_INET6 &&
(connp->conn_ipversion == IPV4_VERSION ||
connp->conn_ipversion == IPV6_VERSION)));
+ tcp->tcp_ccv.type = IPPROTO_TCP;
+ tcp->tcp_ccv.ccvc.tcp = tcp;
+
if (parent == NULL) {
+ tcp->tcp_cc_algo = tcps->tcps_default_cc_algo;
+
tcp->tcp_naglim = tcps->tcps_naglim_def;
tcp->tcp_rto_initial = tcps->tcps_rexmit_interval_initial;
tcp->tcp_rto_min = tcps->tcps_rexmit_interval_min;
tcp->tcp_rto_max = tcps->tcps_rexmit_interval_max;
*** 2344,2353 ****
--- 2359,2370 ----
* Default value of tcp_init_cwnd is 0, so no need to set here
* if parent is NULL. But we need to inherit it from parent.
*/
} else {
/* Inherit various TCP parameters from the parent. */
+ tcp->tcp_cc_algo = parent->tcp_cc_algo;
+
tcp->tcp_naglim = parent->tcp_naglim;
tcp->tcp_rto_initial = parent->tcp_rto_initial;
tcp->tcp_rto_min = parent->tcp_rto_min;
tcp->tcp_rto_max = parent->tcp_rto_max;
*** 2370,2379 ****
--- 2387,2399 ----
tcp->tcp_ka_rinterval = parent->tcp_ka_rinterval;
tcp->tcp_init_cwnd = parent->tcp_init_cwnd;
}
+ if (tcp->tcp_cc_algo->cb_init != NULL)
+ VERIFY(tcp->tcp_cc_algo->cb_init(&tcp->tcp_ccv) == 0);
+
/*
* Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO
* will be close to tcp_rexmit_interval_initial. By doing this, we
* allow the algorithm to adjust slowly to large fluctuations of RTT
* during first few transmissions of a connection as seen in slow
*** 2614,2624 ****
else
zoneid = crgetzoneid(credp);
}
sqp = IP_SQUEUE_GET((uint_t)gethrtime());
! connp = (conn_t *)tcp_get_conn(sqp, tcps);
/*
* Both tcp_get_conn and netstack_find_by_cred incremented refcnt,
* so we drop it by one.
*/
netstack_rele(tcps->tcps_netstack);
--- 2634,2644 ----
else
zoneid = crgetzoneid(credp);
}
sqp = IP_SQUEUE_GET((uint_t)gethrtime());
! connp = tcp_get_conn(sqp, tcps);
/*
* Both tcp_get_conn and netstack_find_by_cred incremented refcnt,
* so we drop it by one.
*/
netstack_rele(tcps->tcps_netstack);
*** 3805,3814 ****
--- 3825,3837 ----
mutex_init(&tcps->tcps_listener_conf_lock, NULL, MUTEX_DEFAULT, NULL);
list_create(&tcps->tcps_listener_conf, sizeof (tcp_listener_t),
offsetof(tcp_listener_t, tl_link));
+ tcps->tcps_default_cc_algo = cc_load_algo(CC_DEFAULT_ALGO_NAME);
+ VERIFY3P(tcps->tcps_default_cc_algo, !=, NULL);
+
return (tcps);
}
/*
* Called when the IP module is about to be unloaded.