Print this page
11553 Want pluggable TCP congestion control algorithms
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>
Reviewed by: Robert Mustacchi <robert.mustacchi@joyent.com>

*** 21,31 **** /* * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, Joyent Inc. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. ! * Copyright (c) 2013, 2016 by Delphix. All rights reserved. * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved. */ /* Copyright (c) 1990 Mentat Inc. */ #include <sys/types.h> --- 21,31 ---- /* * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, Joyent Inc. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. ! * Copyright (c) 2013, 2017 by Delphix. All rights reserved. * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved. */ /* Copyright (c) 1990 Mentat Inc. */ #include <sys/types.h>
*** 72,81 **** --- 72,82 ---- #include <net/if.h> #include <net/route.h> #include <inet/ipsec_impl.h> #include <inet/common.h> + #include <inet/cc.h> #include <inet/ip.h> #include <inet/ip_impl.h> #include <inet/ip6.h> #include <inet/ip_ndp.h> #include <inet/proto_set.h>
*** 1407,1416 **** --- 1408,1421 ---- * It happens to have exactly two members of identical size * the following code is enough. */ tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind); + /* Allow the CC algorithm to clean up after itself. */ + if (tcp->tcp_cc_algo != NULL && tcp->tcp_cc_algo->cb_destroy != NULL) + tcp->tcp_cc_algo->cb_destroy(&tcp->tcp_ccv); + /* * If this is a non-STREAM socket still holding on to an upper * handle, release it. As a result of fallback we might also see * STREAMS based conns with upper handles, in which case there is * nothing to do other than clearing the field.
*** 1453,1463 **** * from which it was allocated. Best results are obtained if conn is * allocated from listener's squeue and freed to the same. Time wait * collector will free up the freelist is the connection ends up sitting * there for too long. */ ! void * tcp_get_conn(void *arg, tcp_stack_t *tcps) { tcp_t *tcp = NULL; conn_t *connp = NULL; squeue_t *sqp = (squeue_t *)arg; --- 1458,1468 ---- * from which it was allocated. Best results are obtained if conn is * allocated from listener's squeue and freed to the same. Time wait * collector will free up the freelist is the connection ends up sitting * there for too long. */ ! conn_t * tcp_get_conn(void *arg, tcp_stack_t *tcps) { tcp_t *tcp = NULL; conn_t *connp = NULL; squeue_t *sqp = (squeue_t *)arg;
*** 1492,1502 **** connp->conn_ixa->ixa_notify_cookie = tcp; ASSERT(connp->conn_ixa->ixa_notify == tcp_notify); connp->conn_recv = tcp_input_data; ASSERT(connp->conn_recvicmp == tcp_icmp_input); ASSERT(connp->conn_verifyicmp == tcp_verifyicmp); ! return ((void *)connp); } mutex_exit(&tcp_time_wait->tcp_time_wait_lock); /* * Pre-allocate the tcp_rsrv_mp. This mblk will not be freed until * this conn_t/tcp_t is freed at ipcl_conn_destroy(). --- 1497,1507 ---- connp->conn_ixa->ixa_notify_cookie = tcp; ASSERT(connp->conn_ixa->ixa_notify == tcp_notify); connp->conn_recv = tcp_input_data; ASSERT(connp->conn_recvicmp == tcp_icmp_input); ASSERT(connp->conn_verifyicmp == tcp_verifyicmp); ! return (connp); } mutex_exit(&tcp_time_wait->tcp_time_wait_lock); /* * Pre-allocate the tcp_rsrv_mp. This mblk will not be freed until * this conn_t/tcp_t is freed at ipcl_conn_destroy().
*** 1527,1537 **** * thus it is inside the squeue. */ connp->conn_ixa->ixa_notify = tcp_notify; connp->conn_ixa->ixa_notify_cookie = tcp; ! return ((void *)connp); } /* * Handle connect to IPv4 destinations, including connections for AF_INET6 * sockets connecting to IPv4 mapped IPv6 destinations. --- 1532,1542 ---- * thus it is inside the squeue. */ connp->conn_ixa->ixa_notify = tcp_notify; connp->conn_ixa->ixa_notify_cookie = tcp; ! return (connp); } /* * Handle connect to IPv4 destinations, including connections for AF_INET6 * sockets connecting to IPv4 mapped IPv6 destinations.
*** 2296,2305 **** --- 2301,2315 ---- PRESERVE(tcp->tcp_connid); ASSERT(tcp->tcp_listen_cnt == NULL); ASSERT(tcp->tcp_reass_tid == 0); + /* Allow the CC algorithm to clean up after itself. */ + if (tcp->tcp_cc_algo->cb_destroy != NULL) + tcp->tcp_cc_algo->cb_destroy(&tcp->tcp_ccv); + tcp->tcp_cc_algo = NULL; + #undef DONTCARE #undef PRESERVE } /*
*** 2316,2326 **** --- 2326,2341 ---- connp->conn_ipversion == IPV4_VERSION) || (connp->conn_family == AF_INET6 && (connp->conn_ipversion == IPV4_VERSION || connp->conn_ipversion == IPV6_VERSION))); + tcp->tcp_ccv.type = IPPROTO_TCP; + tcp->tcp_ccv.ccvc.tcp = tcp; + if (parent == NULL) { + tcp->tcp_cc_algo = tcps->tcps_default_cc_algo; + tcp->tcp_naglim = tcps->tcps_naglim_def; tcp->tcp_rto_initial = tcps->tcps_rexmit_interval_initial; tcp->tcp_rto_min = tcps->tcps_rexmit_interval_min; tcp->tcp_rto_max = tcps->tcps_rexmit_interval_max;
*** 2344,2353 **** --- 2359,2370 ---- * Default value of tcp_init_cwnd is 0, so no need to set here * if parent is NULL. But we need to inherit it from parent. */ } else { /* Inherit various TCP parameters from the parent. */ + tcp->tcp_cc_algo = parent->tcp_cc_algo; + tcp->tcp_naglim = parent->tcp_naglim; tcp->tcp_rto_initial = parent->tcp_rto_initial; tcp->tcp_rto_min = parent->tcp_rto_min; tcp->tcp_rto_max = parent->tcp_rto_max;
*** 2370,2379 **** --- 2387,2399 ---- tcp->tcp_ka_rinterval = parent->tcp_ka_rinterval; tcp->tcp_init_cwnd = parent->tcp_init_cwnd; } + if (tcp->tcp_cc_algo->cb_init != NULL) + VERIFY(tcp->tcp_cc_algo->cb_init(&tcp->tcp_ccv) == 0); + /* * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO * will be close to tcp_rexmit_interval_initial. By doing this, we * allow the algorithm to adjust slowly to large fluctuations of RTT * during first few transmissions of a connection as seen in slow
*** 2614,2624 **** else zoneid = crgetzoneid(credp); } sqp = IP_SQUEUE_GET((uint_t)gethrtime()); ! connp = (conn_t *)tcp_get_conn(sqp, tcps); /* * Both tcp_get_conn and netstack_find_by_cred incremented refcnt, * so we drop it by one. */ netstack_rele(tcps->tcps_netstack); --- 2634,2644 ---- else zoneid = crgetzoneid(credp); } sqp = IP_SQUEUE_GET((uint_t)gethrtime()); ! connp = tcp_get_conn(sqp, tcps); /* * Both tcp_get_conn and netstack_find_by_cred incremented refcnt, * so we drop it by one. */ netstack_rele(tcps->tcps_netstack);
*** 3805,3814 **** --- 3825,3837 ---- mutex_init(&tcps->tcps_listener_conf_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&tcps->tcps_listener_conf, sizeof (tcp_listener_t), offsetof(tcp_listener_t, tl_link)); + tcps->tcps_default_cc_algo = cc_load_algo(CC_DEFAULT_ALGO_NAME); + VERIFY3P(tcps->tcps_default_cc_algo, !=, NULL); + return (tcps); } /* * Called when the IP module is about to be unloaded.