Print this page
11553 Want pluggable TCP congestion control algorithms
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>
Reviewed by: Robert Mustacchi <robert.mustacchi@joyent.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/inet/tcp/tcp.c
          +++ new/usr/src/uts/common/inet/tcp/tcp.c
↓ open down ↓ 15 lines elided ↑ open up ↑
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright (c) 2011, Joyent Inc. All rights reserved.
  25   25   * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  26      - * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
       26 + * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
  27   27   * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
  28   28   */
  29   29  /* Copyright (c) 1990 Mentat Inc. */
  30   30  
  31   31  #include <sys/types.h>
  32   32  #include <sys/stream.h>
  33   33  #include <sys/strsun.h>
  34   34  #include <sys/strsubr.h>
  35   35  #include <sys/stropts.h>
  36   36  #include <sys/strlog.h>
↓ open down ↓ 30 lines elided ↑ open up ↑
  67   67  #include <sys/systm.h>
  68   68  #include <netinet/in.h>
  69   69  #include <netinet/tcp.h>
  70   70  #include <netinet/ip6.h>
  71   71  #include <netinet/icmp6.h>
  72   72  #include <net/if.h>
  73   73  #include <net/route.h>
  74   74  #include <inet/ipsec_impl.h>
  75   75  
  76   76  #include <inet/common.h>
       77 +#include <inet/cc.h>
  77   78  #include <inet/ip.h>
  78   79  #include <inet/ip_impl.h>
  79   80  #include <inet/ip6.h>
  80   81  #include <inet/ip_ndp.h>
  81   82  #include <inet/proto_set.h>
  82   83  #include <inet/mib2.h>
  83   84  #include <inet/optcom.h>
  84   85  #include <inet/snmpcom.h>
  85   86  #include <inet/kstatcom.h>
  86   87  #include <inet/tcp.h>
↓ open down ↓ 1315 lines elided ↑ open up ↑
1402 1403          }
1403 1404          ASSERT(tcp->tcp_rthdrlen == 0);
1404 1405  
1405 1406          /*
1406 1407           * Following is really a blowing away a union.
1407 1408           * It happens to have exactly two members of identical size
1408 1409           * the following code is enough.
1409 1410           */
1410 1411          tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind);
1411 1412  
     1413 +        /* Allow the CC algorithm to clean up after itself. */
     1414 +        if (tcp->tcp_cc_algo != NULL && tcp->tcp_cc_algo->cb_destroy != NULL)
     1415 +                tcp->tcp_cc_algo->cb_destroy(&tcp->tcp_ccv);
     1416 +
1412 1417          /*
1413 1418           * If this is a non-STREAM socket still holding on to an upper
1414 1419           * handle, release it. As a result of fallback we might also see
1415 1420           * STREAMS based conns with upper handles, in which case there is
1416 1421           * nothing to do other than clearing the field.
1417 1422           */
1418 1423          if (connp->conn_upper_handle != NULL) {
1419 1424                  if (IPCL_IS_NONSTR(connp)) {
1420 1425                          (*connp->conn_upcalls->su_closed)(
1421 1426                              connp->conn_upper_handle);
↓ open down ↓ 26 lines elided ↑ open up ↑
1448 1453   * putting the conn/tcp back in freelist, we don't pay a penalty for
1449 1454   * allocating memory without checking 'q/q0' and freeing it if we can't
1450 1455   * accept the connection.
1451 1456   *
1452 1457   * Care should be taken to put the conn back in the same squeue's freelist
1453 1458   * from which it was allocated. Best results are obtained if conn is
1454 1459   * allocated from listener's squeue and freed to the same. Time wait
1455 1460   * collector will free up the freelist is the connection ends up sitting
1456 1461   * there for too long.
1457 1462   */
1458      -void *
     1463 +conn_t *
1459 1464  tcp_get_conn(void *arg, tcp_stack_t *tcps)
1460 1465  {
1461 1466          tcp_t                   *tcp = NULL;
1462 1467          conn_t                  *connp = NULL;
1463 1468          squeue_t                *sqp = (squeue_t *)arg;
1464 1469          tcp_squeue_priv_t       *tcp_time_wait;
1465 1470          netstack_t              *ns;
1466 1471          mblk_t                  *tcp_rsrv_mp = NULL;
1467 1472  
1468 1473          tcp_time_wait =
↓ open down ↓ 18 lines elided ↑ open up ↑
1487 1492                  connp->conn_netstack = ns;
1488 1493                  connp->conn_ixa->ixa_ipst = ns->netstack_ip;
1489 1494                  tcp->tcp_tcps = tcps;
1490 1495                  ipcl_globalhash_insert(connp);
1491 1496  
1492 1497                  connp->conn_ixa->ixa_notify_cookie = tcp;
1493 1498                  ASSERT(connp->conn_ixa->ixa_notify == tcp_notify);
1494 1499                  connp->conn_recv = tcp_input_data;
1495 1500                  ASSERT(connp->conn_recvicmp == tcp_icmp_input);
1496 1501                  ASSERT(connp->conn_verifyicmp == tcp_verifyicmp);
1497      -                return ((void *)connp);
     1502 +                return (connp);
1498 1503          }
1499 1504          mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1500 1505          /*
1501 1506           * Pre-allocate the tcp_rsrv_mp. This mblk will not be freed until
1502 1507           * this conn_t/tcp_t is freed at ipcl_conn_destroy().
1503 1508           */
1504 1509          tcp_rsrv_mp = allocb(0, BPRI_HI);
1505 1510          if (tcp_rsrv_mp == NULL)
1506 1511                  return (NULL);
1507 1512  
↓ open down ↓ 14 lines elided ↑ open up ↑
1522 1527          connp->conn_verifyicmp = tcp_verifyicmp;
1523 1528  
1524 1529          /*
1525 1530           * Register tcp_notify to listen to capability changes detected by IP.
1526 1531           * This upcall is made in the context of the call to conn_ip_output
1527 1532           * thus it is inside the squeue.
1528 1533           */
1529 1534          connp->conn_ixa->ixa_notify = tcp_notify;
1530 1535          connp->conn_ixa->ixa_notify_cookie = tcp;
1531 1536  
1532      -        return ((void *)connp);
     1537 +        return (connp);
1533 1538  }
1534 1539  
1535 1540  /*
1536 1541   * Handle connect to IPv4 destinations, including connections for AF_INET6
1537 1542   * sockets connecting to IPv4 mapped IPv6 destinations.
1538 1543   * Returns zero if OK, a positive errno, or a negative TLI error.
1539 1544   */
1540 1545  static int
1541 1546  tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, in_port_t dstport,
1542 1547      uint_t srcid)
↓ open down ↓ 748 lines elided ↑ open up ↑
2291 2296  
2292 2297  #ifdef DEBUG
2293 2298          DONTCARE(tcp->tcmp_stk[0]);
2294 2299  #endif
2295 2300  
2296 2301          PRESERVE(tcp->tcp_connid);
2297 2302  
2298 2303          ASSERT(tcp->tcp_listen_cnt == NULL);
2299 2304          ASSERT(tcp->tcp_reass_tid == 0);
2300 2305  
     2306 +        /* Allow the CC algorithm to clean up after itself. */
     2307 +        if (tcp->tcp_cc_algo->cb_destroy != NULL)
     2308 +                tcp->tcp_cc_algo->cb_destroy(&tcp->tcp_ccv);
     2309 +        tcp->tcp_cc_algo = NULL;
     2310 +
2301 2311  #undef  DONTCARE
2302 2312  #undef  PRESERVE
2303 2313  }
2304 2314  
2305 2315  /*
2306 2316   * Initialize the various fields in tcp_t.  If parent (the listener) is non
2307 2317   * NULL, certain values will be inheritted from it.
2308 2318   */
2309 2319  void
2310 2320  tcp_init_values(tcp_t *tcp, tcp_t *parent)
2311 2321  {
2312 2322          tcp_stack_t     *tcps = tcp->tcp_tcps;
2313 2323          conn_t          *connp = tcp->tcp_connp;
2314 2324  
2315 2325          ASSERT((connp->conn_family == AF_INET &&
2316 2326              connp->conn_ipversion == IPV4_VERSION) ||
2317 2327              (connp->conn_family == AF_INET6 &&
2318 2328              (connp->conn_ipversion == IPV4_VERSION ||
2319 2329              connp->conn_ipversion == IPV6_VERSION)));
2320 2330  
     2331 +        tcp->tcp_ccv.type = IPPROTO_TCP;
     2332 +        tcp->tcp_ccv.ccvc.tcp = tcp;
     2333 +
2321 2334          if (parent == NULL) {
     2335 +                tcp->tcp_cc_algo = tcps->tcps_default_cc_algo;
     2336 +
2322 2337                  tcp->tcp_naglim = tcps->tcps_naglim_def;
2323 2338  
2324 2339                  tcp->tcp_rto_initial = tcps->tcps_rexmit_interval_initial;
2325 2340                  tcp->tcp_rto_min = tcps->tcps_rexmit_interval_min;
2326 2341                  tcp->tcp_rto_max = tcps->tcps_rexmit_interval_max;
2327 2342  
2328 2343                  tcp->tcp_first_ctimer_threshold =
2329 2344                      tcps->tcps_ip_notify_cinterval;
2330 2345                  tcp->tcp_second_ctimer_threshold =
2331 2346                      tcps->tcps_ip_abort_cinterval;
↓ open down ↓ 7 lines elided ↑ open up ↑
2339 2354                  tcp->tcp_ka_abort_thres = tcps->tcps_keepalive_abort_interval;
2340 2355                  tcp->tcp_ka_cnt = 0;
2341 2356                  tcp->tcp_ka_rinterval = 0;
2342 2357  
2343 2358                  /*
2344 2359                   * Default value of tcp_init_cwnd is 0, so no need to set here
2345 2360                   * if parent is NULL.  But we need to inherit it from parent.
2346 2361                   */
2347 2362          } else {
2348 2363                  /* Inherit various TCP parameters from the parent. */
     2364 +                tcp->tcp_cc_algo = parent->tcp_cc_algo;
     2365 +
2349 2366                  tcp->tcp_naglim = parent->tcp_naglim;
2350 2367  
2351 2368                  tcp->tcp_rto_initial = parent->tcp_rto_initial;
2352 2369                  tcp->tcp_rto_min = parent->tcp_rto_min;
2353 2370                  tcp->tcp_rto_max = parent->tcp_rto_max;
2354 2371  
2355 2372                  tcp->tcp_first_ctimer_threshold =
2356 2373                      parent->tcp_first_ctimer_threshold;
2357 2374                  tcp->tcp_second_ctimer_threshold =
2358 2375                      parent->tcp_second_ctimer_threshold;
↓ open down ↓ 6 lines elided ↑ open up ↑
2365 2382                      parent->tcp_fin_wait_2_flush_interval;
2366 2383  
2367 2384                  tcp->tcp_ka_interval = parent->tcp_ka_interval;
2368 2385                  tcp->tcp_ka_abort_thres = parent->tcp_ka_abort_thres;
2369 2386                  tcp->tcp_ka_cnt = parent->tcp_ka_cnt;
2370 2387                  tcp->tcp_ka_rinterval = parent->tcp_ka_rinterval;
2371 2388  
2372 2389                  tcp->tcp_init_cwnd = parent->tcp_init_cwnd;
2373 2390          }
2374 2391  
     2392 +        if (tcp->tcp_cc_algo->cb_init != NULL)
     2393 +                VERIFY(tcp->tcp_cc_algo->cb_init(&tcp->tcp_ccv) == 0);
     2394 +
2375 2395          /*
2376 2396           * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO
2377 2397           * will be close to tcp_rexmit_interval_initial.  By doing this, we
2378 2398           * allow the algorithm to adjust slowly to large fluctuations of RTT
2379 2399           * during first few transmissions of a connection as seen in slow
2380 2400           * links.
2381 2401           */
2382 2402          tcp->tcp_rtt_sa = MSEC2NSEC(tcp->tcp_rto_initial) << 2;
2383 2403          tcp->tcp_rtt_sd = MSEC2NSEC(tcp->tcp_rto_initial) >> 1;
2384 2404          tcp->tcp_rto = tcp_calculate_rto(tcp, tcps,
↓ open down ↓ 224 lines elided ↑ open up ↑
2609 2629                   * to make TCP operate as if in the global zone.
2610 2630                   */
2611 2631                  if (tcps->tcps_netstack->netstack_stackid !=
2612 2632                      GLOBAL_NETSTACKID)
2613 2633                          zoneid = GLOBAL_ZONEID;
2614 2634                  else
2615 2635                          zoneid = crgetzoneid(credp);
2616 2636          }
2617 2637  
2618 2638          sqp = IP_SQUEUE_GET((uint_t)gethrtime());
2619      -        connp = (conn_t *)tcp_get_conn(sqp, tcps);
     2639 +        connp = tcp_get_conn(sqp, tcps);
2620 2640          /*
2621 2641           * Both tcp_get_conn and netstack_find_by_cred incremented refcnt,
2622 2642           * so we drop it by one.
2623 2643           */
2624 2644          netstack_rele(tcps->tcps_netstack);
2625 2645          if (connp == NULL) {
2626 2646                  *errorp = ENOSR;
2627 2647                  return (NULL);
2628 2648          }
2629 2649          ASSERT(connp->conn_ixa->ixa_protocol == connp->conn_proto);
↓ open down ↓ 1170 lines elided ↑ open up ↑
3800 3820              KM_SLEEP);
3801 3821          for (i = 0; i < tcps->tcps_sc_cnt; i++) {
3802 3822                  tcps->tcps_sc[i] = kmem_zalloc(sizeof (tcp_stats_cpu_t),
3803 3823                      KM_SLEEP);
3804 3824          }
3805 3825  
3806 3826          mutex_init(&tcps->tcps_listener_conf_lock, NULL, MUTEX_DEFAULT, NULL);
3807 3827          list_create(&tcps->tcps_listener_conf, sizeof (tcp_listener_t),
3808 3828              offsetof(tcp_listener_t, tl_link));
3809 3829  
     3830 +        tcps->tcps_default_cc_algo = cc_load_algo(CC_DEFAULT_ALGO_NAME);
     3831 +        VERIFY3P(tcps->tcps_default_cc_algo, !=, NULL);
     3832 +
3810 3833          return (tcps);
3811 3834  }
3812 3835  
3813 3836  /*
3814 3837   * Called when the IP module is about to be unloaded.
3815 3838   */
3816 3839  void
3817 3840  tcp_ddi_g_destroy(void)
3818 3841  {
3819 3842          tcp_g_kstat_fini(tcp_g_kstat);
↓ open down ↓ 638 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX