Print this page
11553 Want pluggable TCP congestion control algorithms
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>
Reviewed by: Robert Mustacchi <robert.mustacchi@joyent.com>


   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2011, Joyent Inc. All rights reserved.
  25  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  26  * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
  27  * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
  28  */
  29 /* Copyright (c) 1990 Mentat Inc. */
  30 
  31 #include <sys/types.h>
  32 #include <sys/stream.h>
  33 #include <sys/strsun.h>
  34 #include <sys/strsubr.h>
  35 #include <sys/stropts.h>
  36 #include <sys/strlog.h>
  37 #define _SUN_TPI_VERSION 2
  38 #include <sys/tihdr.h>
  39 #include <sys/timod.h>
  40 #include <sys/ddi.h>
  41 #include <sys/sunddi.h>
  42 #include <sys/suntpi.h>
  43 #include <sys/xti_inet.h>
  44 #include <sys/cmn_err.h>
  45 #include <sys/debug.h>
  46 #include <sys/sdt.h>


  57 
  58 #include <sys/errno.h>
  59 #include <sys/signal.h>
  60 #include <sys/socket.h>
  61 #include <sys/socketvar.h>
  62 #include <sys/sockio.h>
  63 #include <sys/isa_defs.h>
  64 #include <sys/md5.h>
  65 #include <sys/random.h>
  66 #include <sys/uio.h>
  67 #include <sys/systm.h>
  68 #include <netinet/in.h>
  69 #include <netinet/tcp.h>
  70 #include <netinet/ip6.h>
  71 #include <netinet/icmp6.h>
  72 #include <net/if.h>
  73 #include <net/route.h>
  74 #include <inet/ipsec_impl.h>
  75 
  76 #include <inet/common.h>

  77 #include <inet/ip.h>
  78 #include <inet/ip_impl.h>
  79 #include <inet/ip6.h>
  80 #include <inet/ip_ndp.h>
  81 #include <inet/proto_set.h>
  82 #include <inet/mib2.h>
  83 #include <inet/optcom.h>
  84 #include <inet/snmpcom.h>
  85 #include <inet/kstatcom.h>
  86 #include <inet/tcp.h>
  87 #include <inet/tcp_impl.h>
  88 #include <inet/tcp_cluster.h>
  89 #include <inet/udp_impl.h>
  90 #include <net/pfkeyv2.h>
  91 #include <inet/ipdrop.h>
  92 
  93 #include <inet/ipclassifier.h>
  94 #include <inet/ip_ire.h>
  95 #include <inet/ip_ftable.h>
  96 #include <inet/ip_if.h>


1392         if (tcp->tcp_rthdrdstopts != NULL) {
1393                 mi_free(tcp->tcp_rthdrdstopts);
1394                 tcp->tcp_rthdrdstopts = NULL;
1395                 tcp->tcp_rthdrdstoptslen = 0;
1396         }
1397         ASSERT(tcp->tcp_rthdrdstoptslen == 0);
1398         if (tcp->tcp_rthdr != NULL) {
1399                 mi_free(tcp->tcp_rthdr);
1400                 tcp->tcp_rthdr = NULL;
1401                 tcp->tcp_rthdrlen = 0;
1402         }
1403         ASSERT(tcp->tcp_rthdrlen == 0);
1404 
1405         /*
1406          * Following is really a blowing away a union.
1407          * It happens to have exactly two members of identical size
1408          * the following code is enough.
1409          */
1410         tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind);
1411 




1412         /*
1413          * If this is a non-STREAM socket still holding on to an upper
1414          * handle, release it. As a result of fallback we might also see
1415          * STREAMS based conns with upper handles, in which case there is
1416          * nothing to do other than clearing the field.
1417          */
1418         if (connp->conn_upper_handle != NULL) {
1419                 if (IPCL_IS_NONSTR(connp)) {
1420                         (*connp->conn_upcalls->su_closed)(
1421                             connp->conn_upper_handle);
1422                         tcp->tcp_detached = B_TRUE;
1423                 }
1424                 connp->conn_upper_handle = NULL;
1425                 connp->conn_upcalls = NULL;
1426         }
1427 }
1428 
1429 /*
1430  * tcp_get_conn/tcp_free_conn
1431  *


1438  * outside the squeue. So when the interrupt comes, we have a clean
1439  * connection sitting in the freelist. Obviously, this buys us
1440  * performance.
1441  *
1442  * 2) Defence against DOS attack. Allocating a tcp/conn in tcp_input_listener
1443  * has multiple disadvantages - tying up the squeue during alloc.
1444  * But allocating the conn/tcp in IP land is also not the best since
1445  * we can't check the 'q' and 'q0' which are protected by squeue and
1446  * blindly allocate memory which might have to be freed here if we are
1447  * not allowed to accept the connection. By using the freelist and
1448  * putting the conn/tcp back in freelist, we don't pay a penalty for
1449  * allocating memory without checking 'q/q0' and freeing it if we can't
1450  * accept the connection.
1451  *
1452  * Care should be taken to put the conn back in the same squeue's freelist
1453  * from which it was allocated. Best results are obtained if conn is
1454  * allocated from listener's squeue and freed to the same. Time wait
1455  * collector will free up the freelist is the connection ends up sitting
1456  * there for too long.
1457  */
1458 void *
1459 tcp_get_conn(void *arg, tcp_stack_t *tcps)
1460 {
1461         tcp_t                   *tcp = NULL;
1462         conn_t                  *connp = NULL;
1463         squeue_t                *sqp = (squeue_t *)arg;
1464         tcp_squeue_priv_t       *tcp_time_wait;
1465         netstack_t              *ns;
1466         mblk_t                  *tcp_rsrv_mp = NULL;
1467 
1468         tcp_time_wait =
1469             *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
1470 
1471         mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
1472         tcp = tcp_time_wait->tcp_free_list;
1473         ASSERT((tcp != NULL) ^ (tcp_time_wait->tcp_free_list_cnt == 0));
1474         if (tcp != NULL) {
1475                 tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next;
1476                 tcp_time_wait->tcp_free_list_cnt--;
1477                 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1478                 tcp->tcp_time_wait_next = NULL;
1479                 connp = tcp->tcp_connp;
1480                 connp->conn_flags |= IPCL_REUSED;
1481 
1482                 ASSERT(tcp->tcp_tcps == NULL);
1483                 ASSERT(connp->conn_netstack == NULL);
1484                 ASSERT(tcp->tcp_rsrv_mp != NULL);
1485                 ns = tcps->tcps_netstack;
1486                 netstack_hold(ns);
1487                 connp->conn_netstack = ns;
1488                 connp->conn_ixa->ixa_ipst = ns->netstack_ip;
1489                 tcp->tcp_tcps = tcps;
1490                 ipcl_globalhash_insert(connp);
1491 
1492                 connp->conn_ixa->ixa_notify_cookie = tcp;
1493                 ASSERT(connp->conn_ixa->ixa_notify == tcp_notify);
1494                 connp->conn_recv = tcp_input_data;
1495                 ASSERT(connp->conn_recvicmp == tcp_icmp_input);
1496                 ASSERT(connp->conn_verifyicmp == tcp_verifyicmp);
1497                 return ((void *)connp);
1498         }
1499         mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1500         /*
1501          * Pre-allocate the tcp_rsrv_mp. This mblk will not be freed until
1502          * this conn_t/tcp_t is freed at ipcl_conn_destroy().
1503          */
1504         tcp_rsrv_mp = allocb(0, BPRI_HI);
1505         if (tcp_rsrv_mp == NULL)
1506                 return (NULL);
1507 
1508         if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP,
1509             tcps->tcps_netstack)) == NULL) {
1510                 freeb(tcp_rsrv_mp);
1511                 return (NULL);
1512         }
1513 
1514         tcp = connp->conn_tcp;
1515         tcp->tcp_rsrv_mp = tcp_rsrv_mp;
1516         mutex_init(&tcp->tcp_rsrv_mp_lock, NULL, MUTEX_DEFAULT, NULL);
1517 
1518         tcp->tcp_tcps = tcps;
1519 
1520         connp->conn_recv = tcp_input_data;
1521         connp->conn_recvicmp = tcp_icmp_input;
1522         connp->conn_verifyicmp = tcp_verifyicmp;
1523 
1524         /*
1525          * Register tcp_notify to listen to capability changes detected by IP.
1526          * This upcall is made in the context of the call to conn_ip_output
1527          * thus it is inside the squeue.
1528          */
1529         connp->conn_ixa->ixa_notify = tcp_notify;
1530         connp->conn_ixa->ixa_notify_cookie = tcp;
1531 
1532         return ((void *)connp);
1533 }
1534 
1535 /*
1536  * Handle connect to IPv4 destinations, including connections for AF_INET6
1537  * sockets connecting to IPv4 mapped IPv6 destinations.
1538  * Returns zero if OK, a positive errno, or a negative TLI error.
1539  */
1540 static int
1541 tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, in_port_t dstport,
1542     uint_t srcid)
1543 {
1544         ipaddr_t        dstaddr = *dstaddrp;
1545         uint16_t        lport;
1546         conn_t          *connp = tcp->tcp_connp;
1547         tcp_stack_t     *tcps = tcp->tcp_tcps;
1548         int             error;
1549 
1550         ASSERT(connp->conn_ipversion == IPV4_VERSION);
1551 
1552         /* Check for attempt to connect to INADDR_ANY */


2281         tcp->tcp_in_ack_unsent = 0;
2282         tcp->tcp_cork = B_FALSE;
2283         tcp->tcp_tconnind_started = B_FALSE;
2284 
2285         PRESERVE(tcp->tcp_squeue_bytes);
2286 
2287         tcp->tcp_closemp_used = B_FALSE;
2288 
2289         PRESERVE(tcp->tcp_rsrv_mp);
2290         PRESERVE(tcp->tcp_rsrv_mp_lock);
2291 
2292 #ifdef DEBUG
2293         DONTCARE(tcp->tcmp_stk[0]);
2294 #endif
2295 
2296         PRESERVE(tcp->tcp_connid);
2297 
2298         ASSERT(tcp->tcp_listen_cnt == NULL);
2299         ASSERT(tcp->tcp_reass_tid == 0);
2300 





2301 #undef  DONTCARE
2302 #undef  PRESERVE
2303 }
2304 
2305 /*
2306  * Initialize the various fields in tcp_t.  If parent (the listener) is non
2307  * NULL, certain values will be inheritted from it.
2308  */
2309 void
2310 tcp_init_values(tcp_t *tcp, tcp_t *parent)
2311 {
2312         tcp_stack_t     *tcps = tcp->tcp_tcps;
2313         conn_t          *connp = tcp->tcp_connp;
2314 
2315         ASSERT((connp->conn_family == AF_INET &&
2316             connp->conn_ipversion == IPV4_VERSION) ||
2317             (connp->conn_family == AF_INET6 &&
2318             (connp->conn_ipversion == IPV4_VERSION ||
2319             connp->conn_ipversion == IPV6_VERSION)));
2320 



2321         if (parent == NULL) {


2322                 tcp->tcp_naglim = tcps->tcps_naglim_def;
2323 
2324                 tcp->tcp_rto_initial = tcps->tcps_rexmit_interval_initial;
2325                 tcp->tcp_rto_min = tcps->tcps_rexmit_interval_min;
2326                 tcp->tcp_rto_max = tcps->tcps_rexmit_interval_max;
2327 
2328                 tcp->tcp_first_ctimer_threshold =
2329                     tcps->tcps_ip_notify_cinterval;
2330                 tcp->tcp_second_ctimer_threshold =
2331                     tcps->tcps_ip_abort_cinterval;
2332                 tcp->tcp_first_timer_threshold = tcps->tcps_ip_notify_interval;
2333                 tcp->tcp_second_timer_threshold = tcps->tcps_ip_abort_interval;
2334 
2335                 tcp->tcp_fin_wait_2_flush_interval =
2336                     tcps->tcps_fin_wait_2_flush_interval;
2337 
2338                 tcp->tcp_ka_interval = tcps->tcps_keepalive_interval;
2339                 tcp->tcp_ka_abort_thres = tcps->tcps_keepalive_abort_interval;
2340                 tcp->tcp_ka_cnt = 0;
2341                 tcp->tcp_ka_rinterval = 0;
2342 
2343                 /*
2344                  * Default value of tcp_init_cwnd is 0, so no need to set here
2345                  * if parent is NULL.  But we need to inherit it from parent.
2346                  */
2347         } else {
2348                 /* Inherit various TCP parameters from the parent. */


2349                 tcp->tcp_naglim = parent->tcp_naglim;
2350 
2351                 tcp->tcp_rto_initial = parent->tcp_rto_initial;
2352                 tcp->tcp_rto_min = parent->tcp_rto_min;
2353                 tcp->tcp_rto_max = parent->tcp_rto_max;
2354 
2355                 tcp->tcp_first_ctimer_threshold =
2356                     parent->tcp_first_ctimer_threshold;
2357                 tcp->tcp_second_ctimer_threshold =
2358                     parent->tcp_second_ctimer_threshold;
2359                 tcp->tcp_first_timer_threshold =
2360                     parent->tcp_first_timer_threshold;
2361                 tcp->tcp_second_timer_threshold =
2362                     parent->tcp_second_timer_threshold;
2363 
2364                 tcp->tcp_fin_wait_2_flush_interval =
2365                     parent->tcp_fin_wait_2_flush_interval;
2366 
2367                 tcp->tcp_ka_interval = parent->tcp_ka_interval;
2368                 tcp->tcp_ka_abort_thres = parent->tcp_ka_abort_thres;
2369                 tcp->tcp_ka_cnt = parent->tcp_ka_cnt;
2370                 tcp->tcp_ka_rinterval = parent->tcp_ka_rinterval;
2371 
2372                 tcp->tcp_init_cwnd = parent->tcp_init_cwnd;
2373         }
2374 



2375         /*
2376          * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO
2377          * will be close to tcp_rexmit_interval_initial.  By doing this, we
2378          * allow the algorithm to adjust slowly to large fluctuations of RTT
2379          * during first few transmissions of a connection as seen in slow
2380          * links.
2381          */
2382         tcp->tcp_rtt_sa = MSEC2NSEC(tcp->tcp_rto_initial) << 2;
2383         tcp->tcp_rtt_sd = MSEC2NSEC(tcp->tcp_rto_initial) >> 1;
2384         tcp->tcp_rto = tcp_calculate_rto(tcp, tcps,
2385             tcps->tcps_conn_grace_period);
2386 
2387         tcp->tcp_timer_backoff = 0;
2388         tcp->tcp_ms_we_have_waited = 0;
2389         tcp->tcp_last_recv_time = ddi_get_lbolt();
2390         tcp->tcp_cwnd_max = tcps->tcps_cwnd_max_;
2391         tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
2392 
2393         tcp->tcp_maxpsz_multiplier = tcps->tcps_maxpsz_multiplier;
2394 


2599                         return (NULL);
2600                 }
2601 
2602                 ns = netstack_find_by_cred(credp);
2603                 ASSERT(ns != NULL);
2604                 tcps = ns->netstack_tcp;
2605                 ASSERT(tcps != NULL);
2606 
2607                 /*
2608                  * For exclusive stacks we set the zoneid to zero
2609                  * to make TCP operate as if in the global zone.
2610                  */
2611                 if (tcps->tcps_netstack->netstack_stackid !=
2612                     GLOBAL_NETSTACKID)
2613                         zoneid = GLOBAL_ZONEID;
2614                 else
2615                         zoneid = crgetzoneid(credp);
2616         }
2617 
2618         sqp = IP_SQUEUE_GET((uint_t)gethrtime());
2619         connp = (conn_t *)tcp_get_conn(sqp, tcps);
2620         /*
2621          * Both tcp_get_conn and netstack_find_by_cred incremented refcnt,
2622          * so we drop it by one.
2623          */
2624         netstack_rele(tcps->tcps_netstack);
2625         if (connp == NULL) {
2626                 *errorp = ENOSR;
2627                 return (NULL);
2628         }
2629         ASSERT(connp->conn_ixa->ixa_protocol == connp->conn_proto);
2630 
2631         connp->conn_sqp = sqp;
2632         connp->conn_initial_sqp = connp->conn_sqp;
2633         connp->conn_ixa->ixa_sqp = connp->conn_sqp;
2634         tcp = connp->conn_tcp;
2635 
2636         /*
2637          * Besides asking IP to set the checksum for us, have conn_ip_output
2638          * to do the following checks when necessary:
2639          *


3790          * there are many CPUs as we will be adding them 1 by 1.
3791          *
3792          * Note that tcps_sc_cnt never decreases and the tcps_sc[x] pointers
3793          * are not freed until the stack is going away.  So there is no need
3794          * to grab a lock to access the per CPU tcps_sc[x] pointer.
3795          */
3796         mutex_enter(&cpu_lock);
3797         tcps->tcps_sc_cnt = MAX(ncpus, boot_ncpus);
3798         mutex_exit(&cpu_lock);
3799         tcps->tcps_sc = kmem_zalloc(max_ncpus  * sizeof (tcp_stats_cpu_t *),
3800             KM_SLEEP);
3801         for (i = 0; i < tcps->tcps_sc_cnt; i++) {
3802                 tcps->tcps_sc[i] = kmem_zalloc(sizeof (tcp_stats_cpu_t),
3803                     KM_SLEEP);
3804         }
3805 
3806         mutex_init(&tcps->tcps_listener_conf_lock, NULL, MUTEX_DEFAULT, NULL);
3807         list_create(&tcps->tcps_listener_conf, sizeof (tcp_listener_t),
3808             offsetof(tcp_listener_t, tl_link));
3809 



3810         return (tcps);
3811 }
3812 
3813 /*
3814  * Called when the IP module is about to be unloaded.
3815  */
3816 void
3817 tcp_ddi_g_destroy(void)
3818 {
3819         tcp_g_kstat_fini(tcp_g_kstat);
3820         tcp_g_kstat = NULL;
3821         bzero(&tcp_g_statistics, sizeof (tcp_g_statistics));
3822 
3823         mutex_destroy(&tcp_random_lock);
3824 
3825         kmem_cache_destroy(tcp_timercache);
3826         kmem_cache_destroy(tcp_notsack_blk_cache);
3827 
3828         netstack_unregister(NS_TCP);
3829 }




   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2011, Joyent Inc. All rights reserved.
  25  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  26  * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
  27  * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
  28  */
  29 /* Copyright (c) 1990 Mentat Inc. */
  30 
  31 #include <sys/types.h>
  32 #include <sys/stream.h>
  33 #include <sys/strsun.h>
  34 #include <sys/strsubr.h>
  35 #include <sys/stropts.h>
  36 #include <sys/strlog.h>
  37 #define _SUN_TPI_VERSION 2
  38 #include <sys/tihdr.h>
  39 #include <sys/timod.h>
  40 #include <sys/ddi.h>
  41 #include <sys/sunddi.h>
  42 #include <sys/suntpi.h>
  43 #include <sys/xti_inet.h>
  44 #include <sys/cmn_err.h>
  45 #include <sys/debug.h>
  46 #include <sys/sdt.h>


  57 
  58 #include <sys/errno.h>
  59 #include <sys/signal.h>
  60 #include <sys/socket.h>
  61 #include <sys/socketvar.h>
  62 #include <sys/sockio.h>
  63 #include <sys/isa_defs.h>
  64 #include <sys/md5.h>
  65 #include <sys/random.h>
  66 #include <sys/uio.h>
  67 #include <sys/systm.h>
  68 #include <netinet/in.h>
  69 #include <netinet/tcp.h>
  70 #include <netinet/ip6.h>
  71 #include <netinet/icmp6.h>
  72 #include <net/if.h>
  73 #include <net/route.h>
  74 #include <inet/ipsec_impl.h>
  75 
  76 #include <inet/common.h>
  77 #include <inet/cc.h>
  78 #include <inet/ip.h>
  79 #include <inet/ip_impl.h>
  80 #include <inet/ip6.h>
  81 #include <inet/ip_ndp.h>
  82 #include <inet/proto_set.h>
  83 #include <inet/mib2.h>
  84 #include <inet/optcom.h>
  85 #include <inet/snmpcom.h>
  86 #include <inet/kstatcom.h>
  87 #include <inet/tcp.h>
  88 #include <inet/tcp_impl.h>
  89 #include <inet/tcp_cluster.h>
  90 #include <inet/udp_impl.h>
  91 #include <net/pfkeyv2.h>
  92 #include <inet/ipdrop.h>
  93 
  94 #include <inet/ipclassifier.h>
  95 #include <inet/ip_ire.h>
  96 #include <inet/ip_ftable.h>
  97 #include <inet/ip_if.h>


1393         if (tcp->tcp_rthdrdstopts != NULL) {
1394                 mi_free(tcp->tcp_rthdrdstopts);
1395                 tcp->tcp_rthdrdstopts = NULL;
1396                 tcp->tcp_rthdrdstoptslen = 0;
1397         }
1398         ASSERT(tcp->tcp_rthdrdstoptslen == 0);
1399         if (tcp->tcp_rthdr != NULL) {
1400                 mi_free(tcp->tcp_rthdr);
1401                 tcp->tcp_rthdr = NULL;
1402                 tcp->tcp_rthdrlen = 0;
1403         }
1404         ASSERT(tcp->tcp_rthdrlen == 0);
1405 
1406         /*
1407          * Following is really a blowing away a union.
1408          * It happens to have exactly two members of identical size
1409          * the following code is enough.
1410          */
1411         tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind);
1412 
1413         /* Allow the CC algorithm to clean up after itself. */
1414         if (tcp->tcp_cc_algo != NULL && tcp->tcp_cc_algo->cb_destroy != NULL)
1415                 tcp->tcp_cc_algo->cb_destroy(&tcp->tcp_ccv);
1416 
1417         /*
1418          * If this is a non-STREAM socket still holding on to an upper
1419          * handle, release it. As a result of fallback we might also see
1420          * STREAMS based conns with upper handles, in which case there is
1421          * nothing to do other than clearing the field.
1422          */
1423         if (connp->conn_upper_handle != NULL) {
1424                 if (IPCL_IS_NONSTR(connp)) {
1425                         (*connp->conn_upcalls->su_closed)(
1426                             connp->conn_upper_handle);
1427                         tcp->tcp_detached = B_TRUE;
1428                 }
1429                 connp->conn_upper_handle = NULL;
1430                 connp->conn_upcalls = NULL;
1431         }
1432 }
1433 
1434 /*
1435  * tcp_get_conn/tcp_free_conn
1436  *


1443  * outside the squeue. So when the interrupt comes, we have a clean
1444  * connection sitting in the freelist. Obviously, this buys us
1445  * performance.
1446  *
1447  * 2) Defence against DOS attack. Allocating a tcp/conn in tcp_input_listener
1448  * has multiple disadvantages - tying up the squeue during alloc.
1449  * But allocating the conn/tcp in IP land is also not the best since
1450  * we can't check the 'q' and 'q0' which are protected by squeue and
1451  * blindly allocate memory which might have to be freed here if we are
1452  * not allowed to accept the connection. By using the freelist and
1453  * putting the conn/tcp back in freelist, we don't pay a penalty for
1454  * allocating memory without checking 'q/q0' and freeing it if we can't
1455  * accept the connection.
1456  *
1457  * Care should be taken to put the conn back in the same squeue's freelist
1458  * from which it was allocated. Best results are obtained if conn is
1459  * allocated from listener's squeue and freed to the same. Time wait
1460  * collector will free up the freelist is the connection ends up sitting
1461  * there for too long.
1462  */
1463 conn_t *
1464 tcp_get_conn(void *arg, tcp_stack_t *tcps)
1465 {
1466         tcp_t                   *tcp = NULL;
1467         conn_t                  *connp = NULL;
1468         squeue_t                *sqp = (squeue_t *)arg;
1469         tcp_squeue_priv_t       *tcp_time_wait;
1470         netstack_t              *ns;
1471         mblk_t                  *tcp_rsrv_mp = NULL;
1472 
1473         tcp_time_wait =
1474             *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
1475 
1476         mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
1477         tcp = tcp_time_wait->tcp_free_list;
1478         ASSERT((tcp != NULL) ^ (tcp_time_wait->tcp_free_list_cnt == 0));
1479         if (tcp != NULL) {
1480                 tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next;
1481                 tcp_time_wait->tcp_free_list_cnt--;
1482                 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1483                 tcp->tcp_time_wait_next = NULL;
1484                 connp = tcp->tcp_connp;
1485                 connp->conn_flags |= IPCL_REUSED;
1486 
1487                 ASSERT(tcp->tcp_tcps == NULL);
1488                 ASSERT(connp->conn_netstack == NULL);
1489                 ASSERT(tcp->tcp_rsrv_mp != NULL);
1490                 ns = tcps->tcps_netstack;
1491                 netstack_hold(ns);
1492                 connp->conn_netstack = ns;
1493                 connp->conn_ixa->ixa_ipst = ns->netstack_ip;
1494                 tcp->tcp_tcps = tcps;
1495                 ipcl_globalhash_insert(connp);
1496 
1497                 connp->conn_ixa->ixa_notify_cookie = tcp;
1498                 ASSERT(connp->conn_ixa->ixa_notify == tcp_notify);
1499                 connp->conn_recv = tcp_input_data;
1500                 ASSERT(connp->conn_recvicmp == tcp_icmp_input);
1501                 ASSERT(connp->conn_verifyicmp == tcp_verifyicmp);
1502                 return (connp);
1503         }
1504         mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1505         /*
1506          * Pre-allocate the tcp_rsrv_mp. This mblk will not be freed until
1507          * this conn_t/tcp_t is freed at ipcl_conn_destroy().
1508          */
1509         tcp_rsrv_mp = allocb(0, BPRI_HI);
1510         if (tcp_rsrv_mp == NULL)
1511                 return (NULL);
1512 
1513         if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP,
1514             tcps->tcps_netstack)) == NULL) {
1515                 freeb(tcp_rsrv_mp);
1516                 return (NULL);
1517         }
1518 
1519         tcp = connp->conn_tcp;
1520         tcp->tcp_rsrv_mp = tcp_rsrv_mp;
1521         mutex_init(&tcp->tcp_rsrv_mp_lock, NULL, MUTEX_DEFAULT, NULL);
1522 
1523         tcp->tcp_tcps = tcps;
1524 
1525         connp->conn_recv = tcp_input_data;
1526         connp->conn_recvicmp = tcp_icmp_input;
1527         connp->conn_verifyicmp = tcp_verifyicmp;
1528 
1529         /*
1530          * Register tcp_notify to listen to capability changes detected by IP.
1531          * This upcall is made in the context of the call to conn_ip_output
1532          * thus it is inside the squeue.
1533          */
1534         connp->conn_ixa->ixa_notify = tcp_notify;
1535         connp->conn_ixa->ixa_notify_cookie = tcp;
1536 
1537         return (connp);
1538 }
1539 
1540 /*
1541  * Handle connect to IPv4 destinations, including connections for AF_INET6
1542  * sockets connecting to IPv4 mapped IPv6 destinations.
1543  * Returns zero if OK, a positive errno, or a negative TLI error.
1544  */
1545 static int
1546 tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, in_port_t dstport,
1547     uint_t srcid)
1548 {
1549         ipaddr_t        dstaddr = *dstaddrp;
1550         uint16_t        lport;
1551         conn_t          *connp = tcp->tcp_connp;
1552         tcp_stack_t     *tcps = tcp->tcp_tcps;
1553         int             error;
1554 
1555         ASSERT(connp->conn_ipversion == IPV4_VERSION);
1556 
1557         /* Check for attempt to connect to INADDR_ANY */


2286         tcp->tcp_in_ack_unsent = 0;
2287         tcp->tcp_cork = B_FALSE;
2288         tcp->tcp_tconnind_started = B_FALSE;
2289 
2290         PRESERVE(tcp->tcp_squeue_bytes);
2291 
2292         tcp->tcp_closemp_used = B_FALSE;
2293 
2294         PRESERVE(tcp->tcp_rsrv_mp);
2295         PRESERVE(tcp->tcp_rsrv_mp_lock);
2296 
2297 #ifdef DEBUG
2298         DONTCARE(tcp->tcmp_stk[0]);
2299 #endif
2300 
2301         PRESERVE(tcp->tcp_connid);
2302 
2303         ASSERT(tcp->tcp_listen_cnt == NULL);
2304         ASSERT(tcp->tcp_reass_tid == 0);
2305 
2306         /* Allow the CC algorithm to clean up after itself. */
2307         if (tcp->tcp_cc_algo->cb_destroy != NULL)
2308                 tcp->tcp_cc_algo->cb_destroy(&tcp->tcp_ccv);
2309         tcp->tcp_cc_algo = NULL;
2310 
2311 #undef  DONTCARE
2312 #undef  PRESERVE
2313 }
2314 
2315 /*
2316  * Initialize the various fields in tcp_t.  If parent (the listener) is non
2317  * NULL, certain values will be inheritted from it.
2318  */
2319 void
2320 tcp_init_values(tcp_t *tcp, tcp_t *parent)
2321 {
2322         tcp_stack_t     *tcps = tcp->tcp_tcps;
2323         conn_t          *connp = tcp->tcp_connp;
2324 
2325         ASSERT((connp->conn_family == AF_INET &&
2326             connp->conn_ipversion == IPV4_VERSION) ||
2327             (connp->conn_family == AF_INET6 &&
2328             (connp->conn_ipversion == IPV4_VERSION ||
2329             connp->conn_ipversion == IPV6_VERSION)));
2330 
2331         tcp->tcp_ccv.type = IPPROTO_TCP;
2332         tcp->tcp_ccv.ccvc.tcp = tcp;
2333 
2334         if (parent == NULL) {
2335                 tcp->tcp_cc_algo = tcps->tcps_default_cc_algo;
2336 
2337                 tcp->tcp_naglim = tcps->tcps_naglim_def;
2338 
2339                 tcp->tcp_rto_initial = tcps->tcps_rexmit_interval_initial;
2340                 tcp->tcp_rto_min = tcps->tcps_rexmit_interval_min;
2341                 tcp->tcp_rto_max = tcps->tcps_rexmit_interval_max;
2342 
2343                 tcp->tcp_first_ctimer_threshold =
2344                     tcps->tcps_ip_notify_cinterval;
2345                 tcp->tcp_second_ctimer_threshold =
2346                     tcps->tcps_ip_abort_cinterval;
2347                 tcp->tcp_first_timer_threshold = tcps->tcps_ip_notify_interval;
2348                 tcp->tcp_second_timer_threshold = tcps->tcps_ip_abort_interval;
2349 
2350                 tcp->tcp_fin_wait_2_flush_interval =
2351                     tcps->tcps_fin_wait_2_flush_interval;
2352 
2353                 tcp->tcp_ka_interval = tcps->tcps_keepalive_interval;
2354                 tcp->tcp_ka_abort_thres = tcps->tcps_keepalive_abort_interval;
2355                 tcp->tcp_ka_cnt = 0;
2356                 tcp->tcp_ka_rinterval = 0;
2357 
2358                 /*
2359                  * Default value of tcp_init_cwnd is 0, so no need to set here
2360                  * if parent is NULL.  But we need to inherit it from parent.
2361                  */
2362         } else {
2363                 /* Inherit various TCP parameters from the parent. */
2364                 tcp->tcp_cc_algo = parent->tcp_cc_algo;
2365 
2366                 tcp->tcp_naglim = parent->tcp_naglim;
2367 
2368                 tcp->tcp_rto_initial = parent->tcp_rto_initial;
2369                 tcp->tcp_rto_min = parent->tcp_rto_min;
2370                 tcp->tcp_rto_max = parent->tcp_rto_max;
2371 
2372                 tcp->tcp_first_ctimer_threshold =
2373                     parent->tcp_first_ctimer_threshold;
2374                 tcp->tcp_second_ctimer_threshold =
2375                     parent->tcp_second_ctimer_threshold;
2376                 tcp->tcp_first_timer_threshold =
2377                     parent->tcp_first_timer_threshold;
2378                 tcp->tcp_second_timer_threshold =
2379                     parent->tcp_second_timer_threshold;
2380 
2381                 tcp->tcp_fin_wait_2_flush_interval =
2382                     parent->tcp_fin_wait_2_flush_interval;
2383 
2384                 tcp->tcp_ka_interval = parent->tcp_ka_interval;
2385                 tcp->tcp_ka_abort_thres = parent->tcp_ka_abort_thres;
2386                 tcp->tcp_ka_cnt = parent->tcp_ka_cnt;
2387                 tcp->tcp_ka_rinterval = parent->tcp_ka_rinterval;
2388 
2389                 tcp->tcp_init_cwnd = parent->tcp_init_cwnd;
2390         }
2391 
2392         if (tcp->tcp_cc_algo->cb_init != NULL)
2393                 VERIFY(tcp->tcp_cc_algo->cb_init(&tcp->tcp_ccv) == 0);
2394 
2395         /*
2396          * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO
2397          * will be close to tcp_rexmit_interval_initial.  By doing this, we
2398          * allow the algorithm to adjust slowly to large fluctuations of RTT
2399          * during first few transmissions of a connection as seen in slow
2400          * links.
2401          */
2402         tcp->tcp_rtt_sa = MSEC2NSEC(tcp->tcp_rto_initial) << 2;
2403         tcp->tcp_rtt_sd = MSEC2NSEC(tcp->tcp_rto_initial) >> 1;
2404         tcp->tcp_rto = tcp_calculate_rto(tcp, tcps,
2405             tcps->tcps_conn_grace_period);
2406 
2407         tcp->tcp_timer_backoff = 0;
2408         tcp->tcp_ms_we_have_waited = 0;
2409         tcp->tcp_last_recv_time = ddi_get_lbolt();
2410         tcp->tcp_cwnd_max = tcps->tcps_cwnd_max_;
2411         tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
2412 
2413         tcp->tcp_maxpsz_multiplier = tcps->tcps_maxpsz_multiplier;
2414 


2619                         return (NULL);
2620                 }
2621 
2622                 ns = netstack_find_by_cred(credp);
2623                 ASSERT(ns != NULL);
2624                 tcps = ns->netstack_tcp;
2625                 ASSERT(tcps != NULL);
2626 
2627                 /*
2628                  * For exclusive stacks we set the zoneid to zero
2629                  * to make TCP operate as if in the global zone.
2630                  */
2631                 if (tcps->tcps_netstack->netstack_stackid !=
2632                     GLOBAL_NETSTACKID)
2633                         zoneid = GLOBAL_ZONEID;
2634                 else
2635                         zoneid = crgetzoneid(credp);
2636         }
2637 
2638         sqp = IP_SQUEUE_GET((uint_t)gethrtime());
2639         connp = tcp_get_conn(sqp, tcps);
2640         /*
2641          * Both tcp_get_conn and netstack_find_by_cred incremented refcnt,
2642          * so we drop it by one.
2643          */
2644         netstack_rele(tcps->tcps_netstack);
2645         if (connp == NULL) {
2646                 *errorp = ENOSR;
2647                 return (NULL);
2648         }
2649         ASSERT(connp->conn_ixa->ixa_protocol == connp->conn_proto);
2650 
2651         connp->conn_sqp = sqp;
2652         connp->conn_initial_sqp = connp->conn_sqp;
2653         connp->conn_ixa->ixa_sqp = connp->conn_sqp;
2654         tcp = connp->conn_tcp;
2655 
2656         /*
2657          * Besides asking IP to set the checksum for us, have conn_ip_output
2658          * to do the following checks when necessary:
2659          *


3810          * there are many CPUs as we will be adding them 1 by 1.
3811          *
3812          * Note that tcps_sc_cnt never decreases and the tcps_sc[x] pointers
3813          * are not freed until the stack is going away.  So there is no need
3814          * to grab a lock to access the per CPU tcps_sc[x] pointer.
3815          */
3816         mutex_enter(&cpu_lock);
3817         tcps->tcps_sc_cnt = MAX(ncpus, boot_ncpus);
3818         mutex_exit(&cpu_lock);
3819         tcps->tcps_sc = kmem_zalloc(max_ncpus  * sizeof (tcp_stats_cpu_t *),
3820             KM_SLEEP);
3821         for (i = 0; i < tcps->tcps_sc_cnt; i++) {
3822                 tcps->tcps_sc[i] = kmem_zalloc(sizeof (tcp_stats_cpu_t),
3823                     KM_SLEEP);
3824         }
3825 
3826         mutex_init(&tcps->tcps_listener_conf_lock, NULL, MUTEX_DEFAULT, NULL);
3827         list_create(&tcps->tcps_listener_conf, sizeof (tcp_listener_t),
3828             offsetof(tcp_listener_t, tl_link));
3829 
3830         tcps->tcps_default_cc_algo = cc_load_algo(CC_DEFAULT_ALGO_NAME);
3831         VERIFY3P(tcps->tcps_default_cc_algo, !=, NULL);
3832 
3833         return (tcps);
3834 }
3835 
3836 /*
3837  * Called when the IP module is about to be unloaded.
3838  */
3839 void
3840 tcp_ddi_g_destroy(void)
3841 {
3842         tcp_g_kstat_fini(tcp_g_kstat);
3843         tcp_g_kstat = NULL;
3844         bzero(&tcp_g_statistics, sizeof (tcp_g_statistics));
3845 
3846         mutex_destroy(&tcp_random_lock);
3847 
3848         kmem_cache_destroy(tcp_timercache);
3849         kmem_cache_destroy(tcp_notsack_blk_cache);
3850 
3851         netstack_unregister(NS_TCP);
3852 }