6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2011, Joyent Inc. All rights reserved.
25 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
26 * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
27 * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
28 */
29 /* Copyright (c) 1990 Mentat Inc. */
30
31 #include <sys/types.h>
32 #include <sys/stream.h>
33 #include <sys/strsun.h>
34 #include <sys/strsubr.h>
35 #include <sys/stropts.h>
36 #include <sys/strlog.h>
37 #define _SUN_TPI_VERSION 2
38 #include <sys/tihdr.h>
39 #include <sys/timod.h>
40 #include <sys/ddi.h>
41 #include <sys/sunddi.h>
42 #include <sys/suntpi.h>
43 #include <sys/xti_inet.h>
44 #include <sys/cmn_err.h>
45 #include <sys/debug.h>
46 #include <sys/sdt.h>
57
58 #include <sys/errno.h>
59 #include <sys/signal.h>
60 #include <sys/socket.h>
61 #include <sys/socketvar.h>
62 #include <sys/sockio.h>
63 #include <sys/isa_defs.h>
64 #include <sys/md5.h>
65 #include <sys/random.h>
66 #include <sys/uio.h>
67 #include <sys/systm.h>
68 #include <netinet/in.h>
69 #include <netinet/tcp.h>
70 #include <netinet/ip6.h>
71 #include <netinet/icmp6.h>
72 #include <net/if.h>
73 #include <net/route.h>
74 #include <inet/ipsec_impl.h>
75
76 #include <inet/common.h>
77 #include <inet/ip.h>
78 #include <inet/ip_impl.h>
79 #include <inet/ip6.h>
80 #include <inet/ip_ndp.h>
81 #include <inet/proto_set.h>
82 #include <inet/mib2.h>
83 #include <inet/optcom.h>
84 #include <inet/snmpcom.h>
85 #include <inet/kstatcom.h>
86 #include <inet/tcp.h>
87 #include <inet/tcp_impl.h>
88 #include <inet/tcp_cluster.h>
89 #include <inet/udp_impl.h>
90 #include <net/pfkeyv2.h>
91 #include <inet/ipdrop.h>
92
93 #include <inet/ipclassifier.h>
94 #include <inet/ip_ire.h>
95 #include <inet/ip_ftable.h>
96 #include <inet/ip_if.h>
1392 if (tcp->tcp_rthdrdstopts != NULL) {
1393 mi_free(tcp->tcp_rthdrdstopts);
1394 tcp->tcp_rthdrdstopts = NULL;
1395 tcp->tcp_rthdrdstoptslen = 0;
1396 }
1397 ASSERT(tcp->tcp_rthdrdstoptslen == 0);
1398 if (tcp->tcp_rthdr != NULL) {
1399 mi_free(tcp->tcp_rthdr);
1400 tcp->tcp_rthdr = NULL;
1401 tcp->tcp_rthdrlen = 0;
1402 }
1403 ASSERT(tcp->tcp_rthdrlen == 0);
1404
1405 /*
1406 * Following is really a blowing away a union.
1407 * It happens to have exactly two members of identical size
1408 * the following code is enough.
1409 */
1410 tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind);
1411
1412 /*
1413 * If this is a non-STREAM socket still holding on to an upper
1414 * handle, release it. As a result of fallback we might also see
1415 * STREAMS based conns with upper handles, in which case there is
1416 * nothing to do other than clearing the field.
1417 */
1418 if (connp->conn_upper_handle != NULL) {
1419 if (IPCL_IS_NONSTR(connp)) {
1420 (*connp->conn_upcalls->su_closed)(
1421 connp->conn_upper_handle);
1422 tcp->tcp_detached = B_TRUE;
1423 }
1424 connp->conn_upper_handle = NULL;
1425 connp->conn_upcalls = NULL;
1426 }
1427 }
1428
1429 /*
1430 * tcp_get_conn/tcp_free_conn
1431 *
1438 * outside the squeue. So when the interrupt comes, we have a clean
1439 * connection sitting in the freelist. Obviously, this buys us
1440 * performance.
1441 *
1442 * 2) Defence against DOS attack. Allocating a tcp/conn in tcp_input_listener
1443 * has multiple disadvantages - tying up the squeue during alloc.
1444 * But allocating the conn/tcp in IP land is also not the best since
1445 * we can't check the 'q' and 'q0' which are protected by squeue and
1446 * blindly allocate memory which might have to be freed here if we are
1447 * not allowed to accept the connection. By using the freelist and
1448 * putting the conn/tcp back in freelist, we don't pay a penalty for
1449 * allocating memory without checking 'q/q0' and freeing it if we can't
1450 * accept the connection.
1451 *
1452 * Care should be taken to put the conn back in the same squeue's freelist
1453 * from which it was allocated. Best results are obtained if conn is
1454 * allocated from listener's squeue and freed to the same. Time wait
1455 * collector will free up the freelist is the connection ends up sitting
1456 * there for too long.
1457 */
1458 void *
1459 tcp_get_conn(void *arg, tcp_stack_t *tcps)
1460 {
1461 tcp_t *tcp = NULL;
1462 conn_t *connp = NULL;
1463 squeue_t *sqp = (squeue_t *)arg;
1464 tcp_squeue_priv_t *tcp_time_wait;
1465 netstack_t *ns;
1466 mblk_t *tcp_rsrv_mp = NULL;
1467
1468 tcp_time_wait =
1469 *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
1470
1471 mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
1472 tcp = tcp_time_wait->tcp_free_list;
1473 ASSERT((tcp != NULL) ^ (tcp_time_wait->tcp_free_list_cnt == 0));
1474 if (tcp != NULL) {
1475 tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next;
1476 tcp_time_wait->tcp_free_list_cnt--;
1477 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1478 tcp->tcp_time_wait_next = NULL;
1479 connp = tcp->tcp_connp;
1480 connp->conn_flags |= IPCL_REUSED;
1481
1482 ASSERT(tcp->tcp_tcps == NULL);
1483 ASSERT(connp->conn_netstack == NULL);
1484 ASSERT(tcp->tcp_rsrv_mp != NULL);
1485 ns = tcps->tcps_netstack;
1486 netstack_hold(ns);
1487 connp->conn_netstack = ns;
1488 connp->conn_ixa->ixa_ipst = ns->netstack_ip;
1489 tcp->tcp_tcps = tcps;
1490 ipcl_globalhash_insert(connp);
1491
1492 connp->conn_ixa->ixa_notify_cookie = tcp;
1493 ASSERT(connp->conn_ixa->ixa_notify == tcp_notify);
1494 connp->conn_recv = tcp_input_data;
1495 ASSERT(connp->conn_recvicmp == tcp_icmp_input);
1496 ASSERT(connp->conn_verifyicmp == tcp_verifyicmp);
1497 return ((void *)connp);
1498 }
1499 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1500 /*
1501 * Pre-allocate the tcp_rsrv_mp. This mblk will not be freed until
1502 * this conn_t/tcp_t is freed at ipcl_conn_destroy().
1503 */
1504 tcp_rsrv_mp = allocb(0, BPRI_HI);
1505 if (tcp_rsrv_mp == NULL)
1506 return (NULL);
1507
1508 if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP,
1509 tcps->tcps_netstack)) == NULL) {
1510 freeb(tcp_rsrv_mp);
1511 return (NULL);
1512 }
1513
1514 tcp = connp->conn_tcp;
1515 tcp->tcp_rsrv_mp = tcp_rsrv_mp;
1516 mutex_init(&tcp->tcp_rsrv_mp_lock, NULL, MUTEX_DEFAULT, NULL);
1517
1518 tcp->tcp_tcps = tcps;
1519
1520 connp->conn_recv = tcp_input_data;
1521 connp->conn_recvicmp = tcp_icmp_input;
1522 connp->conn_verifyicmp = tcp_verifyicmp;
1523
1524 /*
1525 * Register tcp_notify to listen to capability changes detected by IP.
1526 * This upcall is made in the context of the call to conn_ip_output
1527 * thus it is inside the squeue.
1528 */
1529 connp->conn_ixa->ixa_notify = tcp_notify;
1530 connp->conn_ixa->ixa_notify_cookie = tcp;
1531
1532 return ((void *)connp);
1533 }
1534
1535 /*
1536 * Handle connect to IPv4 destinations, including connections for AF_INET6
1537 * sockets connecting to IPv4 mapped IPv6 destinations.
1538 * Returns zero if OK, a positive errno, or a negative TLI error.
1539 */
1540 static int
1541 tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, in_port_t dstport,
1542 uint_t srcid)
1543 {
1544 ipaddr_t dstaddr = *dstaddrp;
1545 uint16_t lport;
1546 conn_t *connp = tcp->tcp_connp;
1547 tcp_stack_t *tcps = tcp->tcp_tcps;
1548 int error;
1549
1550 ASSERT(connp->conn_ipversion == IPV4_VERSION);
1551
1552 /* Check for attempt to connect to INADDR_ANY */
2281 tcp->tcp_in_ack_unsent = 0;
2282 tcp->tcp_cork = B_FALSE;
2283 tcp->tcp_tconnind_started = B_FALSE;
2284
2285 PRESERVE(tcp->tcp_squeue_bytes);
2286
2287 tcp->tcp_closemp_used = B_FALSE;
2288
2289 PRESERVE(tcp->tcp_rsrv_mp);
2290 PRESERVE(tcp->tcp_rsrv_mp_lock);
2291
2292 #ifdef DEBUG
2293 DONTCARE(tcp->tcmp_stk[0]);
2294 #endif
2295
2296 PRESERVE(tcp->tcp_connid);
2297
2298 ASSERT(tcp->tcp_listen_cnt == NULL);
2299 ASSERT(tcp->tcp_reass_tid == 0);
2300
2301 #undef DONTCARE
2302 #undef PRESERVE
2303 }
2304
2305 /*
2306 * Initialize the various fields in tcp_t. If parent (the listener) is non
2307 * NULL, certain values will be inheritted from it.
2308 */
2309 void
2310 tcp_init_values(tcp_t *tcp, tcp_t *parent)
2311 {
2312 tcp_stack_t *tcps = tcp->tcp_tcps;
2313 conn_t *connp = tcp->tcp_connp;
2314
2315 ASSERT((connp->conn_family == AF_INET &&
2316 connp->conn_ipversion == IPV4_VERSION) ||
2317 (connp->conn_family == AF_INET6 &&
2318 (connp->conn_ipversion == IPV4_VERSION ||
2319 connp->conn_ipversion == IPV6_VERSION)));
2320
2321 if (parent == NULL) {
2322 tcp->tcp_naglim = tcps->tcps_naglim_def;
2323
2324 tcp->tcp_rto_initial = tcps->tcps_rexmit_interval_initial;
2325 tcp->tcp_rto_min = tcps->tcps_rexmit_interval_min;
2326 tcp->tcp_rto_max = tcps->tcps_rexmit_interval_max;
2327
2328 tcp->tcp_first_ctimer_threshold =
2329 tcps->tcps_ip_notify_cinterval;
2330 tcp->tcp_second_ctimer_threshold =
2331 tcps->tcps_ip_abort_cinterval;
2332 tcp->tcp_first_timer_threshold = tcps->tcps_ip_notify_interval;
2333 tcp->tcp_second_timer_threshold = tcps->tcps_ip_abort_interval;
2334
2335 tcp->tcp_fin_wait_2_flush_interval =
2336 tcps->tcps_fin_wait_2_flush_interval;
2337
2338 tcp->tcp_ka_interval = tcps->tcps_keepalive_interval;
2339 tcp->tcp_ka_abort_thres = tcps->tcps_keepalive_abort_interval;
2340 tcp->tcp_ka_cnt = 0;
2341 tcp->tcp_ka_rinterval = 0;
2342
2343 /*
2344 * Default value of tcp_init_cwnd is 0, so no need to set here
2345 * if parent is NULL. But we need to inherit it from parent.
2346 */
2347 } else {
2348 /* Inherit various TCP parameters from the parent. */
2349 tcp->tcp_naglim = parent->tcp_naglim;
2350
2351 tcp->tcp_rto_initial = parent->tcp_rto_initial;
2352 tcp->tcp_rto_min = parent->tcp_rto_min;
2353 tcp->tcp_rto_max = parent->tcp_rto_max;
2354
2355 tcp->tcp_first_ctimer_threshold =
2356 parent->tcp_first_ctimer_threshold;
2357 tcp->tcp_second_ctimer_threshold =
2358 parent->tcp_second_ctimer_threshold;
2359 tcp->tcp_first_timer_threshold =
2360 parent->tcp_first_timer_threshold;
2361 tcp->tcp_second_timer_threshold =
2362 parent->tcp_second_timer_threshold;
2363
2364 tcp->tcp_fin_wait_2_flush_interval =
2365 parent->tcp_fin_wait_2_flush_interval;
2366
2367 tcp->tcp_ka_interval = parent->tcp_ka_interval;
2368 tcp->tcp_ka_abort_thres = parent->tcp_ka_abort_thres;
2369 tcp->tcp_ka_cnt = parent->tcp_ka_cnt;
2370 tcp->tcp_ka_rinterval = parent->tcp_ka_rinterval;
2371
2372 tcp->tcp_init_cwnd = parent->tcp_init_cwnd;
2373 }
2374
2375 /*
2376 * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO
2377 * will be close to tcp_rexmit_interval_initial. By doing this, we
2378 * allow the algorithm to adjust slowly to large fluctuations of RTT
2379 * during first few transmissions of a connection as seen in slow
2380 * links.
2381 */
2382 tcp->tcp_rtt_sa = MSEC2NSEC(tcp->tcp_rto_initial) << 2;
2383 tcp->tcp_rtt_sd = MSEC2NSEC(tcp->tcp_rto_initial) >> 1;
2384 tcp->tcp_rto = tcp_calculate_rto(tcp, tcps,
2385 tcps->tcps_conn_grace_period);
2386
2387 tcp->tcp_timer_backoff = 0;
2388 tcp->tcp_ms_we_have_waited = 0;
2389 tcp->tcp_last_recv_time = ddi_get_lbolt();
2390 tcp->tcp_cwnd_max = tcps->tcps_cwnd_max_;
2391 tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
2392
2393 tcp->tcp_maxpsz_multiplier = tcps->tcps_maxpsz_multiplier;
2394
2599 return (NULL);
2600 }
2601
2602 ns = netstack_find_by_cred(credp);
2603 ASSERT(ns != NULL);
2604 tcps = ns->netstack_tcp;
2605 ASSERT(tcps != NULL);
2606
2607 /*
2608 * For exclusive stacks we set the zoneid to zero
2609 * to make TCP operate as if in the global zone.
2610 */
2611 if (tcps->tcps_netstack->netstack_stackid !=
2612 GLOBAL_NETSTACKID)
2613 zoneid = GLOBAL_ZONEID;
2614 else
2615 zoneid = crgetzoneid(credp);
2616 }
2617
2618 sqp = IP_SQUEUE_GET((uint_t)gethrtime());
2619 connp = (conn_t *)tcp_get_conn(sqp, tcps);
2620 /*
2621 * Both tcp_get_conn and netstack_find_by_cred incremented refcnt,
2622 * so we drop it by one.
2623 */
2624 netstack_rele(tcps->tcps_netstack);
2625 if (connp == NULL) {
2626 *errorp = ENOSR;
2627 return (NULL);
2628 }
2629 ASSERT(connp->conn_ixa->ixa_protocol == connp->conn_proto);
2630
2631 connp->conn_sqp = sqp;
2632 connp->conn_initial_sqp = connp->conn_sqp;
2633 connp->conn_ixa->ixa_sqp = connp->conn_sqp;
2634 tcp = connp->conn_tcp;
2635
2636 /*
2637 * Besides asking IP to set the checksum for us, have conn_ip_output
2638 * to do the following checks when necessary:
2639 *
3790 * there are many CPUs as we will be adding them 1 by 1.
3791 *
3792 * Note that tcps_sc_cnt never decreases and the tcps_sc[x] pointers
3793 * are not freed until the stack is going away. So there is no need
3794 * to grab a lock to access the per CPU tcps_sc[x] pointer.
3795 */
3796 mutex_enter(&cpu_lock);
3797 tcps->tcps_sc_cnt = MAX(ncpus, boot_ncpus);
3798 mutex_exit(&cpu_lock);
3799 tcps->tcps_sc = kmem_zalloc(max_ncpus * sizeof (tcp_stats_cpu_t *),
3800 KM_SLEEP);
3801 for (i = 0; i < tcps->tcps_sc_cnt; i++) {
3802 tcps->tcps_sc[i] = kmem_zalloc(sizeof (tcp_stats_cpu_t),
3803 KM_SLEEP);
3804 }
3805
3806 mutex_init(&tcps->tcps_listener_conf_lock, NULL, MUTEX_DEFAULT, NULL);
3807 list_create(&tcps->tcps_listener_conf, sizeof (tcp_listener_t),
3808 offsetof(tcp_listener_t, tl_link));
3809
3810 return (tcps);
3811 }
3812
3813 /*
3814 * Called when the IP module is about to be unloaded.
3815 */
3816 void
3817 tcp_ddi_g_destroy(void)
3818 {
3819 tcp_g_kstat_fini(tcp_g_kstat);
3820 tcp_g_kstat = NULL;
3821 bzero(&tcp_g_statistics, sizeof (tcp_g_statistics));
3822
3823 mutex_destroy(&tcp_random_lock);
3824
3825 kmem_cache_destroy(tcp_timercache);
3826 kmem_cache_destroy(tcp_notsack_blk_cache);
3827
3828 netstack_unregister(NS_TCP);
3829 }
|
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2011, Joyent Inc. All rights reserved.
25 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
26 * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
27 * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
28 */
29 /* Copyright (c) 1990 Mentat Inc. */
30
31 #include <sys/types.h>
32 #include <sys/stream.h>
33 #include <sys/strsun.h>
34 #include <sys/strsubr.h>
35 #include <sys/stropts.h>
36 #include <sys/strlog.h>
37 #define _SUN_TPI_VERSION 2
38 #include <sys/tihdr.h>
39 #include <sys/timod.h>
40 #include <sys/ddi.h>
41 #include <sys/sunddi.h>
42 #include <sys/suntpi.h>
43 #include <sys/xti_inet.h>
44 #include <sys/cmn_err.h>
45 #include <sys/debug.h>
46 #include <sys/sdt.h>
57
58 #include <sys/errno.h>
59 #include <sys/signal.h>
60 #include <sys/socket.h>
61 #include <sys/socketvar.h>
62 #include <sys/sockio.h>
63 #include <sys/isa_defs.h>
64 #include <sys/md5.h>
65 #include <sys/random.h>
66 #include <sys/uio.h>
67 #include <sys/systm.h>
68 #include <netinet/in.h>
69 #include <netinet/tcp.h>
70 #include <netinet/ip6.h>
71 #include <netinet/icmp6.h>
72 #include <net/if.h>
73 #include <net/route.h>
74 #include <inet/ipsec_impl.h>
75
76 #include <inet/common.h>
77 #include <inet/cc.h>
78 #include <inet/ip.h>
79 #include <inet/ip_impl.h>
80 #include <inet/ip6.h>
81 #include <inet/ip_ndp.h>
82 #include <inet/proto_set.h>
83 #include <inet/mib2.h>
84 #include <inet/optcom.h>
85 #include <inet/snmpcom.h>
86 #include <inet/kstatcom.h>
87 #include <inet/tcp.h>
88 #include <inet/tcp_impl.h>
89 #include <inet/tcp_cluster.h>
90 #include <inet/udp_impl.h>
91 #include <net/pfkeyv2.h>
92 #include <inet/ipdrop.h>
93
94 #include <inet/ipclassifier.h>
95 #include <inet/ip_ire.h>
96 #include <inet/ip_ftable.h>
97 #include <inet/ip_if.h>
1393 if (tcp->tcp_rthdrdstopts != NULL) {
1394 mi_free(tcp->tcp_rthdrdstopts);
1395 tcp->tcp_rthdrdstopts = NULL;
1396 tcp->tcp_rthdrdstoptslen = 0;
1397 }
1398 ASSERT(tcp->tcp_rthdrdstoptslen == 0);
1399 if (tcp->tcp_rthdr != NULL) {
1400 mi_free(tcp->tcp_rthdr);
1401 tcp->tcp_rthdr = NULL;
1402 tcp->tcp_rthdrlen = 0;
1403 }
1404 ASSERT(tcp->tcp_rthdrlen == 0);
1405
1406 /*
1407 * Following is really a blowing away a union.
1408 * It happens to have exactly two members of identical size
1409 * the following code is enough.
1410 */
1411 tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind);
1412
1413 /* Allow the CC algorithm to clean up after itself. */
1414 if (tcp->tcp_cc_algo != NULL && tcp->tcp_cc_algo->cb_destroy != NULL)
1415 tcp->tcp_cc_algo->cb_destroy(&tcp->tcp_ccv);
1416
1417 /*
1418 * If this is a non-STREAM socket still holding on to an upper
1419 * handle, release it. As a result of fallback we might also see
1420 * STREAMS based conns with upper handles, in which case there is
1421 * nothing to do other than clearing the field.
1422 */
1423 if (connp->conn_upper_handle != NULL) {
1424 if (IPCL_IS_NONSTR(connp)) {
1425 (*connp->conn_upcalls->su_closed)(
1426 connp->conn_upper_handle);
1427 tcp->tcp_detached = B_TRUE;
1428 }
1429 connp->conn_upper_handle = NULL;
1430 connp->conn_upcalls = NULL;
1431 }
1432 }
1433
1434 /*
1435 * tcp_get_conn/tcp_free_conn
1436 *
1443 * outside the squeue. So when the interrupt comes, we have a clean
1444 * connection sitting in the freelist. Obviously, this buys us
1445 * performance.
1446 *
1447 * 2) Defence against DOS attack. Allocating a tcp/conn in tcp_input_listener
1448 * has multiple disadvantages - tying up the squeue during alloc.
1449 * But allocating the conn/tcp in IP land is also not the best since
1450 * we can't check the 'q' and 'q0' which are protected by squeue and
1451 * blindly allocate memory which might have to be freed here if we are
1452 * not allowed to accept the connection. By using the freelist and
1453 * putting the conn/tcp back in freelist, we don't pay a penalty for
1454 * allocating memory without checking 'q/q0' and freeing it if we can't
1455 * accept the connection.
1456 *
1457 * Care should be taken to put the conn back in the same squeue's freelist
1458 * from which it was allocated. Best results are obtained if conn is
1459 * allocated from listener's squeue and freed to the same. Time wait
1460 * collector will free up the freelist is the connection ends up sitting
1461 * there for too long.
1462 */
1463 conn_t *
1464 tcp_get_conn(void *arg, tcp_stack_t *tcps)
1465 {
1466 tcp_t *tcp = NULL;
1467 conn_t *connp = NULL;
1468 squeue_t *sqp = (squeue_t *)arg;
1469 tcp_squeue_priv_t *tcp_time_wait;
1470 netstack_t *ns;
1471 mblk_t *tcp_rsrv_mp = NULL;
1472
1473 tcp_time_wait =
1474 *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
1475
1476 mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
1477 tcp = tcp_time_wait->tcp_free_list;
1478 ASSERT((tcp != NULL) ^ (tcp_time_wait->tcp_free_list_cnt == 0));
1479 if (tcp != NULL) {
1480 tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next;
1481 tcp_time_wait->tcp_free_list_cnt--;
1482 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1483 tcp->tcp_time_wait_next = NULL;
1484 connp = tcp->tcp_connp;
1485 connp->conn_flags |= IPCL_REUSED;
1486
1487 ASSERT(tcp->tcp_tcps == NULL);
1488 ASSERT(connp->conn_netstack == NULL);
1489 ASSERT(tcp->tcp_rsrv_mp != NULL);
1490 ns = tcps->tcps_netstack;
1491 netstack_hold(ns);
1492 connp->conn_netstack = ns;
1493 connp->conn_ixa->ixa_ipst = ns->netstack_ip;
1494 tcp->tcp_tcps = tcps;
1495 ipcl_globalhash_insert(connp);
1496
1497 connp->conn_ixa->ixa_notify_cookie = tcp;
1498 ASSERT(connp->conn_ixa->ixa_notify == tcp_notify);
1499 connp->conn_recv = tcp_input_data;
1500 ASSERT(connp->conn_recvicmp == tcp_icmp_input);
1501 ASSERT(connp->conn_verifyicmp == tcp_verifyicmp);
1502 return (connp);
1503 }
1504 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1505 /*
1506 * Pre-allocate the tcp_rsrv_mp. This mblk will not be freed until
1507 * this conn_t/tcp_t is freed at ipcl_conn_destroy().
1508 */
1509 tcp_rsrv_mp = allocb(0, BPRI_HI);
1510 if (tcp_rsrv_mp == NULL)
1511 return (NULL);
1512
1513 if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP,
1514 tcps->tcps_netstack)) == NULL) {
1515 freeb(tcp_rsrv_mp);
1516 return (NULL);
1517 }
1518
1519 tcp = connp->conn_tcp;
1520 tcp->tcp_rsrv_mp = tcp_rsrv_mp;
1521 mutex_init(&tcp->tcp_rsrv_mp_lock, NULL, MUTEX_DEFAULT, NULL);
1522
1523 tcp->tcp_tcps = tcps;
1524
1525 connp->conn_recv = tcp_input_data;
1526 connp->conn_recvicmp = tcp_icmp_input;
1527 connp->conn_verifyicmp = tcp_verifyicmp;
1528
1529 /*
1530 * Register tcp_notify to listen to capability changes detected by IP.
1531 * This upcall is made in the context of the call to conn_ip_output
1532 * thus it is inside the squeue.
1533 */
1534 connp->conn_ixa->ixa_notify = tcp_notify;
1535 connp->conn_ixa->ixa_notify_cookie = tcp;
1536
1537 return (connp);
1538 }
1539
1540 /*
1541 * Handle connect to IPv4 destinations, including connections for AF_INET6
1542 * sockets connecting to IPv4 mapped IPv6 destinations.
1543 * Returns zero if OK, a positive errno, or a negative TLI error.
1544 */
1545 static int
1546 tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, in_port_t dstport,
1547 uint_t srcid)
1548 {
1549 ipaddr_t dstaddr = *dstaddrp;
1550 uint16_t lport;
1551 conn_t *connp = tcp->tcp_connp;
1552 tcp_stack_t *tcps = tcp->tcp_tcps;
1553 int error;
1554
1555 ASSERT(connp->conn_ipversion == IPV4_VERSION);
1556
1557 /* Check for attempt to connect to INADDR_ANY */
2286 tcp->tcp_in_ack_unsent = 0;
2287 tcp->tcp_cork = B_FALSE;
2288 tcp->tcp_tconnind_started = B_FALSE;
2289
2290 PRESERVE(tcp->tcp_squeue_bytes);
2291
2292 tcp->tcp_closemp_used = B_FALSE;
2293
2294 PRESERVE(tcp->tcp_rsrv_mp);
2295 PRESERVE(tcp->tcp_rsrv_mp_lock);
2296
2297 #ifdef DEBUG
2298 DONTCARE(tcp->tcmp_stk[0]);
2299 #endif
2300
2301 PRESERVE(tcp->tcp_connid);
2302
2303 ASSERT(tcp->tcp_listen_cnt == NULL);
2304 ASSERT(tcp->tcp_reass_tid == 0);
2305
2306 /* Allow the CC algorithm to clean up after itself. */
2307 if (tcp->tcp_cc_algo->cb_destroy != NULL)
2308 tcp->tcp_cc_algo->cb_destroy(&tcp->tcp_ccv);
2309 tcp->tcp_cc_algo = NULL;
2310
2311 #undef DONTCARE
2312 #undef PRESERVE
2313 }
2314
2315 /*
2316 * Initialize the various fields in tcp_t. If parent (the listener) is non
2317 * NULL, certain values will be inheritted from it.
2318 */
2319 void
2320 tcp_init_values(tcp_t *tcp, tcp_t *parent)
2321 {
2322 tcp_stack_t *tcps = tcp->tcp_tcps;
2323 conn_t *connp = tcp->tcp_connp;
2324
2325 ASSERT((connp->conn_family == AF_INET &&
2326 connp->conn_ipversion == IPV4_VERSION) ||
2327 (connp->conn_family == AF_INET6 &&
2328 (connp->conn_ipversion == IPV4_VERSION ||
2329 connp->conn_ipversion == IPV6_VERSION)));
2330
2331 tcp->tcp_ccv.type = IPPROTO_TCP;
2332 tcp->tcp_ccv.ccvc.tcp = tcp;
2333
2334 if (parent == NULL) {
2335 tcp->tcp_cc_algo = tcps->tcps_default_cc_algo;
2336
2337 tcp->tcp_naglim = tcps->tcps_naglim_def;
2338
2339 tcp->tcp_rto_initial = tcps->tcps_rexmit_interval_initial;
2340 tcp->tcp_rto_min = tcps->tcps_rexmit_interval_min;
2341 tcp->tcp_rto_max = tcps->tcps_rexmit_interval_max;
2342
2343 tcp->tcp_first_ctimer_threshold =
2344 tcps->tcps_ip_notify_cinterval;
2345 tcp->tcp_second_ctimer_threshold =
2346 tcps->tcps_ip_abort_cinterval;
2347 tcp->tcp_first_timer_threshold = tcps->tcps_ip_notify_interval;
2348 tcp->tcp_second_timer_threshold = tcps->tcps_ip_abort_interval;
2349
2350 tcp->tcp_fin_wait_2_flush_interval =
2351 tcps->tcps_fin_wait_2_flush_interval;
2352
2353 tcp->tcp_ka_interval = tcps->tcps_keepalive_interval;
2354 tcp->tcp_ka_abort_thres = tcps->tcps_keepalive_abort_interval;
2355 tcp->tcp_ka_cnt = 0;
2356 tcp->tcp_ka_rinterval = 0;
2357
2358 /*
2359 * Default value of tcp_init_cwnd is 0, so no need to set here
2360 * if parent is NULL. But we need to inherit it from parent.
2361 */
2362 } else {
2363 /* Inherit various TCP parameters from the parent. */
2364 tcp->tcp_cc_algo = parent->tcp_cc_algo;
2365
2366 tcp->tcp_naglim = parent->tcp_naglim;
2367
2368 tcp->tcp_rto_initial = parent->tcp_rto_initial;
2369 tcp->tcp_rto_min = parent->tcp_rto_min;
2370 tcp->tcp_rto_max = parent->tcp_rto_max;
2371
2372 tcp->tcp_first_ctimer_threshold =
2373 parent->tcp_first_ctimer_threshold;
2374 tcp->tcp_second_ctimer_threshold =
2375 parent->tcp_second_ctimer_threshold;
2376 tcp->tcp_first_timer_threshold =
2377 parent->tcp_first_timer_threshold;
2378 tcp->tcp_second_timer_threshold =
2379 parent->tcp_second_timer_threshold;
2380
2381 tcp->tcp_fin_wait_2_flush_interval =
2382 parent->tcp_fin_wait_2_flush_interval;
2383
2384 tcp->tcp_ka_interval = parent->tcp_ka_interval;
2385 tcp->tcp_ka_abort_thres = parent->tcp_ka_abort_thres;
2386 tcp->tcp_ka_cnt = parent->tcp_ka_cnt;
2387 tcp->tcp_ka_rinterval = parent->tcp_ka_rinterval;
2388
2389 tcp->tcp_init_cwnd = parent->tcp_init_cwnd;
2390 }
2391
2392 if (tcp->tcp_cc_algo->cb_init != NULL)
2393 VERIFY(tcp->tcp_cc_algo->cb_init(&tcp->tcp_ccv) == 0);
2394
2395 /*
2396 * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO
2397 * will be close to tcp_rexmit_interval_initial. By doing this, we
2398 * allow the algorithm to adjust slowly to large fluctuations of RTT
2399 * during first few transmissions of a connection as seen in slow
2400 * links.
2401 */
2402 tcp->tcp_rtt_sa = MSEC2NSEC(tcp->tcp_rto_initial) << 2;
2403 tcp->tcp_rtt_sd = MSEC2NSEC(tcp->tcp_rto_initial) >> 1;
2404 tcp->tcp_rto = tcp_calculate_rto(tcp, tcps,
2405 tcps->tcps_conn_grace_period);
2406
2407 tcp->tcp_timer_backoff = 0;
2408 tcp->tcp_ms_we_have_waited = 0;
2409 tcp->tcp_last_recv_time = ddi_get_lbolt();
2410 tcp->tcp_cwnd_max = tcps->tcps_cwnd_max_;
2411 tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
2412
2413 tcp->tcp_maxpsz_multiplier = tcps->tcps_maxpsz_multiplier;
2414
2619 return (NULL);
2620 }
2621
2622 ns = netstack_find_by_cred(credp);
2623 ASSERT(ns != NULL);
2624 tcps = ns->netstack_tcp;
2625 ASSERT(tcps != NULL);
2626
2627 /*
2628 * For exclusive stacks we set the zoneid to zero
2629 * to make TCP operate as if in the global zone.
2630 */
2631 if (tcps->tcps_netstack->netstack_stackid !=
2632 GLOBAL_NETSTACKID)
2633 zoneid = GLOBAL_ZONEID;
2634 else
2635 zoneid = crgetzoneid(credp);
2636 }
2637
2638 sqp = IP_SQUEUE_GET((uint_t)gethrtime());
2639 connp = tcp_get_conn(sqp, tcps);
2640 /*
2641 * Both tcp_get_conn and netstack_find_by_cred incremented refcnt,
2642 * so we drop it by one.
2643 */
2644 netstack_rele(tcps->tcps_netstack);
2645 if (connp == NULL) {
2646 *errorp = ENOSR;
2647 return (NULL);
2648 }
2649 ASSERT(connp->conn_ixa->ixa_protocol == connp->conn_proto);
2650
2651 connp->conn_sqp = sqp;
2652 connp->conn_initial_sqp = connp->conn_sqp;
2653 connp->conn_ixa->ixa_sqp = connp->conn_sqp;
2654 tcp = connp->conn_tcp;
2655
2656 /*
2657 * Besides asking IP to set the checksum for us, have conn_ip_output
2658 * to do the following checks when necessary:
2659 *
3810 * there are many CPUs as we will be adding them 1 by 1.
3811 *
3812 * Note that tcps_sc_cnt never decreases and the tcps_sc[x] pointers
3813 * are not freed until the stack is going away. So there is no need
3814 * to grab a lock to access the per CPU tcps_sc[x] pointer.
3815 */
3816 mutex_enter(&cpu_lock);
3817 tcps->tcps_sc_cnt = MAX(ncpus, boot_ncpus);
3818 mutex_exit(&cpu_lock);
3819 tcps->tcps_sc = kmem_zalloc(max_ncpus * sizeof (tcp_stats_cpu_t *),
3820 KM_SLEEP);
3821 for (i = 0; i < tcps->tcps_sc_cnt; i++) {
3822 tcps->tcps_sc[i] = kmem_zalloc(sizeof (tcp_stats_cpu_t),
3823 KM_SLEEP);
3824 }
3825
3826 mutex_init(&tcps->tcps_listener_conf_lock, NULL, MUTEX_DEFAULT, NULL);
3827 list_create(&tcps->tcps_listener_conf, sizeof (tcp_listener_t),
3828 offsetof(tcp_listener_t, tl_link));
3829
3830 tcps->tcps_default_cc_algo = cc_load_algo(CC_DEFAULT_ALGO_NAME);
3831 VERIFY3P(tcps->tcps_default_cc_algo, !=, NULL);
3832
3833 return (tcps);
3834 }
3835
3836 /*
3837 * Called when the IP module is about to be unloaded.
3838 */
3839 void
3840 tcp_ddi_g_destroy(void)
3841 {
3842 tcp_g_kstat_fini(tcp_g_kstat);
3843 tcp_g_kstat = NULL;
3844 bzero(&tcp_g_statistics, sizeof (tcp_g_statistics));
3845
3846 mutex_destroy(&tcp_random_lock);
3847
3848 kmem_cache_destroy(tcp_timercache);
3849 kmem_cache_destroy(tcp_notsack_blk_cache);
3850
3851 netstack_unregister(NS_TCP);
3852 }
|