Print this page
11546 Track TCP round-trip time in nanoseconds
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Portions contributed by: Brandon Baker <bbaker@delphix.com>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>


   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2011, Joyent Inc. All rights reserved.
  25  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  26  * Copyright (c) 2013,2014 by Delphix. All rights reserved.
  27  * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
  28  */
  29 /* Copyright (c) 1990 Mentat Inc. */
  30 
  31 #include <sys/types.h>
  32 #include <sys/stream.h>
  33 #include <sys/strsun.h>
  34 #include <sys/strsubr.h>
  35 #include <sys/stropts.h>
  36 #include <sys/strlog.h>
  37 #define _SUN_TPI_VERSION 2
  38 #include <sys/tihdr.h>
  39 #include <sys/timod.h>
  40 #include <sys/ddi.h>
  41 #include <sys/sunddi.h>
  42 #include <sys/suntpi.h>
  43 #include <sys/xti_inet.h>
  44 #include <sys/cmn_err.h>
  45 #include <sys/debug.h>
  46 #include <sys/sdt.h>


 249                 ((uint_t)(accid) & (TCP_ACCEPTOR_FANOUT_SIZE - 1))
 250 #endif  /* _ILP32 */
 251 
 252 /*
 253  * Minimum number of connections which can be created per listener.  Used
 254  * when the listener connection count is in effect.
 255  */
 256 static uint32_t tcp_min_conn_listener = 2;
 257 
 258 uint32_t tcp_early_abort = 30;
 259 
 260 /* TCP Timer control structure */
 261 typedef struct tcpt_s {
 262         pfv_t   tcpt_pfv;       /* The routine we are to call */
 263         tcp_t   *tcpt_tcp;      /* The parameter we are to pass in */
 264 } tcpt_t;
 265 
 266 /*
 267  * Functions called directly via squeue having a prototype of edesc_t.
 268  */
 269 void            tcp_input_listener(void *arg, mblk_t *mp, void *arg2,
 270     ip_recv_attr_t *ira);
 271 void            tcp_input_data(void *arg, mblk_t *mp, void *arg2,
 272     ip_recv_attr_t *ira);
 273 static void     tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2,
 274     ip_recv_attr_t *dummy);
 275 
 276 
 277 /* Prototype for TCP functions */
 278 static void     tcp_random_init(void);
 279 int             tcp_random(void);
 280 static int      tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp,
 281                     in_port_t dstport, uint_t srcid);
 282 static int      tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp,
 283                     in_port_t dstport, uint32_t flowinfo,
 284                     uint_t srcid, uint32_t scope_id);
 285 static void     tcp_iss_init(tcp_t *tcp);
 286 static void     tcp_reinit(tcp_t *tcp);
 287 static void     tcp_reinit_values(tcp_t *tcp);
 288 
 289 static int      tcp_wsrv(queue_t *q);
 290 static void     tcp_update_lso(tcp_t *tcp, ip_xmit_attr_t *ixa);


 623          */
 624         flags |= IPDF_UNIQUE_DCE;
 625 
 626         if (!tcps->tcps_ignore_path_mtu)
 627                 connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
 628 
 629         /* Use conn_lock to satify ASSERT; tcp is already serialized */
 630         mutex_enter(&connp->conn_lock);
 631         error = conn_connect(connp, &uinfo, flags);
 632         mutex_exit(&connp->conn_lock);
 633         if (error != 0)
 634                 return (error);
 635 
 636         error = tcp_build_hdrs(tcp);
 637         if (error != 0)
 638                 return (error);
 639 
 640         tcp->tcp_localnet = uinfo.iulp_localnet;
 641 
 642         if (uinfo.iulp_rtt != 0) {
 643                 clock_t rto;
 644 
 645                 tcp->tcp_rtt_sa = uinfo.iulp_rtt;
 646                 tcp->tcp_rtt_sd = uinfo.iulp_rtt_sd;
 647                 rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
 648                     tcps->tcps_rexmit_interval_extra +
 649                     (tcp->tcp_rtt_sa >> 5);
 650 
 651                 TCP_SET_RTO(tcp, rto);
 652         }
 653         if (uinfo.iulp_ssthresh != 0)
 654                 tcp->tcp_cwnd_ssthresh = uinfo.iulp_ssthresh;
 655         else
 656                 tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
 657         if (uinfo.iulp_spipe > 0) {
 658                 connp->conn_sndbuf = MIN(uinfo.iulp_spipe,
 659                     tcps->tcps_max_buf);
 660                 if (tcps->tcps_snd_lowat_fraction != 0) {
 661                         connp->conn_sndlowat = connp->conn_sndbuf /
 662                             tcps->tcps_snd_lowat_fraction;
 663                 }
 664                 (void) tcp_maxpsz_set(tcp, B_TRUE);
 665         }
 666         /*
 667          * Note that up till now, acceptor always inherits receive
 668          * window from the listener.  But if there is a metrics
 669          * associated with a host, we should use that instead of
 670          * inheriting it from listener. Thus we need to pass this
 671          * info back to the caller.


2317 #endif
2318 
2319         PRESERVE(tcp->tcp_connid);
2320 
2321         ASSERT(tcp->tcp_listen_cnt == NULL);
2322         ASSERT(tcp->tcp_reass_tid == 0);
2323 
2324 #undef  DONTCARE
2325 #undef  PRESERVE
2326 }
2327 
2328 /*
2329  * Initialize the various fields in tcp_t.  If parent (the listener) is non
2330  * NULL, certain values will be inheritted from it.
2331  */
2332 void
2333 tcp_init_values(tcp_t *tcp, tcp_t *parent)
2334 {
2335         tcp_stack_t     *tcps = tcp->tcp_tcps;
2336         conn_t          *connp = tcp->tcp_connp;
2337         clock_t         rto;
2338 
2339         ASSERT((connp->conn_family == AF_INET &&
2340             connp->conn_ipversion == IPV4_VERSION) ||
2341             (connp->conn_family == AF_INET6 &&
2342             (connp->conn_ipversion == IPV4_VERSION ||
2343             connp->conn_ipversion == IPV6_VERSION)));
2344 
2345         if (parent == NULL) {
2346                 tcp->tcp_naglim = tcps->tcps_naglim_def;
2347 
2348                 tcp->tcp_rto_initial = tcps->tcps_rexmit_interval_initial;
2349                 tcp->tcp_rto_min = tcps->tcps_rexmit_interval_min;
2350                 tcp->tcp_rto_max = tcps->tcps_rexmit_interval_max;
2351 
2352                 tcp->tcp_first_ctimer_threshold =
2353                     tcps->tcps_ip_notify_cinterval;
2354                 tcp->tcp_second_ctimer_threshold =
2355                     tcps->tcps_ip_abort_cinterval;
2356                 tcp->tcp_first_timer_threshold = tcps->tcps_ip_notify_interval;
2357                 tcp->tcp_second_timer_threshold = tcps->tcps_ip_abort_interval;


2386                     parent->tcp_second_timer_threshold;
2387 
2388                 tcp->tcp_fin_wait_2_flush_interval =
2389                     parent->tcp_fin_wait_2_flush_interval;
2390 
2391                 tcp->tcp_ka_interval = parent->tcp_ka_interval;
2392                 tcp->tcp_ka_abort_thres = parent->tcp_ka_abort_thres;
2393                 tcp->tcp_ka_cnt = parent->tcp_ka_cnt;
2394                 tcp->tcp_ka_rinterval = parent->tcp_ka_rinterval;
2395 
2396                 tcp->tcp_init_cwnd = parent->tcp_init_cwnd;
2397         }
2398 
2399         /*
2400          * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO
2401          * will be close to tcp_rexmit_interval_initial.  By doing this, we
2402          * allow the algorithm to adjust slowly to large fluctuations of RTT
2403          * during first few transmissions of a connection as seen in slow
2404          * links.
2405          */
2406         tcp->tcp_rtt_sa = tcp->tcp_rto_initial << 2;
2407         tcp->tcp_rtt_sd = tcp->tcp_rto_initial >> 1;
2408         rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
2409             tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5) +
2410             tcps->tcps_conn_grace_period;
2411         TCP_SET_RTO(tcp, rto);
2412 
2413         tcp->tcp_timer_backoff = 0;
2414         tcp->tcp_ms_we_have_waited = 0;
2415         tcp->tcp_last_recv_time = ddi_get_lbolt();
2416         tcp->tcp_cwnd_max = tcps->tcps_cwnd_max_;
2417         tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
2418 
2419         tcp->tcp_maxpsz_multiplier = tcps->tcps_maxpsz_multiplier;
2420 
2421         /* NOTE:  ISS is now set in tcp_set_destination(). */
2422 
2423         /* Reset fusion-related fields */
2424         tcp->tcp_fused = B_FALSE;
2425         tcp->tcp_unfusable = B_FALSE;
2426         tcp->tcp_fused_sigurg = B_FALSE;
2427         tcp->tcp_loopback_peer = NULL;
2428 
2429         /* We rebuild the header template on the next connect/conn_request */
2430 
2431         connp->conn_mlp_type = mlptSingle;




   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2011, Joyent Inc. All rights reserved.
  25  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  26  * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
  27  * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
  28  */
  29 /* Copyright (c) 1990 Mentat Inc. */
  30 
  31 #include <sys/types.h>
  32 #include <sys/stream.h>
  33 #include <sys/strsun.h>
  34 #include <sys/strsubr.h>
  35 #include <sys/stropts.h>
  36 #include <sys/strlog.h>
  37 #define _SUN_TPI_VERSION 2
  38 #include <sys/tihdr.h>
  39 #include <sys/timod.h>
  40 #include <sys/ddi.h>
  41 #include <sys/sunddi.h>
  42 #include <sys/suntpi.h>
  43 #include <sys/xti_inet.h>
  44 #include <sys/cmn_err.h>
  45 #include <sys/debug.h>
  46 #include <sys/sdt.h>


 249                 ((uint_t)(accid) & (TCP_ACCEPTOR_FANOUT_SIZE - 1))
 250 #endif  /* _ILP32 */
 251 
 252 /*
 253  * Minimum number of connections which can be created per listener.  Used
 254  * when the listener connection count is in effect.
 255  */
 256 static uint32_t tcp_min_conn_listener = 2;
 257 
 258 uint32_t tcp_early_abort = 30;
 259 
 260 /* TCP Timer control structure */
 261 typedef struct tcpt_s {
 262         pfv_t   tcpt_pfv;       /* The routine we are to call */
 263         tcp_t   *tcpt_tcp;      /* The parameter we are to pass in */
 264 } tcpt_t;
 265 
 266 /*
 267  * Functions called directly via squeue having a prototype of edesc_t.
 268  */


 269 void            tcp_input_data(void *arg, mblk_t *mp, void *arg2,
 270     ip_recv_attr_t *ira);
 271 static void     tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2,
 272     ip_recv_attr_t *dummy);
 273 
 274 
 275 /* Prototype for TCP functions */
 276 static void     tcp_random_init(void);
 277 int             tcp_random(void);
 278 static int      tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp,
 279                     in_port_t dstport, uint_t srcid);
 280 static int      tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp,
 281                     in_port_t dstport, uint32_t flowinfo,
 282                     uint_t srcid, uint32_t scope_id);
 283 static void     tcp_iss_init(tcp_t *tcp);
 284 static void     tcp_reinit(tcp_t *tcp);
 285 static void     tcp_reinit_values(tcp_t *tcp);
 286 
 287 static int      tcp_wsrv(queue_t *q);
 288 static void     tcp_update_lso(tcp_t *tcp, ip_xmit_attr_t *ixa);


 621          */
 622         flags |= IPDF_UNIQUE_DCE;
 623 
 624         if (!tcps->tcps_ignore_path_mtu)
 625                 connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
 626 
 627         /* Use conn_lock to satify ASSERT; tcp is already serialized */
 628         mutex_enter(&connp->conn_lock);
 629         error = conn_connect(connp, &uinfo, flags);
 630         mutex_exit(&connp->conn_lock);
 631         if (error != 0)
 632                 return (error);
 633 
 634         error = tcp_build_hdrs(tcp);
 635         if (error != 0)
 636                 return (error);
 637 
 638         tcp->tcp_localnet = uinfo.iulp_localnet;
 639 
 640         if (uinfo.iulp_rtt != 0) {
 641                 tcp->tcp_rtt_sa = MSEC2NSEC(uinfo.iulp_rtt);
 642                 tcp->tcp_rtt_sd = MSEC2NSEC(uinfo.iulp_rtt_sd);
 643                 tcp->tcp_rto = tcp_calculate_rto(tcp, tcps, 0);






 644         }
 645         if (uinfo.iulp_ssthresh != 0)
 646                 tcp->tcp_cwnd_ssthresh = uinfo.iulp_ssthresh;
 647         else
 648                 tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
 649         if (uinfo.iulp_spipe > 0) {
 650                 connp->conn_sndbuf = MIN(uinfo.iulp_spipe,
 651                     tcps->tcps_max_buf);
 652                 if (tcps->tcps_snd_lowat_fraction != 0) {
 653                         connp->conn_sndlowat = connp->conn_sndbuf /
 654                             tcps->tcps_snd_lowat_fraction;
 655                 }
 656                 (void) tcp_maxpsz_set(tcp, B_TRUE);
 657         }
 658         /*
 659          * Note that up till now, acceptor always inherits receive
 660          * window from the listener.  But if there is a metrics
 661          * associated with a host, we should use that instead of
 662          * inheriting it from listener. Thus we need to pass this
 663          * info back to the caller.


2309 #endif
2310 
2311         PRESERVE(tcp->tcp_connid);
2312 
2313         ASSERT(tcp->tcp_listen_cnt == NULL);
2314         ASSERT(tcp->tcp_reass_tid == 0);
2315 
2316 #undef  DONTCARE
2317 #undef  PRESERVE
2318 }
2319 
2320 /*
2321  * Initialize the various fields in tcp_t.  If parent (the listener) is non
2322  * NULL, certain values will be inheritted from it.
2323  */
2324 void
2325 tcp_init_values(tcp_t *tcp, tcp_t *parent)
2326 {
2327         tcp_stack_t     *tcps = tcp->tcp_tcps;
2328         conn_t          *connp = tcp->tcp_connp;

2329 
2330         ASSERT((connp->conn_family == AF_INET &&
2331             connp->conn_ipversion == IPV4_VERSION) ||
2332             (connp->conn_family == AF_INET6 &&
2333             (connp->conn_ipversion == IPV4_VERSION ||
2334             connp->conn_ipversion == IPV6_VERSION)));
2335 
2336         if (parent == NULL) {
2337                 tcp->tcp_naglim = tcps->tcps_naglim_def;
2338 
2339                 tcp->tcp_rto_initial = tcps->tcps_rexmit_interval_initial;
2340                 tcp->tcp_rto_min = tcps->tcps_rexmit_interval_min;
2341                 tcp->tcp_rto_max = tcps->tcps_rexmit_interval_max;
2342 
2343                 tcp->tcp_first_ctimer_threshold =
2344                     tcps->tcps_ip_notify_cinterval;
2345                 tcp->tcp_second_ctimer_threshold =
2346                     tcps->tcps_ip_abort_cinterval;
2347                 tcp->tcp_first_timer_threshold = tcps->tcps_ip_notify_interval;
2348                 tcp->tcp_second_timer_threshold = tcps->tcps_ip_abort_interval;


2377                     parent->tcp_second_timer_threshold;
2378 
2379                 tcp->tcp_fin_wait_2_flush_interval =
2380                     parent->tcp_fin_wait_2_flush_interval;
2381 
2382                 tcp->tcp_ka_interval = parent->tcp_ka_interval;
2383                 tcp->tcp_ka_abort_thres = parent->tcp_ka_abort_thres;
2384                 tcp->tcp_ka_cnt = parent->tcp_ka_cnt;
2385                 tcp->tcp_ka_rinterval = parent->tcp_ka_rinterval;
2386 
2387                 tcp->tcp_init_cwnd = parent->tcp_init_cwnd;
2388         }
2389 
2390         /*
2391          * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO
2392          * will be close to tcp_rexmit_interval_initial.  By doing this, we
2393          * allow the algorithm to adjust slowly to large fluctuations of RTT
2394          * during first few transmissions of a connection as seen in slow
2395          * links.
2396          */
2397         tcp->tcp_rtt_sa = MSEC2NSEC(tcp->tcp_rto_initial) << 2;
2398         tcp->tcp_rtt_sd = MSEC2NSEC(tcp->tcp_rto_initial) >> 1;
2399         tcp->tcp_rto = tcp_calculate_rto(tcp, tcps,
2400             tcps->tcps_conn_grace_period);


2401 
2402         tcp->tcp_timer_backoff = 0;
2403         tcp->tcp_ms_we_have_waited = 0;
2404         tcp->tcp_last_recv_time = ddi_get_lbolt();
2405         tcp->tcp_cwnd_max = tcps->tcps_cwnd_max_;
2406         tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
2407 
2408         tcp->tcp_maxpsz_multiplier = tcps->tcps_maxpsz_multiplier;
2409 
2410         /* NOTE:  ISS is now set in tcp_set_destination(). */
2411 
2412         /* Reset fusion-related fields */
2413         tcp->tcp_fused = B_FALSE;
2414         tcp->tcp_unfusable = B_FALSE;
2415         tcp->tcp_fused_sigurg = B_FALSE;
2416         tcp->tcp_loopback_peer = NULL;
2417 
2418         /* We rebuild the header template on the next connect/conn_request */
2419 
2420         connp->conn_mlp_type = mlptSingle;