Print this page
11546 Track TCP round-trip time in nanoseconds
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Portions contributed by: Brandon Baker <bbaker@delphix.com>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>


   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2016 Joyent, Inc.
  24  * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
  25  * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
  26  */
  27 
  28 #ifndef _INET_TCP_IMPL_H
  29 #define _INET_TCP_IMPL_H
  30 
  31 /*
  32  * TCP implementation private declarations.  These interfaces are
  33  * used to build the IP module and are not meant to be accessed
  34  * by any modules except IP itself.  They are undocumented and are
  35  * subject to change without notice.
  36  */
  37 
  38 #ifdef  __cplusplus
  39 extern "C" {
  40 #endif
  41 
  42 #ifdef _KERNEL
  43 
  44 #include <sys/cpuvar.h>
  45 #include <sys/clock_impl.h>       /* For LBOLT_FASTPATH{,64} */


 283 }
 284 
 285 /*
 286  * Set ECN capable transport (ECT) code point in IP header.
 287  *
 288  * Note that there are 2 ECT code points '01' and '10', which are called
 289  * ECT(1) and ECT(0) respectively.  Here we follow the original ECT code
 290  * point ECT(0) for TCP as described in RFC 2481.
 291  */
 292 #define TCP_SET_ECT(tcp, iph) \
 293         if ((tcp)->tcp_connp->conn_ipversion == IPV4_VERSION) { \
 294                 /* We need to clear the code point first. */ \
 295                 ((ipha_t *)(iph))->ipha_type_of_service &= 0xFC; \
 296                 ((ipha_t *)(iph))->ipha_type_of_service |= IPH_ECN_ECT0; \
 297         } else { \
 298                 ((ip6_t *)(iph))->ip6_vcf &= htonl(0xFFCFFFFF); \
 299                 ((ip6_t *)(iph))->ip6_vcf |= htonl(IPH_ECN_ECT0 << 20); \
 300         }
 301 
 302 /*
 303  * Set tcp_rto with boundary checking.
 304  */
 305 #define TCP_SET_RTO(tcp, rto) \
 306         if ((rto) < (tcp)->tcp_rto_min)                   \
 307                 (tcp)->tcp_rto = (tcp)->tcp_rto_min;      \
 308         else if ((rto) > (tcp)->tcp_rto_max)              \
 309                 (tcp)->tcp_rto = (tcp)->tcp_rto_max;      \
 310         else                                            \
 311                 (tcp)->tcp_rto = (rto);
 312 
 313 /*
 314  * TCP options struct returned from tcp_parse_options.
 315  */
 316 typedef struct tcp_opt_s {
 317         uint32_t        tcp_opt_mss;
 318         uint32_t        tcp_opt_wscale;
 319         uint32_t        tcp_opt_ts_val;
 320         uint32_t        tcp_opt_ts_ecr;
 321         tcp_t           *tcp;
 322 } tcp_opt_t;
 323 
 324 /*
 325  * Flags returned from tcp_parse_options.
 326  */
 327 #define TCP_OPT_MSS_PRESENT     1
 328 #define TCP_OPT_WSCALE_PRESENT  2
 329 #define TCP_OPT_TSTAMP_PRESENT  4
 330 #define TCP_OPT_SACK_OK_PRESENT 8
 331 #define TCP_OPT_SACK_PRESENT    16
 332 
 333 /*


 557 #define tcps_mss_max_ipv6               tcps_propinfo_tbl[47].prop_cur_uval
 558 #define tcps_rev_src_routes             tcps_propinfo_tbl[48].prop_cur_bval
 559 #define tcps_local_dack_interval        tcps_propinfo_tbl[49].prop_cur_uval
 560 #define tcps_local_dacks_max            tcps_propinfo_tbl[50].prop_cur_uval
 561 #define tcps_ecn_permitted              tcps_propinfo_tbl[51].prop_cur_uval
 562 #define tcps_rst_sent_rate_enabled      tcps_propinfo_tbl[52].prop_cur_bval
 563 #define tcps_rst_sent_rate              tcps_propinfo_tbl[53].prop_cur_uval
 564 #define tcps_push_timer_interval        tcps_propinfo_tbl[54].prop_cur_uval
 565 #define tcps_use_smss_as_mss_opt        tcps_propinfo_tbl[55].prop_cur_bval
 566 #define tcps_keepalive_abort_interval_high \
 567                                         tcps_propinfo_tbl[56].prop_max_uval
 568 #define tcps_keepalive_abort_interval \
 569                                         tcps_propinfo_tbl[56].prop_cur_uval
 570 #define tcps_keepalive_abort_interval_low \
 571                                         tcps_propinfo_tbl[56].prop_min_uval
 572 #define tcps_wroff_xtra                 tcps_propinfo_tbl[57].prop_cur_uval
 573 #define tcps_dev_flow_ctl               tcps_propinfo_tbl[58].prop_cur_bval
 574 #define tcps_reass_timeout              tcps_propinfo_tbl[59].prop_cur_uval
 575 #define tcps_iss_incr                   tcps_propinfo_tbl[65].prop_cur_uval
 576 























































 577 extern struct qinit tcp_rinitv4, tcp_rinitv6;
 578 extern boolean_t do_tcp_fusion;
 579 
 580 /*
 581  * Object to represent database of options to search passed to
 582  * {sock,tpi}optcom_req() interface routine to take care of option
 583  * management and associated methods.
 584  */
 585 extern optdb_obj_t      tcp_opt_obj;
 586 extern uint_t           tcp_max_optsize;
 587 
 588 extern int tcp_squeue_flag;
 589 
 590 extern uint_t tcp_free_list_max_cnt;
 591 
 592 /*
 593  * Functions in tcp.c.
 594  */
 595 extern void     tcp_acceptor_hash_insert(t_uscalar_t, tcp_t *);
 596 extern tcp_t    *tcp_acceptor_hash_lookup(t_uscalar_t, tcp_stack_t *);




   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2019 Joyent, Inc.
  24  * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
  25  * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
  26  */
  27 
  28 #ifndef _INET_TCP_IMPL_H
  29 #define _INET_TCP_IMPL_H
  30 
  31 /*
  32  * TCP implementation private declarations.  These interfaces are
  33  * used to build the IP module and are not meant to be accessed
  34  * by any modules except IP itself.  They are undocumented and are
  35  * subject to change without notice.
  36  */
  37 
  38 #ifdef  __cplusplus
  39 extern "C" {
  40 #endif
  41 
  42 #ifdef _KERNEL
  43 
  44 #include <sys/cpuvar.h>
  45 #include <sys/clock_impl.h>       /* For LBOLT_FASTPATH{,64} */


 283 }
 284 
 285 /*
 286  * Set ECN capable transport (ECT) code point in IP header.
 287  *
 288  * Note that there are 2 ECT code points '01' and '10', which are called
 289  * ECT(1) and ECT(0) respectively.  Here we follow the original ECT code
 290  * point ECT(0) for TCP as described in RFC 2481.
 291  */
 292 #define TCP_SET_ECT(tcp, iph) \
 293         if ((tcp)->tcp_connp->conn_ipversion == IPV4_VERSION) { \
 294                 /* We need to clear the code point first. */ \
 295                 ((ipha_t *)(iph))->ipha_type_of_service &= 0xFC; \
 296                 ((ipha_t *)(iph))->ipha_type_of_service |= IPH_ECN_ECT0; \
 297         } else { \
 298                 ((ip6_t *)(iph))->ip6_vcf &= htonl(0xFFCFFFFF); \
 299                 ((ip6_t *)(iph))->ip6_vcf |= htonl(IPH_ECN_ECT0 << 20); \
 300         }
 301 
 302 /*











 303  * TCP options struct returned from tcp_parse_options.
 304  */
 305 typedef struct tcp_opt_s {
 306         uint32_t        tcp_opt_mss;
 307         uint32_t        tcp_opt_wscale;
 308         uint32_t        tcp_opt_ts_val;
 309         uint32_t        tcp_opt_ts_ecr;
 310         tcp_t           *tcp;
 311 } tcp_opt_t;
 312 
 313 /*
 314  * Flags returned from tcp_parse_options.
 315  */
 316 #define TCP_OPT_MSS_PRESENT     1
 317 #define TCP_OPT_WSCALE_PRESENT  2
 318 #define TCP_OPT_TSTAMP_PRESENT  4
 319 #define TCP_OPT_SACK_OK_PRESENT 8
 320 #define TCP_OPT_SACK_PRESENT    16
 321 
 322 /*


 546 #define tcps_mss_max_ipv6               tcps_propinfo_tbl[47].prop_cur_uval
 547 #define tcps_rev_src_routes             tcps_propinfo_tbl[48].prop_cur_bval
 548 #define tcps_local_dack_interval        tcps_propinfo_tbl[49].prop_cur_uval
 549 #define tcps_local_dacks_max            tcps_propinfo_tbl[50].prop_cur_uval
 550 #define tcps_ecn_permitted              tcps_propinfo_tbl[51].prop_cur_uval
 551 #define tcps_rst_sent_rate_enabled      tcps_propinfo_tbl[52].prop_cur_bval
 552 #define tcps_rst_sent_rate              tcps_propinfo_tbl[53].prop_cur_uval
 553 #define tcps_push_timer_interval        tcps_propinfo_tbl[54].prop_cur_uval
 554 #define tcps_use_smss_as_mss_opt        tcps_propinfo_tbl[55].prop_cur_bval
 555 #define tcps_keepalive_abort_interval_high \
 556                                         tcps_propinfo_tbl[56].prop_max_uval
 557 #define tcps_keepalive_abort_interval \
 558                                         tcps_propinfo_tbl[56].prop_cur_uval
 559 #define tcps_keepalive_abort_interval_low \
 560                                         tcps_propinfo_tbl[56].prop_min_uval
 561 #define tcps_wroff_xtra                 tcps_propinfo_tbl[57].prop_cur_uval
 562 #define tcps_dev_flow_ctl               tcps_propinfo_tbl[58].prop_cur_bval
 563 #define tcps_reass_timeout              tcps_propinfo_tbl[59].prop_cur_uval
 564 #define tcps_iss_incr                   tcps_propinfo_tbl[65].prop_cur_uval
 565 
 566 
 567 /*
 568  * As defined in RFC 6298, the RTO is the average estimates (SRTT) plus a
 569  * multiple of the deviation estimates (K * RTTVAR):
 570  *
 571  * RTO = SRTT + max(G, K * RTTVAR)
 572  *
 573  * K is defined in the RFC as 4, and G is the clock granularity. We constrain
 574  * the minimum mean deviation to TCP_SD_MIN when processing new RTTs, so this
 575  * becomes:
 576  *
 577  * RTO = SRTT + 4 * RTTVAR
 578  *
 579  * In practice, however, we make several additions to it. As we use a finer
 580  * grained clock than BSD and update RTO for every ACK, we add in another 1/4 of
 581  * RTT to the deviation of RTO to accommodate burstiness of 1/4 of window size:
 582  *
 583  * RTO = SRTT + (SRTT / 4) + 4 * RTTVAR
 584  *
 585  * Since tcp_rtt_sa is 8 times the SRTT, and tcp_rtt_sd is 4 times the RTTVAR,
 586  * this becomes:
 587  *
 588  * RTO = (tcp_rtt_sa / 8) + ((tcp_rtt_sa / 8) / 4) + tcp_rtt_sd
 589  * RTO = (tcp_rtt_sa / 2^3) + (tcp_rtt_sa / 2^5) + tcp_rtt_sd
 590  * RTO = (tcp_rtt_sa >> 3) + (tcp_rtt_sa >> 5) + tcp_rtt_sd
 591  *
 592  * The "tcp_rexmit_interval_extra" and "tcp_conn_grace_period" tunables are
 593  * used to help account for extreme environments where the algorithm fails to
 594  * work; by default they should be 0. (The latter tunable is only used for
 595  * calculating the intial RTO, and so is optionally passed in as "extra".) We
 596  * add them here:
 597  *
 598  * RTO = (tcp_rtt_sa >> 3) + (tcp_rtt_sa >> 5) + tcp_rtt_sd +
 599  *     tcps_rexmit_interval_extra + tcps_conn_grace_period
 600  *
 601  * We then pin the RTO within our configured boundaries (sections 2.4 and 2.5
 602  * of RFC 6298).
 603  */
 604 static __GNU_INLINE clock_t
 605 tcp_calculate_rto(tcp_t *tcp, tcp_stack_t *tcps, uint32_t extra)
 606 {
 607         clock_t rto;
 608 
 609         rto = NSEC2MSEC((tcp->tcp_rtt_sa >> 3) + (tcp->tcp_rtt_sa >> 5) +
 610             tcp->tcp_rtt_sd) + tcps->tcps_rexmit_interval_extra + extra;
 611 
 612         if (rto < tcp->tcp_rto_min) {
 613                 rto = tcp->tcp_rto_min;
 614         } else if (rto > tcp->tcp_rto_max) {
 615                 rto = tcp->tcp_rto_max;
 616         }
 617 
 618         return (rto);
 619 }
 620 
 621 extern struct qinit tcp_rinitv4, tcp_rinitv6;
 622 extern boolean_t do_tcp_fusion;
 623 
 624 /*
 625  * Object to represent database of options to search passed to
 626  * {sock,tpi}optcom_req() interface routine to take care of option
 627  * management and associated methods.
 628  */
 629 extern optdb_obj_t      tcp_opt_obj;
 630 extern uint_t           tcp_max_optsize;
 631 
 632 extern int tcp_squeue_flag;
 633 
 634 extern uint_t tcp_free_list_max_cnt;
 635 
 636 /*
 637  * Functions in tcp.c.
 638  */
 639 extern void     tcp_acceptor_hash_insert(t_uscalar_t, tcp_t *);
 640 extern tcp_t    *tcp_acceptor_hash_lookup(t_uscalar_t, tcp_stack_t *);