Print this page
11546 Track TCP round-trip time in nanoseconds
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Portions contributed by: Brandon Baker <bbaker@delphix.com>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>

*** 18,30 **** * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ! * Copyright 2016 Joyent, Inc. * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. ! * Copyright (c) 2013, 2014 by Delphix. All rights reserved. */ #ifndef _INET_TCP_IMPL_H #define _INET_TCP_IMPL_H --- 18,30 ---- * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ! * Copyright 2019 Joyent, Inc. * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. ! * Copyright (c) 2013, 2016 by Delphix. All rights reserved. */ #ifndef _INET_TCP_IMPL_H #define _INET_TCP_IMPL_H
*** 298,318 **** ((ip6_t *)(iph))->ip6_vcf &= htonl(0xFFCFFFFF); \ ((ip6_t *)(iph))->ip6_vcf |= htonl(IPH_ECN_ECT0 << 20); \ } /* - * Set tcp_rto with boundary checking. - */ - #define TCP_SET_RTO(tcp, rto) \ - if ((rto) < (tcp)->tcp_rto_min) \ - (tcp)->tcp_rto = (tcp)->tcp_rto_min; \ - else if ((rto) > (tcp)->tcp_rto_max) \ - (tcp)->tcp_rto = (tcp)->tcp_rto_max; \ - else \ - (tcp)->tcp_rto = (rto); - - /* * TCP options struct returned from tcp_parse_options. */ typedef struct tcp_opt_s { uint32_t tcp_opt_mss; uint32_t tcp_opt_wscale; --- 298,307 ----
*** 572,581 **** --- 561,625 ---- #define tcps_wroff_xtra tcps_propinfo_tbl[57].prop_cur_uval #define tcps_dev_flow_ctl tcps_propinfo_tbl[58].prop_cur_bval #define tcps_reass_timeout tcps_propinfo_tbl[59].prop_cur_uval #define tcps_iss_incr tcps_propinfo_tbl[65].prop_cur_uval + + /* + * As defined in RFC 6298, the RTO is the average estimates (SRTT) plus a + * multiple of the deviation estimates (K * RTTVAR): + * + * RTO = SRTT + max(G, K * RTTVAR) + * + * K is defined in the RFC as 4, and G is the clock granularity. We constrain + * the minimum mean deviation to TCP_SD_MIN when processing new RTTs, so this + * becomes: + * + * RTO = SRTT + 4 * RTTVAR + * + * In practice, however, we make several additions to it. As we use a finer + * grained clock than BSD and update RTO for every ACK, we add in another 1/4 of + * RTT to the deviation of RTO to accommodate burstiness of 1/4 of window size: + * + * RTO = SRTT + (SRTT / 4) + 4 * RTTVAR + * + * Since tcp_rtt_sa is 8 times the SRTT, and tcp_rtt_sd is 4 times the RTTVAR, + * this becomes: + * + * RTO = (tcp_rtt_sa / 8) + ((tcp_rtt_sa / 8) / 4) + tcp_rtt_sd + * RTO = (tcp_rtt_sa / 2^3) + (tcp_rtt_sa / 2^5) + tcp_rtt_sd + * RTO = (tcp_rtt_sa >> 3) + (tcp_rtt_sa >> 5) + tcp_rtt_sd + * + * The "tcp_rexmit_interval_extra" and "tcp_conn_grace_period" tunables are + * used to help account for extreme environments where the algorithm fails to + * work; by default they should be 0. (The latter tunable is only used for + * calculating the intial RTO, and so is optionally passed in as "extra".) We + * add them here: + * + * RTO = (tcp_rtt_sa >> 3) + (tcp_rtt_sa >> 5) + tcp_rtt_sd + + * tcps_rexmit_interval_extra + tcps_conn_grace_period + * + * We then pin the RTO within our configured boundaries (sections 2.4 and 2.5 + * of RFC 6298). + */ + static __GNU_INLINE clock_t + tcp_calculate_rto(tcp_t *tcp, tcp_stack_t *tcps, uint32_t extra) + { + clock_t rto; + + rto = NSEC2MSEC((tcp->tcp_rtt_sa >> 3) + (tcp->tcp_rtt_sa >> 5) + + tcp->tcp_rtt_sd) + tcps->tcps_rexmit_interval_extra + extra; + + if (rto < tcp->tcp_rto_min) { + rto = tcp->tcp_rto_min; + } else if (rto > tcp->tcp_rto_max) { + rto = tcp->tcp_rto_max; + } + + return (rto); + } + extern struct qinit tcp_rinitv4, tcp_rinitv6; extern boolean_t do_tcp_fusion; /* * Object to represent database of options to search passed to