Print this page
11546 Track TCP round-trip time in nanoseconds
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Portions contributed by: Brandon Baker <bbaker@delphix.com>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>

@@ -18,13 +18,13 @@
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
  * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
- * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
  */
 
 #ifndef _INET_TCP_IMPL_H
 #define _INET_TCP_IMPL_H
 

@@ -298,21 +298,10 @@
                 ((ip6_t *)(iph))->ip6_vcf &= htonl(0xFFCFFFFF); \
                 ((ip6_t *)(iph))->ip6_vcf |= htonl(IPH_ECN_ECT0 << 20); \
         }
 
 /*
- * Set tcp_rto with boundary checking.
- */
-#define TCP_SET_RTO(tcp, rto) \
-        if ((rto) < (tcp)->tcp_rto_min)                 \
-                (tcp)->tcp_rto = (tcp)->tcp_rto_min;    \
-        else if ((rto) > (tcp)->tcp_rto_max)            \
-                (tcp)->tcp_rto = (tcp)->tcp_rto_max;    \
-        else                                            \
-                (tcp)->tcp_rto = (rto);
-
-/*
  * TCP options struct returned from tcp_parse_options.
  */
 typedef struct tcp_opt_s {
         uint32_t        tcp_opt_mss;
         uint32_t        tcp_opt_wscale;

@@ -572,10 +561,65 @@
 #define tcps_wroff_xtra                 tcps_propinfo_tbl[57].prop_cur_uval
 #define tcps_dev_flow_ctl               tcps_propinfo_tbl[58].prop_cur_bval
 #define tcps_reass_timeout              tcps_propinfo_tbl[59].prop_cur_uval
 #define tcps_iss_incr                   tcps_propinfo_tbl[65].prop_cur_uval
 
+
+/*
+ * As defined in RFC 6298, the RTO is the average estimates (SRTT) plus a
+ * multiple of the deviation estimates (K * RTTVAR):
+ *
+ * RTO = SRTT + max(G, K * RTTVAR)
+ *
+ * K is defined in the RFC as 4, and G is the clock granularity. We constrain
+ * the minimum mean deviation to TCP_SD_MIN when processing new RTTs, so this
+ * becomes:
+ *
+ * RTO = SRTT + 4 * RTTVAR
+ *
+ * In practice, however, we make several additions to it. As we use a finer
+ * grained clock than BSD and update RTO for every ACK, we add in another 1/4 of
+ * RTT to the deviation of RTO to accommodate burstiness of 1/4 of window size:
+ *
+ * RTO = SRTT + (SRTT / 4) + 4 * RTTVAR
+ *
+ * Since tcp_rtt_sa is 8 times the SRTT, and tcp_rtt_sd is 4 times the RTTVAR,
+ * this becomes:
+ *
+ * RTO = (tcp_rtt_sa / 8) + ((tcp_rtt_sa / 8) / 4) + tcp_rtt_sd
+ * RTO = (tcp_rtt_sa / 2^3) + (tcp_rtt_sa / 2^5) + tcp_rtt_sd
+ * RTO = (tcp_rtt_sa >> 3) + (tcp_rtt_sa >> 5) + tcp_rtt_sd
+ *
+ * The "tcp_rexmit_interval_extra" and "tcp_conn_grace_period" tunables are
+ * used to help account for extreme environments where the algorithm fails to
+ * work; by default they should be 0. (The latter tunable is only used for
+ * calculating the intial RTO, and so is optionally passed in as "extra".) We
+ * add them here:
+ *
+ * RTO = (tcp_rtt_sa >> 3) + (tcp_rtt_sa >> 5) + tcp_rtt_sd +
+ *     tcps_rexmit_interval_extra + tcps_conn_grace_period
+ *
+ * We then pin the RTO within our configured boundaries (sections 2.4 and 2.5
+ * of RFC 6298).
+ */
+static __GNU_INLINE clock_t
+tcp_calculate_rto(tcp_t *tcp, tcp_stack_t *tcps, uint32_t extra)
+{
+        clock_t rto;
+
+        rto = NSEC2MSEC((tcp->tcp_rtt_sa >> 3) + (tcp->tcp_rtt_sa >> 5) +
+            tcp->tcp_rtt_sd) + tcps->tcps_rexmit_interval_extra + extra;
+
+        if (rto < tcp->tcp_rto_min) {
+                rto = tcp->tcp_rto_min;
+        } else if (rto > tcp->tcp_rto_max) {
+                rto = tcp->tcp_rto_max;
+        }
+
+        return (rto);
+}
+
 extern struct qinit tcp_rinitv4, tcp_rinitv6;
 extern boolean_t do_tcp_fusion;
 
 /*
  * Object to represent database of options to search passed to