Print this page
11546 Track TCP round-trip time in nanoseconds
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Portions contributed by: Brandon Baker <bbaker@delphix.com>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>

@@ -20,12 +20,12 @@
  */
 
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright 2016 Joyent, Inc.
- * Copyright (c) 2014 by Delphix. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
+ * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
  */
 
 /* This file contains all TCP input processing functions. */
 
 #include <sys/types.h>

@@ -164,11 +164,11 @@
 static void     tcp_input_listener(void *, mblk_t *, void *, ip_recv_attr_t *);
 static void     tcp_process_options(tcp_t *, tcpha_t *);
 static mblk_t   *tcp_reass(tcp_t *, mblk_t *, uint32_t);
 static void     tcp_reass_elim_overlap(tcp_t *, mblk_t *);
 static void     tcp_rsrv_input(void *, mblk_t *, void *, ip_recv_attr_t *);
-static void     tcp_set_rto(tcp_t *, time_t);
+static void     tcp_set_rto(tcp_t *, hrtime_t);
 static void     tcp_setcred_data(mblk_t *, ip_recv_attr_t *);
 
 /*
  * Set the MSS associated with a particular tcp based on its current value,
  * and a new one passed in. Observe minimums and maximums, and reset other

@@ -3360,11 +3360,11 @@
         /*
          * urp could be -1 when the urp field in the packet is 0
          * and TCP_OLD_URP_INTERPRETATION is set. This implies that the urgent
          * byte was at seg_seq - 1, in which case we ignore the urgent flag.
          */
-        if (flags & TH_URG && urp >= 0) {
+        if ((flags & TH_URG) && urp >= 0) {
                 if (!tcp->tcp_urp_last_valid ||
                     SEQ_GT(urp + seg_seq, tcp->tcp_urp_last)) {
                         /*
                          * Non-STREAMS sockets handle the urgent data a litte
                          * differently from STREAMS based sockets. There is no

@@ -4302,40 +4302,33 @@
         /* See if the latest urgent data has been acknowledged */
         if ((tcp->tcp_valid_bits & TCP_URG_VALID) &&
             SEQ_GT(seg_ack, tcp->tcp_urg))
                 tcp->tcp_valid_bits &= ~TCP_URG_VALID;
 
-        /* Can we update the RTT estimates? */
-        if (tcp->tcp_snd_ts_ok) {
-                /* Ignore zero timestamp echo-reply. */
-                if (tcpopt.tcp_opt_ts_ecr != 0) {
-                        tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH -
-                            (int32_t)tcpopt.tcp_opt_ts_ecr);
-                }
-
-                /* If needed, restart the timer. */
-                if (tcp->tcp_set_timer == 1) {
-                        TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
-                        tcp->tcp_set_timer = 0;
-                }
                 /*
-                 * Update tcp_csuna in case the other side stops sending
-                 * us timestamps.
+         * Update the RTT estimates. Note that we don't use the TCP
+         * timestamp option to calculate RTT even if one is present. This is
+         * because the timestamp option's resolution (CPU tick) is
+         * too coarse to measure modern datacenter networks' microsecond
+         * latencies. The timestamp field's resolution is limited by its
+         * 4-byte width (see RFC1323), and since we always store a
+         * high-resolution nanosecond presision timestamp along with the data,
+         * there is no point to ever using the timestamp option.
                  */
-                tcp->tcp_csuna = tcp->tcp_snxt;
-        } else if (SEQ_GT(seg_ack, tcp->tcp_csuna)) {
+        if (SEQ_GT(seg_ack, tcp->tcp_csuna)) {
                 /*
                  * An ACK sequence we haven't seen before, so get the RTT
                  * and update the RTO. But first check if the timestamp is
                  * valid to use.
                  */
                 if ((mp1->b_next != NULL) &&
-                    SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next)))
-                        tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH -
-                            (int32_t)(intptr_t)mp1->b_prev);
-                else
+                    SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next))) {
+                        tcp_set_rto(tcp, gethrtime() -
+                            (hrtime_t)(intptr_t)mp1->b_prev);
+                } else {
                         TCPS_BUMP_MIB(tcps, tcpRttNoUpdate);
+                }
 
                 /* Remeber the last sequence to be ACKed */
                 tcp->tcp_csuna = seg_ack;
                 if (tcp->tcp_set_timer == 1) {
                         TCP_TIMER_RESTART(tcp, tcp->tcp_rto);

@@ -4360,11 +4353,11 @@
                          * old timestamp have been ack'ed.
                          */
                         if (SEQ_GT(seg_ack,
                             (uint32_t)(uintptr_t)(mp1->b_next))) {
                                 mp1->b_prev =
-                                    (mblk_t *)(uintptr_t)LBOLT_FASTPATH;
+                                    (mblk_t *)(intptr_t)gethrtime();
                                 mp1->b_next = NULL;
                         }
                         break;
                 }
                 mp1->b_next = NULL;

@@ -4837,11 +4830,11 @@
                             NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size,
                             B_TRUE);
 
                         if (mp1 != NULL) {
                                 tcp->tcp_xmit_head->b_prev =
-                                    (mblk_t *)LBOLT_FASTPATH;
+                                    (mblk_t *)(intptr_t)gethrtime();
                                 tcp->tcp_csuna = tcp->tcp_snxt;
                                 TCPS_BUMP_MIB(tcps, tcpRetransSegs);
                                 TCPS_UPDATE_MIB(tcps, tcpRetransBytes,
                                     snd_size);
                                 tcp_send_data(tcp, mp1);

@@ -4871,14 +4864,15 @@
                         /*
                          * This will restart the timer.  Restarting the
                          * timer is used to avoid a timeout before the
                          * limited transmitted segment's ACK gets back.
                          */
-                        if (tcp->tcp_xmit_head != NULL)
+                        if (tcp->tcp_xmit_head != NULL) {
                                 tcp->tcp_xmit_head->b_prev =
-                                    (mblk_t *)LBOLT_FASTPATH;
+                                    (mblk_t *)(intptr_t)gethrtime();
                 }
+                }
 
                 /* Anything more to do? */
                 if ((flags & (TH_ACK_NEEDED|TH_ACK_TIMER_NEEDED|
                     TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0)
                         goto done;

@@ -5209,42 +5203,55 @@
         }
         ASSERT(optptr == mp->b_wptr);
         return (mp);
 }
 
-/* The minimum of smoothed mean deviation in RTO calculation. */
-#define TCP_SD_MIN      400
+/* The minimum of smoothed mean deviation in RTO calculation (nsec). */
+#define TCP_SD_MIN      400000000
 
 /*
- * Set RTO for this connection.  The formula is from Jacobson and Karels'
- * "Congestion Avoidance and Control" in SIGCOMM '88.  The variable names
- * are the same as those in Appendix A.2 of that paper.
+ * Set RTO for this connection based on a new round-trip time measurement.
+ * The formula is from Jacobson and Karels' "Congestion Avoidance and Control"
+ * in SIGCOMM '88.  The variable names are the same as those in Appendix A.2
+ * of that paper.
  *
  * m = new measurement
  * sa = smoothed RTT average (8 * average estimates).
  * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates).
  */
 static void
-tcp_set_rto(tcp_t *tcp, clock_t rtt)
+tcp_set_rto(tcp_t *tcp, hrtime_t rtt)
 {
-        long m = TICK_TO_MSEC(rtt);
-        clock_t sa = tcp->tcp_rtt_sa;
-        clock_t sv = tcp->tcp_rtt_sd;
-        clock_t rto;
+        hrtime_t m = rtt;
+        hrtime_t sa = tcp->tcp_rtt_sa;
+        hrtime_t sv = tcp->tcp_rtt_sd;
         tcp_stack_t     *tcps = tcp->tcp_tcps;
 
         TCPS_BUMP_MIB(tcps, tcpRttUpdate);
         tcp->tcp_rtt_update++;
 
         /* tcp_rtt_sa is not 0 means this is a new sample. */
         if (sa != 0) {
                 /*
-                 * Update average estimator:
-                 *      new rtt = 7/8 old rtt + 1/8 Error
+                 * Update average estimator (see section 2.3 of RFC6298):
+                 *      SRTT = 7/8 SRTT + 1/8 rtt
+                 *
+                 * We maintain tcp_rtt_sa as 8 * SRTT, so this reduces to:
+                 *      tcp_rtt_sa = 7 * SRTT + rtt
+                 *      tcp_rtt_sa = 7 * (tcp_rtt_sa / 8) + rtt
+                 *      tcp_rtt_sa = tcp_rtt_sa - (tcp_rtt_sa / 8) + rtt
+                 *      tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa / 8))
+                 *      tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa / 2^3))
+                 *      tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa >> 3))
+                 *
+                 * (rtt - tcp_rtt_sa / 8) is simply the difference
+                 * between the new rtt measurement and the existing smoothed
+                 * RTT average. This is referred to as "Error" in subsequent
+                 * calculations.
                  */
 
-                /* m is now Error in estimate. */
+                /* m is now Error. */
                 m -= sa >> 3;
                 if ((sa += m) <= 0) {
                         /*
                          * Don't allow the smoothed average to be negative.
                          * We use 0 to denote reinitialization of the

@@ -5253,11 +5260,17 @@
                         sa = 1;
                 }
 
                 /*
                  * Update deviation estimator:
-                 *      new mdev = 3/4 old mdev + 1/4 (abs(Error) - old mdev)
+                 *  mdev = 3/4 mdev + 1/4 abs(Error)
+                 *
+                 * We maintain tcp_rtt_sd as 4 * mdev, so this reduces to:
+                 *  tcp_rtt_sd = 3 * mdev + abs(Error)
+                 *  tcp_rtt_sd = tcp_rtt_sd - (tcp_rtt_sd / 4) + abs(Error)
+                 *  tcp_rtt_sd = tcp_rtt_sd - (tcp_rtt_sd / 2^2) + abs(Error)
+                 *  tcp_rtt_sd = tcp_rtt_sd - (tcp_rtt_sd >> 2) + abs(Error)
                  */
                 if (m < 0)
                         m = -m;
                 m -= sv >> 2;
                 sv += m;

@@ -5273,37 +5286,25 @@
                 sa = m << 3;
                 sv = m << 1;
         }
         if (sv < TCP_SD_MIN) {
                 /*
-                 * We do not know that if sa captures the delay ACK
-                 * effect as in a long train of segments, a receiver
-                 * does not delay its ACKs.  So set the minimum of sv
-                 * to be TCP_SD_MIN, which is default to 400 ms, twice
-                 * of BSD DATO.  That means the minimum of mean
+                 * Since a receiver doesn't delay its ACKs during a long run of
+                 * segments, sa may not have captured the effect of delayed ACK
+                 * timeouts on the RTT.  To make sure we always account for the
+                 * possible delay (and avoid the unnecessary retransmission),
+                 * TCP_SD_MIN is set to 400ms, twice the delayed ACK timeout of
+                 * 200ms on older SunOS/BSD systems and modern Windows systems
+                 * (as of 2019).  This means that the minimum possible mean
                  * deviation is 100 ms.
-                 *
                  */
                 sv = TCP_SD_MIN;
         }
         tcp->tcp_rtt_sa = sa;
         tcp->tcp_rtt_sd = sv;
-        /*
-         * RTO = average estimates (sa / 8) + 4 * deviation estimates (sv)
-         *
-         * Add tcp_rexmit_interval extra in case of extreme environment
-         * where the algorithm fails to work.  The default value of
-         * tcp_rexmit_interval_extra should be 0.
-         *
-         * As we use a finer grained clock than BSD and update
-         * RTO for every ACKs, add in another .25 of RTT to the
-         * deviation of RTO to accomodate burstiness of 1/4 of
-         * window size.
-         */
-        rto = (sa >> 3) + sv + tcps->tcps_rexmit_interval_extra + (sa >> 5);
 
-        TCP_SET_RTO(tcp, rto);
+        tcp->tcp_rto = tcp_calculate_rto(tcp, tcps, 0);
 
         /* Now, we can reset tcp_timer_backoff to use the new RTO... */
         tcp->tcp_timer_backoff = 0;
 }