Print this page
11546 Track TCP round-trip time in nanoseconds
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Portions contributed by: Brandon Baker <bbaker@delphix.com>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>
@@ -20,12 +20,12 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright 2016 Joyent, Inc.
- * Copyright (c) 2014 by Delphix. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
+ * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
*/
/* This file contains all TCP input processing functions. */
#include <sys/types.h>
@@ -164,11 +164,11 @@
static void tcp_input_listener(void *, mblk_t *, void *, ip_recv_attr_t *);
static void tcp_process_options(tcp_t *, tcpha_t *);
static mblk_t *tcp_reass(tcp_t *, mblk_t *, uint32_t);
static void tcp_reass_elim_overlap(tcp_t *, mblk_t *);
static void tcp_rsrv_input(void *, mblk_t *, void *, ip_recv_attr_t *);
-static void tcp_set_rto(tcp_t *, time_t);
+static void tcp_set_rto(tcp_t *, hrtime_t);
static void tcp_setcred_data(mblk_t *, ip_recv_attr_t *);
/*
* Set the MSS associated with a particular tcp based on its current value,
* and a new one passed in. Observe minimums and maximums, and reset other
@@ -3360,11 +3360,11 @@
/*
* urp could be -1 when the urp field in the packet is 0
* and TCP_OLD_URP_INTERPRETATION is set. This implies that the urgent
* byte was at seg_seq - 1, in which case we ignore the urgent flag.
*/
- if (flags & TH_URG && urp >= 0) {
+ if ((flags & TH_URG) && urp >= 0) {
if (!tcp->tcp_urp_last_valid ||
SEQ_GT(urp + seg_seq, tcp->tcp_urp_last)) {
/*
* Non-STREAMS sockets handle the urgent data a litte
* differently from STREAMS based sockets. There is no
@@ -4302,40 +4302,33 @@
/* See if the latest urgent data has been acknowledged */
if ((tcp->tcp_valid_bits & TCP_URG_VALID) &&
SEQ_GT(seg_ack, tcp->tcp_urg))
tcp->tcp_valid_bits &= ~TCP_URG_VALID;
- /* Can we update the RTT estimates? */
- if (tcp->tcp_snd_ts_ok) {
- /* Ignore zero timestamp echo-reply. */
- if (tcpopt.tcp_opt_ts_ecr != 0) {
- tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH -
- (int32_t)tcpopt.tcp_opt_ts_ecr);
- }
-
- /* If needed, restart the timer. */
- if (tcp->tcp_set_timer == 1) {
- TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
- tcp->tcp_set_timer = 0;
- }
/*
- * Update tcp_csuna in case the other side stops sending
- * us timestamps.
+ * Update the RTT estimates. Note that we don't use the TCP
+ * timestamp option to calculate RTT even if one is present. This is
+ * because the timestamp option's resolution (CPU tick) is
+ * too coarse to measure modern datacenter networks' microsecond
+ * latencies. The timestamp field's resolution is limited by its
+ * 4-byte width (see RFC1323), and since we always store a
+ * high-resolution nanosecond presision timestamp along with the data,
+ * there is no point to ever using the timestamp option.
*/
- tcp->tcp_csuna = tcp->tcp_snxt;
- } else if (SEQ_GT(seg_ack, tcp->tcp_csuna)) {
+ if (SEQ_GT(seg_ack, tcp->tcp_csuna)) {
/*
* An ACK sequence we haven't seen before, so get the RTT
* and update the RTO. But first check if the timestamp is
* valid to use.
*/
if ((mp1->b_next != NULL) &&
- SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next)))
- tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH -
- (int32_t)(intptr_t)mp1->b_prev);
- else
+ SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next))) {
+ tcp_set_rto(tcp, gethrtime() -
+ (hrtime_t)(intptr_t)mp1->b_prev);
+ } else {
TCPS_BUMP_MIB(tcps, tcpRttNoUpdate);
+ }
/* Remeber the last sequence to be ACKed */
tcp->tcp_csuna = seg_ack;
if (tcp->tcp_set_timer == 1) {
TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
@@ -4360,11 +4353,11 @@
* old timestamp have been ack'ed.
*/
if (SEQ_GT(seg_ack,
(uint32_t)(uintptr_t)(mp1->b_next))) {
mp1->b_prev =
- (mblk_t *)(uintptr_t)LBOLT_FASTPATH;
+ (mblk_t *)(intptr_t)gethrtime();
mp1->b_next = NULL;
}
break;
}
mp1->b_next = NULL;
@@ -4837,11 +4830,11 @@
NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size,
B_TRUE);
if (mp1 != NULL) {
tcp->tcp_xmit_head->b_prev =
- (mblk_t *)LBOLT_FASTPATH;
+ (mblk_t *)(intptr_t)gethrtime();
tcp->tcp_csuna = tcp->tcp_snxt;
TCPS_BUMP_MIB(tcps, tcpRetransSegs);
TCPS_UPDATE_MIB(tcps, tcpRetransBytes,
snd_size);
tcp_send_data(tcp, mp1);
@@ -4871,14 +4864,15 @@
/*
* This will restart the timer. Restarting the
* timer is used to avoid a timeout before the
* limited transmitted segment's ACK gets back.
*/
- if (tcp->tcp_xmit_head != NULL)
+ if (tcp->tcp_xmit_head != NULL) {
tcp->tcp_xmit_head->b_prev =
- (mblk_t *)LBOLT_FASTPATH;
+ (mblk_t *)(intptr_t)gethrtime();
}
+ }
/* Anything more to do? */
if ((flags & (TH_ACK_NEEDED|TH_ACK_TIMER_NEEDED|
TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0)
goto done;
@@ -5209,42 +5203,55 @@
}
ASSERT(optptr == mp->b_wptr);
return (mp);
}
-/* The minimum of smoothed mean deviation in RTO calculation. */
-#define TCP_SD_MIN 400
+/* The minimum of smoothed mean deviation in RTO calculation (nsec). */
+#define TCP_SD_MIN 400000000
/*
- * Set RTO for this connection. The formula is from Jacobson and Karels'
- * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names
- * are the same as those in Appendix A.2 of that paper.
+ * Set RTO for this connection based on a new round-trip time measurement.
+ * The formula is from Jacobson and Karels' "Congestion Avoidance and Control"
+ * in SIGCOMM '88. The variable names are the same as those in Appendix A.2
+ * of that paper.
*
* m = new measurement
* sa = smoothed RTT average (8 * average estimates).
* sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates).
*/
static void
-tcp_set_rto(tcp_t *tcp, clock_t rtt)
+tcp_set_rto(tcp_t *tcp, hrtime_t rtt)
{
- long m = TICK_TO_MSEC(rtt);
- clock_t sa = tcp->tcp_rtt_sa;
- clock_t sv = tcp->tcp_rtt_sd;
- clock_t rto;
+ hrtime_t m = rtt;
+ hrtime_t sa = tcp->tcp_rtt_sa;
+ hrtime_t sv = tcp->tcp_rtt_sd;
tcp_stack_t *tcps = tcp->tcp_tcps;
TCPS_BUMP_MIB(tcps, tcpRttUpdate);
tcp->tcp_rtt_update++;
/* tcp_rtt_sa is not 0 means this is a new sample. */
if (sa != 0) {
/*
- * Update average estimator:
- * new rtt = 7/8 old rtt + 1/8 Error
+ * Update average estimator (see section 2.3 of RFC6298):
+ * SRTT = 7/8 SRTT + 1/8 rtt
+ *
+ * We maintain tcp_rtt_sa as 8 * SRTT, so this reduces to:
+ * tcp_rtt_sa = 7 * SRTT + rtt
+ * tcp_rtt_sa = 7 * (tcp_rtt_sa / 8) + rtt
+ * tcp_rtt_sa = tcp_rtt_sa - (tcp_rtt_sa / 8) + rtt
+ * tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa / 8))
+ * tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa / 2^3))
+ * tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa >> 3))
+ *
+ * (rtt - tcp_rtt_sa / 8) is simply the difference
+ * between the new rtt measurement and the existing smoothed
+ * RTT average. This is referred to as "Error" in subsequent
+ * calculations.
*/
- /* m is now Error in estimate. */
+ /* m is now Error. */
m -= sa >> 3;
if ((sa += m) <= 0) {
/*
* Don't allow the smoothed average to be negative.
* We use 0 to denote reinitialization of the
@@ -5253,11 +5260,17 @@
sa = 1;
}
/*
* Update deviation estimator:
- * new mdev = 3/4 old mdev + 1/4 (abs(Error) - old mdev)
+ * mdev = 3/4 mdev + 1/4 abs(Error)
+ *
+ * We maintain tcp_rtt_sd as 4 * mdev, so this reduces to:
+ * tcp_rtt_sd = 3 * mdev + abs(Error)
+ * tcp_rtt_sd = tcp_rtt_sd - (tcp_rtt_sd / 4) + abs(Error)
+ * tcp_rtt_sd = tcp_rtt_sd - (tcp_rtt_sd / 2^2) + abs(Error)
+ * tcp_rtt_sd = tcp_rtt_sd - (tcp_rtt_sd >> 2) + abs(Error)
*/
if (m < 0)
m = -m;
m -= sv >> 2;
sv += m;
@@ -5273,37 +5286,25 @@
sa = m << 3;
sv = m << 1;
}
if (sv < TCP_SD_MIN) {
/*
- * We do not know that if sa captures the delay ACK
- * effect as in a long train of segments, a receiver
- * does not delay its ACKs. So set the minimum of sv
- * to be TCP_SD_MIN, which is default to 400 ms, twice
- * of BSD DATO. That means the minimum of mean
+ * Since a receiver doesn't delay its ACKs during a long run of
+ * segments, sa may not have captured the effect of delayed ACK
+ * timeouts on the RTT. To make sure we always account for the
+ * possible delay (and avoid the unnecessary retransmission),
+ * TCP_SD_MIN is set to 400ms, twice the delayed ACK timeout of
+ * 200ms on older SunOS/BSD systems and modern Windows systems
+ * (as of 2019). This means that the minimum possible mean
* deviation is 100 ms.
- *
*/
sv = TCP_SD_MIN;
}
tcp->tcp_rtt_sa = sa;
tcp->tcp_rtt_sd = sv;
- /*
- * RTO = average estimates (sa / 8) + 4 * deviation estimates (sv)
- *
- * Add tcp_rexmit_interval extra in case of extreme environment
- * where the algorithm fails to work. The default value of
- * tcp_rexmit_interval_extra should be 0.
- *
- * As we use a finer grained clock than BSD and update
- * RTO for every ACKs, add in another .25 of RTT to the
- * deviation of RTO to accomodate burstiness of 1/4 of
- * window size.
- */
- rto = (sa >> 3) + sv + tcps->tcps_rexmit_interval_extra + (sa >> 5);
- TCP_SET_RTO(tcp, rto);
+ tcp->tcp_rto = tcp_calculate_rto(tcp, tcps, 0);
/* Now, we can reset tcp_timer_backoff to use the new RTO... */
tcp->tcp_timer_backoff = 0;
}