Print this page
11546 Track TCP round-trip time in nanoseconds
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Portions contributed by: Brandon Baker <bbaker@delphix.com>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>

*** 20,31 **** */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. ! * Copyright 2016 Joyent, Inc. ! * Copyright (c) 2014 by Delphix. All rights reserved. */ /* This file contains all TCP input processing functions. */ #include <sys/types.h> --- 20,31 ---- */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. ! * Copyright 2019 Joyent, Inc. ! * Copyright (c) 2014, 2016 by Delphix. All rights reserved. */ /* This file contains all TCP input processing functions. */ #include <sys/types.h>
*** 164,174 **** static void tcp_input_listener(void *, mblk_t *, void *, ip_recv_attr_t *); static void tcp_process_options(tcp_t *, tcpha_t *); static mblk_t *tcp_reass(tcp_t *, mblk_t *, uint32_t); static void tcp_reass_elim_overlap(tcp_t *, mblk_t *); static void tcp_rsrv_input(void *, mblk_t *, void *, ip_recv_attr_t *); ! static void tcp_set_rto(tcp_t *, time_t); static void tcp_setcred_data(mblk_t *, ip_recv_attr_t *); /* * Set the MSS associated with a particular tcp based on its current value, * and a new one passed in. Observe minimums and maximums, and reset other --- 164,174 ---- static void tcp_input_listener(void *, mblk_t *, void *, ip_recv_attr_t *); static void tcp_process_options(tcp_t *, tcpha_t *); static mblk_t *tcp_reass(tcp_t *, mblk_t *, uint32_t); static void tcp_reass_elim_overlap(tcp_t *, mblk_t *); static void tcp_rsrv_input(void *, mblk_t *, void *, ip_recv_attr_t *); ! static void tcp_set_rto(tcp_t *, hrtime_t); static void tcp_setcred_data(mblk_t *, ip_recv_attr_t *); /* * Set the MSS associated with a particular tcp based on its current value, * and a new one passed in. Observe minimums and maximums, and reset other
*** 3360,3370 **** /* * urp could be -1 when the urp field in the packet is 0 * and TCP_OLD_URP_INTERPRETATION is set. This implies that the urgent * byte was at seg_seq - 1, in which case we ignore the urgent flag. */ ! if (flags & TH_URG && urp >= 0) { if (!tcp->tcp_urp_last_valid || SEQ_GT(urp + seg_seq, tcp->tcp_urp_last)) { /* * Non-STREAMS sockets handle the urgent data a litte * differently from STREAMS based sockets. There is no --- 3360,3370 ---- /* * urp could be -1 when the urp field in the packet is 0 * and TCP_OLD_URP_INTERPRETATION is set. This implies that the urgent * byte was at seg_seq - 1, in which case we ignore the urgent flag. */ ! if ((flags & TH_URG) && urp >= 0) { if (!tcp->tcp_urp_last_valid || SEQ_GT(urp + seg_seq, tcp->tcp_urp_last)) { /* * Non-STREAMS sockets handle the urgent data a litte * differently from STREAMS based sockets. There is no
*** 4302,4341 **** /* See if the latest urgent data has been acknowledged */ if ((tcp->tcp_valid_bits & TCP_URG_VALID) && SEQ_GT(seg_ack, tcp->tcp_urg)) tcp->tcp_valid_bits &= ~TCP_URG_VALID; - /* Can we update the RTT estimates? */ - if (tcp->tcp_snd_ts_ok) { - /* Ignore zero timestamp echo-reply. */ - if (tcpopt.tcp_opt_ts_ecr != 0) { - tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH - - (int32_t)tcpopt.tcp_opt_ts_ecr); - } - - /* If needed, restart the timer. */ - if (tcp->tcp_set_timer == 1) { - TCP_TIMER_RESTART(tcp, tcp->tcp_rto); - tcp->tcp_set_timer = 0; - } /* ! * Update tcp_csuna in case the other side stops sending ! * us timestamps. */ ! tcp->tcp_csuna = tcp->tcp_snxt; ! } else if (SEQ_GT(seg_ack, tcp->tcp_csuna)) { /* * An ACK sequence we haven't seen before, so get the RTT * and update the RTO. But first check if the timestamp is * valid to use. */ if ((mp1->b_next != NULL) && ! SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next))) ! tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH - ! (int32_t)(intptr_t)mp1->b_prev); ! else TCPS_BUMP_MIB(tcps, tcpRttNoUpdate); /* Remeber the last sequence to be ACKed */ tcp->tcp_csuna = seg_ack; if (tcp->tcp_set_timer == 1) { TCP_TIMER_RESTART(tcp, tcp->tcp_rto); --- 4302,4334 ---- /* See if the latest urgent data has been acknowledged */ if ((tcp->tcp_valid_bits & TCP_URG_VALID) && SEQ_GT(seg_ack, tcp->tcp_urg)) tcp->tcp_valid_bits &= ~TCP_URG_VALID; /* ! * Update the RTT estimates. Note that we don't use the TCP ! * timestamp option to calculate RTT even if one is present. This is ! * because the timestamp option's resolution (CPU tick) is ! * too coarse to measure modern datacenter networks' microsecond ! * latencies. The timestamp field's resolution is limited by its ! * 4-byte width (see RFC1323), and since we always store a ! * high-resolution nanosecond presision timestamp along with the data, ! * there is no point to ever using the timestamp option. */ ! if (SEQ_GT(seg_ack, tcp->tcp_csuna)) { /* * An ACK sequence we haven't seen before, so get the RTT * and update the RTO. But first check if the timestamp is * valid to use. */ if ((mp1->b_next != NULL) && ! SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next))) { ! tcp_set_rto(tcp, gethrtime() - ! (hrtime_t)(intptr_t)mp1->b_prev); ! } else { TCPS_BUMP_MIB(tcps, tcpRttNoUpdate); + } /* Remeber the last sequence to be ACKed */ tcp->tcp_csuna = seg_ack; if (tcp->tcp_set_timer == 1) { TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
*** 4360,4370 **** * old timestamp have been ack'ed. */ if (SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next))) { mp1->b_prev = ! (mblk_t *)(uintptr_t)LBOLT_FASTPATH; mp1->b_next = NULL; } break; } mp1->b_next = NULL; --- 4353,4363 ---- * old timestamp have been ack'ed. */ if (SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next))) { mp1->b_prev = ! (mblk_t *)(intptr_t)gethrtime(); mp1->b_next = NULL; } break; } mp1->b_next = NULL;
*** 4837,4847 **** NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size, B_TRUE); if (mp1 != NULL) { tcp->tcp_xmit_head->b_prev = ! (mblk_t *)LBOLT_FASTPATH; tcp->tcp_csuna = tcp->tcp_snxt; TCPS_BUMP_MIB(tcps, tcpRetransSegs); TCPS_UPDATE_MIB(tcps, tcpRetransBytes, snd_size); tcp_send_data(tcp, mp1); --- 4830,4840 ---- NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size, B_TRUE); if (mp1 != NULL) { tcp->tcp_xmit_head->b_prev = ! (mblk_t *)(intptr_t)gethrtime(); tcp->tcp_csuna = tcp->tcp_snxt; TCPS_BUMP_MIB(tcps, tcpRetransSegs); TCPS_UPDATE_MIB(tcps, tcpRetransBytes, snd_size); tcp_send_data(tcp, mp1);
*** 4871,4884 **** /* * This will restart the timer. Restarting the * timer is used to avoid a timeout before the * limited transmitted segment's ACK gets back. */ ! if (tcp->tcp_xmit_head != NULL) tcp->tcp_xmit_head->b_prev = ! (mblk_t *)LBOLT_FASTPATH; } /* Anything more to do? */ if ((flags & (TH_ACK_NEEDED|TH_ACK_TIMER_NEEDED| TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0) goto done; --- 4864,4878 ---- /* * This will restart the timer. Restarting the * timer is used to avoid a timeout before the * limited transmitted segment's ACK gets back. */ ! if (tcp->tcp_xmit_head != NULL) { tcp->tcp_xmit_head->b_prev = ! (mblk_t *)(intptr_t)gethrtime(); } + } /* Anything more to do? */ if ((flags & (TH_ACK_NEEDED|TH_ACK_TIMER_NEEDED| TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0) goto done;
*** 5209,5250 **** } ASSERT(optptr == mp->b_wptr); return (mp); } ! /* The minimum of smoothed mean deviation in RTO calculation. */ ! #define TCP_SD_MIN 400 /* ! * Set RTO for this connection. The formula is from Jacobson and Karels' ! * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names ! * are the same as those in Appendix A.2 of that paper. * * m = new measurement * sa = smoothed RTT average (8 * average estimates). * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates). */ static void ! tcp_set_rto(tcp_t *tcp, clock_t rtt) { ! long m = TICK_TO_MSEC(rtt); ! clock_t sa = tcp->tcp_rtt_sa; ! clock_t sv = tcp->tcp_rtt_sd; ! clock_t rto; tcp_stack_t *tcps = tcp->tcp_tcps; TCPS_BUMP_MIB(tcps, tcpRttUpdate); tcp->tcp_rtt_update++; /* tcp_rtt_sa is not 0 means this is a new sample. */ if (sa != 0) { /* ! * Update average estimator: ! * new rtt = 7/8 old rtt + 1/8 Error */ ! /* m is now Error in estimate. */ m -= sa >> 3; if ((sa += m) <= 0) { /* * Don't allow the smoothed average to be negative. * We use 0 to denote reinitialization of the --- 5203,5257 ---- } ASSERT(optptr == mp->b_wptr); return (mp); } ! /* The minimum of smoothed mean deviation in RTO calculation (nsec). */ ! #define TCP_SD_MIN 400000000 /* ! * Set RTO for this connection based on a new round-trip time measurement. ! * The formula is from Jacobson and Karels' "Congestion Avoidance and Control" ! * in SIGCOMM '88. The variable names are the same as those in Appendix A.2 ! * of that paper. * * m = new measurement * sa = smoothed RTT average (8 * average estimates). * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates). */ static void ! tcp_set_rto(tcp_t *tcp, hrtime_t rtt) { ! hrtime_t m = rtt; ! hrtime_t sa = tcp->tcp_rtt_sa; ! hrtime_t sv = tcp->tcp_rtt_sd; tcp_stack_t *tcps = tcp->tcp_tcps; TCPS_BUMP_MIB(tcps, tcpRttUpdate); tcp->tcp_rtt_update++; /* tcp_rtt_sa is not 0 means this is a new sample. */ if (sa != 0) { /* ! * Update average estimator (see section 2.3 of RFC6298): ! * SRTT = 7/8 SRTT + 1/8 rtt ! * ! * We maintain tcp_rtt_sa as 8 * SRTT, so this reduces to: ! * tcp_rtt_sa = 7 * SRTT + rtt ! * tcp_rtt_sa = 7 * (tcp_rtt_sa / 8) + rtt ! * tcp_rtt_sa = tcp_rtt_sa - (tcp_rtt_sa / 8) + rtt ! * tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa / 8)) ! * tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa / 2^3)) ! * tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa >> 3)) ! * ! * (rtt - tcp_rtt_sa / 8) is simply the difference ! * between the new rtt measurement and the existing smoothed ! * RTT average. This is referred to as "Error" in subsequent ! * calculations. */ ! /* m is now Error. */ m -= sa >> 3; if ((sa += m) <= 0) { /* * Don't allow the smoothed average to be negative. * We use 0 to denote reinitialization of the
*** 5253,5263 **** sa = 1; } /* * Update deviation estimator: ! * new mdev = 3/4 old mdev + 1/4 (abs(Error) - old mdev) */ if (m < 0) m = -m; m -= sv >> 2; sv += m; --- 5260,5276 ---- sa = 1; } /* * Update deviation estimator: ! * mdev = 3/4 mdev + 1/4 abs(Error) ! * ! * We maintain tcp_rtt_sd as 4 * mdev, so this reduces to: ! * tcp_rtt_sd = 3 * mdev + abs(Error) ! * tcp_rtt_sd = tcp_rtt_sd - (tcp_rtt_sd / 4) + abs(Error) ! * tcp_rtt_sd = tcp_rtt_sd - (tcp_rtt_sd / 2^2) + abs(Error) ! * tcp_rtt_sd = tcp_rtt_sd - (tcp_rtt_sd >> 2) + abs(Error) */ if (m < 0) m = -m; m -= sv >> 2; sv += m;
*** 5273,5309 **** sa = m << 3; sv = m << 1; } if (sv < TCP_SD_MIN) { /* ! * We do not know that if sa captures the delay ACK ! * effect as in a long train of segments, a receiver ! * does not delay its ACKs. So set the minimum of sv ! * to be TCP_SD_MIN, which is default to 400 ms, twice ! * of BSD DATO. That means the minimum of mean * deviation is 100 ms. - * */ sv = TCP_SD_MIN; } tcp->tcp_rtt_sa = sa; tcp->tcp_rtt_sd = sv; - /* - * RTO = average estimates (sa / 8) + 4 * deviation estimates (sv) - * - * Add tcp_rexmit_interval extra in case of extreme environment - * where the algorithm fails to work. The default value of - * tcp_rexmit_interval_extra should be 0. - * - * As we use a finer grained clock than BSD and update - * RTO for every ACKs, add in another .25 of RTT to the - * deviation of RTO to accomodate burstiness of 1/4 of - * window size. - */ - rto = (sa >> 3) + sv + tcps->tcps_rexmit_interval_extra + (sa >> 5); ! TCP_SET_RTO(tcp, rto); /* Now, we can reset tcp_timer_backoff to use the new RTO... */ tcp->tcp_timer_backoff = 0; } --- 5286,5310 ---- sa = m << 3; sv = m << 1; } if (sv < TCP_SD_MIN) { /* ! * Since a receiver doesn't delay its ACKs during a long run of ! * segments, sa may not have captured the effect of delayed ACK ! * timeouts on the RTT. To make sure we always account for the ! * possible delay (and avoid the unnecessary retransmission), ! * TCP_SD_MIN is set to 400ms, twice the delayed ACK timeout of ! * 200ms on older SunOS/BSD systems and modern Windows systems ! * (as of 2019). This means that the minimum possible mean * deviation is 100 ms. */ sv = TCP_SD_MIN; } tcp->tcp_rtt_sa = sa; tcp->tcp_rtt_sd = sv; ! tcp->tcp_rto = tcp_calculate_rto(tcp, tcps, 0); /* Now, we can reset tcp_timer_backoff to use the new RTO... */ tcp->tcp_timer_backoff = 0; }