Print this page
11546 Track TCP round-trip time in nanoseconds
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Portions contributed by: Brandon Baker <bbaker@delphix.com>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/inet/tcp/tcp_input.c
          +++ new/usr/src/uts/common/inet/tcp/tcp_input.c
↓ open down ↓ 14 lines elided ↑ open up ↑
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  25      - * Copyright 2016 Joyent, Inc.
  26      - * Copyright (c) 2014 by Delphix. All rights reserved.
       25 + * Copyright 2019 Joyent, Inc.
       26 + * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
  27   27   */
  28   28  
  29   29  /* This file contains all TCP input processing functions. */
  30   30  
  31   31  #include <sys/types.h>
  32   32  #include <sys/stream.h>
  33   33  #include <sys/strsun.h>
  34   34  #include <sys/strsubr.h>
  35   35  #include <sys/stropts.h>
  36   36  #include <sys/strlog.h>
↓ open down ↓ 122 lines elided ↑ open up ↑
 159  159                      ip_recv_attr_t *);
 160  160  static boolean_t        tcp_drop_q0(tcp_t *);
 161  161  static void     tcp_icmp_error_ipv6(tcp_t *, mblk_t *, ip_recv_attr_t *);
 162  162  static mblk_t   *tcp_input_add_ancillary(tcp_t *, mblk_t *, ip_pkt_t *,
 163  163                      ip_recv_attr_t *);
 164  164  static void     tcp_input_listener(void *, mblk_t *, void *, ip_recv_attr_t *);
 165  165  static void     tcp_process_options(tcp_t *, tcpha_t *);
 166  166  static mblk_t   *tcp_reass(tcp_t *, mblk_t *, uint32_t);
 167  167  static void     tcp_reass_elim_overlap(tcp_t *, mblk_t *);
 168  168  static void     tcp_rsrv_input(void *, mblk_t *, void *, ip_recv_attr_t *);
 169      -static void     tcp_set_rto(tcp_t *, time_t);
      169 +static void     tcp_set_rto(tcp_t *, hrtime_t);
 170  170  static void     tcp_setcred_data(mblk_t *, ip_recv_attr_t *);
 171  171  
 172  172  /*
 173  173   * Set the MSS associated with a particular tcp based on its current value,
 174  174   * and a new one passed in. Observe minimums and maximums, and reset other
 175  175   * state variables that we want to view as multiples of MSS.
 176  176   *
 177  177   * The value of MSS could be either increased or descreased.
 178  178   */
 179  179  void
↓ open down ↓ 3175 lines elided ↑ open up ↑
3355 3355                      TH_RST|TH_ACK);
3356 3356                  ASSERT(tcp->tcp_state != TCPS_TIME_WAIT);
3357 3357                  (void) tcp_clean_death(tcp, ECONNRESET);
3358 3358                  return;
3359 3359          }
3360 3360          /*
3361 3361           * urp could be -1 when the urp field in the packet is 0
3362 3362           * and TCP_OLD_URP_INTERPRETATION is set. This implies that the urgent
3363 3363           * byte was at seg_seq - 1, in which case we ignore the urgent flag.
3364 3364           */
3365      -        if (flags & TH_URG && urp >= 0) {
     3365 +        if ((flags & TH_URG) && urp >= 0) {
3366 3366                  if (!tcp->tcp_urp_last_valid ||
3367 3367                      SEQ_GT(urp + seg_seq, tcp->tcp_urp_last)) {
3368 3368                          /*
3369 3369                           * Non-STREAMS sockets handle the urgent data a litte
3370 3370                           * differently from STREAMS based sockets. There is no
3371 3371                           * need to mark any mblks with the MSG{NOT,}MARKNEXT
3372 3372                           * flags to keep SIOCATMARK happy. Instead a
3373 3373                           * su_signal_oob upcall is made to update the mark.
3374 3374                           * Neither is a T_EXDATA_IND mblk needed to be
3375 3375                           * prepended to the urgent data. The urgent data is
↓ open down ↓ 921 lines elided ↑ open up ↑
4297 4297                          }
4298 4298                  }
4299 4299                  tcp->tcp_cwnd = MIN(cwnd + add, tcp->tcp_cwnd_max);
4300 4300          }
4301 4301  
4302 4302          /* See if the latest urgent data has been acknowledged */
4303 4303          if ((tcp->tcp_valid_bits & TCP_URG_VALID) &&
4304 4304              SEQ_GT(seg_ack, tcp->tcp_urg))
4305 4305                  tcp->tcp_valid_bits &= ~TCP_URG_VALID;
4306 4306  
4307      -        /* Can we update the RTT estimates? */
4308      -        if (tcp->tcp_snd_ts_ok) {
4309      -                /* Ignore zero timestamp echo-reply. */
4310      -                if (tcpopt.tcp_opt_ts_ecr != 0) {
4311      -                        tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH -
4312      -                            (int32_t)tcpopt.tcp_opt_ts_ecr);
4313      -                }
4314      -
4315      -                /* If needed, restart the timer. */
4316      -                if (tcp->tcp_set_timer == 1) {
4317      -                        TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
4318      -                        tcp->tcp_set_timer = 0;
4319      -                }
     4307 +        /*
     4308 +         * Update the RTT estimates. Note that we don't use the TCP
     4309 +         * timestamp option to calculate RTT even if one is present. This is
     4310 +         * because the timestamp option's resolution (CPU tick) is
     4311 +         * too coarse to measure modern datacenter networks' microsecond
     4312 +         * latencies. The timestamp field's resolution is limited by its
     4313 +         * 4-byte width (see RFC1323), and since we always store a
     4314 +         * high-resolution nanosecond presision timestamp along with the data,
     4315 +         * there is no point to ever using the timestamp option.
     4316 +         */
     4317 +        if (SEQ_GT(seg_ack, tcp->tcp_csuna)) {
4320 4318                  /*
4321      -                 * Update tcp_csuna in case the other side stops sending
4322      -                 * us timestamps.
4323      -                 */
4324      -                tcp->tcp_csuna = tcp->tcp_snxt;
4325      -        } else if (SEQ_GT(seg_ack, tcp->tcp_csuna)) {
4326      -                /*
4327 4319                   * An ACK sequence we haven't seen before, so get the RTT
4328 4320                   * and update the RTO. But first check if the timestamp is
4329 4321                   * valid to use.
4330 4322                   */
4331 4323                  if ((mp1->b_next != NULL) &&
4332      -                    SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next)))
4333      -                        tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH -
4334      -                            (int32_t)(intptr_t)mp1->b_prev);
4335      -                else
     4324 +                    SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next))) {
     4325 +                        tcp_set_rto(tcp, gethrtime() -
     4326 +                            (hrtime_t)(intptr_t)mp1->b_prev);
     4327 +                } else {
4336 4328                          TCPS_BUMP_MIB(tcps, tcpRttNoUpdate);
     4329 +                }
4337 4330  
4338 4331                  /* Remeber the last sequence to be ACKed */
4339 4332                  tcp->tcp_csuna = seg_ack;
4340 4333                  if (tcp->tcp_set_timer == 1) {
4341 4334                          TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
4342 4335                          tcp->tcp_set_timer = 0;
4343 4336                  }
4344 4337          } else {
4345 4338                  TCPS_BUMP_MIB(tcps, tcpRttNoUpdate);
4346 4339          }
↓ open down ↓ 8 lines elided ↑ open up ↑
4355 4348                  bytes_acked -= (int)(wptr - mp1->b_rptr);
4356 4349                  if (bytes_acked < 0) {
4357 4350                          mp1->b_rptr = wptr + bytes_acked;
4358 4351                          /*
4359 4352                           * Set a new timestamp if all the bytes timed by the
4360 4353                           * old timestamp have been ack'ed.
4361 4354                           */
4362 4355                          if (SEQ_GT(seg_ack,
4363 4356                              (uint32_t)(uintptr_t)(mp1->b_next))) {
4364 4357                                  mp1->b_prev =
4365      -                                    (mblk_t *)(uintptr_t)LBOLT_FASTPATH;
     4358 +                                    (mblk_t *)(intptr_t)gethrtime();
4366 4359                                  mp1->b_next = NULL;
4367 4360                          }
4368 4361                          break;
4369 4362                  }
4370 4363                  mp1->b_next = NULL;
4371 4364                  mp1->b_prev = NULL;
4372 4365                  mp2 = mp1;
4373 4366                  mp1 = mp1->b_cont;
4374 4367  
4375 4368                  /*
↓ open down ↓ 456 lines elided ↑ open up ↑
4832 4825                          if (snd_size > mss)
4833 4826                                  snd_size = mss;
4834 4827                          if (snd_size > tcp->tcp_swnd)
4835 4828                                  snd_size = tcp->tcp_swnd;
4836 4829                          mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, snd_size,
4837 4830                              NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size,
4838 4831                              B_TRUE);
4839 4832  
4840 4833                          if (mp1 != NULL) {
4841 4834                                  tcp->tcp_xmit_head->b_prev =
4842      -                                    (mblk_t *)LBOLT_FASTPATH;
     4835 +                                    (mblk_t *)(intptr_t)gethrtime();
4843 4836                                  tcp->tcp_csuna = tcp->tcp_snxt;
4844 4837                                  TCPS_BUMP_MIB(tcps, tcpRetransSegs);
4845 4838                                  TCPS_UPDATE_MIB(tcps, tcpRetransBytes,
4846 4839                                      snd_size);
4847 4840                                  tcp_send_data(tcp, mp1);
4848 4841                          }
4849 4842                  }
4850 4843                  if (flags & TH_NEED_SACK_REXMIT) {
4851 4844                          tcp_sack_rexmit(tcp, &flags);
4852 4845                  }
↓ open down ↓ 13 lines elided ↑ open up ↑
4866 4859                   * Adjust tcp_cwnd back to normal value after sending
4867 4860                   * new data segments.
4868 4861                   */
4869 4862                  if (flags & TH_LIMIT_XMIT) {
4870 4863                          tcp->tcp_cwnd -= mss << (tcp->tcp_dupack_cnt - 1);
4871 4864                          /*
4872 4865                           * This will restart the timer.  Restarting the
4873 4866                           * timer is used to avoid a timeout before the
4874 4867                           * limited transmitted segment's ACK gets back.
4875 4868                           */
4876      -                        if (tcp->tcp_xmit_head != NULL)
     4869 +                        if (tcp->tcp_xmit_head != NULL) {
4877 4870                                  tcp->tcp_xmit_head->b_prev =
4878      -                                    (mblk_t *)LBOLT_FASTPATH;
     4871 +                                    (mblk_t *)(intptr_t)gethrtime();
     4872 +                        }
4879 4873                  }
4880 4874  
4881 4875                  /* Anything more to do? */
4882 4876                  if ((flags & (TH_ACK_NEEDED|TH_ACK_TIMER_NEEDED|
4883 4877                      TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0)
4884 4878                          goto done;
4885 4879          }
4886 4880  ack_check:
4887 4881          if (flags & TH_SEND_URP_MARK) {
4888 4882                  ASSERT(tcp->tcp_urp_mark_mp);
↓ open down ↓ 315 lines elided ↑ open up ↑
5204 5198                  ASSERT(OK_32PTR(optptr));
5205 5199                  /* Save as last value */
5206 5200                  ip_savebuf((void **)&tcp->tcp_dstopts, &tcp->tcp_dstoptslen,
5207 5201                      (ipp->ipp_fields & IPPF_DSTOPTS),
5208 5202                      ipp->ipp_dstopts, ipp->ipp_dstoptslen);
5209 5203          }
5210 5204          ASSERT(optptr == mp->b_wptr);
5211 5205          return (mp);
5212 5206  }
5213 5207  
5214      -/* The minimum of smoothed mean deviation in RTO calculation. */
5215      -#define TCP_SD_MIN      400
     5208 +/* The minimum of smoothed mean deviation in RTO calculation (nsec). */
     5209 +#define TCP_SD_MIN      400000000
5216 5210  
5217 5211  /*
5218      - * Set RTO for this connection.  The formula is from Jacobson and Karels'
5219      - * "Congestion Avoidance and Control" in SIGCOMM '88.  The variable names
5220      - * are the same as those in Appendix A.2 of that paper.
     5212 + * Set RTO for this connection based on a new round-trip time measurement.
     5213 + * The formula is from Jacobson and Karels' "Congestion Avoidance and Control"
     5214 + * in SIGCOMM '88.  The variable names are the same as those in Appendix A.2
     5215 + * of that paper.
5221 5216   *
5222 5217   * m = new measurement
5223 5218   * sa = smoothed RTT average (8 * average estimates).
5224 5219   * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates).
5225 5220   */
5226 5221  static void
5227      -tcp_set_rto(tcp_t *tcp, clock_t rtt)
     5222 +tcp_set_rto(tcp_t *tcp, hrtime_t rtt)
5228 5223  {
5229      -        long m = TICK_TO_MSEC(rtt);
5230      -        clock_t sa = tcp->tcp_rtt_sa;
5231      -        clock_t sv = tcp->tcp_rtt_sd;
5232      -        clock_t rto;
5233      -        tcp_stack_t     *tcps = tcp->tcp_tcps;
     5224 +        hrtime_t m = rtt;
     5225 +        hrtime_t sa = tcp->tcp_rtt_sa;
     5226 +        hrtime_t sv = tcp->tcp_rtt_sd;
     5227 +        tcp_stack_t *tcps = tcp->tcp_tcps;
5234 5228  
5235 5229          TCPS_BUMP_MIB(tcps, tcpRttUpdate);
5236 5230          tcp->tcp_rtt_update++;
5237 5231  
5238 5232          /* tcp_rtt_sa is not 0 means this is a new sample. */
5239 5233          if (sa != 0) {
5240 5234                  /*
5241      -                 * Update average estimator:
5242      -                 *      new rtt = 7/8 old rtt + 1/8 Error
     5235 +                 * Update average estimator (see section 2.3 of RFC6298):
     5236 +                 *      SRTT = 7/8 SRTT + 1/8 rtt
     5237 +                 *
     5238 +                 * We maintain tcp_rtt_sa as 8 * SRTT, so this reduces to:
     5239 +                 *      tcp_rtt_sa = 7 * SRTT + rtt
     5240 +                 *      tcp_rtt_sa = 7 * (tcp_rtt_sa / 8) + rtt
     5241 +                 *      tcp_rtt_sa = tcp_rtt_sa - (tcp_rtt_sa / 8) + rtt
     5242 +                 *      tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa / 8))
     5243 +                 *      tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa / 2^3))
     5244 +                 *      tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa >> 3))
     5245 +                 *
     5246 +                 * (rtt - tcp_rtt_sa / 8) is simply the difference
     5247 +                 * between the new rtt measurement and the existing smoothed
     5248 +                 * RTT average. This is referred to as "Error" in subsequent
     5249 +                 * calculations.
5243 5250                   */
5244 5251  
5245      -                /* m is now Error in estimate. */
     5252 +                /* m is now Error. */
5246 5253                  m -= sa >> 3;
5247 5254                  if ((sa += m) <= 0) {
5248 5255                          /*
5249 5256                           * Don't allow the smoothed average to be negative.
5250 5257                           * We use 0 to denote reinitialization of the
5251 5258                           * variables.
5252 5259                           */
5253 5260                          sa = 1;
5254 5261                  }
5255 5262  
5256 5263                  /*
5257 5264                   * Update deviation estimator:
5258      -                 *      new mdev = 3/4 old mdev + 1/4 (abs(Error) - old mdev)
     5265 +                 *  mdev = 3/4 mdev + 1/4 abs(Error)
     5266 +                 *
     5267 +                 * We maintain tcp_rtt_sd as 4 * mdev, so this reduces to:
     5268 +                 *  tcp_rtt_sd = 3 * mdev + abs(Error)
     5269 +                 *  tcp_rtt_sd = tcp_rtt_sd - (tcp_rtt_sd / 4) + abs(Error)
     5270 +                 *  tcp_rtt_sd = tcp_rtt_sd - (tcp_rtt_sd / 2^2) + abs(Error)
     5271 +                 *  tcp_rtt_sd = tcp_rtt_sd - (tcp_rtt_sd >> 2) + abs(Error)
5259 5272                   */
5260 5273                  if (m < 0)
5261 5274                          m = -m;
5262 5275                  m -= sv >> 2;
5263 5276                  sv += m;
5264 5277          } else {
5265 5278                  /*
5266 5279                   * This follows BSD's implementation.  So the reinitialized
5267 5280                   * RTO is 3 * m.  We cannot go less than 2 because if the
5268 5281                   * link is bandwidth dominated, doubling the window size
5269 5282                   * during slow start means doubling the RTT.  We want to be
5270 5283                   * more conservative when we reinitialize our estimates.  3
5271 5284                   * is just a convenient number.
5272 5285                   */
5273 5286                  sa = m << 3;
5274 5287                  sv = m << 1;
5275 5288          }
5276 5289          if (sv < TCP_SD_MIN) {
5277 5290                  /*
5278      -                 * We do not know that if sa captures the delay ACK
5279      -                 * effect as in a long train of segments, a receiver
5280      -                 * does not delay its ACKs.  So set the minimum of sv
5281      -                 * to be TCP_SD_MIN, which is default to 400 ms, twice
5282      -                 * of BSD DATO.  That means the minimum of mean
     5291 +                 * Since a receiver doesn't delay its ACKs during a long run of
     5292 +                 * segments, sa may not have captured the effect of delayed ACK
     5293 +                 * timeouts on the RTT.  To make sure we always account for the
     5294 +                 * possible delay (and avoid the unnecessary retransmission),
     5295 +                 * TCP_SD_MIN is set to 400ms, twice the delayed ACK timeout of
     5296 +                 * 200ms on older SunOS/BSD systems and modern Windows systems
     5297 +                 * (as of 2019).  This means that the minimum possible mean
5283 5298                   * deviation is 100 ms.
5284      -                 *
5285 5299                   */
5286 5300                  sv = TCP_SD_MIN;
5287 5301          }
5288 5302          tcp->tcp_rtt_sa = sa;
5289 5303          tcp->tcp_rtt_sd = sv;
5290      -        /*
5291      -         * RTO = average estimates (sa / 8) + 4 * deviation estimates (sv)
5292      -         *
5293      -         * Add tcp_rexmit_interval extra in case of extreme environment
5294      -         * where the algorithm fails to work.  The default value of
5295      -         * tcp_rexmit_interval_extra should be 0.
5296      -         *
5297      -         * As we use a finer grained clock than BSD and update
5298      -         * RTO for every ACKs, add in another .25 of RTT to the
5299      -         * deviation of RTO to accomodate burstiness of 1/4 of
5300      -         * window size.
5301      -         */
5302      -        rto = (sa >> 3) + sv + tcps->tcps_rexmit_interval_extra + (sa >> 5);
5303 5304  
5304      -        TCP_SET_RTO(tcp, rto);
     5305 +        tcp->tcp_rto = tcp_calculate_rto(tcp, tcps, 0);
5305 5306  
5306 5307          /* Now, we can reset tcp_timer_backoff to use the new RTO... */
5307 5308          tcp->tcp_timer_backoff = 0;
5308 5309  }
5309 5310  
5310 5311  /*
5311 5312   * On a labeled system we have some protocols above TCP, such as RPC, which
5312 5313   * appear to assume that every mblk in a chain has a db_credp.
5313 5314   */
5314 5315  static void
↓ open down ↓ 472 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX