Print this page
11546 Track TCP round-trip time in nanoseconds
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Portions contributed by: Brandon Baker <bbaker@delphix.com>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/inet/tcp_impl.h
          +++ new/usr/src/uts/common/inet/tcp_impl.h
↓ open down ↓ 12 lines elided ↑ open up ↑
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23      - * Copyright 2016 Joyent, Inc.
       23 + * Copyright 2019 Joyent, Inc.
  24   24   * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
  25      - * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
       25 + * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
  26   26   */
  27   27  
  28   28  #ifndef _INET_TCP_IMPL_H
  29   29  #define _INET_TCP_IMPL_H
  30   30  
  31   31  /*
  32   32   * TCP implementation private declarations.  These interfaces are
  33   33   * used to build the IP module and are not meant to be accessed
  34   34   * by any modules except IP itself.  They are undocumented and are
  35   35   * subject to change without notice.
↓ open down ↓ 257 lines elided ↑ open up ↑
 293  293          if ((tcp)->tcp_connp->conn_ipversion == IPV4_VERSION) { \
 294  294                  /* We need to clear the code point first. */ \
 295  295                  ((ipha_t *)(iph))->ipha_type_of_service &= 0xFC; \
 296  296                  ((ipha_t *)(iph))->ipha_type_of_service |= IPH_ECN_ECT0; \
 297  297          } else { \
 298  298                  ((ip6_t *)(iph))->ip6_vcf &= htonl(0xFFCFFFFF); \
 299  299                  ((ip6_t *)(iph))->ip6_vcf |= htonl(IPH_ECN_ECT0 << 20); \
 300  300          }
 301  301  
 302  302  /*
 303      - * Set tcp_rto with boundary checking.
 304      - */
 305      -#define TCP_SET_RTO(tcp, rto) \
 306      -        if ((rto) < (tcp)->tcp_rto_min)                 \
 307      -                (tcp)->tcp_rto = (tcp)->tcp_rto_min;    \
 308      -        else if ((rto) > (tcp)->tcp_rto_max)            \
 309      -                (tcp)->tcp_rto = (tcp)->tcp_rto_max;    \
 310      -        else                                            \
 311      -                (tcp)->tcp_rto = (rto);
 312      -
 313      -/*
 314  303   * TCP options struct returned from tcp_parse_options.
 315  304   */
 316  305  typedef struct tcp_opt_s {
 317  306          uint32_t        tcp_opt_mss;
 318  307          uint32_t        tcp_opt_wscale;
 319  308          uint32_t        tcp_opt_ts_val;
 320  309          uint32_t        tcp_opt_ts_ecr;
 321  310          tcp_t           *tcp;
 322  311  } tcp_opt_t;
 323  312  
↓ open down ↓ 243 lines elided ↑ open up ↑
 567  556                                          tcps_propinfo_tbl[56].prop_max_uval
 568  557  #define tcps_keepalive_abort_interval \
 569  558                                          tcps_propinfo_tbl[56].prop_cur_uval
 570  559  #define tcps_keepalive_abort_interval_low \
 571  560                                          tcps_propinfo_tbl[56].prop_min_uval
 572  561  #define tcps_wroff_xtra                 tcps_propinfo_tbl[57].prop_cur_uval
 573  562  #define tcps_dev_flow_ctl               tcps_propinfo_tbl[58].prop_cur_bval
 574  563  #define tcps_reass_timeout              tcps_propinfo_tbl[59].prop_cur_uval
 575  564  #define tcps_iss_incr                   tcps_propinfo_tbl[65].prop_cur_uval
 576  565  
      566 +
      567 +/*
      568 + * As defined in RFC 6298, the RTO is the average estimates (SRTT) plus a
      569 + * multiple of the deviation estimates (K * RTTVAR):
      570 + *
      571 + * RTO = SRTT + max(G, K * RTTVAR)
      572 + *
      573 + * K is defined in the RFC as 4, and G is the clock granularity. We constrain
      574 + * the minimum mean deviation to TCP_SD_MIN when processing new RTTs, so this
      575 + * becomes:
      576 + *
      577 + * RTO = SRTT + 4 * RTTVAR
      578 + *
      579 + * In practice, however, we make several additions to it. As we use a finer
      580 + * grained clock than BSD and update RTO for every ACK, we add in another 1/4 of
      581 + * RTT to the deviation of RTO to accommodate burstiness of 1/4 of window size:
      582 + *
      583 + * RTO = SRTT + (SRTT / 4) + 4 * RTTVAR
      584 + *
      585 + * Since tcp_rtt_sa is 8 times the SRTT, and tcp_rtt_sd is 4 times the RTTVAR,
      586 + * this becomes:
      587 + *
      588 + * RTO = (tcp_rtt_sa / 8) + ((tcp_rtt_sa / 8) / 4) + tcp_rtt_sd
      589 + * RTO = (tcp_rtt_sa / 2^3) + (tcp_rtt_sa / 2^5) + tcp_rtt_sd
      590 + * RTO = (tcp_rtt_sa >> 3) + (tcp_rtt_sa >> 5) + tcp_rtt_sd
      591 + *
      592 + * The "tcp_rexmit_interval_extra" and "tcp_conn_grace_period" tunables are
      593 + * used to help account for extreme environments where the algorithm fails to
      594 + * work; by default they should be 0. (The latter tunable is only used for
      595 + * calculating the intial RTO, and so is optionally passed in as "extra".) We
      596 + * add them here:
      597 + *
      598 + * RTO = (tcp_rtt_sa >> 3) + (tcp_rtt_sa >> 5) + tcp_rtt_sd +
      599 + *     tcps_rexmit_interval_extra + tcps_conn_grace_period
      600 + *
      601 + * We then pin the RTO within our configured boundaries (sections 2.4 and 2.5
      602 + * of RFC 6298).
      603 + */
      604 +static __GNU_INLINE clock_t
      605 +tcp_calculate_rto(tcp_t *tcp, tcp_stack_t *tcps, uint32_t extra)
      606 +{
      607 +        clock_t rto;
      608 +
      609 +        rto = NSEC2MSEC((tcp->tcp_rtt_sa >> 3) + (tcp->tcp_rtt_sa >> 5) +
      610 +            tcp->tcp_rtt_sd) + tcps->tcps_rexmit_interval_extra + extra;
      611 +
      612 +        if (rto < tcp->tcp_rto_min) {
      613 +                rto = tcp->tcp_rto_min;
      614 +        } else if (rto > tcp->tcp_rto_max) {
      615 +                rto = tcp->tcp_rto_max;
      616 +        }
      617 +
      618 +        return (rto);
      619 +}
      620 +
 577  621  extern struct qinit tcp_rinitv4, tcp_rinitv6;
 578  622  extern boolean_t do_tcp_fusion;
 579  623  
 580  624  /*
 581  625   * Object to represent database of options to search passed to
 582  626   * {sock,tpi}optcom_req() interface routine to take care of option
 583  627   * management and associated methods.
 584  628   */
 585  629  extern optdb_obj_t      tcp_opt_obj;
 586  630  extern uint_t           tcp_max_optsize;
↓ open down ↓ 195 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX