Print this page
11546 Track TCP round-trip time in nanoseconds
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Portions contributed by: Brandon Baker <bbaker@delphix.com>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/inet/tcp/tcp_output.c
          +++ new/usr/src/uts/common/inet/tcp/tcp_output.c
↓ open down ↓ 13 lines elided ↑ open up ↑
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24      - * Copyright (c) 2014 by Delphix. All rights reserved.
       24 + * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
       25 + * Copyright 2019 Joyent, Inc.
  25   26   */
  26   27  
  27   28  /* This file contains all TCP output processing functions. */
  28   29  
  29   30  #include <sys/types.h>
  30   31  #include <sys/stream.h>
  31   32  #include <sys/strsun.h>
  32   33  #include <sys/strsubr.h>
  33   34  #include <sys/stropts.h>
  34   35  #include <sys/strlog.h>
↓ open down ↓ 16 lines elided ↑ open up ↑
  51   52  #include <inet/proto_set.h>
  52   53  #include <inet/ipsec_impl.h>
  53   54  #include <inet/ip_ndp.h>
  54   55  
  55   56  static mblk_t   *tcp_get_seg_mp(tcp_t *, uint32_t, int32_t *);
  56   57  static void     tcp_wput_cmdblk(queue_t *, mblk_t *);
  57   58  static void     tcp_wput_flush(tcp_t *, mblk_t *);
  58   59  static void     tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp);
  59   60  static int      tcp_xmit_end(tcp_t *);
  60   61  static int      tcp_send(tcp_t *, const int, const int, const int,
  61      -                    const int, int *, uint_t *, int *, mblk_t **, mblk_t *);
       62 +                    const int, int *, uint32_t *, int *, mblk_t **, mblk_t *);
  62   63  static void     tcp_xmit_early_reset(char *, mblk_t *, uint32_t, uint32_t,
  63   64                      int, ip_recv_attr_t *, ip_stack_t *, conn_t *);
  64   65  static boolean_t        tcp_send_rst_chk(tcp_stack_t *);
  65   66  static void     tcp_process_shrunk_swnd(tcp_t *, uint32_t);
  66      -static void     tcp_fill_header(tcp_t *, uchar_t *, clock_t, int);
       67 +static void     tcp_fill_header(tcp_t *, uchar_t *, int);
  67   68  
  68   69  /*
  69   70   * Functions called directly via squeue having a prototype of edesc_t.
  70   71   */
  71   72  static void     tcp_wput_nondata(void *, mblk_t *, void *, ip_recv_attr_t *);
  72   73  static void     tcp_wput_ioctl(void *, mblk_t *, void *, ip_recv_attr_t *);
  73   74  static void     tcp_wput_proto(void *, mblk_t *, void *, ip_recv_attr_t *);
  74   75  
  75   76  /*
  76   77   * This controls how tiny a write must be before we try to copy it
↓ open down ↓ 370 lines elided ↑ open up ↑
 447  448  
 448  449                  /* usable = MAX(usable, {1 for urgent, 0 for data}) */
 449  450                  if (usable_r > 0) {
 450  451                          usable = usable_r;
 451  452                  } else {
 452  453                          /* Bypass all other unnecessary processing. */
 453  454                          goto done;
 454  455                  }
 455  456          }
 456  457  
 457      -        local_time = (mblk_t *)now;
      458 +        local_time = (mblk_t *)(intptr_t)gethrtime();
 458  459  
 459  460          /*
 460  461           * "Our" Nagle Algorithm.  This is not the same as in the old
 461  462           * BSD.  This is more in line with the true intent of Nagle.
 462  463           *
 463  464           * The conditions are:
 464  465           * 1. The amount of unsent data (or amount of data which can be
 465  466           *    sent, whichever is smaller) is less than Nagle limit.
 466  467           * 2. The last sent size is also less than Nagle limit.
 467  468           * 3. There is unack'ed data.
↓ open down ↓ 708 lines elided ↑ open up ↑
1176 1177           *    un-acked     usable
1177 1178           *  |--------------|-----------------|
1178 1179           *  tcp_suna       tcp_snxt       tcp_suna+tcp_swnd
1179 1180           */
1180 1181          /* END CSTYLED */
1181 1182  
1182 1183          /* start sending from tcp_snxt */
1183 1184          snxt = tcp->tcp_snxt;
1184 1185  
1185 1186          /*
1186      -         * Check to see if this connection has been idled for some
1187      -         * time and no ACK is expected.  If it is, we need to slow
1188      -         * start again to get back the connection's "self-clock" as
1189      -         * described in VJ's paper.
     1187 +         * Check to see if this connection has been idle for some time and no
     1188 +         * ACK is expected. If so, then the congestion window size is no longer
     1189 +         * meaningfully tied to current network conditions.
1190 1190           *
1191      -         * Reinitialize tcp_cwnd after idle.
     1191 +         * We reinitialize tcp_cwnd, and slow start again to get back the
     1192 +         * connection's "self-clock" as described in Van Jacobson's 1988 paper
     1193 +         * "Congestion avoidance and control".
1192 1194           */
1193 1195          now = LBOLT_FASTPATH;
1194 1196          if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
1195 1197              (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
1196 1198                  TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle);
1197 1199          }
1198 1200  
1199 1201          usable = tcp->tcp_swnd;         /* tcp window size */
1200 1202          if (usable > tcp->tcp_cwnd)
1201 1203                  usable = tcp->tcp_cwnd; /* congestion window smaller */
↓ open down ↓ 47 lines elided ↑ open up ↑
1249 1251          }
1250 1252  
1251 1253          /* we have always sent something */
1252 1254          tcp->tcp_rack_cnt = 0;
1253 1255  
1254 1256          tcp->tcp_snxt = snxt + len;
1255 1257          tcp->tcp_rack = tcp->tcp_rnxt;
1256 1258  
1257 1259          if ((mp1 = dupb(mp)) == 0)
1258 1260                  goto no_memory;
1259      -        mp->b_prev = (mblk_t *)(uintptr_t)now;
     1261 +        mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
1260 1262          mp->b_next = (mblk_t *)(uintptr_t)snxt;
1261 1263  
1262 1264          /* adjust tcp header information */
1263 1265          tcpha = tcp->tcp_tcpha;
1264 1266          tcpha->tha_flags = (TH_ACK|TH_PUSH);
1265 1267  
1266 1268          sum = len + connp->conn_ht_ulp_len + connp->conn_sum;
1267 1269          sum = (sum >> 16) + (sum & 0xFFFF);
1268 1270          tcpha->tha_sum = htons(sum);
1269 1271  
↓ open down ↓ 34 lines elided ↑ open up ↑
1304 1306                  mp->b_cont = mp1;
1305 1307                  mp1 = mp;
1306 1308                  /* Leave room for Link Level header */
1307 1309                  rptr = &mp1->b_rptr[tcps->tcps_wroff_xtra];
1308 1310                  mp1->b_wptr = &rptr[hdrlen];
1309 1311          }
1310 1312          mp1->b_rptr = rptr;
1311 1313  
1312 1314          /* Fill in the timestamp option. */
1313 1315          if (tcp->tcp_snd_ts_ok) {
1314      -                uint32_t llbolt = (uint32_t)LBOLT_FASTPATH;
1315      -
1316      -                U32_TO_BE32(llbolt,
1317      -                    (char *)tcpha + TCP_MIN_HEADER_LENGTH+4);
     1316 +                U32_TO_BE32(now,
     1317 +                    (char *)tcpha + TCP_MIN_HEADER_LENGTH + 4);
1318 1318                  U32_TO_BE32(tcp->tcp_ts_recent,
1319      -                    (char *)tcpha + TCP_MIN_HEADER_LENGTH+8);
     1319 +                    (char *)tcpha + TCP_MIN_HEADER_LENGTH + 8);
1320 1320          } else {
1321 1321                  ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
1322 1322          }
1323 1323  
1324 1324          /* copy header into outgoing packet */
1325 1325          dst = (ipaddr_t *)rptr;
1326 1326          src = (ipaddr_t *)connp->conn_ht_iphc;
1327 1327          dst[0] = src[0];
1328 1328          dst[1] = src[1];
1329 1329          dst[2] = src[2];
↓ open down ↓ 434 lines elided ↑ open up ↑
1764 1764  /*
1765 1765   * tcp_send() is called by tcp_wput_data() and returns one of the following:
1766 1766   *
1767 1767   * -1 = failed allocation.
1768 1768   *  0 = We've either successfully sent data, or our usable send window is too
1769 1769   *      small and we'd rather wait until later before sending again.
1770 1770   */
1771 1771  static int
1772 1772  tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len,
1773 1773      const int tcp_hdr_len, const int num_sack_blk, int *usable,
1774      -    uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time)
     1774 +    uint32_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time)
1775 1775  {
1776 1776          int             num_lso_seg = 1;
1777 1777          uint_t          lso_usable;
1778 1778          boolean_t       do_lso_send = B_FALSE;
1779 1779          tcp_stack_t     *tcps = tcp->tcp_tcps;
1780 1780          conn_t          *connp = tcp->tcp_connp;
1781 1781          ip_xmit_attr_t  *ixa = connp->conn_ixa;
1782 1782  
1783 1783          /*
1784 1784           * Check LSO possibility. The value of tcp->tcp_lso indicates whether
↓ open down ↓ 274 lines elided ↑ open up ↑
2059 2059                          /* Leave room for Link Level header */
2060 2060                          len = total_hdr_len;
2061 2061                          rptr = &mp->b_rptr[tcps->tcps_wroff_xtra];
2062 2062                          mp->b_wptr = &rptr[len];
2063 2063                  }
2064 2064  
2065 2065                  /*
2066 2066                   * Fill in the header using the template header, and add
2067 2067                   * options such as time-stamp, ECN and/or SACK, as needed.
2068 2068                   */
2069      -                tcp_fill_header(tcp, rptr, (clock_t)local_time, num_sack_blk);
     2069 +                tcp_fill_header(tcp, rptr, num_sack_blk);
2070 2070  
2071 2071                  mp->b_rptr = rptr;
2072 2072  
2073 2073                  if (*tail_unsent) {
2074 2074                          int spill = *tail_unsent;
2075 2075  
2076 2076                          mp1 = mp->b_cont;
2077 2077                          if (mp1 == NULL)
2078 2078                                  mp1 = mp;
2079 2079  
↓ open down ↓ 197 lines elided ↑ open up ↑
2277 2277           */
2278 2278          if (tcps->tcps_rtt_updates == 0 ||
2279 2279              tcp->tcp_rtt_update < tcps->tcps_rtt_updates)
2280 2280                  return (0);
2281 2281  
2282 2282          /*
2283 2283           * We do not have a good algorithm to update ssthresh at this time.
2284 2284           * So don't do any update.
2285 2285           */
2286 2286          bzero(&uinfo, sizeof (uinfo));
2287      -        uinfo.iulp_rtt = tcp->tcp_rtt_sa;
2288      -        uinfo.iulp_rtt_sd = tcp->tcp_rtt_sd;
     2287 +        uinfo.iulp_rtt = NSEC2MSEC(tcp->tcp_rtt_sa);
     2288 +        uinfo.iulp_rtt_sd = NSEC2MSEC(tcp->tcp_rtt_sd);
2289 2289  
2290 2290          /*
2291 2291           * Note that uinfo is kept for conn_faddr in the DCE. Could update even
2292 2292           * if source routed but we don't.
2293 2293           */
2294 2294          if (connp->conn_ipversion == IPV4_VERSION) {
2295 2295                  if (connp->conn_faddr_v4 !=  tcp->tcp_ipha->ipha_dst) {
2296 2296                          return (0);
2297 2297                  }
2298 2298                  (void) dce_update_uinfo_v4(connp->conn_faddr_v4, &uinfo, ipst);
↓ open down ↓ 1083 lines elided ↑ open up ↑
3382 3382  
3383 3383                  usable_swnd -= seg_len;
3384 3384                  tcp->tcp_pipe += seg_len;
3385 3385                  tcp->tcp_sack_snxt = begin + seg_len;
3386 3386  
3387 3387                  tcp_send_data(tcp, xmit_mp);
3388 3388  
3389 3389                  /*
3390 3390                   * Update the send timestamp to avoid false retransmission.
3391 3391                   */
3392      -                snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt();
     3392 +                snxt_mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
3393 3393  
3394 3394                  TCPS_BUMP_MIB(tcps, tcpRetransSegs);
3395 3395                  TCPS_UPDATE_MIB(tcps, tcpRetransBytes, seg_len);
3396 3396                  TCPS_BUMP_MIB(tcps, tcpOutSackRetransSegs);
3397 3397                  /*
3398 3398                   * Update tcp_rexmit_max to extend this SACK recovery phase.
3399 3399                   * This happens when new data sent during fast recovery is
3400 3400                   * also lost.  If TCP retransmits those new data, it needs
3401 3401                   * to extend SACK recover phase to avoid starting another
3402 3402                   * fast retransmit/recovery unnecessarily.
↓ open down ↓ 51 lines elided ↑ open up ↑
3454 3454                                  return;
3455 3455  
3456 3456                          tcp_send_data(tcp, xmit_mp);
3457 3457  
3458 3458                          snxt += cnt;
3459 3459                          win -= cnt;
3460 3460                          /*
3461 3461                           * Update the send timestamp to avoid false
3462 3462                           * retransmission.
3463 3463                           */
3464      -                        old_snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt();
     3464 +                        old_snxt_mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
3465 3465                          TCPS_BUMP_MIB(tcps, tcpRetransSegs);
3466 3466                          TCPS_UPDATE_MIB(tcps, tcpRetransBytes, cnt);
3467 3467  
3468 3468                          tcp->tcp_rexmit_nxt = snxt;
3469 3469                  }
3470 3470                  /*
3471 3471                   * If we have transmitted all we have at the time
3472 3472                   * we started the retranmission, we can leave
3473 3473                   * the rest of the job to tcp_wput_data().  But we
3474 3474                   * need to check the send window first.  If the
↓ open down ↓ 139 lines elided ↑ open up ↑
3614 3614                   */
3615 3615                  TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
3616 3616  }
3617 3617  
3618 3618  /*
3619 3619   * tcp_fill_header is called by tcp_send() to fill the outgoing TCP header
3620 3620   * with the template header, as well as other options such as time-stamp,
3621 3621   * ECN and/or SACK.
3622 3622   */
3623 3623  static void
3624      -tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk)
     3624 +tcp_fill_header(tcp_t *tcp, uchar_t *rptr, int num_sack_blk)
3625 3625  {
3626 3626          tcpha_t *tcp_tmpl, *tcpha;
3627 3627          uint32_t *dst, *src;
3628 3628          int hdrlen;
3629 3629          conn_t *connp = tcp->tcp_connp;
3630 3630  
3631 3631          ASSERT(OK_32PTR(rptr));
3632 3632  
3633 3633          /* Template header */
3634 3634          tcp_tmpl = tcp->tcp_tcpha;
↓ open down ↓ 1 lines elided ↑ open up ↑
3636 3636          /* Header of outgoing packet */
3637 3637          tcpha = (tcpha_t *)(rptr + connp->conn_ixa->ixa_ip_hdr_length);
3638 3638  
3639 3639          /* dst and src are opaque 32-bit fields, used for copying */
3640 3640          dst = (uint32_t *)rptr;
3641 3641          src = (uint32_t *)connp->conn_ht_iphc;
3642 3642          hdrlen = connp->conn_ht_iphc_len;
3643 3643  
3644 3644          /* Fill time-stamp option if needed */
3645 3645          if (tcp->tcp_snd_ts_ok) {
3646      -                U32_TO_BE32((uint32_t)now,
     3646 +                U32_TO_BE32(LBOLT_FASTPATH,
3647 3647                      (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 4);
3648 3648                  U32_TO_BE32(tcp->tcp_ts_recent,
3649 3649                      (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 8);
3650 3650          } else {
3651 3651                  ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
3652 3652          }
3653 3653  
3654 3654          /*
3655 3655           * Copy the template header; is this really more efficient than
3656 3656           * calling bcopy()?  For simple IPv4/TCP, it may be the case,
↓ open down ↓ 61 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX