Print this page
11546 Track TCP round-trip time in nanoseconds
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Portions contributed by: Brandon Baker <bbaker@delphix.com>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>

@@ -19,11 +19,12 @@
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
  */
 
 /* This file contains all TCP output processing functions. */
 
 #include <sys/types.h>

@@ -56,16 +57,16 @@
 static void     tcp_wput_cmdblk(queue_t *, mblk_t *);
 static void     tcp_wput_flush(tcp_t *, mblk_t *);
 static void     tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp);
 static int      tcp_xmit_end(tcp_t *);
 static int      tcp_send(tcp_t *, const int, const int, const int,
-                    const int, int *, uint_t *, int *, mblk_t **, mblk_t *);
+                    const int, int *, uint32_t *, int *, mblk_t **, mblk_t *);
 static void     tcp_xmit_early_reset(char *, mblk_t *, uint32_t, uint32_t,
                     int, ip_recv_attr_t *, ip_stack_t *, conn_t *);
 static boolean_t        tcp_send_rst_chk(tcp_stack_t *);
 static void     tcp_process_shrunk_swnd(tcp_t *, uint32_t);
-static void     tcp_fill_header(tcp_t *, uchar_t *, clock_t, int);
+static void     tcp_fill_header(tcp_t *, uchar_t *, int);
 
 /*
  * Functions called directly via squeue having a prototype of edesc_t.
  */
 static void     tcp_wput_nondata(void *, mblk_t *, void *, ip_recv_attr_t *);

@@ -452,11 +453,11 @@
                         /* Bypass all other unnecessary processing. */
                         goto done;
                 }
         }
 
-        local_time = (mblk_t *)now;
+        local_time = (mblk_t *)(intptr_t)gethrtime();
 
         /*
          * "Our" Nagle Algorithm.  This is not the same as in the old
          * BSD.  This is more in line with the true intent of Nagle.
          *

@@ -1181,16 +1182,17 @@
 
         /* start sending from tcp_snxt */
         snxt = tcp->tcp_snxt;
 
         /*
-         * Check to see if this connection has been idled for some
-         * time and no ACK is expected.  If it is, we need to slow
-         * start again to get back the connection's "self-clock" as
-         * described in VJ's paper.
+         * Check to see if this connection has been idle for some time and no
+         * ACK is expected. If so, then the congestion window size is no longer
+         * meaningfully tied to current network conditions.
          *
-         * Reinitialize tcp_cwnd after idle.
+         * We reinitialize tcp_cwnd, and slow start again to get back the
+         * connection's "self-clock" as described in Van Jacobson's 1988 paper
+         * "Congestion avoidance and control".
          */
         now = LBOLT_FASTPATH;
         if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
             (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
                 TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle);

@@ -1254,11 +1256,11 @@
         tcp->tcp_snxt = snxt + len;
         tcp->tcp_rack = tcp->tcp_rnxt;
 
         if ((mp1 = dupb(mp)) == 0)
                 goto no_memory;
-        mp->b_prev = (mblk_t *)(uintptr_t)now;
+        mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
         mp->b_next = (mblk_t *)(uintptr_t)snxt;
 
         /* adjust tcp header information */
         tcpha = tcp->tcp_tcpha;
         tcpha->tha_flags = (TH_ACK|TH_PUSH);

@@ -1309,16 +1311,14 @@
         }
         mp1->b_rptr = rptr;
 
         /* Fill in the timestamp option. */
         if (tcp->tcp_snd_ts_ok) {
-                uint32_t llbolt = (uint32_t)LBOLT_FASTPATH;
-
-                U32_TO_BE32(llbolt,
-                    (char *)tcpha + TCP_MIN_HEADER_LENGTH+4);
+                U32_TO_BE32(now,
+                    (char *)tcpha + TCP_MIN_HEADER_LENGTH + 4);
                 U32_TO_BE32(tcp->tcp_ts_recent,
-                    (char *)tcpha + TCP_MIN_HEADER_LENGTH+8);
+                    (char *)tcpha + TCP_MIN_HEADER_LENGTH + 8);
         } else {
                 ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
         }
 
         /* copy header into outgoing packet */

@@ -1769,11 +1769,11 @@
  *      small and we'd rather wait until later before sending again.
  */
 static int
 tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len,
     const int tcp_hdr_len, const int num_sack_blk, int *usable,
-    uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time)
+    uint32_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time)
 {
         int             num_lso_seg = 1;
         uint_t          lso_usable;
         boolean_t       do_lso_send = B_FALSE;
         tcp_stack_t     *tcps = tcp->tcp_tcps;

@@ -2064,11 +2064,11 @@
 
                 /*
                  * Fill in the header using the template header, and add
                  * options such as time-stamp, ECN and/or SACK, as needed.
                  */
-                tcp_fill_header(tcp, rptr, (clock_t)local_time, num_sack_blk);
+                tcp_fill_header(tcp, rptr, num_sack_blk);
 
                 mp->b_rptr = rptr;
 
                 if (*tail_unsent) {
                         int spill = *tail_unsent;

@@ -2282,12 +2282,12 @@
         /*
          * We do not have a good algorithm to update ssthresh at this time.
          * So don't do any update.
          */
         bzero(&uinfo, sizeof (uinfo));
-        uinfo.iulp_rtt = tcp->tcp_rtt_sa;
-        uinfo.iulp_rtt_sd = tcp->tcp_rtt_sd;
+        uinfo.iulp_rtt = NSEC2MSEC(tcp->tcp_rtt_sa);
+        uinfo.iulp_rtt_sd = NSEC2MSEC(tcp->tcp_rtt_sd);
 
         /*
          * Note that uinfo is kept for conn_faddr in the DCE. Could update even
          * if source routed but we don't.
          */

@@ -3387,11 +3387,11 @@
                 tcp_send_data(tcp, xmit_mp);
 
                 /*
                  * Update the send timestamp to avoid false retransmission.
                  */
-                snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt();
+                snxt_mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
 
                 TCPS_BUMP_MIB(tcps, tcpRetransSegs);
                 TCPS_UPDATE_MIB(tcps, tcpRetransBytes, seg_len);
                 TCPS_BUMP_MIB(tcps, tcpOutSackRetransSegs);
                 /*

@@ -3459,11 +3459,11 @@
                         win -= cnt;
                         /*
                          * Update the send timestamp to avoid false
                          * retransmission.
                          */
-                        old_snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt();
+                        old_snxt_mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
                         TCPS_BUMP_MIB(tcps, tcpRetransSegs);
                         TCPS_UPDATE_MIB(tcps, tcpRetransBytes, cnt);
 
                         tcp->tcp_rexmit_nxt = snxt;
                 }

@@ -3619,11 +3619,11 @@
  * tcp_fill_header is called by tcp_send() to fill the outgoing TCP header
  * with the template header, as well as other options such as time-stamp,
  * ECN and/or SACK.
  */
 static void
-tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk)
+tcp_fill_header(tcp_t *tcp, uchar_t *rptr, int num_sack_blk)
 {
         tcpha_t *tcp_tmpl, *tcpha;
         uint32_t *dst, *src;
         int hdrlen;
         conn_t *connp = tcp->tcp_connp;

@@ -3641,11 +3641,11 @@
         src = (uint32_t *)connp->conn_ht_iphc;
         hdrlen = connp->conn_ht_iphc_len;
 
         /* Fill time-stamp option if needed */
         if (tcp->tcp_snd_ts_ok) {
-                U32_TO_BE32((uint32_t)now,
+                U32_TO_BE32(LBOLT_FASTPATH,
                     (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 4);
                 U32_TO_BE32(tcp->tcp_ts_recent,
                     (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 8);
         } else {
                 ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);