Print this page
11546 Track TCP round-trip time in nanoseconds
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Portions contributed by: Brandon Baker <bbaker@delphix.com>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>


   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  25  * Copyright 2011 Joyent, Inc.  All rights reserved.
  26  * Copyright (c) 2014 by Delphix. All rights reserved.
  27  */
  28 
  29 #include <sys/types.h>
  30 #include <sys/strlog.h>
  31 #include <sys/strsun.h>
  32 #include <sys/squeue_impl.h>
  33 #include <sys/squeue.h>
  34 #include <sys/callo.h>
  35 #include <sys/strsubr.h>
  36 
  37 #include <inet/common.h>
  38 #include <inet/ip.h>
  39 #include <inet/ip_ire.h>
  40 #include <inet/ip_rts.h>
  41 #include <inet/tcp.h>
  42 #include <inet/tcp_impl.h>
  43 
  44 /*
  45  * Implementation of TCP Timers.
  46  * =============================


 739                 /*
 740                  * If the end point has not been closed, TCP can retransmit
 741                  * forever.  But if the end point is closed, the normal
 742                  * timeout applies.
 743                  */
 744                 if (second_threshold == 0) {
 745                         second_threshold = tcps->tcps_ip_abort_linterval;
 746                         dont_timeout = B_TRUE;
 747                 }
 748                 /* FALLTHRU */
 749         case TCPS_FIN_WAIT_1:
 750         case TCPS_CLOSING:
 751         case TCPS_LAST_ACK:
 752                 /* If we have data to rexmit */
 753                 if (tcp->tcp_suna != tcp->tcp_snxt) {
 754                         clock_t time_to_wait;
 755 
 756                         TCPS_BUMP_MIB(tcps, tcpTimRetrans);
 757                         if (!tcp->tcp_xmit_head)
 758                                 break;
 759                         time_to_wait = ddi_get_lbolt() -
 760                             (clock_t)tcp->tcp_xmit_head->b_prev;
 761                         time_to_wait = tcp->tcp_rto -
 762                             TICK_TO_MSEC(time_to_wait);
 763                         /*
 764                          * If the timer fires too early, 1 clock tick earlier,
 765                          * restart the timer.
 766                          */
 767                         if (time_to_wait > msec_per_tick) {
 768                                 TCP_STAT(tcps, tcp_timer_fire_early);
 769                                 TCP_TIMER_RESTART(tcp, time_to_wait);
 770                                 return;
 771                         }
 772                         /*
 773                          * When we probe zero windows, we force the swnd open.
 774                          * If our peer acks with a closed window swnd will be
 775                          * set to zero by tcp_rput(). As long as we are
 776                          * receiving acks tcp_rput will
 777                          * reset 'tcp_ms_we_have_waited' so as not to trip the
 778                          * first and second interval actions.  NOTE: the timer
 779                          * interval is allowed to continue its exponential
 780                          * backoff.
 781                          */
 782                         if (tcp->tcp_swnd == 0 || tcp->tcp_zero_win_probe) {


 995                          */
 996                         tcp->tcp_ms_we_have_waited = second_threshold;
 997                 }
 998         } else if (ms > first_threshold) {
 999                 /*
1000                  * Should not hold the zero-copy messages for too long.
1001                  */
1002                 if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean)
1003                         tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp,
1004                             tcp->tcp_xmit_head, B_TRUE);
1005 
1006                 /*
1007                  * We have been retransmitting for too long...  The RTT
1008                  * we calculated is probably incorrect.  Reinitialize it.
1009                  * Need to compensate for 0 tcp_rtt_sa.  Reset
1010                  * tcp_rtt_update so that we won't accidentally cache a
1011                  * bad value.  But only do this if this is not a zero
1012                  * window probe.
1013                  */
1014                 if (tcp->tcp_rtt_sa != 0 && tcp->tcp_zero_win_probe == 0) {
1015                         tcp->tcp_rtt_sd += (tcp->tcp_rtt_sa >> 3) +
1016                             (tcp->tcp_rtt_sa >> 5);
1017                         tcp->tcp_rtt_sa = 0;
1018                         tcp_ip_notify(tcp);
1019                         tcp->tcp_rtt_update = 0;
1020                 }
1021         }
1022 
1023 timer_rexmit:
1024         tcp->tcp_timer_backoff++;
1025         if ((ms = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
1026             tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) <
1027             tcp->tcp_rto_min) {
1028                 /*
1029                  * This means the original RTO is tcp_rexmit_interval_min.
1030                  * So we will use tcp_rexmit_interval_min as the RTO value
1031                  * and do the backoff.
1032                  */
1033                 ms = tcp->tcp_rto_min << tcp->tcp_timer_backoff;
1034         } else {
1035                 ms <<= tcp->tcp_timer_backoff;
1036         }
1037         if (ms > tcp->tcp_rto_max) {
1038                 ms = tcp->tcp_rto_max;
1039                 /*
1040                  * ms is at max, decrement tcp_timer_backoff to avoid
1041                  * overflow.
1042                  */
1043                 tcp->tcp_timer_backoff--;
1044         }
1045         tcp->tcp_ms_we_have_waited += ms;
1046         if (tcp->tcp_zero_win_probe == 0) {
1047                 tcp->tcp_rto = ms;
1048         }
1049         TCP_TIMER_RESTART(tcp, ms);
1050         /*
1051          * This is after a timeout and tcp_rto is backed off.  Set
1052          * tcp_set_timer to 1 so that next time RTO is updated, we will
1053          * restart the timer with a correct value.
1054          */
1055         tcp->tcp_set_timer = 1;
1056         mss = tcp->tcp_snxt - tcp->tcp_suna;
1057         if (mss > tcp->tcp_mss)
1058                 mss = tcp->tcp_mss;
1059         if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0)
1060                 mss = tcp->tcp_swnd;
1061 
1062         if ((mp = tcp->tcp_xmit_head) != NULL)
1063                 mp->b_prev = (mblk_t *)ddi_get_lbolt();

1064         mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss,
1065             B_TRUE);
1066 
1067         /*
1068          * When slow start after retransmission begins, start with
1069          * this seq no.  tcp_rexmit_max marks the end of special slow
1070          * start phase.
1071          */
1072         tcp->tcp_rexmit_nxt = tcp->tcp_suna;
1073         if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
1074             (tcp->tcp_unsent == 0)) {
1075                 tcp->tcp_rexmit_max = tcp->tcp_fss;
1076         } else {
1077                 tcp->tcp_rexmit_max = tcp->tcp_snxt;
1078         }
1079         tcp->tcp_rexmit = B_TRUE;
1080         tcp->tcp_dupack_cnt = 0;
1081 
1082         /*
1083          * Remove all rexmit SACK blk to start from fresh.




   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  25  * Copyright 2011 Joyent, Inc.  All rights reserved.
  26  * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
  27  */
  28 
  29 #include <sys/types.h>
  30 #include <sys/strlog.h>
  31 #include <sys/strsun.h>
  32 #include <sys/squeue_impl.h>
  33 #include <sys/squeue.h>
  34 #include <sys/callo.h>
  35 #include <sys/strsubr.h>
  36 
  37 #include <inet/common.h>
  38 #include <inet/ip.h>
  39 #include <inet/ip_ire.h>
  40 #include <inet/ip_rts.h>
  41 #include <inet/tcp.h>
  42 #include <inet/tcp_impl.h>
  43 
  44 /*
  45  * Implementation of TCP Timers.
  46  * =============================


 739                 /*
 740                  * If the end point has not been closed, TCP can retransmit
 741                  * forever.  But if the end point is closed, the normal
 742                  * timeout applies.
 743                  */
 744                 if (second_threshold == 0) {
 745                         second_threshold = tcps->tcps_ip_abort_linterval;
 746                         dont_timeout = B_TRUE;
 747                 }
 748                 /* FALLTHRU */
 749         case TCPS_FIN_WAIT_1:
 750         case TCPS_CLOSING:
 751         case TCPS_LAST_ACK:
 752                 /* If we have data to rexmit */
 753                 if (tcp->tcp_suna != tcp->tcp_snxt) {
 754                         clock_t time_to_wait;
 755 
 756                         TCPS_BUMP_MIB(tcps, tcpTimRetrans);
 757                         if (!tcp->tcp_xmit_head)
 758                                 break;
 759                         time_to_wait = NSEC2MSEC(gethrtime() -
 760                             (hrtime_t)(intptr_t)tcp->tcp_xmit_head->b_prev);
 761                         time_to_wait = tcp->tcp_rto - time_to_wait;

 762                         /*
 763                          * If the timer fires too early, 1 clock tick earlier,
 764                          * restart the timer.
 765                          */
 766                         if (time_to_wait > msec_per_tick) {
 767                                 TCP_STAT(tcps, tcp_timer_fire_early);
 768                                 TCP_TIMER_RESTART(tcp, time_to_wait);
 769                                 return;
 770                         }
 771                         /*
 772                          * When we probe zero windows, we force the swnd open.
 773                          * If our peer acks with a closed window swnd will be
 774                          * set to zero by tcp_rput(). As long as we are
 775                          * receiving acks tcp_rput will
 776                          * reset 'tcp_ms_we_have_waited' so as not to trip the
 777                          * first and second interval actions.  NOTE: the timer
 778                          * interval is allowed to continue its exponential
 779                          * backoff.
 780                          */
 781                         if (tcp->tcp_swnd == 0 || tcp->tcp_zero_win_probe) {


 994                          */
 995                         tcp->tcp_ms_we_have_waited = second_threshold;
 996                 }
 997         } else if (ms > first_threshold) {
 998                 /*
 999                  * Should not hold the zero-copy messages for too long.
1000                  */
1001                 if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean)
1002                         tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp,
1003                             tcp->tcp_xmit_head, B_TRUE);
1004 
1005                 /*
1006                  * We have been retransmitting for too long...  The RTT
1007                  * we calculated is probably incorrect.  Reinitialize it.
1008                  * Need to compensate for 0 tcp_rtt_sa.  Reset
1009                  * tcp_rtt_update so that we won't accidentally cache a
1010                  * bad value.  But only do this if this is not a zero
1011                  * window probe.
1012                  */
1013                 if (tcp->tcp_rtt_sa != 0 && tcp->tcp_zero_win_probe == 0) {
1014                         tcp->tcp_rtt_sd += tcp->tcp_rtt_sa >> 3 +
1015                             tcp->tcp_rtt_sa >> 5;
1016                         tcp->tcp_rtt_sa = 0;
1017                         tcp_ip_notify(tcp);
1018                         tcp->tcp_rtt_update = 0;
1019                 }
1020         }
1021 
1022 timer_rexmit:
1023         tcp->tcp_timer_backoff++;



1024         /*
1025          * Calculate the backed off retransmission timeout. If the shift brings
1026          * us back over the max, then we repin the value, and decrement the
1027          * backoff to avoid overflow.
1028          */
1029         ms = tcp_calculate_rto(tcp, tcps, 0) << tcp->tcp_timer_backoff;



1030         if (ms > tcp->tcp_rto_max) {
1031                 ms = tcp->tcp_rto_max;




1032                 tcp->tcp_timer_backoff--;
1033         }
1034         tcp->tcp_ms_we_have_waited += ms;
1035         if (tcp->tcp_zero_win_probe == 0) {
1036                 tcp->tcp_rto = ms;
1037         }
1038         TCP_TIMER_RESTART(tcp, ms);
1039         /*
1040          * This is after a timeout and tcp_rto is backed off.  Set
1041          * tcp_set_timer to 1 so that next time RTO is updated, we will
1042          * restart the timer with a correct value.
1043          */
1044         tcp->tcp_set_timer = 1;
1045         mss = tcp->tcp_snxt - tcp->tcp_suna;
1046         if (mss > tcp->tcp_mss)
1047                 mss = tcp->tcp_mss;
1048         if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0)
1049                 mss = tcp->tcp_swnd;
1050 
1051         if ((mp = tcp->tcp_xmit_head) != NULL) {
1052                 mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
1053         }
1054         mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss,
1055             B_TRUE);
1056 
1057         /*
1058          * When slow start after retransmission begins, start with
1059          * this seq no.  tcp_rexmit_max marks the end of special slow
1060          * start phase.
1061          */
1062         tcp->tcp_rexmit_nxt = tcp->tcp_suna;
1063         if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
1064             (tcp->tcp_unsent == 0)) {
1065                 tcp->tcp_rexmit_max = tcp->tcp_fss;
1066         } else {
1067                 tcp->tcp_rexmit_max = tcp->tcp_snxt;
1068         }
1069         tcp->tcp_rexmit = B_TRUE;
1070         tcp->tcp_dupack_cnt = 0;
1071 
1072         /*
1073          * Remove all rexmit SACK blk to start from fresh.