illumos-gate Wdiff usr/src/uts/common/inet/tcp/tcp_timers.c

Print this page

11546 Track TCP round-trip time in nanoseconds
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Portions contributed by: Brandon Baker <bbaker@delphix.com>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/inet/tcp/tcp_timers.c
          +++ new/usr/src/uts/common/inet/tcp/tcp_timers.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the

↓ open down ↓

15 lines elided

↑ open up ↑

  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  25   25   * Copyright 2011 Joyent, Inc.  All rights reserved.
  26      - * Copyright (c) 2014 by Delphix. All rights reserved.
       26 + * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
  27   27   */
  28   28  
  29   29  #include <sys/types.h>
  30   30  #include <sys/strlog.h>
  31   31  #include <sys/strsun.h>
  32   32  #include <sys/squeue_impl.h>
  33   33  #include <sys/squeue.h>
  34   34  #include <sys/callo.h>
  35   35  #include <sys/strsubr.h>
  36   36

  37   37  #include <inet/common.h>
  38   38  #include <inet/ip.h>
  39   39  #include <inet/ip_ire.h>
  40   40  #include <inet/ip_rts.h>
  41   41  #include <inet/tcp.h>
  42   42  #include <inet/tcp_impl.h>
  43   43  
  44   44  /*
  45   45   * Implementation of TCP Timers.
  46   46   * =============================
  47   47   *
  48   48   * INTERFACE:
  49   49   *
  50   50   * There are two basic functions dealing with tcp timers:
  51   51   *
  52   52   *      timeout_id_t    tcp_timeout(connp, func, time)
  53   53   *      clock_t         tcp_timeout_cancel(connp, timeout_id)
  54   54   *      TCP_TIMER_RESTART(tcp, intvl)
  55   55   *
  56   56   * tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func'
  57   57   * after 'time' ticks passed. The function called by timeout() must adhere to
  58   58   * the same restrictions as a driver soft interrupt handler - it must not sleep
  59   59   * or call other functions that might sleep. The value returned is the opaque
  60   60   * non-zero timeout identifier that can be passed to tcp_timeout_cancel() to
  61   61   * cancel the request. The call to tcp_timeout() may fail in which case it
  62   62   * returns zero. This is different from the timeout(9F) function which never
  63   63   * fails.
  64   64   *
  65   65   * The call-back function 'func' always receives 'connp' as its single
  66   66   * argument. It is always executed in the squeue corresponding to the tcp
  67   67   * structure. The tcp structure is guaranteed to be present at the time the
  68   68   * call-back is called.
  69   69   *
  70   70   * NOTE: The call-back function 'func' is never called if tcp is in
  71   71   *      the TCPS_CLOSED state.
  72   72   *
  73   73   * tcp_timeout_cancel() attempts to cancel a pending tcp_timeout()
  74   74   * request. locks acquired by the call-back routine should not be held across
  75   75   * the call to tcp_timeout_cancel() or a deadlock may result.
  76   76   *
  77   77   * tcp_timeout_cancel() returns -1 if the timeout request is invalid.
  78   78   * Otherwise, it returns an integer value greater than or equal to 0.
  79   79   *
  80   80   * NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called
  81   81   *      within squeue context corresponding to the tcp instance. Since the
  82   82   *      call-back is also called via the same squeue, there are no race
  83   83   *      conditions described in untimeout(9F) manual page since all calls are
  84   84   *      strictly serialized.
  85   85   *
  86   86   *      TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout
  87   87   *      stored in tcp_timer_tid and starts a new one using
  88   88   *      MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back
  89   89   *      and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid
  90   90   *      field.
  91   91   *
  92   92   * IMPLEMENTATION:
  93   93   *
  94   94   * TCP timers are implemented using three-stage process. The call to
  95   95   * tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function
  96   96   * when the timer expires. The tcp_timer_callback() arranges the call of the
  97   97   * tcp_timer_handler() function via squeue corresponding to the tcp
  98   98   * instance. The tcp_timer_handler() calls actual requested timeout call-back
  99   99   * and passes tcp instance as an argument to it. Information is passed between
 100  100   * stages using the tcp_timer_t structure which contains the connp pointer, the
 101  101   * tcp call-back to call and the timeout id returned by the timeout(9F).
 102  102   *
 103  103   * The tcp_timer_t structure is not used directly, it is embedded in an mblk_t -
 104  104   * like structure that is used to enter an squeue. The mp->b_rptr of this pseudo
 105  105   * mblk points to the beginning of tcp_timer_t structure. The tcp_timeout()
 106  106   * returns the pointer to this mblk.
 107  107   *
 108  108   * The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It
 109  109   * looks like a normal mblk without actual dblk attached to it.
 110  110   *
 111  111   * To optimize performance each tcp instance holds a small cache of timer
 112  112   * mblocks. In the current implementation it caches up to two timer mblocks per
 113  113   * tcp instance. The cache is preserved over tcp frees and is only freed when
 114  114   * the whole tcp structure is destroyed by its kmem destructor. Since all tcp
 115  115   * timer processing happens on a corresponding squeue, the cache manipulation
 116  116   * does not require any locks. Experiments show that majority of timer mblocks
 117  117   * allocations are satisfied from the tcp cache and do not involve kmem calls.
 118  118   *
 119  119   * The tcp_timeout() places a refhold on the connp instance which guarantees
 120  120   * that it will be present at the time the call-back function fires. The
 121  121   * tcp_timer_handler() drops the reference after calling the call-back, so the
 122  122   * call-back function does not need to manipulate the references explicitly.
 123  123   */
 124  124  
 125  125  kmem_cache_t *tcp_timercache;
 126  126  
 127  127  static void     tcp_ip_notify(tcp_t *);
 128  128  static void     tcp_timer_callback(void *);
 129  129  static void     tcp_timer_free(tcp_t *, mblk_t *);
 130  130  static void     tcp_timer_handler(void *, mblk_t *, void *, ip_recv_attr_t *);
 131  131  
 132  132  /*
 133  133   * tim is in millisec.
 134  134   */
 135  135  timeout_id_t
 136  136  tcp_timeout(conn_t *connp, void (*f)(void *), hrtime_t tim)
 137  137  {
 138  138          mblk_t *mp;
 139  139          tcp_timer_t *tcpt;
 140  140          tcp_t *tcp = connp->conn_tcp;
 141  141  
 142  142          ASSERT(connp->conn_sqp != NULL);
 143  143  
 144  144          TCP_DBGSTAT(tcp->tcp_tcps, tcp_timeout_calls);
 145  145  
 146  146          if (tcp->tcp_timercache == NULL) {
 147  147                  mp = tcp_timermp_alloc(KM_NOSLEEP | KM_PANIC);
 148  148          } else {
 149  149                  TCP_DBGSTAT(tcp->tcp_tcps, tcp_timeout_cached_alloc);
 150  150                  mp = tcp->tcp_timercache;
 151  151                  tcp->tcp_timercache = mp->b_next;
 152  152                  mp->b_next = NULL;
 153  153                  ASSERT(mp->b_wptr == NULL);
 154  154          }
 155  155  
 156  156          CONN_INC_REF(connp);
 157  157          tcpt = (tcp_timer_t *)mp->b_rptr;
 158  158          tcpt->connp = connp;
 159  159          tcpt->tcpt_proc = f;
 160  160          /*
 161  161           * TCP timers are normal timeouts. Plus, they do not require more than
 162  162           * a 10 millisecond resolution. By choosing a coarser resolution and by
 163  163           * rounding up the expiration to the next resolution boundary, we can
 164  164           * batch timers in the callout subsystem to make TCP timers more
 165  165           * efficient. The roundup also protects short timers from expiring too
 166  166           * early before they have a chance to be cancelled.
 167  167           */
 168  168          tcpt->tcpt_tid = timeout_generic(CALLOUT_NORMAL, tcp_timer_callback, mp,
 169  169              tim * MICROSEC, CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
 170  170          VERIFY(!(tcpt->tcpt_tid & CALLOUT_ID_FREE));
 171  171  
 172  172          return ((timeout_id_t)mp);
 173  173  }
 174  174  
 175  175  static void
 176  176  tcp_timer_callback(void *arg)
 177  177  {
 178  178          mblk_t *mp = (mblk_t *)arg;
 179  179          tcp_timer_t *tcpt;
 180  180          conn_t  *connp;
 181  181  
 182  182          tcpt = (tcp_timer_t *)mp->b_rptr;
 183  183          connp = tcpt->connp;
 184  184          SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_timer_handler, connp,
 185  185              NULL, SQ_FILL, SQTAG_TCP_TIMER);
 186  186  }
 187  187  
 188  188  /* ARGSUSED */
 189  189  static void
 190  190  tcp_timer_handler(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
 191  191  {
 192  192          tcp_timer_t *tcpt;
 193  193          conn_t *connp = (conn_t *)arg;
 194  194          tcp_t *tcp = connp->conn_tcp;
 195  195  
 196  196          tcpt = (tcp_timer_t *)mp->b_rptr;
 197  197          ASSERT(connp == tcpt->connp);
 198  198          ASSERT((squeue_t *)arg2 == connp->conn_sqp);
 199  199  
 200  200          if (tcpt->tcpt_tid & CALLOUT_ID_FREE) {
 201  201                  /*
 202  202                   * This timeout was cancelled after it was enqueued to the
 203  203                   * squeue; free the timer and return.
 204  204                   */
 205  205                  tcp_timer_free(connp->conn_tcp, mp);
 206  206                  return;
 207  207          }
 208  208  
 209  209          /*
 210  210           * If the TCP has reached the closed state, don't proceed any
 211  211           * further. This TCP logically does not exist on the system.
 212  212           * tcpt_proc could for example access queues, that have already
 213  213           * been qprocoff'ed off.
 214  214           */
 215  215          if (tcp->tcp_state != TCPS_CLOSED) {
 216  216                  (*tcpt->tcpt_proc)(connp);
 217  217          } else {
 218  218                  tcp->tcp_timer_tid = 0;
 219  219          }
 220  220  
 221  221          tcp_timer_free(connp->conn_tcp, mp);
 222  222  }
 223  223  
 224  224  /*
 225  225   * There is potential race with untimeout and the handler firing at the same
 226  226   * time. The mblock may be freed by the handler while we are trying to use
 227  227   * it. But since both should execute on the same squeue, this race should not
 228  228   * occur.
 229  229   */
 230  230  clock_t
 231  231  tcp_timeout_cancel(conn_t *connp, timeout_id_t id)
 232  232  {
 233  233          mblk_t  *mp = (mblk_t *)id;
 234  234          tcp_timer_t *tcpt;
 235  235          clock_t delta;
 236  236  
 237  237          TCP_DBGSTAT(connp->conn_tcp->tcp_tcps, tcp_timeout_cancel_reqs);
 238  238  
 239  239          if (mp == NULL)
 240  240                  return (-1);
 241  241  
 242  242          tcpt = (tcp_timer_t *)mp->b_rptr;
 243  243          ASSERT(tcpt->connp == connp);
 244  244  
 245  245          delta = untimeout_default(tcpt->tcpt_tid, 0);
 246  246  
 247  247          if (delta >= 0) {
 248  248                  TCP_DBGSTAT(connp->conn_tcp->tcp_tcps, tcp_timeout_canceled);
 249  249                  tcp_timer_free(connp->conn_tcp, mp);
 250  250                  CONN_DEC_REF(connp);
 251  251          } else {
 252  252                  /*
 253  253                   * If we were unable to untimeout successfully, it has already
 254  254                   * been enqueued on the squeue; mark the ID with the free
 255  255                   * bit.  This bit can never be set in a valid identifier, and
 256  256                   * we'll use it to prevent the timeout from being executed.
 257  257                   * And note that we're within the squeue perimeter here, so
 258  258                   * we don't need to worry about racing with timer handling
 259  259                   * (which also executes within the perimeter).
 260  260                   */
 261  261                  tcpt->tcpt_tid |= CALLOUT_ID_FREE;
 262  262                  delta = 0;
 263  263          }
 264  264  
 265  265          return (TICK_TO_MSEC(delta));
 266  266  }
 267  267  
 268  268  /*
 269  269   * Allocate space for the timer event. The allocation looks like mblk, but it is
 270  270   * not a proper mblk. To avoid confusion we set b_wptr to NULL.
 271  271   *
 272  272   * Dealing with failures: If we can't allocate from the timer cache we try
 273  273   * allocating from dblock caches using allocb_tryhard(). In this case b_wptr
 274  274   * points to b_rptr.
 275  275   * If we can't allocate anything using allocb_tryhard(), we perform a last
 276  276   * attempt and use kmem_alloc_tryhard(). In this case we set b_wptr to -1 and
 277  277   * save the actual allocation size in b_datap.
 278  278   */
 279  279  mblk_t *
 280  280  tcp_timermp_alloc(int kmflags)
 281  281  {
 282  282          mblk_t *mp = (mblk_t *)kmem_cache_alloc(tcp_timercache,
 283  283              kmflags & ~KM_PANIC);
 284  284  
 285  285          if (mp != NULL) {
 286  286                  mp->b_next = mp->b_prev = NULL;
 287  287                  mp->b_rptr = (uchar_t *)(&mp[1]);
 288  288                  mp->b_wptr = NULL;
 289  289                  mp->b_datap = NULL;
 290  290                  mp->b_queue = NULL;
 291  291                  mp->b_cont = NULL;
 292  292          } else if (kmflags & KM_PANIC) {
 293  293                  /*
 294  294                   * Failed to allocate memory for the timer. Try allocating from
 295  295                   * dblock caches.
 296  296                   */
 297  297                  /* ipclassifier calls this from a constructor - hence no tcps */
 298  298                  TCP_G_STAT(tcp_timermp_allocfail);
 299  299                  mp = allocb_tryhard(sizeof (tcp_timer_t));
 300  300                  if (mp == NULL) {
 301  301                          size_t size = 0;
 302  302                          /*
 303  303                           * Memory is really low. Try tryhard allocation.
 304  304                           *
 305  305                           * ipclassifier calls this from a constructor -
 306  306                           * hence no tcps
 307  307                           */
 308  308                          TCP_G_STAT(tcp_timermp_allocdblfail);
 309  309                          mp = kmem_alloc_tryhard(sizeof (mblk_t) +
 310  310                              sizeof (tcp_timer_t), &size, kmflags);
 311  311                          mp->b_rptr = (uchar_t *)(&mp[1]);
 312  312                          mp->b_next = mp->b_prev = NULL;
 313  313                          mp->b_wptr = (uchar_t *)-1;
 314  314                          mp->b_datap = (dblk_t *)size;
 315  315                          mp->b_queue = NULL;
 316  316                          mp->b_cont = NULL;
 317  317                  }
 318  318                  ASSERT(mp->b_wptr != NULL);
 319  319          }
 320  320          /* ipclassifier calls this from a constructor - hence no tcps */
 321  321          TCP_G_DBGSTAT(tcp_timermp_alloced);
 322  322  
 323  323          return (mp);
 324  324  }
 325  325  
 326  326  /*
 327  327   * Free per-tcp timer cache.
 328  328   * It can only contain entries from tcp_timercache.
 329  329   */
 330  330  void
 331  331  tcp_timermp_free(tcp_t *tcp)
 332  332  {
 333  333          mblk_t *mp;
 334  334  
 335  335          while ((mp = tcp->tcp_timercache) != NULL) {
 336  336                  ASSERT(mp->b_wptr == NULL);
 337  337                  tcp->tcp_timercache = tcp->tcp_timercache->b_next;
 338  338                  kmem_cache_free(tcp_timercache, mp);
 339  339          }
 340  340  }
 341  341  
 342  342  /*
 343  343   * Free timer event. Put it on the per-tcp timer cache if there is not too many
 344  344   * events there already (currently at most two events are cached).
 345  345   * If the event is not allocated from the timer cache, free it right away.
 346  346   */
 347  347  static void
 348  348  tcp_timer_free(tcp_t *tcp, mblk_t *mp)
 349  349  {
 350  350          mblk_t *mp1 = tcp->tcp_timercache;
 351  351  
 352  352          if (mp->b_wptr != NULL) {
 353  353                  /*
 354  354                   * This allocation is not from a timer cache, free it right
 355  355                   * away.
 356  356                   */
 357  357                  if (mp->b_wptr != (uchar_t *)-1)
 358  358                          freeb(mp);
 359  359                  else
 360  360                          kmem_free(mp, (size_t)mp->b_datap);
 361  361          } else if (mp1 == NULL || mp1->b_next == NULL) {
 362  362                  /* Cache this timer block for future allocations */
 363  363                  mp->b_rptr = (uchar_t *)(&mp[1]);
 364  364                  mp->b_next = mp1;
 365  365                  tcp->tcp_timercache = mp;
 366  366          } else {
 367  367                  kmem_cache_free(tcp_timercache, mp);
 368  368                  TCP_DBGSTAT(tcp->tcp_tcps, tcp_timermp_freed);
 369  369          }
 370  370  }
 371  371  
 372  372  /*
 373  373   * Stop all TCP timers.
 374  374   */
 375  375  void
 376  376  tcp_timers_stop(tcp_t *tcp)
 377  377  {
 378  378          if (tcp->tcp_timer_tid != 0) {
 379  379                  (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid);
 380  380                  tcp->tcp_timer_tid = 0;
 381  381          }
 382  382          if (tcp->tcp_ka_tid != 0) {
 383  383                  (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ka_tid);
 384  384                  tcp->tcp_ka_tid = 0;
 385  385          }
 386  386          if (tcp->tcp_ack_tid != 0) {
 387  387                  (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid);
 388  388                  tcp->tcp_ack_tid = 0;
 389  389          }
 390  390          if (tcp->tcp_push_tid != 0) {
 391  391                  (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid);
 392  392                  tcp->tcp_push_tid = 0;
 393  393          }
 394  394          if (tcp->tcp_reass_tid != 0) {
 395  395                  (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_reass_tid);
 396  396                  tcp->tcp_reass_tid = 0;
 397  397          }
 398  398  }
 399  399  
 400  400  /*
 401  401   * Timer callback routine for keepalive probe.  We do a fake resend of
 402  402   * last ACKed byte.  Then set a timer using RTO.  When the timer expires,
 403  403   * check to see if we have heard anything from the other end for the last
 404  404   * RTO period.  If we have, set the timer to expire for another
 405  405   * tcp_keepalive_intrvl and check again.  If we have not, set a timer using
 406  406   * RTO << 1 and check again when it expires.  Keep exponentially increasing
 407  407   * the timeout if we have not heard from the other side.  If for more than
 408  408   * (tcp_ka_interval + tcp_ka_abort_thres) we have not heard anything,
 409  409   * kill the connection unless the keepalive abort threshold is 0.  In
 410  410   * that case, we will probe "forever."
 411  411   * If tcp_ka_cnt and tcp_ka_rinterval are non-zero, then we do not follow
 412  412   * the exponential backoff, but send probes tcp_ka_cnt times in regular
 413  413   * intervals of tcp_ka_rinterval milliseconds until we hear back from peer.
 414  414   * Kill the connection if we don't hear back from peer after tcp_ka_cnt
 415  415   * probes are sent.
 416  416   */
 417  417  void
 418  418  tcp_keepalive_timer(void *arg)
 419  419  {
 420  420          mblk_t  *mp;
 421  421          conn_t  *connp = (conn_t *)arg;
 422  422          tcp_t   *tcp = connp->conn_tcp;
 423  423          int32_t firetime;
 424  424          int32_t idletime;
 425  425          int32_t ka_intrvl;
 426  426          tcp_stack_t     *tcps = tcp->tcp_tcps;
 427  427  
 428  428          tcp->tcp_ka_tid = 0;
 429  429  
 430  430          if (tcp->tcp_fused)
 431  431                  return;
 432  432  
 433  433          TCPS_BUMP_MIB(tcps, tcpTimKeepalive);
 434  434          ka_intrvl = tcp->tcp_ka_interval;
 435  435  
 436  436          /*
 437  437           * Keepalive probe should only be sent if the application has not
 438  438           * done a close on the connection.
 439  439           */
 440  440          if (tcp->tcp_state > TCPS_CLOSE_WAIT) {
 441  441                  return;
 442  442          }
 443  443          /* Timer fired too early, restart it. */
 444  444          if (tcp->tcp_state < TCPS_ESTABLISHED) {
 445  445                  tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
 446  446                      ka_intrvl);
 447  447                  return;
 448  448          }
 449  449  
 450  450          idletime = TICK_TO_MSEC(ddi_get_lbolt() - tcp->tcp_last_recv_time);
 451  451          /*
 452  452           * If we have not heard from the other side for a long
 453  453           * time, kill the connection unless the keepalive abort
 454  454           * threshold is 0.  In that case, we will probe "forever."
 455  455           */
 456  456          if (tcp->tcp_ka_abort_thres != 0 &&
 457  457              idletime > (ka_intrvl + tcp->tcp_ka_abort_thres)) {
 458  458                  TCPS_BUMP_MIB(tcps, tcpTimKeepaliveDrop);
 459  459                  (void) tcp_clean_death(tcp, tcp->tcp_client_errno ?
 460  460                      tcp->tcp_client_errno : ETIMEDOUT);
 461  461                  return;
 462  462          }
 463  463  
 464  464          if (tcp->tcp_snxt == tcp->tcp_suna &&
 465  465              idletime >= ka_intrvl) {
 466  466                  /* Fake resend of last ACKed byte. */
 467  467                  mblk_t  *mp1 = allocb(1, BPRI_LO);
 468  468  
 469  469                  if (mp1 != NULL) {
 470  470                          *mp1->b_wptr++ = '\0';
 471  471                          mp = tcp_xmit_mp(tcp, mp1, 1, NULL, NULL,
 472  472                              tcp->tcp_suna - 1, B_FALSE, NULL, B_TRUE);
 473  473                          freeb(mp1);
 474  474                          /*
 475  475                           * if allocation failed, fall through to start the
 476  476                           * timer back.
 477  477                           */
 478  478                          if (mp != NULL) {
 479  479                                  tcp_send_data(tcp, mp);
 480  480                                  TCPS_BUMP_MIB(tcps, tcpTimKeepaliveProbe);
 481  481                                  if (tcp->tcp_ka_rinterval) {
 482  482                                          firetime = tcp->tcp_ka_rinterval;
 483  483                                  } else if (tcp->tcp_ka_last_intrvl != 0) {
 484  484                                          int max;
 485  485                                          /*
 486  486                                           * We should probe again at least
 487  487                                           * in ka_intrvl, but not more than
 488  488                                           * tcp_rto_max.
 489  489                                           */
 490  490                                          max = tcp->tcp_rto_max;
 491  491                                          firetime = MIN(ka_intrvl - 1,
 492  492                                              tcp->tcp_ka_last_intrvl << 1);
 493  493                                          if (firetime > max)
 494  494                                                  firetime = max;
 495  495                                  } else {
 496  496                                          firetime = tcp->tcp_rto;
 497  497                                  }
 498  498                                  tcp->tcp_ka_tid = TCP_TIMER(tcp,
 499  499                                      tcp_keepalive_timer, firetime);
 500  500                                  tcp->tcp_ka_last_intrvl = firetime;
 501  501                                  return;
 502  502                          }
 503  503                  }
 504  504          } else {
 505  505                  tcp->tcp_ka_last_intrvl = 0;
 506  506          }
 507  507  
 508  508          /* firetime can be negative if (mp1 == NULL || mp == NULL) */
 509  509          if ((firetime = ka_intrvl - idletime) < 0) {
 510  510                  firetime = ka_intrvl;
 511  511          }
 512  512          tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer, firetime);
 513  513  }
 514  514  
 515  515  void
 516  516  tcp_reass_timer(void *arg)
 517  517  {
 518  518          conn_t *connp = (conn_t *)arg;
 519  519          tcp_t *tcp = connp->conn_tcp;
 520  520  
 521  521          tcp->tcp_reass_tid = 0;
 522  522          if (tcp->tcp_reass_head == NULL)
 523  523                  return;
 524  524          ASSERT(tcp->tcp_reass_tail != NULL);
 525  525          if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) {
 526  526                  tcp_sack_remove(tcp->tcp_sack_list,
 527  527                      TCP_REASS_END(tcp->tcp_reass_tail), &tcp->tcp_num_sack_blk);
 528  528          }
 529  529          tcp_close_mpp(&tcp->tcp_reass_head);
 530  530          tcp->tcp_reass_tail = NULL;
 531  531          TCP_STAT(tcp->tcp_tcps, tcp_reass_timeout);
 532  532  }
 533  533  
 534  534  /* This function handles the push timeout. */
 535  535  void
 536  536  tcp_push_timer(void *arg)
 537  537  {
 538  538          conn_t  *connp = (conn_t *)arg;
 539  539          tcp_t *tcp = connp->conn_tcp;
 540  540  
 541  541          TCP_DBGSTAT(tcp->tcp_tcps, tcp_push_timer_cnt);
 542  542  
 543  543          ASSERT(tcp->tcp_listener == NULL);
 544  544  
 545  545          ASSERT(!IPCL_IS_NONSTR(connp));
 546  546  
 547  547          tcp->tcp_push_tid = 0;
 548  548  
 549  549          if (tcp->tcp_rcv_list != NULL &&
 550  550              tcp_rcv_drain(tcp) == TH_ACK_NEEDED)
 551  551                  tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK);
 552  552  }
 553  553  
 554  554  /*
 555  555   * This function handles delayed ACK timeout.
 556  556   */
 557  557  void
 558  558  tcp_ack_timer(void *arg)
 559  559  {
 560  560          conn_t  *connp = (conn_t *)arg;
 561  561          tcp_t *tcp = connp->conn_tcp;
 562  562          mblk_t *mp;
 563  563          tcp_stack_t     *tcps = tcp->tcp_tcps;
 564  564  
 565  565          TCP_DBGSTAT(tcps, tcp_ack_timer_cnt);
 566  566  
 567  567          tcp->tcp_ack_tid = 0;
 568  568  
 569  569          if (tcp->tcp_fused)
 570  570                  return;
 571  571  
 572  572          /*
 573  573           * Do not send ACK if there is no outstanding unack'ed data.
 574  574           */
 575  575          if (tcp->tcp_rnxt == tcp->tcp_rack) {
 576  576                  return;
 577  577          }
 578  578  
 579  579          if ((tcp->tcp_rnxt - tcp->tcp_rack) > tcp->tcp_mss) {
 580  580                  /*
 581  581                   * Make sure we don't allow deferred ACKs to result in
 582  582                   * timer-based ACKing.  If we have held off an ACK
 583  583                   * when there was more than an mss here, and the timer
 584  584                   * goes off, we have to worry about the possibility
 585  585                   * that the sender isn't doing slow-start, or is out
 586  586                   * of step with us for some other reason.  We fall
 587  587                   * permanently back in the direction of
 588  588                   * ACK-every-other-packet as suggested in RFC 1122.
 589  589                   */
 590  590                  if (tcp->tcp_rack_abs_max > 2)
 591  591                          tcp->tcp_rack_abs_max--;
 592  592                  tcp->tcp_rack_cur_max = 2;
 593  593          }
 594  594          mp = tcp_ack_mp(tcp);
 595  595  
 596  596          if (mp != NULL) {
 597  597                  BUMP_LOCAL(tcp->tcp_obsegs);
 598  598                  TCPS_BUMP_MIB(tcps, tcpOutAck);
 599  599                  TCPS_BUMP_MIB(tcps, tcpOutAckDelayed);
 600  600                  tcp_send_data(tcp, mp);
 601  601          }
 602  602  }
 603  603  
 604  604  /*
 605  605   * Notify IP that we are having trouble with this connection.  IP should
 606  606   * make note so it can potentially use a different IRE.
 607  607   */
 608  608  static void
 609  609  tcp_ip_notify(tcp_t *tcp)
 610  610  {
 611  611          conn_t          *connp = tcp->tcp_connp;
 612  612          ire_t           *ire;
 613  613  
 614  614          /*
 615  615           * Note: in the case of source routing we want to blow away the
 616  616           * route to the first source route hop.
 617  617           */
 618  618          ire = connp->conn_ixa->ixa_ire;
 619  619          if (ire != NULL && !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
 620  620                  if (ire->ire_ipversion == IPV4_VERSION) {
 621  621                          /*
 622  622                           * As per RFC 1122, we send an RTM_LOSING to inform
 623  623                           * routing protocols.
 624  624                           */
 625  625                          ip_rts_change(RTM_LOSING, ire->ire_addr,
 626  626                              ire->ire_gateway_addr, ire->ire_mask,
 627  627                              connp->conn_laddr_v4,  0, 0, 0,
 628  628                              (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA),
 629  629                              ire->ire_ipst);
 630  630                  }
 631  631                  (void) ire_no_good(ire);
 632  632          }
 633  633  }
 634  634  
 635  635  /*
 636  636   * tcp_timer is the timer service routine.  It handles the retransmission,
 637  637   * FIN_WAIT_2 flush, and zero window probe timeout events.  It figures out
 638  638   * from the state of the tcp instance what kind of action needs to be done
 639  639   * at the time it is called.
 640  640   */
 641  641  void
 642  642  tcp_timer(void *arg)
 643  643  {
 644  644          mblk_t          *mp;
 645  645          clock_t         first_threshold;
 646  646          clock_t         second_threshold;
 647  647          clock_t         ms;
 648  648          uint32_t        mss;
 649  649          conn_t          *connp = (conn_t *)arg;
 650  650          tcp_t           *tcp = connp->conn_tcp;
 651  651          tcp_stack_t     *tcps = tcp->tcp_tcps;
 652  652          boolean_t       dont_timeout = B_FALSE;
 653  653  
 654  654          tcp->tcp_timer_tid = 0;
 655  655  
 656  656          if (tcp->tcp_fused)
 657  657                  return;
 658  658  
 659  659          first_threshold =  tcp->tcp_first_timer_threshold;
 660  660          second_threshold = tcp->tcp_second_timer_threshold;
 661  661          switch (tcp->tcp_state) {
 662  662          case TCPS_IDLE:
 663  663          case TCPS_BOUND:
 664  664          case TCPS_LISTEN:
 665  665                  return;
 666  666          case TCPS_SYN_RCVD: {
 667  667                  tcp_t   *listener = tcp->tcp_listener;
 668  668  
 669  669                  if (tcp->tcp_syn_rcvd_timeout == 0 && (listener != NULL)) {
 670  670                          /* it's our first timeout */
 671  671                          tcp->tcp_syn_rcvd_timeout = 1;
 672  672                          mutex_enter(&listener->tcp_eager_lock);
 673  673                          listener->tcp_syn_rcvd_timeout++;
 674  674                          if (!tcp->tcp_dontdrop && !tcp->tcp_closemp_used) {
 675  675                                  /*
 676  676                                   * Make this eager available for drop if we
 677  677                                   * need to drop one to accomodate a new
 678  678                                   * incoming SYN request.
 679  679                                   */
 680  680                                  MAKE_DROPPABLE(listener, tcp);
 681  681                          }
 682  682                          if (!listener->tcp_syn_defense &&
 683  683                              (listener->tcp_syn_rcvd_timeout >
 684  684                              (tcps->tcps_conn_req_max_q0 >> 2)) &&
 685  685                              (tcps->tcps_conn_req_max_q0 > 200)) {
 686  686                                  /* We may be under attack. Put on a defense. */
 687  687                                  listener->tcp_syn_defense = B_TRUE;
 688  688                                  cmn_err(CE_WARN, "High TCP connect timeout "
 689  689                                      "rate! System (port %d) may be under a "
 690  690                                      "SYN flood attack!",
 691  691                                      ntohs(listener->tcp_connp->conn_lport));
 692  692  
 693  693                                  listener->tcp_ip_addr_cache = kmem_zalloc(
 694  694                                      IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t),
 695  695                                      KM_NOSLEEP);
 696  696                          }
 697  697                          mutex_exit(&listener->tcp_eager_lock);
 698  698                  } else if (listener != NULL) {
 699  699                          mutex_enter(&listener->tcp_eager_lock);
 700  700                          tcp->tcp_syn_rcvd_timeout++;
 701  701                          if (tcp->tcp_syn_rcvd_timeout > 1 &&
 702  702                              !tcp->tcp_closemp_used) {
 703  703                                  /*
 704  704                                   * This is our second timeout. Put the tcp in
 705  705                                   * the list of droppable eagers to allow it to
 706  706                                   * be dropped, if needed. We don't check
 707  707                                   * whether tcp_dontdrop is set or not to
 708  708                                   * protect ourselve from a SYN attack where a
 709  709                                   * remote host can spoof itself as one of the
 710  710                                   * good IP source and continue to hold
 711  711                                   * resources too long.
 712  712                                   */
 713  713                                  MAKE_DROPPABLE(listener, tcp);
 714  714                          }
 715  715                          mutex_exit(&listener->tcp_eager_lock);
 716  716                  }
 717  717          }
 718  718                  /* FALLTHRU */
 719  719          case TCPS_SYN_SENT:
 720  720                  first_threshold =  tcp->tcp_first_ctimer_threshold;
 721  721                  second_threshold = tcp->tcp_second_ctimer_threshold;
 722  722  
 723  723                  /*
 724  724                   * If an app has set the second_threshold to 0, it means that
 725  725                   * we need to retransmit forever, unless this is a passive
 726  726                   * open.  We need to set second_threshold back to a normal
 727  727                   * value such that later comparison with it still makes
 728  728                   * sense.  But we set dont_timeout to B_TRUE so that we will
 729  729                   * never time out.
 730  730                   */
 731  731                  if (second_threshold == 0) {
 732  732                          second_threshold = tcps->tcps_ip_abort_linterval;
 733  733                          if (tcp->tcp_active_open)
 734  734                                  dont_timeout = B_TRUE;
 735  735                  }
 736  736                  break;
 737  737          case TCPS_ESTABLISHED:
 738  738          case TCPS_CLOSE_WAIT:
 739  739                  /*
 740  740                   * If the end point has not been closed, TCP can retransmit
 741  741                   * forever.  But if the end point is closed, the normal
 742  742                   * timeout applies.
 743  743                   */

↓ open down ↓

707 lines elided

↑ open up ↑

 744  744                  if (second_threshold == 0) {
 745  745                          second_threshold = tcps->tcps_ip_abort_linterval;
 746  746                          dont_timeout = B_TRUE;
 747  747                  }
 748  748                  /* FALLTHRU */
 749  749          case TCPS_FIN_WAIT_1:
 750  750          case TCPS_CLOSING:
 751  751          case TCPS_LAST_ACK:
 752  752                  /* If we have data to rexmit */
 753  753                  if (tcp->tcp_suna != tcp->tcp_snxt) {
 754      -                        clock_t time_to_wait;
      754 +                        clock_t time_to_wait;
 755  755  
 756  756                          TCPS_BUMP_MIB(tcps, tcpTimRetrans);
 757  757                          if (!tcp->tcp_xmit_head)
 758  758                                  break;
 759      -                        time_to_wait = ddi_get_lbolt() -
 760      -                            (clock_t)tcp->tcp_xmit_head->b_prev;
 761      -                        time_to_wait = tcp->tcp_rto -
 762      -                            TICK_TO_MSEC(time_to_wait);
      759 +                        time_to_wait = NSEC2MSEC(gethrtime() -
      760 +                            (hrtime_t)(intptr_t)tcp->tcp_xmit_head->b_prev);
      761 +                        time_to_wait = tcp->tcp_rto - time_to_wait;
 763  762                          /*
 764  763                           * If the timer fires too early, 1 clock tick earlier,
 765  764                           * restart the timer.
 766  765                           */
 767  766                          if (time_to_wait > msec_per_tick) {
 768  767                                  TCP_STAT(tcps, tcp_timer_fire_early);
 769  768                                  TCP_TIMER_RESTART(tcp, time_to_wait);
 770  769                                  return;
 771  770                          }
 772  771                          /*

 773  772                           * When we probe zero windows, we force the swnd open.
 774  773                           * If our peer acks with a closed window swnd will be
 775  774                           * set to zero by tcp_rput(). As long as we are
 776  775                           * receiving acks tcp_rput will
 777  776                           * reset 'tcp_ms_we_have_waited' so as not to trip the
 778  777                           * first and second interval actions.  NOTE: the timer
 779  778                           * interval is allowed to continue its exponential
 780  779                           * backoff.
 781  780                           */
 782  781                          if (tcp->tcp_swnd == 0 || tcp->tcp_zero_win_probe) {
 783  782                                  if (connp->conn_debug) {
 784  783                                          (void) strlog(TCP_MOD_ID, 0, 1,
 785  784                                              SL_TRACE, "tcp_timer: zero win");
 786  785                                  }
 787  786                          } else {
 788  787                                  /*
 789  788                                   * After retransmission, we need to do
 790  789                                   * slow start.  Set the ssthresh to one
 791  790                                   * half of current effective window and
 792  791                                   * cwnd to one MSS.  Also reset
 793  792                                   * tcp_cwnd_cnt.
 794  793                                   *
 795  794                                   * Note that if tcp_ssthresh is reduced because
 796  795                                   * of ECN, do not reduce it again unless it is
 797  796                                   * already one window of data away (tcp_cwr
 798  797                                   * should then be cleared) or this is a
 799  798                                   * timeout for a retransmitted segment.
 800  799                                   */
 801  800                                  uint32_t npkt;
 802  801  
 803  802                                  if (!tcp->tcp_cwr || tcp->tcp_rexmit) {
 804  803                                          npkt = ((tcp->tcp_timer_backoff ?
 805  804                                              tcp->tcp_cwnd_ssthresh :
 806  805                                              tcp->tcp_snxt -
 807  806                                              tcp->tcp_suna) >> 1) / tcp->tcp_mss;
 808  807                                          tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) *
 809  808                                              tcp->tcp_mss;
 810  809                                  }
 811  810                                  tcp->tcp_cwnd = tcp->tcp_mss;
 812  811                                  tcp->tcp_cwnd_cnt = 0;
 813  812                                  if (tcp->tcp_ecn_ok) {
 814  813                                          tcp->tcp_cwr = B_TRUE;
 815  814                                          tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
 816  815                                          tcp->tcp_ecn_cwr_sent = B_FALSE;
 817  816                                  }
 818  817                          }
 819  818                          break;
 820  819                  }
 821  820                  /*
 822  821                   * We have something to send yet we cannot send.  The
 823  822                   * reason can be:
 824  823                   *
 825  824                   * 1. Zero send window: we need to do zero window probe.
 826  825                   * 2. Zero cwnd: because of ECN, we need to "clock out
 827  826                   * segments.
 828  827                   * 3. SWS avoidance: receiver may have shrunk window,
 829  828                   * reset our knowledge.
 830  829                   *
 831  830                   * Note that condition 2 can happen with either 1 or
 832  831                   * 3.  But 1 and 3 are exclusive.
 833  832                   */
 834  833                  if (tcp->tcp_unsent != 0) {
 835  834                          /*
 836  835                           * Should not hold the zero-copy messages for too long.
 837  836                           */
 838  837                          if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean)
 839  838                                  tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp,
 840  839                                      tcp->tcp_xmit_head, B_TRUE);
 841  840  
 842  841                          if (tcp->tcp_cwnd == 0) {
 843  842                                  /*
 844  843                                   * Set tcp_cwnd to 1 MSS so that a
 845  844                                   * new segment can be sent out.  We
 846  845                                   * are "clocking out" new data when
 847  846                                   * the network is really congested.
 848  847                                   */
 849  848                                  ASSERT(tcp->tcp_ecn_ok);
 850  849                                  tcp->tcp_cwnd = tcp->tcp_mss;
 851  850                          }
 852  851                          if (tcp->tcp_swnd == 0) {
 853  852                                  /* Extend window for zero window probe */
 854  853                                  tcp->tcp_swnd++;
 855  854                                  tcp->tcp_zero_win_probe = B_TRUE;
 856  855                                  TCPS_BUMP_MIB(tcps, tcpOutWinProbe);
 857  856                          } else {
 858  857                                  /*
 859  858                                   * Handle timeout from sender SWS avoidance.
 860  859                                   * Reset our knowledge of the max send window
 861  860                                   * since the receiver might have reduced its
 862  861                                   * receive buffer.  Avoid setting tcp_max_swnd
 863  862                                   * to one since that will essentially disable
 864  863                                   * the SWS checks.
 865  864                                   *
 866  865                                   * Note that since we don't have a SWS
 867  866                                   * state variable, if the timeout is set
 868  867                                   * for ECN but not for SWS, this
 869  868                                   * code will also be executed.  This is
 870  869                                   * fine as tcp_max_swnd is updated
 871  870                                   * constantly and it will not affect
 872  871                                   * anything.
 873  872                                   */
 874  873                                  tcp->tcp_max_swnd = MAX(tcp->tcp_swnd, 2);
 875  874                          }
 876  875                          tcp_wput_data(tcp, NULL, B_FALSE);
 877  876                          return;
 878  877                  }
 879  878                  /* Is there a FIN that needs to be to re retransmitted? */
 880  879                  if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
 881  880                      !tcp->tcp_fin_acked)
 882  881                          break;
 883  882                  /* Nothing to do, return without restarting timer. */
 884  883                  TCP_STAT(tcps, tcp_timer_fire_miss);
 885  884                  return;
 886  885          case TCPS_FIN_WAIT_2:
 887  886                  /*
 888  887                   * User closed the TCP endpoint and peer ACK'ed our FIN.
 889  888                   * We waited some time for for peer's FIN, but it hasn't
 890  889                   * arrived.  We flush the connection now to avoid
 891  890                   * case where the peer has rebooted.
 892  891                   */
 893  892                  if (TCP_IS_DETACHED(tcp)) {
 894  893                          (void) tcp_clean_death(tcp, 0);
 895  894                  } else {
 896  895                          TCP_TIMER_RESTART(tcp,
 897  896                              tcp->tcp_fin_wait_2_flush_interval);
 898  897                  }
 899  898                  return;
 900  899          case TCPS_TIME_WAIT:
 901  900                  (void) tcp_clean_death(tcp, 0);
 902  901                  return;
 903  902          default:
 904  903                  if (connp->conn_debug) {
 905  904                          (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
 906  905                              "tcp_timer: strange state (%d) %s",
 907  906                              tcp->tcp_state, tcp_display(tcp, NULL,
 908  907                              DISP_PORT_ONLY));
 909  908                  }
 910  909                  return;
 911  910          }
 912  911  
 913  912          /*
 914  913           * If the system is under memory pressure or the max number of
 915  914           * connections have been established for the listener, be more
 916  915           * aggressive in aborting connections.
 917  916           */
 918  917          if (tcps->tcps_reclaim || (tcp->tcp_listen_cnt != NULL &&
 919  918              tcp->tcp_listen_cnt->tlc_cnt > tcp->tcp_listen_cnt->tlc_max)) {
 920  919                  second_threshold = tcp_early_abort * SECONDS;
 921  920  
 922  921                  /* We will ignore the never timeout promise in this case... */
 923  922                  dont_timeout = B_FALSE;
 924  923          }
 925  924  
 926  925          ASSERT(second_threshold != 0);
 927  926  
 928  927          if ((ms = tcp->tcp_ms_we_have_waited) > second_threshold) {
 929  928                  /*
 930  929                   * Should not hold the zero-copy messages for too long.
 931  930                   */
 932  931                  if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean)
 933  932                          tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp,
 934  933                              tcp->tcp_xmit_head, B_TRUE);
 935  934  
 936  935                  if (dont_timeout) {
 937  936                          /*
 938  937                           * Reset tcp_ms_we_have_waited to avoid overflow since
 939  938                           * we are going to retransmit forever.
 940  939                           */
 941  940                          tcp->tcp_ms_we_have_waited = second_threshold;
 942  941                          goto timer_rexmit;
 943  942                  }
 944  943  
 945  944                  /*
 946  945                   * For zero window probe, we need to send indefinitely,
 947  946                   * unless we have not heard from the other side for some
 948  947                   * time...
 949  948                   */
 950  949                  if ((tcp->tcp_zero_win_probe == 0) ||
 951  950                      (TICK_TO_MSEC(ddi_get_lbolt() - tcp->tcp_last_recv_time) >
 952  951                      second_threshold)) {
 953  952                          TCPS_BUMP_MIB(tcps, tcpTimRetransDrop);
 954  953                          /*
 955  954                           * If TCP is in SYN_RCVD state, send back a
 956  955                           * RST|ACK as BSD does.  Note that tcp_zero_win_probe
 957  956                           * should be zero in TCPS_SYN_RCVD state.
 958  957                           */
 959  958                          if (tcp->tcp_state == TCPS_SYN_RCVD) {
 960  959                                  tcp_xmit_ctl("tcp_timer: RST sent on timeout "
 961  960                                      "in SYN_RCVD",
 962  961                                      tcp, tcp->tcp_snxt,
 963  962                                      tcp->tcp_rnxt, TH_RST | TH_ACK);
 964  963                          }
 965  964                          (void) tcp_clean_death(tcp,
 966  965                              tcp->tcp_client_errno ?
 967  966                              tcp->tcp_client_errno : ETIMEDOUT);
 968  967                          return;
 969  968                  } else {
 970  969                          /*
 971  970                           * If the system is under memory pressure, we also
 972  971                           * abort connection in zero window probing.
 973  972                           */
 974  973                          if (tcps->tcps_reclaim) {
 975  974                                  (void) tcp_clean_death(tcp,
 976  975                                      tcp->tcp_client_errno ?
 977  976                                      tcp->tcp_client_errno : ETIMEDOUT);
 978  977                                  TCP_STAT(tcps, tcp_zwin_mem_drop);
 979  978                                  return;
 980  979                          }
 981  980                          /*
 982  981                           * Set tcp_ms_we_have_waited to second_threshold
 983  982                           * so that in next timeout, we will do the above
 984  983                           * check (ddi_get_lbolt() - tcp_last_recv_time).
 985  984                           * This is also to avoid overflow.
 986  985                           *
 987  986                           * We don't need to decrement tcp_timer_backoff
 988  987                           * to avoid overflow because it will be decremented
 989  988                           * later if new timeout value is greater than
 990  989                           * tcp_rto_max.  In the case when tcp_rto_max is
 991  990                           * greater than second_threshold, it means that we
 992  991                           * will wait longer than second_threshold to send
 993  992                           * the next
 994  993                           * window probe.
 995  994                           */
 996  995                          tcp->tcp_ms_we_have_waited = second_threshold;
 997  996                  }
 998  997          } else if (ms > first_threshold) {
 999  998                  /*
1000  999                   * Should not hold the zero-copy messages for too long.
1001 1000                   */
1002 1001                  if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean)
1003 1002                          tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp,
1004 1003                              tcp->tcp_xmit_head, B_TRUE);

↓ open down ↓

232 lines elided

↑ open up ↑

1005 1004  
1006 1005                  /*
1007 1006                   * We have been retransmitting for too long...  The RTT
1008 1007                   * we calculated is probably incorrect.  Reinitialize it.
1009 1008                   * Need to compensate for 0 tcp_rtt_sa.  Reset
1010 1009                   * tcp_rtt_update so that we won't accidentally cache a
1011 1010                   * bad value.  But only do this if this is not a zero
1012 1011                   * window probe.
1013 1012                   */
1014 1013                  if (tcp->tcp_rtt_sa != 0 && tcp->tcp_zero_win_probe == 0) {
1015      -                        tcp->tcp_rtt_sd += (tcp->tcp_rtt_sa >> 3) +
1016      -                            (tcp->tcp_rtt_sa >> 5);
     1014 +                        tcp->tcp_rtt_sd += tcp->tcp_rtt_sa >> 3 +
     1015 +                            tcp->tcp_rtt_sa >> 5;
1017 1016                          tcp->tcp_rtt_sa = 0;
1018 1017                          tcp_ip_notify(tcp);
1019 1018                          tcp->tcp_rtt_update = 0;
1020 1019                  }
1021 1020          }
1022 1021  
1023 1022  timer_rexmit:
1024 1023          tcp->tcp_timer_backoff++;
1025      -        if ((ms = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
1026      -            tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) <
1027      -            tcp->tcp_rto_min) {
1028      -                /*
1029      -                 * This means the original RTO is tcp_rexmit_interval_min.
1030      -                 * So we will use tcp_rexmit_interval_min as the RTO value
1031      -                 * and do the backoff.
1032      -                 */
1033      -                ms = tcp->tcp_rto_min << tcp->tcp_timer_backoff;
1034      -        } else {
1035      -                ms <<= tcp->tcp_timer_backoff;
1036      -        }
     1024 +        /*
     1025 +         * Calculate the backed off retransmission timeout. If the shift brings
     1026 +         * us back over the max, then we repin the value, and decrement the
     1027 +         * backoff to avoid overflow.
     1028 +         */
     1029 +        ms = tcp_calculate_rto(tcp, tcps, 0) << tcp->tcp_timer_backoff;
1037 1030          if (ms > tcp->tcp_rto_max) {
1038 1031                  ms = tcp->tcp_rto_max;
1039      -                /*
1040      -                 * ms is at max, decrement tcp_timer_backoff to avoid
1041      -                 * overflow.
1042      -                 */
1043 1032                  tcp->tcp_timer_backoff--;
1044 1033          }
1045 1034          tcp->tcp_ms_we_have_waited += ms;
1046 1035          if (tcp->tcp_zero_win_probe == 0) {
1047 1036                  tcp->tcp_rto = ms;
1048 1037          }
1049 1038          TCP_TIMER_RESTART(tcp, ms);
1050 1039          /*
1051 1040           * This is after a timeout and tcp_rto is backed off.  Set
1052 1041           * tcp_set_timer to 1 so that next time RTO is updated, we will
1053 1042           * restart the timer with a correct value.
1054 1043           */
1055 1044          tcp->tcp_set_timer = 1;
1056 1045          mss = tcp->tcp_snxt - tcp->tcp_suna;
1057 1046          if (mss > tcp->tcp_mss)
1058 1047                  mss = tcp->tcp_mss;
1059 1048          if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0)
1060 1049                  mss = tcp->tcp_swnd;
1061 1050  
1062      -        if ((mp = tcp->tcp_xmit_head) != NULL)
1063      -                mp->b_prev = (mblk_t *)ddi_get_lbolt();
     1051 +        if ((mp = tcp->tcp_xmit_head) != NULL) {
     1052 +                mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
     1053 +        }
1064 1054          mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss,
1065 1055              B_TRUE);
1066 1056  
1067 1057          /*
1068 1058           * When slow start after retransmission begins, start with
1069 1059           * this seq no.  tcp_rexmit_max marks the end of special slow
1070 1060           * start phase.
1071 1061           */
1072 1062          tcp->tcp_rexmit_nxt = tcp->tcp_suna;
1073 1063          if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&

1074 1064              (tcp->tcp_unsent == 0)) {
1075 1065                  tcp->tcp_rexmit_max = tcp->tcp_fss;
1076 1066          } else {
1077 1067                  tcp->tcp_rexmit_max = tcp->tcp_snxt;
1078 1068          }
1079 1069          tcp->tcp_rexmit = B_TRUE;
1080 1070          tcp->tcp_dupack_cnt = 0;
1081 1071  
1082 1072          /*
1083 1073           * Remove all rexmit SACK blk to start from fresh.
1084 1074           */
1085 1075          if (tcp->tcp_snd_sack_ok)
1086 1076                  TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, tcp);
1087 1077          if (mp == NULL) {
1088 1078                  return;
1089 1079          }
1090 1080  
1091 1081          tcp->tcp_csuna = tcp->tcp_snxt;
1092 1082          TCPS_BUMP_MIB(tcps, tcpRetransSegs);
1093 1083          TCPS_UPDATE_MIB(tcps, tcpRetransBytes, mss);
1094 1084          tcp_send_data(tcp, mp);
1095 1085  
1096 1086  }
1097 1087  
1098 1088  /*
1099 1089   * Handle lingering timeouts. This function is called when the SO_LINGER timeout
1100 1090   * expires.
1101 1091   */
1102 1092  void
1103 1093  tcp_close_linger_timeout(void *arg)
1104 1094  {
1105 1095          conn_t  *connp = (conn_t *)arg;
1106 1096          tcp_t   *tcp = connp->conn_tcp;
1107 1097  
1108 1098          tcp->tcp_client_errno = ETIMEDOUT;
1109 1099          tcp_stop_lingering(tcp);
1110 1100  }

↓ open down ↓

37 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX