illumos-gate Wdiff usr/src/uts/common/inet/tcp/tcp_time_wait.c

Print this page

11547 Want connstat(1M) command to display per-connection TCP statistics
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Portions contributed by: Ahmed G <ahmedg@delphix.com>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/inet/tcp/tcp_time_wait.c
          +++ new/usr/src/uts/common/inet/tcp/tcp_time_wait.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright 2016 Joyent, Inc.
  25   25   * Copyright (c) 2016 by Delphix. All rights reserved.
  26   26   */
  27   27  
  28   28  /*
  29   29   * This file contains functions related to TCP time wait processing.  Also
  30   30   * refer to the time wait handling comments in tcp_impl.h.
  31   31   */
  32   32  
  33   33  #include <sys/types.h>
  34   34  #include <sys/strsun.h>
  35   35  #include <sys/squeue_impl.h>
  36   36  #include <sys/squeue.h>
  37   37  #include <sys/callo.h>
  38   38  
  39   39  #include <inet/common.h>
  40   40  #include <inet/ip.h>
  41   41  #include <inet/tcp.h>
  42   42  #include <inet/tcp_impl.h>
  43   43  #include <inet/tcp_cluster.h>
  44   44  
  45   45  static void tcp_time_wait_purge(tcp_t *, tcp_squeue_priv_t *);
  46   46  
  47   47  #define TW_BUCKET(t)                                    \
  48   48          (((t) / MSEC_TO_TICK(TCP_TIME_WAIT_DELAY)) % TCP_TIME_WAIT_BUCKETS)
  49   49  
  50   50  #define TW_BUCKET_NEXT(b)       (((b) + 1) % TCP_TIME_WAIT_BUCKETS)
  51   51  
  52   52  
  53   53  /*
  54   54   * Remove a connection from the list of detached TIME_WAIT connections.
  55   55   * It returns B_FALSE if it can't remove the connection from the list
  56   56   * as the connection has already been removed from the list due to an
  57   57   * earlier call to tcp_time_wait_remove(); otherwise it returns B_TRUE.
  58   58   */
  59   59  boolean_t
  60   60  tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tsp)
  61   61  {
  62   62          boolean_t       locked = B_FALSE;
  63   63  
  64   64          if (tsp == NULL) {
  65   65                  tsp = *((tcp_squeue_priv_t **)
  66   66                      squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP));
  67   67                  mutex_enter(&tsp->tcp_time_wait_lock);
  68   68                  locked = B_TRUE;
  69   69          } else {
  70   70                  ASSERT(MUTEX_HELD(&tsp->tcp_time_wait_lock));
  71   71          }
  72   72  
  73   73          /* 0 means that the tcp_t has not been added to the time wait list. */
  74   74          if (tcp->tcp_time_wait_expire == 0) {
  75   75                  ASSERT(tcp->tcp_time_wait_next == NULL);
  76   76                  ASSERT(tcp->tcp_time_wait_prev == NULL);
  77   77                  if (locked)
  78   78                          mutex_exit(&tsp->tcp_time_wait_lock);
  79   79                  return (B_FALSE);
  80   80          }
  81   81          ASSERT(TCP_IS_DETACHED(tcp));
  82   82          ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
  83   83          ASSERT(tsp->tcp_time_wait_cnt > 0);
  84   84  
  85   85          if (tcp->tcp_time_wait_next != NULL) {
  86   86                  tcp->tcp_time_wait_next->tcp_time_wait_prev =
  87   87                      tcp->tcp_time_wait_prev;
  88   88          }
  89   89          if (tcp->tcp_time_wait_prev != NULL) {
  90   90                  tcp->tcp_time_wait_prev->tcp_time_wait_next =
  91   91                      tcp->tcp_time_wait_next;
  92   92          } else {
  93   93                  unsigned int bucket;
  94   94  
  95   95                  bucket = TW_BUCKET(tcp->tcp_time_wait_expire);
  96   96                  ASSERT(tsp->tcp_time_wait_bucket[bucket] == tcp);
  97   97                  tsp->tcp_time_wait_bucket[bucket] = tcp->tcp_time_wait_next;
  98   98          }
  99   99          tcp->tcp_time_wait_next = NULL;
 100  100          tcp->tcp_time_wait_prev = NULL;
 101  101          tcp->tcp_time_wait_expire = 0;
 102  102          tsp->tcp_time_wait_cnt--;
 103  103  
 104  104          if (locked)
 105  105                  mutex_exit(&tsp->tcp_time_wait_lock);
 106  106          return (B_TRUE);
 107  107  }
 108  108  
 109  109  /* Constants used for fast checking of a localhost address */
 110  110  #if defined(_BIG_ENDIAN)
 111  111  #define IPv4_LOCALHOST  0x7f000000U
 112  112  #define IPv4_LH_MASK    0xffffff00U
 113  113  #else
 114  114  #define IPv4_LOCALHOST  0x0000007fU
 115  115  #define IPv4_LH_MASK    0x00ffffffU
 116  116  #endif
 117  117  
 118  118  #define IS_LOCAL_HOST(x)        ( \
 119  119          ((x)->tcp_connp->conn_ipversion == IPV4_VERSION && \
 120  120          ((x)->tcp_connp->conn_laddr_v4 & IPv4_LH_MASK) == IPv4_LOCALHOST) || \
 121  121          ((x)->tcp_connp->conn_ipversion == IPV6_VERSION && \
 122  122          IN6_IS_ADDR_LOOPBACK(&(x)->tcp_connp->conn_laddr_v6)))
 123  123  
 124  124  
 125  125  /*
 126  126   * Add a connection to the list of detached TIME_WAIT connections
 127  127   * and set its time to expire.
 128  128   */
 129  129  void
 130  130  tcp_time_wait_append(tcp_t *tcp)
 131  131  {
 132  132          tcp_stack_t     *tcps = tcp->tcp_tcps;
 133  133          squeue_t        *sqp = tcp->tcp_connp->conn_sqp;
 134  134          tcp_squeue_priv_t *tsp =
 135  135              *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
 136  136          int64_t         now, schedule;
 137  137          unsigned int    bucket;
 138  138  
 139  139          tcp_timers_stop(tcp);
 140  140  
 141  141          /* Freed above */
 142  142          ASSERT(tcp->tcp_timer_tid == 0);
 143  143          ASSERT(tcp->tcp_ack_tid == 0);
 144  144  
 145  145          /* must have happened at the time of detaching the tcp */
 146  146          ASSERT(TCP_IS_DETACHED(tcp));
 147  147          ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
 148  148          ASSERT(tcp->tcp_ptpahn == NULL);
 149  149          ASSERT(tcp->tcp_flow_stopped == 0);
 150  150          ASSERT(tcp->tcp_time_wait_next == NULL);
 151  151          ASSERT(tcp->tcp_time_wait_prev == NULL);
 152  152          ASSERT(tcp->tcp_time_wait_expire == 0);
 153  153          ASSERT(tcp->tcp_listener == NULL);
 154  154  
 155  155          TCP_DBGSTAT(tcps, tcp_time_wait);
 156  156          mutex_enter(&tsp->tcp_time_wait_lock);
 157  157  
 158  158          /*
 159  159           * Immediately expire loopback connections.  Since there is no worry
 160  160           * about packets on the local host showing up after a long network
 161  161           * delay, this is safe and allows much higher rates of connection churn
 162  162           * for applications operating locally.
 163  163           *
 164  164           * This typically bypasses the tcp_free_list fast path due to squeue
 165  165           * re-entry for the loopback close operation.
 166  166           */
 167  167          if (tcp->tcp_loopback) {
 168  168                  tcp_time_wait_purge(tcp, tsp);
 169  169                  mutex_exit(&tsp->tcp_time_wait_lock);
 170  170                  return;
 171  171          }
 172  172  
 173  173          /*
 174  174           * In order to reap TIME_WAITs reliably, we should use a source of time
 175  175           * that is not adjustable by the user.  While it would be more accurate
 176  176           * to grab this timestamp before (potentially) sleeping on the
 177  177           * tcp_time_wait_lock, doing so complicates bucket addressing later.
 178  178           */
 179  179          now = ddi_get_lbolt64();
 180  180  
 181  181          /*
 182  182           * Each squeue uses an arbitrary time offset when scheduling
 183  183           * expiration timers.  This prevents the bucketing from forcing
 184  184           * tcp_time_wait_collector to run in locksetup across squeues.
 185  185           *
 186  186           * This offset is (re)initialized when a new TIME_WAIT connection is
 187  187           * added to an squeue which has no connections waiting to expire.
 188  188           */
 189  189          if (tsp->tcp_time_wait_tid == 0) {
 190  190                  ASSERT(tsp->tcp_time_wait_cnt == 0);
 191  191                  tsp->tcp_time_wait_offset =
 192  192                      now % MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
 193  193          }
 194  194          now -= tsp->tcp_time_wait_offset;
 195  195  
 196  196          /*
 197  197           * Use the netstack-defined timeout, rounded up to the minimum
 198  198           * time_wait_collector interval.
 199  199           */
 200  200          schedule = now + MSEC_TO_TICK(tcps->tcps_time_wait_interval);
 201  201          tcp->tcp_time_wait_expire = schedule;
 202  202  
 203  203          /*
 204  204           * Append the connection into the appropriate bucket.
 205  205           */
 206  206          bucket = TW_BUCKET(tcp->tcp_time_wait_expire);
 207  207          tcp->tcp_time_wait_next = tsp->tcp_time_wait_bucket[bucket];
 208  208          tsp->tcp_time_wait_bucket[bucket] = tcp;
 209  209          if (tcp->tcp_time_wait_next != NULL) {
 210  210                  ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == NULL);
 211  211                  tcp->tcp_time_wait_next->tcp_time_wait_prev = tcp;
 212  212          }
 213  213          tsp->tcp_time_wait_cnt++;
 214  214  
 215  215          /*
 216  216           * Round delay up to the nearest bucket boundary.
 217  217           */
 218  218          schedule += MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
 219  219          schedule -= schedule % MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
 220  220  
 221  221          /*
 222  222           * The newly inserted entry may require a tighter schedule for the
 223  223           * expiration timer.
 224  224           */
 225  225          if (schedule < tsp->tcp_time_wait_schedule) {
 226  226                  callout_id_t old_tid = tsp->tcp_time_wait_tid;
 227  227  
 228  228                  tsp->tcp_time_wait_schedule = schedule;
 229  229                  tsp->tcp_time_wait_tid =
 230  230                      timeout_generic(CALLOUT_NORMAL,
 231  231                      tcp_time_wait_collector, sqp,
 232  232                      TICK_TO_NSEC(schedule - now),
 233  233                      CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
 234  234  
 235  235                  /*
 236  236                   * It is possible for the timer to fire before the untimeout
 237  237                   * action is able to complete.  In that case, the exclusion
 238  238                   * offered by the tcp_time_wait_collector_active flag will
 239  239                   * prevent multiple collector threads from processing records
 240  240                   * simultaneously from the same squeue.
 241  241                   */
 242  242                  mutex_exit(&tsp->tcp_time_wait_lock);
 243  243                  (void) untimeout_default(old_tid, 0);
 244  244                  return;
 245  245          }
 246  246  
 247  247          /*
 248  248           * Start a fresh timer if none exists.
 249  249           */
 250  250          if (tsp->tcp_time_wait_schedule == 0) {
 251  251                  ASSERT(tsp->tcp_time_wait_tid == 0);
 252  252  
 253  253                  tsp->tcp_time_wait_schedule = schedule;
 254  254                  tsp->tcp_time_wait_tid =
 255  255                      timeout_generic(CALLOUT_NORMAL,
 256  256                      tcp_time_wait_collector, sqp,
 257  257                      TICK_TO_NSEC(schedule - now),
 258  258                      CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
 259  259          }
 260  260          mutex_exit(&tsp->tcp_time_wait_lock);
 261  261  }
 262  262  
 263  263  /*
 264  264   * Wrapper to call tcp_close_detached() via squeue to clean up TIME-WAIT
 265  265   * tcp_t.  Used in tcp_time_wait_collector().
 266  266   */
 267  267  /* ARGSUSED */
 268  268  static void
 269  269  tcp_timewait_close(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
 270  270  {
 271  271          conn_t  *connp = (conn_t *)arg;
 272  272          tcp_t   *tcp = connp->conn_tcp;
 273  273  
 274  274          ASSERT(tcp != NULL);
 275  275          if (tcp->tcp_state == TCPS_CLOSED) {
 276  276                  return;
 277  277          }
 278  278  
 279  279          ASSERT((connp->conn_family == AF_INET &&
 280  280              connp->conn_ipversion == IPV4_VERSION) ||
 281  281              (connp->conn_family == AF_INET6 &&
 282  282              (connp->conn_ipversion == IPV4_VERSION ||
 283  283              connp->conn_ipversion == IPV6_VERSION)));
 284  284          ASSERT(!tcp->tcp_listener);
 285  285  
 286  286          ASSERT(TCP_IS_DETACHED(tcp));
 287  287  
 288  288          /*
 289  289           * Because they have no upstream client to rebind or tcp_close()
 290  290           * them later, we axe the connection here and now.
 291  291           */
 292  292          tcp_close_detached(tcp);
 293  293  }
 294  294  
 295  295  
 296  296  static void
 297  297  tcp_time_wait_purge(tcp_t *tcp, tcp_squeue_priv_t *tsp)
 298  298  {
 299  299          mblk_t *mp;
 300  300          conn_t *connp = tcp->tcp_connp;
 301  301          kmutex_t *lock;
 302  302  
 303  303          ASSERT(MUTEX_HELD(&tsp->tcp_time_wait_lock));
 304  304          ASSERT(connp->conn_fanout != NULL);
 305  305  
 306  306          lock = &connp->conn_fanout->connf_lock;
 307  307  
 308  308          /*
 309  309           * This is essentially a TIME_WAIT reclaim fast path optimization for
 310  310           * performance where the connection is checked under the fanout lock
 311  311           * (so that no one else can get access to the conn_t) that the refcnt
 312  312           * is 2 (one each for TCP and the classifier hash list).  That is the
 313  313           * case and clustering callbacks are not enabled, the conn can be
 314  314           * removed under the fanout lock and avoid clean-up under the squeue.
 315  315           *
 316  316           * This optimization is forgone when clustering is enabled since the
 317  317           * clustering callback must be made before setting the CONDEMNED flag
 318  318           * and after dropping all locks
 319  319           *
 320  320           * See the comments in tcp_closei_local for additional information
 321  321           * regarding the refcnt logic.
 322  322           */
 323  323          if (mutex_tryenter(lock)) {
 324  324                  mutex_enter(&connp->conn_lock);
 325  325                  if (connp->conn_ref == 2 && cl_inet_disconnect == NULL) {
 326  326                          ipcl_hash_remove_locked(connp, connp->conn_fanout);
 327  327                          /*
 328  328                           * Set the CONDEMNED flag now itself so that the refcnt
 329  329                           * cannot increase due to any walker.
 330  330                           */
 331  331                          connp->conn_state_flags |= CONN_CONDEMNED;
 332  332                          mutex_exit(&connp->conn_lock);
 333  333                          mutex_exit(lock);
 334  334                          if (tsp->tcp_free_list_cnt < tcp_free_list_max_cnt) {
 335  335                                  /*
 336  336                                   * Add to head of tcp_free_list
 337  337                                   */
 338  338                                  tcp_cleanup(tcp);
 339  339                                  ASSERT(connp->conn_latch == NULL);
 340  340                                  ASSERT(connp->conn_policy == NULL);
 341  341                                  ASSERT(tcp->tcp_tcps == NULL);
 342  342                                  ASSERT(connp->conn_netstack == NULL);
 343  343  
 344  344                                  tcp->tcp_time_wait_next = tsp->tcp_free_list;
 345  345                                  tcp->tcp_in_free_list = B_TRUE;
 346  346                                  tsp->tcp_free_list = tcp;
 347  347                                  tsp->tcp_free_list_cnt++;
 348  348                          } else {
 349  349                                  /*
 350  350                                   * Do not add to tcp_free_list
 351  351                                   */
 352  352                                  tcp_bind_hash_remove(tcp);
 353  353                                  ixa_cleanup(tcp->tcp_connp->conn_ixa);
 354  354                                  tcp_ipsec_cleanup(tcp);
 355  355                                  CONN_DEC_REF(tcp->tcp_connp);
 356  356                          }
 357  357  
 358  358                          /*
 359  359                           * With the fast-path complete, we can bail.
 360  360                           */
 361  361                          return;
 362  362                  } else {
 363  363                          /*
 364  364                           * Fall back to slow path.
 365  365                           */
 366  366                          CONN_INC_REF_LOCKED(connp);
 367  367                          mutex_exit(&connp->conn_lock);
 368  368                          mutex_exit(lock);
 369  369                  }
 370  370          } else {
 371  371                  CONN_INC_REF(connp);
 372  372          }
 373  373  
 374  374          /*
 375  375           * We can reuse the closemp here since conn has detached (otherwise we
 376  376           * wouldn't even be in time_wait list). It is safe to change
 377  377           * tcp_closemp_used without taking a lock as no other thread can
 378  378           * concurrently access it at this point in the connection lifecycle.
 379  379           */
 380  380          if (tcp->tcp_closemp.b_prev == NULL) {
 381  381                  tcp->tcp_closemp_used = B_TRUE;
 382  382          } else {
 383  383                  cmn_err(CE_PANIC,
 384  384                      "tcp_timewait_collector: concurrent use of tcp_closemp: "
 385  385                      "connp %p tcp %p\n", (void *)connp, (void *)tcp);
 386  386          }
 387  387  
 388  388          TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
 389  389          mp = &tcp->tcp_closemp;
 390  390          mutex_exit(&tsp->tcp_time_wait_lock);
 391  391          SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_timewait_close, connp, NULL,
 392  392              SQ_FILL, SQTAG_TCP_TIMEWAIT);
 393  393          mutex_enter(&tsp->tcp_time_wait_lock);
 394  394  }
 395  395  
 396  396  /*
 397  397   * Purge any tcp_t instances associated with this squeue which have expired
 398  398   * from the TIME_WAIT state.
 399  399   */
 400  400  void
 401  401  tcp_time_wait_collector(void *arg)
 402  402  {
 403  403          tcp_t *tcp;
 404  404          int64_t now, sched_active, sched_cur, sched_new;
 405  405          unsigned int idx;
 406  406  
 407  407          squeue_t *sqp = (squeue_t *)arg;
 408  408          tcp_squeue_priv_t *tsp =
 409  409              *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
 410  410  
 411  411          mutex_enter(&tsp->tcp_time_wait_lock);
 412  412  
 413  413          /*
 414  414           * Because of timer scheduling complexity and the fact that the
 415  415           * tcp_time_wait_lock is dropped during tcp_time_wait_purge, it is
 416  416           * possible for multiple tcp_time_wait_collector threads to run against
 417  417           * the same squeue.  This flag is used to exclude other collectors from
 418  418           * the squeue during execution.
 419  419           */
 420  420          if (tsp->tcp_time_wait_collector_active) {
 421  421                  mutex_exit(&tsp->tcp_time_wait_lock);
 422  422                  return;
 423  423          }
 424  424          tsp->tcp_time_wait_collector_active = B_TRUE;
 425  425  
 426  426          /*
 427  427           * After its assignment here, the value of sched_active must not be
 428  428           * altered as it is used to validate the state of the
 429  429           * tcp_time_wait_collector callout schedule for this squeue.
 430  430           *
 431  431           * The same does not hold true of sched_cur, which holds the timestamp
 432  432           * of the bucket undergoing processing.  While it is initially equal to
 433  433           * sched_active, certain conditions below can walk it forward,
 434  434           * triggering the retry loop.
 435  435           */
 436  436          sched_cur = sched_active = tsp->tcp_time_wait_schedule;
 437  437  
 438  438          /*
 439  439           * Purge the free list if necessary
 440  440           */
 441  441          if (tsp->tcp_free_list != NULL) {
 442  442                  TCP_G_STAT(tcp_freelist_cleanup);
 443  443                  while ((tcp = tsp->tcp_free_list) != NULL) {
 444  444                          tsp->tcp_free_list = tcp->tcp_time_wait_next;
 445  445                          tcp->tcp_time_wait_next = NULL;
 446  446                          tsp->tcp_free_list_cnt--;
 447  447                          ASSERT(tcp->tcp_tcps == NULL);
 448  448                          CONN_DEC_REF(tcp->tcp_connp);
 449  449                  }
 450  450                  ASSERT(tsp->tcp_free_list_cnt == 0);
 451  451          }
 452  452  
 453  453          /*
 454  454           * If there are no connections pending, clear timer-related state to be
 455  455           * reinitialized by the next caller.
 456  456           */
 457  457          if (tsp->tcp_time_wait_cnt == 0) {
 458  458                  tsp->tcp_time_wait_offset = 0;
 459  459                  tsp->tcp_time_wait_schedule = 0;
 460  460                  tsp->tcp_time_wait_tid = 0;
 461  461                  tsp->tcp_time_wait_collector_active = B_FALSE;
 462  462                  mutex_exit(&tsp->tcp_time_wait_lock);
 463  463                  return;
 464  464          }
 465  465  
 466  466  retry:
 467  467          /*
 468  468           * Grab the bucket which we were scheduled to cleanse.
 469  469           */
 470  470          idx = TW_BUCKET(sched_cur - 1);
 471  471          now = ddi_get_lbolt64() - tsp->tcp_time_wait_offset;
 472  472          tcp = tsp->tcp_time_wait_bucket[idx];
 473  473  
 474  474          while (tcp != NULL) {
 475  475                  /*
 476  476                   * Since the bucket count is sized to prevent wrap-around
 477  477                   * during typical operation and timers are schedule to process
 478  478                   * buckets with only expired connections, there is only one
 479  479                   * reason to encounter a connection expiring in the future:
 480  480                   * The tcp_time_wait_collector thread has been so delayed in
 481  481                   * its processing that connections have wrapped around the
 482  482                   * timing wheel into this bucket.
 483  483                   *
 484  484                   * In that case, the remaining entires in the bucket can be
 485  485                   * ignored since, being appended sequentially, they should all
 486  486                   * expire in the future.
 487  487                   */
 488  488                  if (now < tcp->tcp_time_wait_expire) {
 489  489                          break;
 490  490                  }
 491  491  
 492  492                  /*
 493  493                   * Pull the connection out of the bucket.
 494  494                   */
 495  495                  VERIFY(tcp_time_wait_remove(tcp, tsp));
 496  496  
 497  497                  /*
 498  498                   * Purge the connection.
 499  499                   *
 500  500                   * While tcp_time_wait_lock will be temporarily dropped as part
 501  501                   * of the process, there is no risk of the timer being
 502  502                   * (re)scheduled while the collector is running since a value
 503  503                   * corresponding to the past is left in tcp_time_wait_schedule.
 504  504                   */
 505  505                  tcp_time_wait_purge(tcp, tsp);
 506  506  
 507  507                  /*
 508  508                   * Because tcp_time_wait_remove clears the tcp_time_wait_next
 509  509                   * field, the next item must be grabbed directly from the
 510  510                   * bucket itself.
 511  511                   */
 512  512                  tcp = tsp->tcp_time_wait_bucket[idx];
 513  513          }
 514  514  
 515  515          if (tsp->tcp_time_wait_cnt == 0) {
 516  516                  /*
 517  517                   * There is not a need for the collector to schedule a new
 518  518                   * timer if no pending items remain.  The timer state can be
 519  519                   * cleared only if it was untouched while the collector dropped
 520  520                   * its locks during tcp_time_wait_purge.
 521  521                   */
 522  522                  if (tsp->tcp_time_wait_schedule == sched_active) {
 523  523                          tsp->tcp_time_wait_offset = 0;
 524  524                          tsp->tcp_time_wait_schedule = 0;
 525  525                          tsp->tcp_time_wait_tid = 0;
 526  526                  }
 527  527                  tsp->tcp_time_wait_collector_active = B_FALSE;
 528  528                  mutex_exit(&tsp->tcp_time_wait_lock);
 529  529                  return;
 530  530          } else {
 531  531                  unsigned int nidx;
 532  532  
 533  533                  /*
 534  534                   * Locate the next bucket containing entries.
 535  535                   */
 536  536                  sched_new = sched_cur + MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
 537  537                  nidx = TW_BUCKET_NEXT(idx);
 538  538                  while (tsp->tcp_time_wait_bucket[nidx] == NULL) {
 539  539                          if (nidx == idx) {
 540  540                                  break;
 541  541                          }
 542  542                          nidx = TW_BUCKET_NEXT(nidx);
 543  543                          sched_new += MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
 544  544                  }
 545  545                  ASSERT(tsp->tcp_time_wait_bucket[nidx] != NULL);
 546  546          }
 547  547  
 548  548          /*
 549  549           * It is possible that the system is under such dire load that between
 550  550           * the timer scheduling and TIME_WAIT processing delay, execution
 551  551           * overran the interval allocated to this bucket.
 552  552           */
 553  553          now = ddi_get_lbolt64() - tsp->tcp_time_wait_offset;
 554  554          if (sched_new <= now) {
 555  555                  /*
 556  556                   * Attempt to right the situation by immediately performing a
 557  557                   * purge on the next bucket.  This loop will continue as needed
 558  558                   * until the schedule can be pushed out ahead of the clock.
 559  559                   */
 560  560                  sched_cur = sched_new;
 561  561                  DTRACE_PROBE3(tcp__time__wait__overrun,
 562  562                      tcp_squeue_priv_t *, tsp, int64_t, sched_new, int64_t, now);
 563  563                  goto retry;
 564  564          }
 565  565  
 566  566          /*
 567  567           * Another thread may have snuck in to reschedule the timer while locks
 568  568           * were dropped during tcp_time_wait_purge.  Defer to the running timer
 569  569           * if that is the case.
 570  570           */
 571  571          if (tsp->tcp_time_wait_schedule != sched_active) {
 572  572                  tsp->tcp_time_wait_collector_active = B_FALSE;
 573  573                  mutex_exit(&tsp->tcp_time_wait_lock);
 574  574                  return;
 575  575          }
 576  576  
 577  577          /*
 578  578           * Schedule the next timer.
 579  579           */
 580  580          tsp->tcp_time_wait_schedule = sched_new;
 581  581          tsp->tcp_time_wait_tid =
 582  582              timeout_generic(CALLOUT_NORMAL,
 583  583              tcp_time_wait_collector, sqp,
 584  584              TICK_TO_NSEC(sched_new - now),
 585  585              CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
 586  586          tsp->tcp_time_wait_collector_active = B_FALSE;
 587  587          mutex_exit(&tsp->tcp_time_wait_lock);
 588  588  }
 589  589  
 590  590  /*
 591  591   * tcp_time_wait_processing() handles processing of incoming packets when
 592  592   * the tcp_t is in the TIME_WAIT state.
 593  593   *
 594  594   * A TIME_WAIT tcp_t that has an associated open TCP end point (not in
 595  595   * detached state) is never put on the time wait list.
 596  596   */
 597  597  void
 598  598  tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq,
 599  599      uint32_t seg_ack, int seg_len, tcpha_t *tcpha, ip_recv_attr_t *ira)
 600  600  {

↓ open down ↓

600 lines elided

↑ open up ↑

 601  601          int32_t         bytes_acked;
 602  602          int32_t         gap;
 603  603          int32_t         rgap;
 604  604          tcp_opt_t       tcpopt;
 605  605          uint_t          flags;
 606  606          uint32_t        new_swnd = 0;
 607  607          conn_t          *nconnp;
 608  608          conn_t          *connp = tcp->tcp_connp;
 609  609          tcp_stack_t     *tcps = tcp->tcp_tcps;
 610  610  
 611      -        BUMP_LOCAL(tcp->tcp_ibsegs);
      611 +        TCPS_BUMP_MIB(tcps, tcpHCInSegs);
 612  612          DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp);
 613  613  
 614  614          flags = (unsigned int)tcpha->tha_flags & 0xFF;
 615  615          new_swnd = ntohs(tcpha->tha_win) <<
 616  616              ((tcpha->tha_flags & TH_SYN) ? 0 : tcp->tcp_snd_ws);
 617  617  
 618  618          boolean_t keepalive = (seg_len == 0 || seg_len == 1) &&
 619  619              (seg_seq + 1 == tcp->tcp_rnxt);
 620  620          if (tcp->tcp_snd_ts_ok && !(flags & TH_RST) && !keepalive) {
 621  621                  int options;

 622  622                  if (tcp->tcp_snd_sack_ok)
 623  623                          tcpopt.tcp = tcp;
 624  624                  else
 625  625                          tcpopt.tcp = NULL;
 626  626                  options = tcp_parse_options(tcpha, &tcpopt);
 627  627                  if (!(options & TCP_OPT_TSTAMP_PRESENT)) {
 628  628                          DTRACE_TCP1(droppedtimestamp, tcp_t *, tcp);
 629  629                          goto done;
 630  630                  } else if (!tcp_paws_check(tcp, &tcpopt)) {
 631  631                          tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt,
 632  632                              TH_ACK);
 633  633                          goto done;
 634  634                  }
 635  635          }
 636  636          gap = seg_seq - tcp->tcp_rnxt;
 637  637          rgap = tcp->tcp_rwnd - (gap + seg_len);
 638  638          if (gap < 0) {
 639  639                  TCPS_BUMP_MIB(tcps, tcpInDataDupSegs);
 640  640                  TCPS_UPDATE_MIB(tcps, tcpInDataDupBytes,
 641  641                      (seg_len > -gap ? -gap : seg_len));
 642  642                  seg_len += gap;
 643  643                  if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) {
 644  644                          if (flags & TH_RST) {
 645  645                                  goto done;
 646  646                          }
 647  647                          if ((flags & TH_FIN) && seg_len == -1) {
 648  648                                  /*
 649  649                                   * When TCP receives a duplicate FIN in
 650  650                                   * TIME_WAIT state, restart the 2 MSL timer.
 651  651                                   * See page 73 in RFC 793. Make sure this TCP
 652  652                                   * is already on the TIME_WAIT list. If not,
 653  653                                   * just restart the timer.
 654  654                                   */
 655  655                                  if (TCP_IS_DETACHED(tcp)) {
 656  656                                          if (tcp_time_wait_remove(tcp, NULL) ==
 657  657                                              B_TRUE) {
 658  658                                                  tcp_time_wait_append(tcp);
 659  659                                                  TCP_DBGSTAT(tcps,
 660  660                                                      tcp_rput_time_wait);
 661  661                                          }
 662  662                                  } else {
 663  663                                          ASSERT(tcp != NULL);
 664  664                                          TCP_TIMER_RESTART(tcp,
 665  665                                              tcps->tcps_time_wait_interval);
 666  666                                  }
 667  667                                  tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt,
 668  668                                      tcp->tcp_rnxt, TH_ACK);
 669  669                                  goto done;
 670  670                          }
 671  671                          flags |=  TH_ACK_NEEDED;
 672  672                          seg_len = 0;
 673  673                          goto process_ack;
 674  674                  }
 675  675  
 676  676                  /* Fix seg_seq, and chew the gap off the front. */
 677  677                  seg_seq = tcp->tcp_rnxt;
 678  678          }
 679  679  
 680  680          if ((flags & TH_SYN) && gap > 0 && rgap < 0) {
 681  681                  /*
 682  682                   * Make sure that when we accept the connection, pick
 683  683                   * an ISS greater than (tcp_snxt + tcp_iss_incr/2) for the
 684  684                   * old connection.
 685  685                   *
 686  686                   * The next ISS generated is equal to tcp_iss_incr_extra
 687  687                   * + tcp_iss_incr/2 + other components depending on the
 688  688                   * value of tcp_strong_iss.  We pre-calculate the new
 689  689                   * ISS here and compare with tcp_snxt to determine if
 690  690                   * we need to make adjustment to tcp_iss_incr_extra.
 691  691                   *
 692  692                   * The above calculation is ugly and is a
 693  693                   * waste of CPU cycles...
 694  694                   */
 695  695                  uint32_t new_iss = tcps->tcps_iss_incr_extra;
 696  696                  int32_t adj;
 697  697                  ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
 698  698  
 699  699                  switch (tcps->tcps_strong_iss) {
 700  700                  case 2: {
 701  701                          /* Add time and MD5 components. */
 702  702                          uint32_t answer[4];
 703  703                          struct {
 704  704                                  uint32_t ports;
 705  705                                  in6_addr_t src;
 706  706                                  in6_addr_t dst;
 707  707                          } arg;
 708  708                          MD5_CTX context;
 709  709  
 710  710                          mutex_enter(&tcps->tcps_iss_key_lock);
 711  711                          context = tcps->tcps_iss_key;
 712  712                          mutex_exit(&tcps->tcps_iss_key_lock);
 713  713                          arg.ports = connp->conn_ports;
 714  714                          /* We use MAPPED addresses in tcp_iss_init */
 715  715                          arg.src = connp->conn_laddr_v6;
 716  716                          arg.dst = connp->conn_faddr_v6;
 717  717                          MD5Update(&context, (uchar_t *)&arg,
 718  718                              sizeof (arg));
 719  719                          MD5Final((uchar_t *)answer, &context);
 720  720                          answer[0] ^= answer[1] ^ answer[2] ^ answer[3];
 721  721                          new_iss += (gethrtime() >> ISS_NSEC_SHT) + answer[0];
 722  722                          break;
 723  723                  }
 724  724                  case 1:
 725  725                          /* Add time component and min random (i.e. 1). */
 726  726                          new_iss += (gethrtime() >> ISS_NSEC_SHT) + 1;
 727  727                          break;
 728  728                  default:
 729  729                          /* Add only time component. */
 730  730                          new_iss += (uint32_t)gethrestime_sec() *
 731  731                              tcps->tcps_iss_incr;
 732  732                          break;
 733  733                  }
 734  734                  if ((adj = (int32_t)(tcp->tcp_snxt - new_iss)) > 0) {
 735  735                          /*
 736  736                           * New ISS not guaranteed to be tcp_iss_incr/2
 737  737                           * ahead of the current tcp_snxt, so add the
 738  738                           * difference to tcp_iss_incr_extra.
 739  739                           */
 740  740                          tcps->tcps_iss_incr_extra += adj;
 741  741                  }
 742  742                  /*
 743  743                   * If tcp_clean_death() can not perform the task now,
 744  744                   * drop the SYN packet and let the other side re-xmit.
 745  745                   * Otherwise pass the SYN packet back in, since the
 746  746                   * old tcp state has been cleaned up or freed.
 747  747                   */
 748  748                  if (tcp_clean_death(tcp, 0) == -1)
 749  749                          goto done;
 750  750                  nconnp = ipcl_classify(mp, ira, ipst);
 751  751                  if (nconnp != NULL) {
 752  752                          TCP_STAT(tcps, tcp_time_wait_syn_success);
 753  753                          /* Drops ref on nconnp */
 754  754                          tcp_reinput(nconnp, mp, ira, ipst);
 755  755                          return;
 756  756                  }
 757  757                  goto done;
 758  758          }
 759  759  
 760  760          /*
 761  761           * rgap is the amount of stuff received out of window.  A negative
 762  762           * value is the amount out of window.
 763  763           */
 764  764          if (rgap < 0) {
 765  765                  TCPS_BUMP_MIB(tcps, tcpInDataPastWinSegs);
 766  766                  TCPS_UPDATE_MIB(tcps, tcpInDataPastWinBytes, -rgap);
 767  767                  /* Fix seg_len and make sure there is something left. */
 768  768                  seg_len += rgap;
 769  769                  if (seg_len <= 0) {
 770  770                          if (flags & TH_RST) {
 771  771                                  goto done;
 772  772                          }
 773  773                          flags |=  TH_ACK_NEEDED;
 774  774                          seg_len = 0;
 775  775                          goto process_ack;
 776  776                  }
 777  777          }
 778  778          /*
 779  779           * Check whether we can update tcp_ts_recent. This test is from RFC
 780  780           * 7323, section 5.3.
 781  781           */
 782  782          if (tcp->tcp_snd_ts_ok && !(flags & TH_RST) &&
 783  783              TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) &&
 784  784              SEQ_LEQ(seg_seq, tcp->tcp_rack)) {
 785  785                  tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val;
 786  786                  tcp->tcp_last_rcv_lbolt = ddi_get_lbolt64();

↓ open down ↓

165 lines elided

↑ open up ↑

 787  787          }
 788  788  
 789  789          if (seg_seq != tcp->tcp_rnxt && seg_len > 0) {
 790  790                  /* Always ack out of order packets */
 791  791                  flags |= TH_ACK_NEEDED;
 792  792                  seg_len = 0;
 793  793          } else if (seg_len > 0) {
 794  794                  TCPS_BUMP_MIB(tcps, tcpInClosed);
 795  795                  TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs);
 796  796                  TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, seg_len);
      797 +                tcp->tcp_cs.tcp_in_data_inorder_segs++;
      798 +                tcp->tcp_cs.tcp_in_data_inorder_bytes += seg_len;
 797  799          }
 798  800          if (flags & TH_RST) {
 799  801                  (void) tcp_clean_death(tcp, 0);
 800  802                  goto done;
 801  803          }
 802  804          if (flags & TH_SYN) {
 803  805                  tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1,
 804  806                      TH_RST|TH_ACK);
 805  807                  /*
 806  808                   * Do not delete the TCP structure if it is in

 807  809                   * TIME_WAIT state.  Refer to RFC 1122, 4.2.2.13.
 808  810                   */
 809  811                  goto done;
 810  812          }
 811  813  process_ack:
 812  814          if (flags & TH_ACK) {
 813  815                  bytes_acked = (int)(seg_ack - tcp->tcp_suna);
 814  816                  if (bytes_acked <= 0) {
 815  817                          if (bytes_acked == 0 && seg_len == 0 &&
 816  818                              new_swnd == tcp->tcp_swnd)
 817  819                                  TCPS_BUMP_MIB(tcps, tcpInDupAck);
 818  820                  } else {
 819  821                          /* Acks something not sent */
 820  822                          flags |= TH_ACK_NEEDED;
 821  823                  }
 822  824          }
 823  825          if (flags & TH_ACK_NEEDED) {
 824  826                  /*
 825  827                   * Time to send an ack for some reason.
 826  828                   */
 827  829                  tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt,
 828  830                      tcp->tcp_rnxt, TH_ACK);
 829  831          }
 830  832  done:
 831  833          freemsg(mp);
 832  834  }

↓ open down ↓

26 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX