1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2016 Joyent, Inc.
  25  * Copyright (c) 2016 by Delphix. All rights reserved.
  26  */
  27 
  28 /*
  29  * This file contains functions related to TCP time wait processing.  Also
  30  * refer to the time wait handling comments in tcp_impl.h.
  31  */
  32 
  33 #include <sys/types.h>
  34 #include <sys/strsun.h>
  35 #include <sys/squeue_impl.h>
  36 #include <sys/squeue.h>
  37 #include <sys/callo.h>
  38 
  39 #include <inet/common.h>
  40 #include <inet/ip.h>
  41 #include <inet/tcp.h>
  42 #include <inet/tcp_impl.h>
  43 #include <inet/tcp_cluster.h>
  44 
  45 static void tcp_time_wait_purge(tcp_t *, tcp_squeue_priv_t *);
  46 
  47 #define TW_BUCKET(t)                                    \
  48         (((t) / MSEC_TO_TICK(TCP_TIME_WAIT_DELAY)) % TCP_TIME_WAIT_BUCKETS)
  49 
  50 #define TW_BUCKET_NEXT(b)       (((b) + 1) % TCP_TIME_WAIT_BUCKETS)
  51 
  52 
  53 /*
  54  * Remove a connection from the list of detached TIME_WAIT connections.
  55  * It returns B_FALSE if it can't remove the connection from the list
  56  * as the connection has already been removed from the list due to an
  57  * earlier call to tcp_time_wait_remove(); otherwise it returns B_TRUE.
  58  */
  59 boolean_t
  60 tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tsp)
  61 {
  62         boolean_t       locked = B_FALSE;
  63 
  64         if (tsp == NULL) {
  65                 tsp = *((tcp_squeue_priv_t **)
  66                     squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP));
  67                 mutex_enter(&tsp->tcp_time_wait_lock);
  68                 locked = B_TRUE;
  69         } else {
  70                 ASSERT(MUTEX_HELD(&tsp->tcp_time_wait_lock));
  71         }
  72 
  73         /* 0 means that the tcp_t has not been added to the time wait list. */
  74         if (tcp->tcp_time_wait_expire == 0) {
  75                 ASSERT(tcp->tcp_time_wait_next == NULL);
  76                 ASSERT(tcp->tcp_time_wait_prev == NULL);
  77                 if (locked)
  78                         mutex_exit(&tsp->tcp_time_wait_lock);
  79                 return (B_FALSE);
  80         }
  81         ASSERT(TCP_IS_DETACHED(tcp));
  82         ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
  83         ASSERT(tsp->tcp_time_wait_cnt > 0);
  84 
  85         if (tcp->tcp_time_wait_next != NULL) {
  86                 tcp->tcp_time_wait_next->tcp_time_wait_prev =
  87                     tcp->tcp_time_wait_prev;
  88         }
  89         if (tcp->tcp_time_wait_prev != NULL) {
  90                 tcp->tcp_time_wait_prev->tcp_time_wait_next =
  91                     tcp->tcp_time_wait_next;
  92         } else {
  93                 unsigned int bucket;
  94 
  95                 bucket = TW_BUCKET(tcp->tcp_time_wait_expire);
  96                 ASSERT(tsp->tcp_time_wait_bucket[bucket] == tcp);
  97                 tsp->tcp_time_wait_bucket[bucket] = tcp->tcp_time_wait_next;
  98         }
  99         tcp->tcp_time_wait_next = NULL;
 100         tcp->tcp_time_wait_prev = NULL;
 101         tcp->tcp_time_wait_expire = 0;
 102         tsp->tcp_time_wait_cnt--;
 103 
 104         if (locked)
 105                 mutex_exit(&tsp->tcp_time_wait_lock);
 106         return (B_TRUE);
 107 }
 108 
 109 /* Constants used for fast checking of a localhost address */
 110 #if defined(_BIG_ENDIAN)
 111 #define IPv4_LOCALHOST  0x7f000000U
 112 #define IPv4_LH_MASK    0xffffff00U
 113 #else
 114 #define IPv4_LOCALHOST  0x0000007fU
 115 #define IPv4_LH_MASK    0x00ffffffU
 116 #endif
 117 
 118 #define IS_LOCAL_HOST(x)        ( \
 119         ((x)->tcp_connp->conn_ipversion == IPV4_VERSION && \
 120         ((x)->tcp_connp->conn_laddr_v4 & IPv4_LH_MASK) == IPv4_LOCALHOST) || \
 121         ((x)->tcp_connp->conn_ipversion == IPV6_VERSION && \
 122         IN6_IS_ADDR_LOOPBACK(&(x)->tcp_connp->conn_laddr_v6)))
 123 
 124 
 125 /*
 126  * Add a connection to the list of detached TIME_WAIT connections
 127  * and set its time to expire.
 128  */
 129 void
 130 tcp_time_wait_append(tcp_t *tcp)
 131 {
 132         tcp_stack_t     *tcps = tcp->tcp_tcps;
 133         squeue_t        *sqp = tcp->tcp_connp->conn_sqp;
 134         tcp_squeue_priv_t *tsp =
 135             *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
 136         int64_t         now, schedule;
 137         unsigned int    bucket;
 138 
 139         tcp_timers_stop(tcp);
 140 
 141         /* Freed above */
 142         ASSERT(tcp->tcp_timer_tid == 0);
 143         ASSERT(tcp->tcp_ack_tid == 0);
 144 
 145         /* must have happened at the time of detaching the tcp */
 146         ASSERT(TCP_IS_DETACHED(tcp));
 147         ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
 148         ASSERT(tcp->tcp_ptpahn == NULL);
 149         ASSERT(tcp->tcp_flow_stopped == 0);
 150         ASSERT(tcp->tcp_time_wait_next == NULL);
 151         ASSERT(tcp->tcp_time_wait_prev == NULL);
 152         ASSERT(tcp->tcp_time_wait_expire == 0);
 153         ASSERT(tcp->tcp_listener == NULL);
 154 
 155         TCP_DBGSTAT(tcps, tcp_time_wait);
 156         mutex_enter(&tsp->tcp_time_wait_lock);
 157 
 158         /*
 159          * Immediately expire loopback connections.  Since there is no worry
 160          * about packets on the local host showing up after a long network
 161          * delay, this is safe and allows much higher rates of connection churn
 162          * for applications operating locally.
 163          *
 164          * This typically bypasses the tcp_free_list fast path due to squeue
 165          * re-entry for the loopback close operation.
 166          */
 167         if (tcp->tcp_loopback) {
 168                 tcp_time_wait_purge(tcp, tsp);
 169                 mutex_exit(&tsp->tcp_time_wait_lock);
 170                 return;
 171         }
 172 
 173         /*
 174          * In order to reap TIME_WAITs reliably, we should use a source of time
 175          * that is not adjustable by the user.  While it would be more accurate
 176          * to grab this timestamp before (potentially) sleeping on the
 177          * tcp_time_wait_lock, doing so complicates bucket addressing later.
 178          */
 179         now = ddi_get_lbolt64();
 180 
 181         /*
 182          * Each squeue uses an arbitrary time offset when scheduling
 183          * expiration timers.  This prevents the bucketing from forcing
 184          * tcp_time_wait_collector to run in locksetup across squeues.
 185          *
 186          * This offset is (re)initialized when a new TIME_WAIT connection is
 187          * added to an squeue which has no connections waiting to expire.
 188          */
 189         if (tsp->tcp_time_wait_tid == 0) {
 190                 ASSERT(tsp->tcp_time_wait_cnt == 0);
 191                 tsp->tcp_time_wait_offset =
 192                     now % MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
 193         }
 194         now -= tsp->tcp_time_wait_offset;
 195 
 196         /*
 197          * Use the netstack-defined timeout, rounded up to the minimum
 198          * time_wait_collector interval.
 199          */
 200         schedule = now + MSEC_TO_TICK(tcps->tcps_time_wait_interval);
 201         tcp->tcp_time_wait_expire = schedule;
 202 
 203         /*
 204          * Append the connection into the appropriate bucket.
 205          */
 206         bucket = TW_BUCKET(tcp->tcp_time_wait_expire);
 207         tcp->tcp_time_wait_next = tsp->tcp_time_wait_bucket[bucket];
 208         tsp->tcp_time_wait_bucket[bucket] = tcp;
 209         if (tcp->tcp_time_wait_next != NULL) {
 210                 ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == NULL);
 211                 tcp->tcp_time_wait_next->tcp_time_wait_prev = tcp;
 212         }
 213         tsp->tcp_time_wait_cnt++;
 214 
 215         /*
 216          * Round delay up to the nearest bucket boundary.
 217          */
 218         schedule += MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
 219         schedule -= schedule % MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
 220 
 221         /*
 222          * The newly inserted entry may require a tighter schedule for the
 223          * expiration timer.
 224          */
 225         if (schedule < tsp->tcp_time_wait_schedule) {
 226                 callout_id_t old_tid = tsp->tcp_time_wait_tid;
 227 
 228                 tsp->tcp_time_wait_schedule = schedule;
 229                 tsp->tcp_time_wait_tid =
 230                     timeout_generic(CALLOUT_NORMAL,
 231                     tcp_time_wait_collector, sqp,
 232                     TICK_TO_NSEC(schedule - now),
 233                     CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
 234 
 235                 /*
 236                  * It is possible for the timer to fire before the untimeout
 237                  * action is able to complete.  In that case, the exclusion
 238                  * offered by the tcp_time_wait_collector_active flag will
 239                  * prevent multiple collector threads from processing records
 240                  * simultaneously from the same squeue.
 241                  */
 242                 mutex_exit(&tsp->tcp_time_wait_lock);
 243                 (void) untimeout_default(old_tid, 0);
 244                 return;
 245         }
 246 
 247         /*
 248          * Start a fresh timer if none exists.
 249          */
 250         if (tsp->tcp_time_wait_schedule == 0) {
 251                 ASSERT(tsp->tcp_time_wait_tid == 0);
 252 
 253                 tsp->tcp_time_wait_schedule = schedule;
 254                 tsp->tcp_time_wait_tid =
 255                     timeout_generic(CALLOUT_NORMAL,
 256                     tcp_time_wait_collector, sqp,
 257                     TICK_TO_NSEC(schedule - now),
 258                     CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
 259         }
 260         mutex_exit(&tsp->tcp_time_wait_lock);
 261 }
 262 
 263 /*
 264  * Wrapper to call tcp_close_detached() via squeue to clean up TIME-WAIT
 265  * tcp_t.  Used in tcp_time_wait_collector().
 266  */
 267 /* ARGSUSED */
 268 static void
 269 tcp_timewait_close(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
 270 {
 271         conn_t  *connp = (conn_t *)arg;
 272         tcp_t   *tcp = connp->conn_tcp;
 273 
 274         ASSERT(tcp != NULL);
 275         if (tcp->tcp_state == TCPS_CLOSED) {
 276                 return;
 277         }
 278 
 279         ASSERT((connp->conn_family == AF_INET &&
 280             connp->conn_ipversion == IPV4_VERSION) ||
 281             (connp->conn_family == AF_INET6 &&
 282             (connp->conn_ipversion == IPV4_VERSION ||
 283             connp->conn_ipversion == IPV6_VERSION)));
 284         ASSERT(!tcp->tcp_listener);
 285 
 286         ASSERT(TCP_IS_DETACHED(tcp));
 287 
 288         /*
 289          * Because they have no upstream client to rebind or tcp_close()
 290          * them later, we axe the connection here and now.
 291          */
 292         tcp_close_detached(tcp);
 293 }
 294 
 295 
 296 static void
 297 tcp_time_wait_purge(tcp_t *tcp, tcp_squeue_priv_t *tsp)
 298 {
 299         mblk_t *mp;
 300         conn_t *connp = tcp->tcp_connp;
 301         kmutex_t *lock;
 302 
 303         ASSERT(MUTEX_HELD(&tsp->tcp_time_wait_lock));
 304         ASSERT(connp->conn_fanout != NULL);
 305 
 306         lock = &connp->conn_fanout->connf_lock;
 307 
 308         /*
 309          * This is essentially a TIME_WAIT reclaim fast path optimization for
 310          * performance where the connection is checked under the fanout lock
 311          * (so that no one else can get access to the conn_t) that the refcnt
 312          * is 2 (one each for TCP and the classifier hash list).  That is the
 313          * case and clustering callbacks are not enabled, the conn can be
 314          * removed under the fanout lock and avoid clean-up under the squeue.
 315          *
 316          * This optimization is forgone when clustering is enabled since the
 317          * clustering callback must be made before setting the CONDEMNED flag
 318          * and after dropping all locks
 319          *
 320          * See the comments in tcp_closei_local for additional information
 321          * regarding the refcnt logic.
 322          */
 323         if (mutex_tryenter(lock)) {
 324                 mutex_enter(&connp->conn_lock);
 325                 if (connp->conn_ref == 2 && cl_inet_disconnect == NULL) {
 326                         ipcl_hash_remove_locked(connp, connp->conn_fanout);
 327                         /*
 328                          * Set the CONDEMNED flag now itself so that the refcnt
 329                          * cannot increase due to any walker.
 330                          */
 331                         connp->conn_state_flags |= CONN_CONDEMNED;
 332                         mutex_exit(&connp->conn_lock);
 333                         mutex_exit(lock);
 334                         if (tsp->tcp_free_list_cnt < tcp_free_list_max_cnt) {
 335                                 /*
 336                                  * Add to head of tcp_free_list
 337                                  */
 338                                 tcp_cleanup(tcp);
 339                                 ASSERT(connp->conn_latch == NULL);
 340                                 ASSERT(connp->conn_policy == NULL);
 341                                 ASSERT(tcp->tcp_tcps == NULL);
 342                                 ASSERT(connp->conn_netstack == NULL);
 343 
 344                                 tcp->tcp_time_wait_next = tsp->tcp_free_list;
 345                                 tcp->tcp_in_free_list = B_TRUE;
 346                                 tsp->tcp_free_list = tcp;
 347                                 tsp->tcp_free_list_cnt++;
 348                         } else {
 349                                 /*
 350                                  * Do not add to tcp_free_list
 351                                  */
 352                                 tcp_bind_hash_remove(tcp);
 353                                 ixa_cleanup(tcp->tcp_connp->conn_ixa);
 354                                 tcp_ipsec_cleanup(tcp);
 355                                 CONN_DEC_REF(tcp->tcp_connp);
 356                         }
 357 
 358                         /*
 359                          * With the fast-path complete, we can bail.
 360                          */
 361                         return;
 362                 } else {
 363                         /*
 364                          * Fall back to slow path.
 365                          */
 366                         CONN_INC_REF_LOCKED(connp);
 367                         mutex_exit(&connp->conn_lock);
 368                         mutex_exit(lock);
 369                 }
 370         } else {
 371                 CONN_INC_REF(connp);
 372         }
 373 
 374         /*
 375          * We can reuse the closemp here since conn has detached (otherwise we
 376          * wouldn't even be in time_wait list). It is safe to change
 377          * tcp_closemp_used without taking a lock as no other thread can
 378          * concurrently access it at this point in the connection lifecycle.
 379          */
 380         if (tcp->tcp_closemp.b_prev == NULL) {
 381                 tcp->tcp_closemp_used = B_TRUE;
 382         } else {
 383                 cmn_err(CE_PANIC,
 384                     "tcp_timewait_collector: concurrent use of tcp_closemp: "
 385                     "connp %p tcp %p\n", (void *)connp, (void *)tcp);
 386         }
 387 
 388         TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
 389         mp = &tcp->tcp_closemp;
 390         mutex_exit(&tsp->tcp_time_wait_lock);
 391         SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_timewait_close, connp, NULL,
 392             SQ_FILL, SQTAG_TCP_TIMEWAIT);
 393         mutex_enter(&tsp->tcp_time_wait_lock);
 394 }
 395 
 396 /*
 397  * Purge any tcp_t instances associated with this squeue which have expired
 398  * from the TIME_WAIT state.
 399  */
 400 void
 401 tcp_time_wait_collector(void *arg)
 402 {
 403         tcp_t *tcp;
 404         int64_t now, sched_active, sched_cur, sched_new;
 405         unsigned int idx;
 406 
 407         squeue_t *sqp = (squeue_t *)arg;
 408         tcp_squeue_priv_t *tsp =
 409             *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
 410 
 411         mutex_enter(&tsp->tcp_time_wait_lock);
 412 
 413         /*
 414          * Because of timer scheduling complexity and the fact that the
 415          * tcp_time_wait_lock is dropped during tcp_time_wait_purge, it is
 416          * possible for multiple tcp_time_wait_collector threads to run against
 417          * the same squeue.  This flag is used to exclude other collectors from
 418          * the squeue during execution.
 419          */
 420         if (tsp->tcp_time_wait_collector_active) {
 421                 mutex_exit(&tsp->tcp_time_wait_lock);
 422                 return;
 423         }
 424         tsp->tcp_time_wait_collector_active = B_TRUE;
 425 
 426         /*
 427          * After its assignment here, the value of sched_active must not be
 428          * altered as it is used to validate the state of the
 429          * tcp_time_wait_collector callout schedule for this squeue.
 430          *
 431          * The same does not hold true of sched_cur, which holds the timestamp
 432          * of the bucket undergoing processing.  While it is initially equal to
 433          * sched_active, certain conditions below can walk it forward,
 434          * triggering the retry loop.
 435          */
 436         sched_cur = sched_active = tsp->tcp_time_wait_schedule;
 437 
 438         /*
 439          * Purge the free list if necessary
 440          */
 441         if (tsp->tcp_free_list != NULL) {
 442                 TCP_G_STAT(tcp_freelist_cleanup);
 443                 while ((tcp = tsp->tcp_free_list) != NULL) {
 444                         tsp->tcp_free_list = tcp->tcp_time_wait_next;
 445                         tcp->tcp_time_wait_next = NULL;
 446                         tsp->tcp_free_list_cnt--;
 447                         ASSERT(tcp->tcp_tcps == NULL);
 448                         CONN_DEC_REF(tcp->tcp_connp);
 449                 }
 450                 ASSERT(tsp->tcp_free_list_cnt == 0);
 451         }
 452 
 453         /*
 454          * If there are no connections pending, clear timer-related state to be
 455          * reinitialized by the next caller.
 456          */
 457         if (tsp->tcp_time_wait_cnt == 0) {
 458                 tsp->tcp_time_wait_offset = 0;
 459                 tsp->tcp_time_wait_schedule = 0;
 460                 tsp->tcp_time_wait_tid = 0;
 461                 tsp->tcp_time_wait_collector_active = B_FALSE;
 462                 mutex_exit(&tsp->tcp_time_wait_lock);
 463                 return;
 464         }
 465 
 466 retry:
 467         /*
 468          * Grab the bucket which we were scheduled to cleanse.
 469          */
 470         idx = TW_BUCKET(sched_cur - 1);
 471         now = ddi_get_lbolt64() - tsp->tcp_time_wait_offset;
 472         tcp = tsp->tcp_time_wait_bucket[idx];
 473 
 474         while (tcp != NULL) {
 475                 /*
 476                  * Since the bucket count is sized to prevent wrap-around
 477                  * during typical operation and timers are schedule to process
 478                  * buckets with only expired connections, there is only one
 479                  * reason to encounter a connection expiring in the future:
 480                  * The tcp_time_wait_collector thread has been so delayed in
 481                  * its processing that connections have wrapped around the
 482                  * timing wheel into this bucket.
 483                  *
 484                  * In that case, the remaining entires in the bucket can be
 485                  * ignored since, being appended sequentially, they should all
 486                  * expire in the future.
 487                  */
 488                 if (now < tcp->tcp_time_wait_expire) {
 489                         break;
 490                 }
 491 
 492                 /*
 493                  * Pull the connection out of the bucket.
 494                  */
 495                 VERIFY(tcp_time_wait_remove(tcp, tsp));
 496 
 497                 /*
 498                  * Purge the connection.
 499                  *
 500                  * While tcp_time_wait_lock will be temporarily dropped as part
 501                  * of the process, there is no risk of the timer being
 502                  * (re)scheduled while the collector is running since a value
 503                  * corresponding to the past is left in tcp_time_wait_schedule.
 504                  */
 505                 tcp_time_wait_purge(tcp, tsp);
 506 
 507                 /*
 508                  * Because tcp_time_wait_remove clears the tcp_time_wait_next
 509                  * field, the next item must be grabbed directly from the
 510                  * bucket itself.
 511                  */
 512                 tcp = tsp->tcp_time_wait_bucket[idx];
 513         }
 514 
 515         if (tsp->tcp_time_wait_cnt == 0) {
 516                 /*
 517                  * There is not a need for the collector to schedule a new
 518                  * timer if no pending items remain.  The timer state can be
 519                  * cleared only if it was untouched while the collector dropped
 520                  * its locks during tcp_time_wait_purge.
 521                  */
 522                 if (tsp->tcp_time_wait_schedule == sched_active) {
 523                         tsp->tcp_time_wait_offset = 0;
 524                         tsp->tcp_time_wait_schedule = 0;
 525                         tsp->tcp_time_wait_tid = 0;
 526                 }
 527                 tsp->tcp_time_wait_collector_active = B_FALSE;
 528                 mutex_exit(&tsp->tcp_time_wait_lock);
 529                 return;
 530         } else {
 531                 unsigned int nidx;
 532 
 533                 /*
 534                  * Locate the next bucket containing entries.
 535                  */
 536                 sched_new = sched_cur + MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
 537                 nidx = TW_BUCKET_NEXT(idx);
 538                 while (tsp->tcp_time_wait_bucket[nidx] == NULL) {
 539                         if (nidx == idx) {
 540                                 break;
 541                         }
 542                         nidx = TW_BUCKET_NEXT(nidx);
 543                         sched_new += MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
 544                 }
 545                 ASSERT(tsp->tcp_time_wait_bucket[nidx] != NULL);
 546         }
 547 
 548         /*
 549          * It is possible that the system is under such dire load that between
 550          * the timer scheduling and TIME_WAIT processing delay, execution
 551          * overran the interval allocated to this bucket.
 552          */
 553         now = ddi_get_lbolt64() - tsp->tcp_time_wait_offset;
 554         if (sched_new <= now) {
 555                 /*
 556                  * Attempt to right the situation by immediately performing a
 557                  * purge on the next bucket.  This loop will continue as needed
 558                  * until the schedule can be pushed out ahead of the clock.
 559                  */
 560                 sched_cur = sched_new;
 561                 DTRACE_PROBE3(tcp__time__wait__overrun,
 562                     tcp_squeue_priv_t *, tsp, int64_t, sched_new, int64_t, now);
 563                 goto retry;
 564         }
 565 
 566         /*
 567          * Another thread may have snuck in to reschedule the timer while locks
 568          * were dropped during tcp_time_wait_purge.  Defer to the running timer
 569          * if that is the case.
 570          */
 571         if (tsp->tcp_time_wait_schedule != sched_active) {
 572                 tsp->tcp_time_wait_collector_active = B_FALSE;
 573                 mutex_exit(&tsp->tcp_time_wait_lock);
 574                 return;
 575         }
 576 
 577         /*
 578          * Schedule the next timer.
 579          */
 580         tsp->tcp_time_wait_schedule = sched_new;
 581         tsp->tcp_time_wait_tid =
 582             timeout_generic(CALLOUT_NORMAL,
 583             tcp_time_wait_collector, sqp,
 584             TICK_TO_NSEC(sched_new - now),
 585             CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
 586         tsp->tcp_time_wait_collector_active = B_FALSE;
 587         mutex_exit(&tsp->tcp_time_wait_lock);
 588 }
 589 
 590 /*
 591  * tcp_time_wait_processing() handles processing of incoming packets when
 592  * the tcp_t is in the TIME_WAIT state.
 593  *
 594  * A TIME_WAIT tcp_t that has an associated open TCP end point (not in
 595  * detached state) is never put on the time wait list.
 596  */
 597 void
 598 tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq,
 599     uint32_t seg_ack, int seg_len, tcpha_t *tcpha, ip_recv_attr_t *ira)
 600 {
 601         int32_t         bytes_acked;
 602         int32_t         gap;
 603         int32_t         rgap;
 604         tcp_opt_t       tcpopt;
 605         uint_t          flags;
 606         uint32_t        new_swnd = 0;
 607         conn_t          *nconnp;
 608         conn_t          *connp = tcp->tcp_connp;
 609         tcp_stack_t     *tcps = tcp->tcp_tcps;
 610 
 611         TCPS_BUMP_MIB(tcps, tcpHCInSegs);
 612         DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp);
 613 
 614         flags = (unsigned int)tcpha->tha_flags & 0xFF;
 615         new_swnd = ntohs(tcpha->tha_win) <<
 616             ((tcpha->tha_flags & TH_SYN) ? 0 : tcp->tcp_snd_ws);
 617 
 618         boolean_t keepalive = (seg_len == 0 || seg_len == 1) &&
 619             (seg_seq + 1 == tcp->tcp_rnxt);
 620         if (tcp->tcp_snd_ts_ok && !(flags & TH_RST) && !keepalive) {
 621                 int options;
 622                 if (tcp->tcp_snd_sack_ok)
 623                         tcpopt.tcp = tcp;
 624                 else
 625                         tcpopt.tcp = NULL;
 626                 options = tcp_parse_options(tcpha, &tcpopt);
 627                 if (!(options & TCP_OPT_TSTAMP_PRESENT)) {
 628                         DTRACE_TCP1(droppedtimestamp, tcp_t *, tcp);
 629                         goto done;
 630                 } else if (!tcp_paws_check(tcp, &tcpopt)) {
 631                         tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt,
 632                             TH_ACK);
 633                         goto done;
 634                 }
 635         }
 636         gap = seg_seq - tcp->tcp_rnxt;
 637         rgap = tcp->tcp_rwnd - (gap + seg_len);
 638         if (gap < 0) {
 639                 TCPS_BUMP_MIB(tcps, tcpInDataDupSegs);
 640                 TCPS_UPDATE_MIB(tcps, tcpInDataDupBytes,
 641                     (seg_len > -gap ? -gap : seg_len));
 642                 seg_len += gap;
 643                 if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) {
 644                         if (flags & TH_RST) {
 645                                 goto done;
 646                         }
 647                         if ((flags & TH_FIN) && seg_len == -1) {
 648                                 /*
 649                                  * When TCP receives a duplicate FIN in
 650                                  * TIME_WAIT state, restart the 2 MSL timer.
 651                                  * See page 73 in RFC 793. Make sure this TCP
 652                                  * is already on the TIME_WAIT list. If not,
 653                                  * just restart the timer.
 654                                  */
 655                                 if (TCP_IS_DETACHED(tcp)) {
 656                                         if (tcp_time_wait_remove(tcp, NULL) ==
 657                                             B_TRUE) {
 658                                                 tcp_time_wait_append(tcp);
 659                                                 TCP_DBGSTAT(tcps,
 660                                                     tcp_rput_time_wait);
 661                                         }
 662                                 } else {
 663                                         ASSERT(tcp != NULL);
 664                                         TCP_TIMER_RESTART(tcp,
 665                                             tcps->tcps_time_wait_interval);
 666                                 }
 667                                 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt,
 668                                     tcp->tcp_rnxt, TH_ACK);
 669                                 goto done;
 670                         }
 671                         flags |=  TH_ACK_NEEDED;
 672                         seg_len = 0;
 673                         goto process_ack;
 674                 }
 675 
 676                 /* Fix seg_seq, and chew the gap off the front. */
 677                 seg_seq = tcp->tcp_rnxt;
 678         }
 679 
 680         if ((flags & TH_SYN) && gap > 0 && rgap < 0) {
 681                 /*
 682                  * Make sure that when we accept the connection, pick
 683                  * an ISS greater than (tcp_snxt + tcp_iss_incr/2) for the
 684                  * old connection.
 685                  *
 686                  * The next ISS generated is equal to tcp_iss_incr_extra
 687                  * + tcp_iss_incr/2 + other components depending on the
 688                  * value of tcp_strong_iss.  We pre-calculate the new
 689                  * ISS here and compare with tcp_snxt to determine if
 690                  * we need to make adjustment to tcp_iss_incr_extra.
 691                  *
 692                  * The above calculation is ugly and is a
 693                  * waste of CPU cycles...
 694                  */
 695                 uint32_t new_iss = tcps->tcps_iss_incr_extra;
 696                 int32_t adj;
 697                 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
 698 
 699                 switch (tcps->tcps_strong_iss) {
 700                 case 2: {
 701                         /* Add time and MD5 components. */
 702                         uint32_t answer[4];
 703                         struct {
 704                                 uint32_t ports;
 705                                 in6_addr_t src;
 706                                 in6_addr_t dst;
 707                         } arg;
 708                         MD5_CTX context;
 709 
 710                         mutex_enter(&tcps->tcps_iss_key_lock);
 711                         context = tcps->tcps_iss_key;
 712                         mutex_exit(&tcps->tcps_iss_key_lock);
 713                         arg.ports = connp->conn_ports;
 714                         /* We use MAPPED addresses in tcp_iss_init */
 715                         arg.src = connp->conn_laddr_v6;
 716                         arg.dst = connp->conn_faddr_v6;
 717                         MD5Update(&context, (uchar_t *)&arg,
 718                             sizeof (arg));
 719                         MD5Final((uchar_t *)answer, &context);
 720                         answer[0] ^= answer[1] ^ answer[2] ^ answer[3];
 721                         new_iss += (gethrtime() >> ISS_NSEC_SHT) + answer[0];
 722                         break;
 723                 }
 724                 case 1:
 725                         /* Add time component and min random (i.e. 1). */
 726                         new_iss += (gethrtime() >> ISS_NSEC_SHT) + 1;
 727                         break;
 728                 default:
 729                         /* Add only time component. */
 730                         new_iss += (uint32_t)gethrestime_sec() *
 731                             tcps->tcps_iss_incr;
 732                         break;
 733                 }
 734                 if ((adj = (int32_t)(tcp->tcp_snxt - new_iss)) > 0) {
 735                         /*
 736                          * New ISS not guaranteed to be tcp_iss_incr/2
 737                          * ahead of the current tcp_snxt, so add the
 738                          * difference to tcp_iss_incr_extra.
 739                          */
 740                         tcps->tcps_iss_incr_extra += adj;
 741                 }
 742                 /*
 743                  * If tcp_clean_death() can not perform the task now,
 744                  * drop the SYN packet and let the other side re-xmit.
 745                  * Otherwise pass the SYN packet back in, since the
 746                  * old tcp state has been cleaned up or freed.
 747                  */
 748                 if (tcp_clean_death(tcp, 0) == -1)
 749                         goto done;
 750                 nconnp = ipcl_classify(mp, ira, ipst);
 751                 if (nconnp != NULL) {
 752                         TCP_STAT(tcps, tcp_time_wait_syn_success);
 753                         /* Drops ref on nconnp */
 754                         tcp_reinput(nconnp, mp, ira, ipst);
 755                         return;
 756                 }
 757                 goto done;
 758         }
 759 
 760         /*
 761          * rgap is the amount of stuff received out of window.  A negative
 762          * value is the amount out of window.
 763          */
 764         if (rgap < 0) {
 765                 TCPS_BUMP_MIB(tcps, tcpInDataPastWinSegs);
 766                 TCPS_UPDATE_MIB(tcps, tcpInDataPastWinBytes, -rgap);
 767                 /* Fix seg_len and make sure there is something left. */
 768                 seg_len += rgap;
 769                 if (seg_len <= 0) {
 770                         if (flags & TH_RST) {
 771                                 goto done;
 772                         }
 773                         flags |=  TH_ACK_NEEDED;
 774                         seg_len = 0;
 775                         goto process_ack;
 776                 }
 777         }
 778         /*
 779          * Check whether we can update tcp_ts_recent. This test is from RFC
 780          * 7323, section 5.3.
 781          */
 782         if (tcp->tcp_snd_ts_ok && !(flags & TH_RST) &&
 783             TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) &&
 784             SEQ_LEQ(seg_seq, tcp->tcp_rack)) {
 785                 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val;
 786                 tcp->tcp_last_rcv_lbolt = ddi_get_lbolt64();
 787         }
 788 
 789         if (seg_seq != tcp->tcp_rnxt && seg_len > 0) {
 790                 /* Always ack out of order packets */
 791                 flags |= TH_ACK_NEEDED;
 792                 seg_len = 0;
 793         } else if (seg_len > 0) {
 794                 TCPS_BUMP_MIB(tcps, tcpInClosed);
 795                 TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs);
 796                 TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, seg_len);
 797                 tcp->tcp_cs.tcp_in_data_inorder_segs++;
 798                 tcp->tcp_cs.tcp_in_data_inorder_bytes += seg_len;
 799         }
 800         if (flags & TH_RST) {
 801                 (void) tcp_clean_death(tcp, 0);
 802                 goto done;
 803         }
 804         if (flags & TH_SYN) {
 805                 tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1,
 806                     TH_RST|TH_ACK);
 807                 /*
 808                  * Do not delete the TCP structure if it is in
 809                  * TIME_WAIT state.  Refer to RFC 1122, 4.2.2.13.
 810                  */
 811                 goto done;
 812         }
 813 process_ack:
 814         if (flags & TH_ACK) {
 815                 bytes_acked = (int)(seg_ack - tcp->tcp_suna);
 816                 if (bytes_acked <= 0) {
 817                         if (bytes_acked == 0 && seg_len == 0 &&
 818                             new_swnd == tcp->tcp_swnd)
 819                                 TCPS_BUMP_MIB(tcps, tcpInDupAck);
 820                 } else {
 821                         /* Acks something not sent */
 822                         flags |= TH_ACK_NEEDED;
 823                 }
 824         }
 825         if (flags & TH_ACK_NEEDED) {
 826                 /*
 827                  * Time to send an ack for some reason.
 828                  */
 829                 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt,
 830                     tcp->tcp_rnxt, TH_ACK);
 831         }
 832 done:
 833         freemsg(mp);
 834 }