1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright 2016 Joyent, Inc. 25 * Copyright (c) 2016 by Delphix. All rights reserved. 26 */ 27 28 /* 29 * This file contains functions related to TCP time wait processing. Also 30 * refer to the time wait handling comments in tcp_impl.h. 31 */ 32 33 #include <sys/types.h> 34 #include <sys/strsun.h> 35 #include <sys/squeue_impl.h> 36 #include <sys/squeue.h> 37 #include <sys/callo.h> 38 39 #include <inet/common.h> 40 #include <inet/ip.h> 41 #include <inet/tcp.h> 42 #include <inet/tcp_impl.h> 43 #include <inet/tcp_cluster.h> 44 45 static void tcp_time_wait_purge(tcp_t *, tcp_squeue_priv_t *); 46 47 #define TW_BUCKET(t) \ 48 (((t) / MSEC_TO_TICK(TCP_TIME_WAIT_DELAY)) % TCP_TIME_WAIT_BUCKETS) 49 50 #define TW_BUCKET_NEXT(b) (((b) + 1) % TCP_TIME_WAIT_BUCKETS) 51 52 53 /* 54 * Remove a connection from the list of detached TIME_WAIT connections. 55 * It returns B_FALSE if it can't remove the connection from the list 56 * as the connection has already been removed from the list due to an 57 * earlier call to tcp_time_wait_remove(); otherwise it returns B_TRUE. 58 */ 59 boolean_t 60 tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tsp) 61 { 62 boolean_t locked = B_FALSE; 63 64 if (tsp == NULL) { 65 tsp = *((tcp_squeue_priv_t **) 66 squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP)); 67 mutex_enter(&tsp->tcp_time_wait_lock); 68 locked = B_TRUE; 69 } else { 70 ASSERT(MUTEX_HELD(&tsp->tcp_time_wait_lock)); 71 } 72 73 /* 0 means that the tcp_t has not been added to the time wait list. */ 74 if (tcp->tcp_time_wait_expire == 0) { 75 ASSERT(tcp->tcp_time_wait_next == NULL); 76 ASSERT(tcp->tcp_time_wait_prev == NULL); 77 if (locked) 78 mutex_exit(&tsp->tcp_time_wait_lock); 79 return (B_FALSE); 80 } 81 ASSERT(TCP_IS_DETACHED(tcp)); 82 ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); 83 ASSERT(tsp->tcp_time_wait_cnt > 0); 84 85 if (tcp->tcp_time_wait_next != NULL) { 86 tcp->tcp_time_wait_next->tcp_time_wait_prev = 87 tcp->tcp_time_wait_prev; 88 } 89 if (tcp->tcp_time_wait_prev != NULL) { 90 tcp->tcp_time_wait_prev->tcp_time_wait_next = 91 tcp->tcp_time_wait_next; 92 } else { 93 unsigned int bucket; 94 95 bucket = TW_BUCKET(tcp->tcp_time_wait_expire); 96 ASSERT(tsp->tcp_time_wait_bucket[bucket] == tcp); 97 tsp->tcp_time_wait_bucket[bucket] = tcp->tcp_time_wait_next; 98 } 99 tcp->tcp_time_wait_next = NULL; 100 tcp->tcp_time_wait_prev = NULL; 101 tcp->tcp_time_wait_expire = 0; 102 tsp->tcp_time_wait_cnt--; 103 104 if (locked) 105 mutex_exit(&tsp->tcp_time_wait_lock); 106 return (B_TRUE); 107 } 108 109 /* Constants used for fast checking of a localhost address */ 110 #if defined(_BIG_ENDIAN) 111 #define IPv4_LOCALHOST 0x7f000000U 112 #define IPv4_LH_MASK 0xffffff00U 113 #else 114 #define IPv4_LOCALHOST 0x0000007fU 115 #define IPv4_LH_MASK 0x00ffffffU 116 #endif 117 118 #define IS_LOCAL_HOST(x) ( \ 119 ((x)->tcp_connp->conn_ipversion == IPV4_VERSION && \ 120 ((x)->tcp_connp->conn_laddr_v4 & IPv4_LH_MASK) == IPv4_LOCALHOST) || \ 121 ((x)->tcp_connp->conn_ipversion == IPV6_VERSION && \ 122 IN6_IS_ADDR_LOOPBACK(&(x)->tcp_connp->conn_laddr_v6))) 123 124 125 /* 126 * Add a connection to the list of detached TIME_WAIT connections 127 * and set its time to expire. 128 */ 129 void 130 tcp_time_wait_append(tcp_t *tcp) 131 { 132 tcp_stack_t *tcps = tcp->tcp_tcps; 133 squeue_t *sqp = tcp->tcp_connp->conn_sqp; 134 tcp_squeue_priv_t *tsp = 135 *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP)); 136 int64_t now, schedule; 137 unsigned int bucket; 138 139 tcp_timers_stop(tcp); 140 141 /* Freed above */ 142 ASSERT(tcp->tcp_timer_tid == 0); 143 ASSERT(tcp->tcp_ack_tid == 0); 144 145 /* must have happened at the time of detaching the tcp */ 146 ASSERT(TCP_IS_DETACHED(tcp)); 147 ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); 148 ASSERT(tcp->tcp_ptpahn == NULL); 149 ASSERT(tcp->tcp_flow_stopped == 0); 150 ASSERT(tcp->tcp_time_wait_next == NULL); 151 ASSERT(tcp->tcp_time_wait_prev == NULL); 152 ASSERT(tcp->tcp_time_wait_expire == 0); 153 ASSERT(tcp->tcp_listener == NULL); 154 155 TCP_DBGSTAT(tcps, tcp_time_wait); 156 mutex_enter(&tsp->tcp_time_wait_lock); 157 158 /* 159 * Immediately expire loopback connections. Since there is no worry 160 * about packets on the local host showing up after a long network 161 * delay, this is safe and allows much higher rates of connection churn 162 * for applications operating locally. 163 * 164 * This typically bypasses the tcp_free_list fast path due to squeue 165 * re-entry for the loopback close operation. 166 */ 167 if (tcp->tcp_loopback) { 168 tcp_time_wait_purge(tcp, tsp); 169 mutex_exit(&tsp->tcp_time_wait_lock); 170 return; 171 } 172 173 /* 174 * In order to reap TIME_WAITs reliably, we should use a source of time 175 * that is not adjustable by the user. While it would be more accurate 176 * to grab this timestamp before (potentially) sleeping on the 177 * tcp_time_wait_lock, doing so complicates bucket addressing later. 178 */ 179 now = ddi_get_lbolt64(); 180 181 /* 182 * Each squeue uses an arbitrary time offset when scheduling 183 * expiration timers. This prevents the bucketing from forcing 184 * tcp_time_wait_collector to run in locksetup across squeues. 185 * 186 * This offset is (re)initialized when a new TIME_WAIT connection is 187 * added to an squeue which has no connections waiting to expire. 188 */ 189 if (tsp->tcp_time_wait_tid == 0) { 190 ASSERT(tsp->tcp_time_wait_cnt == 0); 191 tsp->tcp_time_wait_offset = 192 now % MSEC_TO_TICK(TCP_TIME_WAIT_DELAY); 193 } 194 now -= tsp->tcp_time_wait_offset; 195 196 /* 197 * Use the netstack-defined timeout, rounded up to the minimum 198 * time_wait_collector interval. 199 */ 200 schedule = now + MSEC_TO_TICK(tcps->tcps_time_wait_interval); 201 tcp->tcp_time_wait_expire = schedule; 202 203 /* 204 * Append the connection into the appropriate bucket. 205 */ 206 bucket = TW_BUCKET(tcp->tcp_time_wait_expire); 207 tcp->tcp_time_wait_next = tsp->tcp_time_wait_bucket[bucket]; 208 tsp->tcp_time_wait_bucket[bucket] = tcp; 209 if (tcp->tcp_time_wait_next != NULL) { 210 ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == NULL); 211 tcp->tcp_time_wait_next->tcp_time_wait_prev = tcp; 212 } 213 tsp->tcp_time_wait_cnt++; 214 215 /* 216 * Round delay up to the nearest bucket boundary. 217 */ 218 schedule += MSEC_TO_TICK(TCP_TIME_WAIT_DELAY); 219 schedule -= schedule % MSEC_TO_TICK(TCP_TIME_WAIT_DELAY); 220 221 /* 222 * The newly inserted entry may require a tighter schedule for the 223 * expiration timer. 224 */ 225 if (schedule < tsp->tcp_time_wait_schedule) { 226 callout_id_t old_tid = tsp->tcp_time_wait_tid; 227 228 tsp->tcp_time_wait_schedule = schedule; 229 tsp->tcp_time_wait_tid = 230 timeout_generic(CALLOUT_NORMAL, 231 tcp_time_wait_collector, sqp, 232 TICK_TO_NSEC(schedule - now), 233 CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP); 234 235 /* 236 * It is possible for the timer to fire before the untimeout 237 * action is able to complete. In that case, the exclusion 238 * offered by the tcp_time_wait_collector_active flag will 239 * prevent multiple collector threads from processing records 240 * simultaneously from the same squeue. 241 */ 242 mutex_exit(&tsp->tcp_time_wait_lock); 243 (void) untimeout_default(old_tid, 0); 244 return; 245 } 246 247 /* 248 * Start a fresh timer if none exists. 249 */ 250 if (tsp->tcp_time_wait_schedule == 0) { 251 ASSERT(tsp->tcp_time_wait_tid == 0); 252 253 tsp->tcp_time_wait_schedule = schedule; 254 tsp->tcp_time_wait_tid = 255 timeout_generic(CALLOUT_NORMAL, 256 tcp_time_wait_collector, sqp, 257 TICK_TO_NSEC(schedule - now), 258 CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP); 259 } 260 mutex_exit(&tsp->tcp_time_wait_lock); 261 } 262 263 /* 264 * Wrapper to call tcp_close_detached() via squeue to clean up TIME-WAIT 265 * tcp_t. Used in tcp_time_wait_collector(). 266 */ 267 /* ARGSUSED */ 268 static void 269 tcp_timewait_close(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 270 { 271 conn_t *connp = (conn_t *)arg; 272 tcp_t *tcp = connp->conn_tcp; 273 274 ASSERT(tcp != NULL); 275 if (tcp->tcp_state == TCPS_CLOSED) { 276 return; 277 } 278 279 ASSERT((connp->conn_family == AF_INET && 280 connp->conn_ipversion == IPV4_VERSION) || 281 (connp->conn_family == AF_INET6 && 282 (connp->conn_ipversion == IPV4_VERSION || 283 connp->conn_ipversion == IPV6_VERSION))); 284 ASSERT(!tcp->tcp_listener); 285 286 ASSERT(TCP_IS_DETACHED(tcp)); 287 288 /* 289 * Because they have no upstream client to rebind or tcp_close() 290 * them later, we axe the connection here and now. 291 */ 292 tcp_close_detached(tcp); 293 } 294 295 296 static void 297 tcp_time_wait_purge(tcp_t *tcp, tcp_squeue_priv_t *tsp) 298 { 299 mblk_t *mp; 300 conn_t *connp = tcp->tcp_connp; 301 kmutex_t *lock; 302 303 ASSERT(MUTEX_HELD(&tsp->tcp_time_wait_lock)); 304 ASSERT(connp->conn_fanout != NULL); 305 306 lock = &connp->conn_fanout->connf_lock; 307 308 /* 309 * This is essentially a TIME_WAIT reclaim fast path optimization for 310 * performance where the connection is checked under the fanout lock 311 * (so that no one else can get access to the conn_t) that the refcnt 312 * is 2 (one each for TCP and the classifier hash list). That is the 313 * case and clustering callbacks are not enabled, the conn can be 314 * removed under the fanout lock and avoid clean-up under the squeue. 315 * 316 * This optimization is forgone when clustering is enabled since the 317 * clustering callback must be made before setting the CONDEMNED flag 318 * and after dropping all locks 319 * 320 * See the comments in tcp_closei_local for additional information 321 * regarding the refcnt logic. 322 */ 323 if (mutex_tryenter(lock)) { 324 mutex_enter(&connp->conn_lock); 325 if (connp->conn_ref == 2 && cl_inet_disconnect == NULL) { 326 ipcl_hash_remove_locked(connp, connp->conn_fanout); 327 /* 328 * Set the CONDEMNED flag now itself so that the refcnt 329 * cannot increase due to any walker. 330 */ 331 connp->conn_state_flags |= CONN_CONDEMNED; 332 mutex_exit(&connp->conn_lock); 333 mutex_exit(lock); 334 if (tsp->tcp_free_list_cnt < tcp_free_list_max_cnt) { 335 /* 336 * Add to head of tcp_free_list 337 */ 338 tcp_cleanup(tcp); 339 ASSERT(connp->conn_latch == NULL); 340 ASSERT(connp->conn_policy == NULL); 341 ASSERT(tcp->tcp_tcps == NULL); 342 ASSERT(connp->conn_netstack == NULL); 343 344 tcp->tcp_time_wait_next = tsp->tcp_free_list; 345 tcp->tcp_in_free_list = B_TRUE; 346 tsp->tcp_free_list = tcp; 347 tsp->tcp_free_list_cnt++; 348 } else { 349 /* 350 * Do not add to tcp_free_list 351 */ 352 tcp_bind_hash_remove(tcp); 353 ixa_cleanup(tcp->tcp_connp->conn_ixa); 354 tcp_ipsec_cleanup(tcp); 355 CONN_DEC_REF(tcp->tcp_connp); 356 } 357 358 /* 359 * With the fast-path complete, we can bail. 360 */ 361 return; 362 } else { 363 /* 364 * Fall back to slow path. 365 */ 366 CONN_INC_REF_LOCKED(connp); 367 mutex_exit(&connp->conn_lock); 368 mutex_exit(lock); 369 } 370 } else { 371 CONN_INC_REF(connp); 372 } 373 374 /* 375 * We can reuse the closemp here since conn has detached (otherwise we 376 * wouldn't even be in time_wait list). It is safe to change 377 * tcp_closemp_used without taking a lock as no other thread can 378 * concurrently access it at this point in the connection lifecycle. 379 */ 380 if (tcp->tcp_closemp.b_prev == NULL) { 381 tcp->tcp_closemp_used = B_TRUE; 382 } else { 383 cmn_err(CE_PANIC, 384 "tcp_timewait_collector: concurrent use of tcp_closemp: " 385 "connp %p tcp %p\n", (void *)connp, (void *)tcp); 386 } 387 388 TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); 389 mp = &tcp->tcp_closemp; 390 mutex_exit(&tsp->tcp_time_wait_lock); 391 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_timewait_close, connp, NULL, 392 SQ_FILL, SQTAG_TCP_TIMEWAIT); 393 mutex_enter(&tsp->tcp_time_wait_lock); 394 } 395 396 /* 397 * Purge any tcp_t instances associated with this squeue which have expired 398 * from the TIME_WAIT state. 399 */ 400 void 401 tcp_time_wait_collector(void *arg) 402 { 403 tcp_t *tcp; 404 int64_t now, sched_active, sched_cur, sched_new; 405 unsigned int idx; 406 407 squeue_t *sqp = (squeue_t *)arg; 408 tcp_squeue_priv_t *tsp = 409 *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP)); 410 411 mutex_enter(&tsp->tcp_time_wait_lock); 412 413 /* 414 * Because of timer scheduling complexity and the fact that the 415 * tcp_time_wait_lock is dropped during tcp_time_wait_purge, it is 416 * possible for multiple tcp_time_wait_collector threads to run against 417 * the same squeue. This flag is used to exclude other collectors from 418 * the squeue during execution. 419 */ 420 if (tsp->tcp_time_wait_collector_active) { 421 mutex_exit(&tsp->tcp_time_wait_lock); 422 return; 423 } 424 tsp->tcp_time_wait_collector_active = B_TRUE; 425 426 /* 427 * After its assignment here, the value of sched_active must not be 428 * altered as it is used to validate the state of the 429 * tcp_time_wait_collector callout schedule for this squeue. 430 * 431 * The same does not hold true of sched_cur, which holds the timestamp 432 * of the bucket undergoing processing. While it is initially equal to 433 * sched_active, certain conditions below can walk it forward, 434 * triggering the retry loop. 435 */ 436 sched_cur = sched_active = tsp->tcp_time_wait_schedule; 437 438 /* 439 * Purge the free list if necessary 440 */ 441 if (tsp->tcp_free_list != NULL) { 442 TCP_G_STAT(tcp_freelist_cleanup); 443 while ((tcp = tsp->tcp_free_list) != NULL) { 444 tsp->tcp_free_list = tcp->tcp_time_wait_next; 445 tcp->tcp_time_wait_next = NULL; 446 tsp->tcp_free_list_cnt--; 447 ASSERT(tcp->tcp_tcps == NULL); 448 CONN_DEC_REF(tcp->tcp_connp); 449 } 450 ASSERT(tsp->tcp_free_list_cnt == 0); 451 } 452 453 /* 454 * If there are no connections pending, clear timer-related state to be 455 * reinitialized by the next caller. 456 */ 457 if (tsp->tcp_time_wait_cnt == 0) { 458 tsp->tcp_time_wait_offset = 0; 459 tsp->tcp_time_wait_schedule = 0; 460 tsp->tcp_time_wait_tid = 0; 461 tsp->tcp_time_wait_collector_active = B_FALSE; 462 mutex_exit(&tsp->tcp_time_wait_lock); 463 return; 464 } 465 466 retry: 467 /* 468 * Grab the bucket which we were scheduled to cleanse. 469 */ 470 idx = TW_BUCKET(sched_cur - 1); 471 now = ddi_get_lbolt64() - tsp->tcp_time_wait_offset; 472 tcp = tsp->tcp_time_wait_bucket[idx]; 473 474 while (tcp != NULL) { 475 /* 476 * Since the bucket count is sized to prevent wrap-around 477 * during typical operation and timers are schedule to process 478 * buckets with only expired connections, there is only one 479 * reason to encounter a connection expiring in the future: 480 * The tcp_time_wait_collector thread has been so delayed in 481 * its processing that connections have wrapped around the 482 * timing wheel into this bucket. 483 * 484 * In that case, the remaining entires in the bucket can be 485 * ignored since, being appended sequentially, they should all 486 * expire in the future. 487 */ 488 if (now < tcp->tcp_time_wait_expire) { 489 break; 490 } 491 492 /* 493 * Pull the connection out of the bucket. 494 */ 495 VERIFY(tcp_time_wait_remove(tcp, tsp)); 496 497 /* 498 * Purge the connection. 499 * 500 * While tcp_time_wait_lock will be temporarily dropped as part 501 * of the process, there is no risk of the timer being 502 * (re)scheduled while the collector is running since a value 503 * corresponding to the past is left in tcp_time_wait_schedule. 504 */ 505 tcp_time_wait_purge(tcp, tsp); 506 507 /* 508 * Because tcp_time_wait_remove clears the tcp_time_wait_next 509 * field, the next item must be grabbed directly from the 510 * bucket itself. 511 */ 512 tcp = tsp->tcp_time_wait_bucket[idx]; 513 } 514 515 if (tsp->tcp_time_wait_cnt == 0) { 516 /* 517 * There is not a need for the collector to schedule a new 518 * timer if no pending items remain. The timer state can be 519 * cleared only if it was untouched while the collector dropped 520 * its locks during tcp_time_wait_purge. 521 */ 522 if (tsp->tcp_time_wait_schedule == sched_active) { 523 tsp->tcp_time_wait_offset = 0; 524 tsp->tcp_time_wait_schedule = 0; 525 tsp->tcp_time_wait_tid = 0; 526 } 527 tsp->tcp_time_wait_collector_active = B_FALSE; 528 mutex_exit(&tsp->tcp_time_wait_lock); 529 return; 530 } else { 531 unsigned int nidx; 532 533 /* 534 * Locate the next bucket containing entries. 535 */ 536 sched_new = sched_cur + MSEC_TO_TICK(TCP_TIME_WAIT_DELAY); 537 nidx = TW_BUCKET_NEXT(idx); 538 while (tsp->tcp_time_wait_bucket[nidx] == NULL) { 539 if (nidx == idx) { 540 break; 541 } 542 nidx = TW_BUCKET_NEXT(nidx); 543 sched_new += MSEC_TO_TICK(TCP_TIME_WAIT_DELAY); 544 } 545 ASSERT(tsp->tcp_time_wait_bucket[nidx] != NULL); 546 } 547 548 /* 549 * It is possible that the system is under such dire load that between 550 * the timer scheduling and TIME_WAIT processing delay, execution 551 * overran the interval allocated to this bucket. 552 */ 553 now = ddi_get_lbolt64() - tsp->tcp_time_wait_offset; 554 if (sched_new <= now) { 555 /* 556 * Attempt to right the situation by immediately performing a 557 * purge on the next bucket. This loop will continue as needed 558 * until the schedule can be pushed out ahead of the clock. 559 */ 560 sched_cur = sched_new; 561 DTRACE_PROBE3(tcp__time__wait__overrun, 562 tcp_squeue_priv_t *, tsp, int64_t, sched_new, int64_t, now); 563 goto retry; 564 } 565 566 /* 567 * Another thread may have snuck in to reschedule the timer while locks 568 * were dropped during tcp_time_wait_purge. Defer to the running timer 569 * if that is the case. 570 */ 571 if (tsp->tcp_time_wait_schedule != sched_active) { 572 tsp->tcp_time_wait_collector_active = B_FALSE; 573 mutex_exit(&tsp->tcp_time_wait_lock); 574 return; 575 } 576 577 /* 578 * Schedule the next timer. 579 */ 580 tsp->tcp_time_wait_schedule = sched_new; 581 tsp->tcp_time_wait_tid = 582 timeout_generic(CALLOUT_NORMAL, 583 tcp_time_wait_collector, sqp, 584 TICK_TO_NSEC(sched_new - now), 585 CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP); 586 tsp->tcp_time_wait_collector_active = B_FALSE; 587 mutex_exit(&tsp->tcp_time_wait_lock); 588 } 589 590 /* 591 * tcp_time_wait_processing() handles processing of incoming packets when 592 * the tcp_t is in the TIME_WAIT state. 593 * 594 * A TIME_WAIT tcp_t that has an associated open TCP end point (not in 595 * detached state) is never put on the time wait list. 596 */ 597 void 598 tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq, 599 uint32_t seg_ack, int seg_len, tcpha_t *tcpha, ip_recv_attr_t *ira) 600 { 601 int32_t bytes_acked; 602 int32_t gap; 603 int32_t rgap; 604 tcp_opt_t tcpopt; 605 uint_t flags; 606 uint32_t new_swnd = 0; 607 conn_t *nconnp; 608 conn_t *connp = tcp->tcp_connp; 609 tcp_stack_t *tcps = tcp->tcp_tcps; 610 611 BUMP_LOCAL(tcp->tcp_ibsegs); 612 DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp); 613 614 flags = (unsigned int)tcpha->tha_flags & 0xFF; 615 new_swnd = ntohs(tcpha->tha_win) << 616 ((tcpha->tha_flags & TH_SYN) ? 0 : tcp->tcp_snd_ws); 617 618 boolean_t keepalive = (seg_len == 0 || seg_len == 1) && 619 (seg_seq + 1 == tcp->tcp_rnxt); 620 if (tcp->tcp_snd_ts_ok && !(flags & TH_RST) && !keepalive) { 621 int options; 622 if (tcp->tcp_snd_sack_ok) 623 tcpopt.tcp = tcp; 624 else 625 tcpopt.tcp = NULL; 626 options = tcp_parse_options(tcpha, &tcpopt); 627 if (!(options & TCP_OPT_TSTAMP_PRESENT)) { 628 DTRACE_TCP1(droppedtimestamp, tcp_t *, tcp); 629 goto done; 630 } else if (!tcp_paws_check(tcp, &tcpopt)) { 631 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt, 632 TH_ACK); 633 goto done; 634 } 635 } 636 gap = seg_seq - tcp->tcp_rnxt; 637 rgap = tcp->tcp_rwnd - (gap + seg_len); 638 if (gap < 0) { 639 TCPS_BUMP_MIB(tcps, tcpInDataDupSegs); 640 TCPS_UPDATE_MIB(tcps, tcpInDataDupBytes, 641 (seg_len > -gap ? -gap : seg_len)); 642 seg_len += gap; 643 if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) { 644 if (flags & TH_RST) { 645 goto done; 646 } 647 if ((flags & TH_FIN) && seg_len == -1) { 648 /* 649 * When TCP receives a duplicate FIN in 650 * TIME_WAIT state, restart the 2 MSL timer. 651 * See page 73 in RFC 793. Make sure this TCP 652 * is already on the TIME_WAIT list. If not, 653 * just restart the timer. 654 */ 655 if (TCP_IS_DETACHED(tcp)) { 656 if (tcp_time_wait_remove(tcp, NULL) == 657 B_TRUE) { 658 tcp_time_wait_append(tcp); 659 TCP_DBGSTAT(tcps, 660 tcp_rput_time_wait); 661 } 662 } else { 663 ASSERT(tcp != NULL); 664 TCP_TIMER_RESTART(tcp, 665 tcps->tcps_time_wait_interval); 666 } 667 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, 668 tcp->tcp_rnxt, TH_ACK); 669 goto done; 670 } 671 flags |= TH_ACK_NEEDED; 672 seg_len = 0; 673 goto process_ack; 674 } 675 676 /* Fix seg_seq, and chew the gap off the front. */ 677 seg_seq = tcp->tcp_rnxt; 678 } 679 680 if ((flags & TH_SYN) && gap > 0 && rgap < 0) { 681 /* 682 * Make sure that when we accept the connection, pick 683 * an ISS greater than (tcp_snxt + tcp_iss_incr/2) for the 684 * old connection. 685 * 686 * The next ISS generated is equal to tcp_iss_incr_extra 687 * + tcp_iss_incr/2 + other components depending on the 688 * value of tcp_strong_iss. We pre-calculate the new 689 * ISS here and compare with tcp_snxt to determine if 690 * we need to make adjustment to tcp_iss_incr_extra. 691 * 692 * The above calculation is ugly and is a 693 * waste of CPU cycles... 694 */ 695 uint32_t new_iss = tcps->tcps_iss_incr_extra; 696 int32_t adj; 697 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 698 699 switch (tcps->tcps_strong_iss) { 700 case 2: { 701 /* Add time and MD5 components. */ 702 uint32_t answer[4]; 703 struct { 704 uint32_t ports; 705 in6_addr_t src; 706 in6_addr_t dst; 707 } arg; 708 MD5_CTX context; 709 710 mutex_enter(&tcps->tcps_iss_key_lock); 711 context = tcps->tcps_iss_key; 712 mutex_exit(&tcps->tcps_iss_key_lock); 713 arg.ports = connp->conn_ports; 714 /* We use MAPPED addresses in tcp_iss_init */ 715 arg.src = connp->conn_laddr_v6; 716 arg.dst = connp->conn_faddr_v6; 717 MD5Update(&context, (uchar_t *)&arg, 718 sizeof (arg)); 719 MD5Final((uchar_t *)answer, &context); 720 answer[0] ^= answer[1] ^ answer[2] ^ answer[3]; 721 new_iss += (gethrtime() >> ISS_NSEC_SHT) + answer[0]; 722 break; 723 } 724 case 1: 725 /* Add time component and min random (i.e. 1). */ 726 new_iss += (gethrtime() >> ISS_NSEC_SHT) + 1; 727 break; 728 default: 729 /* Add only time component. */ 730 new_iss += (uint32_t)gethrestime_sec() * 731 tcps->tcps_iss_incr; 732 break; 733 } 734 if ((adj = (int32_t)(tcp->tcp_snxt - new_iss)) > 0) { 735 /* 736 * New ISS not guaranteed to be tcp_iss_incr/2 737 * ahead of the current tcp_snxt, so add the 738 * difference to tcp_iss_incr_extra. 739 */ 740 tcps->tcps_iss_incr_extra += adj; 741 } 742 /* 743 * If tcp_clean_death() can not perform the task now, 744 * drop the SYN packet and let the other side re-xmit. 745 * Otherwise pass the SYN packet back in, since the 746 * old tcp state has been cleaned up or freed. 747 */ 748 if (tcp_clean_death(tcp, 0) == -1) 749 goto done; 750 nconnp = ipcl_classify(mp, ira, ipst); 751 if (nconnp != NULL) { 752 TCP_STAT(tcps, tcp_time_wait_syn_success); 753 /* Drops ref on nconnp */ 754 tcp_reinput(nconnp, mp, ira, ipst); 755 return; 756 } 757 goto done; 758 } 759 760 /* 761 * rgap is the amount of stuff received out of window. A negative 762 * value is the amount out of window. 763 */ 764 if (rgap < 0) { 765 TCPS_BUMP_MIB(tcps, tcpInDataPastWinSegs); 766 TCPS_UPDATE_MIB(tcps, tcpInDataPastWinBytes, -rgap); 767 /* Fix seg_len and make sure there is something left. */ 768 seg_len += rgap; 769 if (seg_len <= 0) { 770 if (flags & TH_RST) { 771 goto done; 772 } 773 flags |= TH_ACK_NEEDED; 774 seg_len = 0; 775 goto process_ack; 776 } 777 } 778 /* 779 * Check whether we can update tcp_ts_recent. This test is from RFC 780 * 7323, section 5.3. 781 */ 782 if (tcp->tcp_snd_ts_ok && !(flags & TH_RST) && 783 TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) && 784 SEQ_LEQ(seg_seq, tcp->tcp_rack)) { 785 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 786 tcp->tcp_last_rcv_lbolt = ddi_get_lbolt64(); 787 } 788 789 if (seg_seq != tcp->tcp_rnxt && seg_len > 0) { 790 /* Always ack out of order packets */ 791 flags |= TH_ACK_NEEDED; 792 seg_len = 0; 793 } else if (seg_len > 0) { 794 TCPS_BUMP_MIB(tcps, tcpInClosed); 795 TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs); 796 TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, seg_len); 797 } 798 if (flags & TH_RST) { 799 (void) tcp_clean_death(tcp, 0); 800 goto done; 801 } 802 if (flags & TH_SYN) { 803 tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1, 804 TH_RST|TH_ACK); 805 /* 806 * Do not delete the TCP structure if it is in 807 * TIME_WAIT state. Refer to RFC 1122, 4.2.2.13. 808 */ 809 goto done; 810 } 811 process_ack: 812 if (flags & TH_ACK) { 813 bytes_acked = (int)(seg_ack - tcp->tcp_suna); 814 if (bytes_acked <= 0) { 815 if (bytes_acked == 0 && seg_len == 0 && 816 new_swnd == tcp->tcp_swnd) 817 TCPS_BUMP_MIB(tcps, tcpInDupAck); 818 } else { 819 /* Acks something not sent */ 820 flags |= TH_ACK_NEEDED; 821 } 822 } 823 if (flags & TH_ACK_NEEDED) { 824 /* 825 * Time to send an ack for some reason. 826 */ 827 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, 828 tcp->tcp_rnxt, TH_ACK); 829 } 830 done: 831 freemsg(mp); 832 }