Print this page
11547 Want connstat(1M) command to display per-connection TCP statistics
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Portions contributed by: Ahmed G <ahmedg@delphix.com>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/inet/tcp/tcp_time_wait.c
+++ new/usr/src/uts/common/inet/tcp/tcp_time_wait.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 24 * Copyright 2016 Joyent, Inc.
25 25 * Copyright (c) 2016 by Delphix. All rights reserved.
26 26 */
27 27
28 28 /*
29 29 * This file contains functions related to TCP time wait processing. Also
30 30 * refer to the time wait handling comments in tcp_impl.h.
31 31 */
32 32
33 33 #include <sys/types.h>
34 34 #include <sys/strsun.h>
35 35 #include <sys/squeue_impl.h>
36 36 #include <sys/squeue.h>
37 37 #include <sys/callo.h>
38 38
39 39 #include <inet/common.h>
40 40 #include <inet/ip.h>
41 41 #include <inet/tcp.h>
42 42 #include <inet/tcp_impl.h>
43 43 #include <inet/tcp_cluster.h>
44 44
45 45 static void tcp_time_wait_purge(tcp_t *, tcp_squeue_priv_t *);
46 46
47 47 #define TW_BUCKET(t) \
48 48 (((t) / MSEC_TO_TICK(TCP_TIME_WAIT_DELAY)) % TCP_TIME_WAIT_BUCKETS)
49 49
50 50 #define TW_BUCKET_NEXT(b) (((b) + 1) % TCP_TIME_WAIT_BUCKETS)
51 51
52 52
53 53 /*
54 54 * Remove a connection from the list of detached TIME_WAIT connections.
55 55 * It returns B_FALSE if it can't remove the connection from the list
56 56 * as the connection has already been removed from the list due to an
57 57 * earlier call to tcp_time_wait_remove(); otherwise it returns B_TRUE.
58 58 */
59 59 boolean_t
60 60 tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tsp)
61 61 {
62 62 boolean_t locked = B_FALSE;
63 63
64 64 if (tsp == NULL) {
65 65 tsp = *((tcp_squeue_priv_t **)
66 66 squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP));
67 67 mutex_enter(&tsp->tcp_time_wait_lock);
68 68 locked = B_TRUE;
69 69 } else {
70 70 ASSERT(MUTEX_HELD(&tsp->tcp_time_wait_lock));
71 71 }
72 72
73 73 /* 0 means that the tcp_t has not been added to the time wait list. */
74 74 if (tcp->tcp_time_wait_expire == 0) {
75 75 ASSERT(tcp->tcp_time_wait_next == NULL);
76 76 ASSERT(tcp->tcp_time_wait_prev == NULL);
77 77 if (locked)
78 78 mutex_exit(&tsp->tcp_time_wait_lock);
79 79 return (B_FALSE);
80 80 }
81 81 ASSERT(TCP_IS_DETACHED(tcp));
82 82 ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
83 83 ASSERT(tsp->tcp_time_wait_cnt > 0);
84 84
85 85 if (tcp->tcp_time_wait_next != NULL) {
86 86 tcp->tcp_time_wait_next->tcp_time_wait_prev =
87 87 tcp->tcp_time_wait_prev;
88 88 }
89 89 if (tcp->tcp_time_wait_prev != NULL) {
90 90 tcp->tcp_time_wait_prev->tcp_time_wait_next =
91 91 tcp->tcp_time_wait_next;
92 92 } else {
93 93 unsigned int bucket;
94 94
95 95 bucket = TW_BUCKET(tcp->tcp_time_wait_expire);
96 96 ASSERT(tsp->tcp_time_wait_bucket[bucket] == tcp);
97 97 tsp->tcp_time_wait_bucket[bucket] = tcp->tcp_time_wait_next;
98 98 }
99 99 tcp->tcp_time_wait_next = NULL;
100 100 tcp->tcp_time_wait_prev = NULL;
101 101 tcp->tcp_time_wait_expire = 0;
102 102 tsp->tcp_time_wait_cnt--;
103 103
104 104 if (locked)
105 105 mutex_exit(&tsp->tcp_time_wait_lock);
106 106 return (B_TRUE);
107 107 }
108 108
109 109 /* Constants used for fast checking of a localhost address */
110 110 #if defined(_BIG_ENDIAN)
111 111 #define IPv4_LOCALHOST 0x7f000000U
112 112 #define IPv4_LH_MASK 0xffffff00U
113 113 #else
114 114 #define IPv4_LOCALHOST 0x0000007fU
115 115 #define IPv4_LH_MASK 0x00ffffffU
116 116 #endif
117 117
118 118 #define IS_LOCAL_HOST(x) ( \
119 119 ((x)->tcp_connp->conn_ipversion == IPV4_VERSION && \
120 120 ((x)->tcp_connp->conn_laddr_v4 & IPv4_LH_MASK) == IPv4_LOCALHOST) || \
121 121 ((x)->tcp_connp->conn_ipversion == IPV6_VERSION && \
122 122 IN6_IS_ADDR_LOOPBACK(&(x)->tcp_connp->conn_laddr_v6)))
123 123
124 124
125 125 /*
126 126 * Add a connection to the list of detached TIME_WAIT connections
127 127 * and set its time to expire.
128 128 */
129 129 void
130 130 tcp_time_wait_append(tcp_t *tcp)
131 131 {
132 132 tcp_stack_t *tcps = tcp->tcp_tcps;
133 133 squeue_t *sqp = tcp->tcp_connp->conn_sqp;
134 134 tcp_squeue_priv_t *tsp =
135 135 *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
136 136 int64_t now, schedule;
137 137 unsigned int bucket;
138 138
139 139 tcp_timers_stop(tcp);
140 140
141 141 /* Freed above */
142 142 ASSERT(tcp->tcp_timer_tid == 0);
143 143 ASSERT(tcp->tcp_ack_tid == 0);
144 144
145 145 /* must have happened at the time of detaching the tcp */
146 146 ASSERT(TCP_IS_DETACHED(tcp));
147 147 ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
148 148 ASSERT(tcp->tcp_ptpahn == NULL);
149 149 ASSERT(tcp->tcp_flow_stopped == 0);
150 150 ASSERT(tcp->tcp_time_wait_next == NULL);
151 151 ASSERT(tcp->tcp_time_wait_prev == NULL);
152 152 ASSERT(tcp->tcp_time_wait_expire == 0);
153 153 ASSERT(tcp->tcp_listener == NULL);
154 154
155 155 TCP_DBGSTAT(tcps, tcp_time_wait);
156 156 mutex_enter(&tsp->tcp_time_wait_lock);
157 157
158 158 /*
159 159 * Immediately expire loopback connections. Since there is no worry
160 160 * about packets on the local host showing up after a long network
161 161 * delay, this is safe and allows much higher rates of connection churn
162 162 * for applications operating locally.
163 163 *
164 164 * This typically bypasses the tcp_free_list fast path due to squeue
165 165 * re-entry for the loopback close operation.
166 166 */
167 167 if (tcp->tcp_loopback) {
168 168 tcp_time_wait_purge(tcp, tsp);
169 169 mutex_exit(&tsp->tcp_time_wait_lock);
170 170 return;
171 171 }
172 172
173 173 /*
174 174 * In order to reap TIME_WAITs reliably, we should use a source of time
175 175 * that is not adjustable by the user. While it would be more accurate
176 176 * to grab this timestamp before (potentially) sleeping on the
177 177 * tcp_time_wait_lock, doing so complicates bucket addressing later.
178 178 */
179 179 now = ddi_get_lbolt64();
180 180
181 181 /*
182 182 * Each squeue uses an arbitrary time offset when scheduling
183 183 * expiration timers. This prevents the bucketing from forcing
184 184 * tcp_time_wait_collector to run in locksetup across squeues.
185 185 *
186 186 * This offset is (re)initialized when a new TIME_WAIT connection is
187 187 * added to an squeue which has no connections waiting to expire.
188 188 */
189 189 if (tsp->tcp_time_wait_tid == 0) {
190 190 ASSERT(tsp->tcp_time_wait_cnt == 0);
191 191 tsp->tcp_time_wait_offset =
192 192 now % MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
193 193 }
194 194 now -= tsp->tcp_time_wait_offset;
195 195
196 196 /*
197 197 * Use the netstack-defined timeout, rounded up to the minimum
198 198 * time_wait_collector interval.
199 199 */
200 200 schedule = now + MSEC_TO_TICK(tcps->tcps_time_wait_interval);
201 201 tcp->tcp_time_wait_expire = schedule;
202 202
203 203 /*
204 204 * Append the connection into the appropriate bucket.
205 205 */
206 206 bucket = TW_BUCKET(tcp->tcp_time_wait_expire);
207 207 tcp->tcp_time_wait_next = tsp->tcp_time_wait_bucket[bucket];
208 208 tsp->tcp_time_wait_bucket[bucket] = tcp;
209 209 if (tcp->tcp_time_wait_next != NULL) {
210 210 ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == NULL);
211 211 tcp->tcp_time_wait_next->tcp_time_wait_prev = tcp;
212 212 }
213 213 tsp->tcp_time_wait_cnt++;
214 214
215 215 /*
216 216 * Round delay up to the nearest bucket boundary.
217 217 */
218 218 schedule += MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
219 219 schedule -= schedule % MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
220 220
221 221 /*
222 222 * The newly inserted entry may require a tighter schedule for the
223 223 * expiration timer.
224 224 */
225 225 if (schedule < tsp->tcp_time_wait_schedule) {
226 226 callout_id_t old_tid = tsp->tcp_time_wait_tid;
227 227
228 228 tsp->tcp_time_wait_schedule = schedule;
229 229 tsp->tcp_time_wait_tid =
230 230 timeout_generic(CALLOUT_NORMAL,
231 231 tcp_time_wait_collector, sqp,
232 232 TICK_TO_NSEC(schedule - now),
233 233 CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
234 234
235 235 /*
236 236 * It is possible for the timer to fire before the untimeout
237 237 * action is able to complete. In that case, the exclusion
238 238 * offered by the tcp_time_wait_collector_active flag will
239 239 * prevent multiple collector threads from processing records
240 240 * simultaneously from the same squeue.
241 241 */
242 242 mutex_exit(&tsp->tcp_time_wait_lock);
243 243 (void) untimeout_default(old_tid, 0);
244 244 return;
245 245 }
246 246
247 247 /*
248 248 * Start a fresh timer if none exists.
249 249 */
250 250 if (tsp->tcp_time_wait_schedule == 0) {
251 251 ASSERT(tsp->tcp_time_wait_tid == 0);
252 252
253 253 tsp->tcp_time_wait_schedule = schedule;
254 254 tsp->tcp_time_wait_tid =
255 255 timeout_generic(CALLOUT_NORMAL,
256 256 tcp_time_wait_collector, sqp,
257 257 TICK_TO_NSEC(schedule - now),
258 258 CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
259 259 }
260 260 mutex_exit(&tsp->tcp_time_wait_lock);
261 261 }
262 262
263 263 /*
264 264 * Wrapper to call tcp_close_detached() via squeue to clean up TIME-WAIT
265 265 * tcp_t. Used in tcp_time_wait_collector().
266 266 */
267 267 /* ARGSUSED */
268 268 static void
269 269 tcp_timewait_close(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
270 270 {
271 271 conn_t *connp = (conn_t *)arg;
272 272 tcp_t *tcp = connp->conn_tcp;
273 273
274 274 ASSERT(tcp != NULL);
275 275 if (tcp->tcp_state == TCPS_CLOSED) {
276 276 return;
277 277 }
278 278
279 279 ASSERT((connp->conn_family == AF_INET &&
280 280 connp->conn_ipversion == IPV4_VERSION) ||
281 281 (connp->conn_family == AF_INET6 &&
282 282 (connp->conn_ipversion == IPV4_VERSION ||
283 283 connp->conn_ipversion == IPV6_VERSION)));
284 284 ASSERT(!tcp->tcp_listener);
285 285
286 286 ASSERT(TCP_IS_DETACHED(tcp));
287 287
288 288 /*
289 289 * Because they have no upstream client to rebind or tcp_close()
290 290 * them later, we axe the connection here and now.
291 291 */
292 292 tcp_close_detached(tcp);
293 293 }
294 294
295 295
296 296 static void
297 297 tcp_time_wait_purge(tcp_t *tcp, tcp_squeue_priv_t *tsp)
298 298 {
299 299 mblk_t *mp;
300 300 conn_t *connp = tcp->tcp_connp;
301 301 kmutex_t *lock;
302 302
303 303 ASSERT(MUTEX_HELD(&tsp->tcp_time_wait_lock));
304 304 ASSERT(connp->conn_fanout != NULL);
305 305
306 306 lock = &connp->conn_fanout->connf_lock;
307 307
308 308 /*
309 309 * This is essentially a TIME_WAIT reclaim fast path optimization for
310 310 * performance where the connection is checked under the fanout lock
311 311 * (so that no one else can get access to the conn_t) that the refcnt
312 312 * is 2 (one each for TCP and the classifier hash list). That is the
313 313 * case and clustering callbacks are not enabled, the conn can be
314 314 * removed under the fanout lock and avoid clean-up under the squeue.
315 315 *
316 316 * This optimization is forgone when clustering is enabled since the
317 317 * clustering callback must be made before setting the CONDEMNED flag
318 318 * and after dropping all locks
319 319 *
320 320 * See the comments in tcp_closei_local for additional information
321 321 * regarding the refcnt logic.
322 322 */
323 323 if (mutex_tryenter(lock)) {
324 324 mutex_enter(&connp->conn_lock);
325 325 if (connp->conn_ref == 2 && cl_inet_disconnect == NULL) {
326 326 ipcl_hash_remove_locked(connp, connp->conn_fanout);
327 327 /*
328 328 * Set the CONDEMNED flag now itself so that the refcnt
329 329 * cannot increase due to any walker.
330 330 */
331 331 connp->conn_state_flags |= CONN_CONDEMNED;
332 332 mutex_exit(&connp->conn_lock);
333 333 mutex_exit(lock);
334 334 if (tsp->tcp_free_list_cnt < tcp_free_list_max_cnt) {
335 335 /*
336 336 * Add to head of tcp_free_list
337 337 */
338 338 tcp_cleanup(tcp);
339 339 ASSERT(connp->conn_latch == NULL);
340 340 ASSERT(connp->conn_policy == NULL);
341 341 ASSERT(tcp->tcp_tcps == NULL);
342 342 ASSERT(connp->conn_netstack == NULL);
343 343
344 344 tcp->tcp_time_wait_next = tsp->tcp_free_list;
345 345 tcp->tcp_in_free_list = B_TRUE;
346 346 tsp->tcp_free_list = tcp;
347 347 tsp->tcp_free_list_cnt++;
348 348 } else {
349 349 /*
350 350 * Do not add to tcp_free_list
351 351 */
352 352 tcp_bind_hash_remove(tcp);
353 353 ixa_cleanup(tcp->tcp_connp->conn_ixa);
354 354 tcp_ipsec_cleanup(tcp);
355 355 CONN_DEC_REF(tcp->tcp_connp);
356 356 }
357 357
358 358 /*
359 359 * With the fast-path complete, we can bail.
360 360 */
361 361 return;
362 362 } else {
363 363 /*
364 364 * Fall back to slow path.
365 365 */
366 366 CONN_INC_REF_LOCKED(connp);
367 367 mutex_exit(&connp->conn_lock);
368 368 mutex_exit(lock);
369 369 }
370 370 } else {
371 371 CONN_INC_REF(connp);
372 372 }
373 373
374 374 /*
375 375 * We can reuse the closemp here since conn has detached (otherwise we
376 376 * wouldn't even be in time_wait list). It is safe to change
377 377 * tcp_closemp_used without taking a lock as no other thread can
378 378 * concurrently access it at this point in the connection lifecycle.
379 379 */
380 380 if (tcp->tcp_closemp.b_prev == NULL) {
381 381 tcp->tcp_closemp_used = B_TRUE;
382 382 } else {
383 383 cmn_err(CE_PANIC,
384 384 "tcp_timewait_collector: concurrent use of tcp_closemp: "
385 385 "connp %p tcp %p\n", (void *)connp, (void *)tcp);
386 386 }
387 387
388 388 TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
389 389 mp = &tcp->tcp_closemp;
390 390 mutex_exit(&tsp->tcp_time_wait_lock);
391 391 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_timewait_close, connp, NULL,
392 392 SQ_FILL, SQTAG_TCP_TIMEWAIT);
393 393 mutex_enter(&tsp->tcp_time_wait_lock);
394 394 }
395 395
396 396 /*
397 397 * Purge any tcp_t instances associated with this squeue which have expired
398 398 * from the TIME_WAIT state.
399 399 */
400 400 void
401 401 tcp_time_wait_collector(void *arg)
402 402 {
403 403 tcp_t *tcp;
404 404 int64_t now, sched_active, sched_cur, sched_new;
405 405 unsigned int idx;
406 406
407 407 squeue_t *sqp = (squeue_t *)arg;
408 408 tcp_squeue_priv_t *tsp =
409 409 *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
410 410
411 411 mutex_enter(&tsp->tcp_time_wait_lock);
412 412
413 413 /*
414 414 * Because of timer scheduling complexity and the fact that the
415 415 * tcp_time_wait_lock is dropped during tcp_time_wait_purge, it is
416 416 * possible for multiple tcp_time_wait_collector threads to run against
417 417 * the same squeue. This flag is used to exclude other collectors from
418 418 * the squeue during execution.
419 419 */
420 420 if (tsp->tcp_time_wait_collector_active) {
421 421 mutex_exit(&tsp->tcp_time_wait_lock);
422 422 return;
423 423 }
424 424 tsp->tcp_time_wait_collector_active = B_TRUE;
425 425
426 426 /*
427 427 * After its assignment here, the value of sched_active must not be
428 428 * altered as it is used to validate the state of the
429 429 * tcp_time_wait_collector callout schedule for this squeue.
430 430 *
431 431 * The same does not hold true of sched_cur, which holds the timestamp
432 432 * of the bucket undergoing processing. While it is initially equal to
433 433 * sched_active, certain conditions below can walk it forward,
434 434 * triggering the retry loop.
435 435 */
436 436 sched_cur = sched_active = tsp->tcp_time_wait_schedule;
437 437
438 438 /*
439 439 * Purge the free list if necessary
440 440 */
441 441 if (tsp->tcp_free_list != NULL) {
442 442 TCP_G_STAT(tcp_freelist_cleanup);
443 443 while ((tcp = tsp->tcp_free_list) != NULL) {
444 444 tsp->tcp_free_list = tcp->tcp_time_wait_next;
445 445 tcp->tcp_time_wait_next = NULL;
446 446 tsp->tcp_free_list_cnt--;
447 447 ASSERT(tcp->tcp_tcps == NULL);
448 448 CONN_DEC_REF(tcp->tcp_connp);
449 449 }
450 450 ASSERT(tsp->tcp_free_list_cnt == 0);
451 451 }
452 452
453 453 /*
454 454 * If there are no connections pending, clear timer-related state to be
455 455 * reinitialized by the next caller.
456 456 */
457 457 if (tsp->tcp_time_wait_cnt == 0) {
458 458 tsp->tcp_time_wait_offset = 0;
459 459 tsp->tcp_time_wait_schedule = 0;
460 460 tsp->tcp_time_wait_tid = 0;
461 461 tsp->tcp_time_wait_collector_active = B_FALSE;
462 462 mutex_exit(&tsp->tcp_time_wait_lock);
463 463 return;
464 464 }
465 465
466 466 retry:
467 467 /*
468 468 * Grab the bucket which we were scheduled to cleanse.
469 469 */
470 470 idx = TW_BUCKET(sched_cur - 1);
471 471 now = ddi_get_lbolt64() - tsp->tcp_time_wait_offset;
472 472 tcp = tsp->tcp_time_wait_bucket[idx];
473 473
474 474 while (tcp != NULL) {
475 475 /*
476 476 * Since the bucket count is sized to prevent wrap-around
477 477 * during typical operation and timers are schedule to process
478 478 * buckets with only expired connections, there is only one
479 479 * reason to encounter a connection expiring in the future:
480 480 * The tcp_time_wait_collector thread has been so delayed in
481 481 * its processing that connections have wrapped around the
482 482 * timing wheel into this bucket.
483 483 *
484 484 * In that case, the remaining entires in the bucket can be
485 485 * ignored since, being appended sequentially, they should all
486 486 * expire in the future.
487 487 */
488 488 if (now < tcp->tcp_time_wait_expire) {
489 489 break;
490 490 }
491 491
492 492 /*
493 493 * Pull the connection out of the bucket.
494 494 */
495 495 VERIFY(tcp_time_wait_remove(tcp, tsp));
496 496
497 497 /*
498 498 * Purge the connection.
499 499 *
500 500 * While tcp_time_wait_lock will be temporarily dropped as part
501 501 * of the process, there is no risk of the timer being
502 502 * (re)scheduled while the collector is running since a value
503 503 * corresponding to the past is left in tcp_time_wait_schedule.
504 504 */
505 505 tcp_time_wait_purge(tcp, tsp);
506 506
507 507 /*
508 508 * Because tcp_time_wait_remove clears the tcp_time_wait_next
509 509 * field, the next item must be grabbed directly from the
510 510 * bucket itself.
511 511 */
512 512 tcp = tsp->tcp_time_wait_bucket[idx];
513 513 }
514 514
515 515 if (tsp->tcp_time_wait_cnt == 0) {
516 516 /*
517 517 * There is not a need for the collector to schedule a new
518 518 * timer if no pending items remain. The timer state can be
519 519 * cleared only if it was untouched while the collector dropped
520 520 * its locks during tcp_time_wait_purge.
521 521 */
522 522 if (tsp->tcp_time_wait_schedule == sched_active) {
523 523 tsp->tcp_time_wait_offset = 0;
524 524 tsp->tcp_time_wait_schedule = 0;
525 525 tsp->tcp_time_wait_tid = 0;
526 526 }
527 527 tsp->tcp_time_wait_collector_active = B_FALSE;
528 528 mutex_exit(&tsp->tcp_time_wait_lock);
529 529 return;
530 530 } else {
531 531 unsigned int nidx;
532 532
533 533 /*
534 534 * Locate the next bucket containing entries.
535 535 */
536 536 sched_new = sched_cur + MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
537 537 nidx = TW_BUCKET_NEXT(idx);
538 538 while (tsp->tcp_time_wait_bucket[nidx] == NULL) {
539 539 if (nidx == idx) {
540 540 break;
541 541 }
542 542 nidx = TW_BUCKET_NEXT(nidx);
543 543 sched_new += MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
544 544 }
545 545 ASSERT(tsp->tcp_time_wait_bucket[nidx] != NULL);
546 546 }
547 547
548 548 /*
549 549 * It is possible that the system is under such dire load that between
550 550 * the timer scheduling and TIME_WAIT processing delay, execution
551 551 * overran the interval allocated to this bucket.
552 552 */
553 553 now = ddi_get_lbolt64() - tsp->tcp_time_wait_offset;
554 554 if (sched_new <= now) {
555 555 /*
556 556 * Attempt to right the situation by immediately performing a
557 557 * purge on the next bucket. This loop will continue as needed
558 558 * until the schedule can be pushed out ahead of the clock.
559 559 */
560 560 sched_cur = sched_new;
561 561 DTRACE_PROBE3(tcp__time__wait__overrun,
562 562 tcp_squeue_priv_t *, tsp, int64_t, sched_new, int64_t, now);
563 563 goto retry;
564 564 }
565 565
566 566 /*
567 567 * Another thread may have snuck in to reschedule the timer while locks
568 568 * were dropped during tcp_time_wait_purge. Defer to the running timer
569 569 * if that is the case.
570 570 */
571 571 if (tsp->tcp_time_wait_schedule != sched_active) {
572 572 tsp->tcp_time_wait_collector_active = B_FALSE;
573 573 mutex_exit(&tsp->tcp_time_wait_lock);
574 574 return;
575 575 }
576 576
577 577 /*
578 578 * Schedule the next timer.
579 579 */
580 580 tsp->tcp_time_wait_schedule = sched_new;
581 581 tsp->tcp_time_wait_tid =
582 582 timeout_generic(CALLOUT_NORMAL,
583 583 tcp_time_wait_collector, sqp,
584 584 TICK_TO_NSEC(sched_new - now),
585 585 CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
586 586 tsp->tcp_time_wait_collector_active = B_FALSE;
587 587 mutex_exit(&tsp->tcp_time_wait_lock);
588 588 }
589 589
590 590 /*
591 591 * tcp_time_wait_processing() handles processing of incoming packets when
592 592 * the tcp_t is in the TIME_WAIT state.
593 593 *
594 594 * A TIME_WAIT tcp_t that has an associated open TCP end point (not in
595 595 * detached state) is never put on the time wait list.
596 596 */
597 597 void
598 598 tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq,
599 599 uint32_t seg_ack, int seg_len, tcpha_t *tcpha, ip_recv_attr_t *ira)
600 600 {
↓ open down ↓ |
600 lines elided |
↑ open up ↑ |
601 601 int32_t bytes_acked;
602 602 int32_t gap;
603 603 int32_t rgap;
604 604 tcp_opt_t tcpopt;
605 605 uint_t flags;
606 606 uint32_t new_swnd = 0;
607 607 conn_t *nconnp;
608 608 conn_t *connp = tcp->tcp_connp;
609 609 tcp_stack_t *tcps = tcp->tcp_tcps;
610 610
611 - BUMP_LOCAL(tcp->tcp_ibsegs);
611 + TCPS_BUMP_MIB(tcps, tcpHCInSegs);
612 612 DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp);
613 613
614 614 flags = (unsigned int)tcpha->tha_flags & 0xFF;
615 615 new_swnd = ntohs(tcpha->tha_win) <<
616 616 ((tcpha->tha_flags & TH_SYN) ? 0 : tcp->tcp_snd_ws);
617 617
618 618 boolean_t keepalive = (seg_len == 0 || seg_len == 1) &&
619 619 (seg_seq + 1 == tcp->tcp_rnxt);
620 620 if (tcp->tcp_snd_ts_ok && !(flags & TH_RST) && !keepalive) {
621 621 int options;
622 622 if (tcp->tcp_snd_sack_ok)
623 623 tcpopt.tcp = tcp;
624 624 else
625 625 tcpopt.tcp = NULL;
626 626 options = tcp_parse_options(tcpha, &tcpopt);
627 627 if (!(options & TCP_OPT_TSTAMP_PRESENT)) {
628 628 DTRACE_TCP1(droppedtimestamp, tcp_t *, tcp);
629 629 goto done;
630 630 } else if (!tcp_paws_check(tcp, &tcpopt)) {
631 631 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt,
632 632 TH_ACK);
633 633 goto done;
634 634 }
635 635 }
636 636 gap = seg_seq - tcp->tcp_rnxt;
637 637 rgap = tcp->tcp_rwnd - (gap + seg_len);
638 638 if (gap < 0) {
639 639 TCPS_BUMP_MIB(tcps, tcpInDataDupSegs);
640 640 TCPS_UPDATE_MIB(tcps, tcpInDataDupBytes,
641 641 (seg_len > -gap ? -gap : seg_len));
642 642 seg_len += gap;
643 643 if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) {
644 644 if (flags & TH_RST) {
645 645 goto done;
646 646 }
647 647 if ((flags & TH_FIN) && seg_len == -1) {
648 648 /*
649 649 * When TCP receives a duplicate FIN in
650 650 * TIME_WAIT state, restart the 2 MSL timer.
651 651 * See page 73 in RFC 793. Make sure this TCP
652 652 * is already on the TIME_WAIT list. If not,
653 653 * just restart the timer.
654 654 */
655 655 if (TCP_IS_DETACHED(tcp)) {
656 656 if (tcp_time_wait_remove(tcp, NULL) ==
657 657 B_TRUE) {
658 658 tcp_time_wait_append(tcp);
659 659 TCP_DBGSTAT(tcps,
660 660 tcp_rput_time_wait);
661 661 }
662 662 } else {
663 663 ASSERT(tcp != NULL);
664 664 TCP_TIMER_RESTART(tcp,
665 665 tcps->tcps_time_wait_interval);
666 666 }
667 667 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt,
668 668 tcp->tcp_rnxt, TH_ACK);
669 669 goto done;
670 670 }
671 671 flags |= TH_ACK_NEEDED;
672 672 seg_len = 0;
673 673 goto process_ack;
674 674 }
675 675
676 676 /* Fix seg_seq, and chew the gap off the front. */
677 677 seg_seq = tcp->tcp_rnxt;
678 678 }
679 679
680 680 if ((flags & TH_SYN) && gap > 0 && rgap < 0) {
681 681 /*
682 682 * Make sure that when we accept the connection, pick
683 683 * an ISS greater than (tcp_snxt + tcp_iss_incr/2) for the
684 684 * old connection.
685 685 *
686 686 * The next ISS generated is equal to tcp_iss_incr_extra
687 687 * + tcp_iss_incr/2 + other components depending on the
688 688 * value of tcp_strong_iss. We pre-calculate the new
689 689 * ISS here and compare with tcp_snxt to determine if
690 690 * we need to make adjustment to tcp_iss_incr_extra.
691 691 *
692 692 * The above calculation is ugly and is a
693 693 * waste of CPU cycles...
694 694 */
695 695 uint32_t new_iss = tcps->tcps_iss_incr_extra;
696 696 int32_t adj;
697 697 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
698 698
699 699 switch (tcps->tcps_strong_iss) {
700 700 case 2: {
701 701 /* Add time and MD5 components. */
702 702 uint32_t answer[4];
703 703 struct {
704 704 uint32_t ports;
705 705 in6_addr_t src;
706 706 in6_addr_t dst;
707 707 } arg;
708 708 MD5_CTX context;
709 709
710 710 mutex_enter(&tcps->tcps_iss_key_lock);
711 711 context = tcps->tcps_iss_key;
712 712 mutex_exit(&tcps->tcps_iss_key_lock);
713 713 arg.ports = connp->conn_ports;
714 714 /* We use MAPPED addresses in tcp_iss_init */
715 715 arg.src = connp->conn_laddr_v6;
716 716 arg.dst = connp->conn_faddr_v6;
717 717 MD5Update(&context, (uchar_t *)&arg,
718 718 sizeof (arg));
719 719 MD5Final((uchar_t *)answer, &context);
720 720 answer[0] ^= answer[1] ^ answer[2] ^ answer[3];
721 721 new_iss += (gethrtime() >> ISS_NSEC_SHT) + answer[0];
722 722 break;
723 723 }
724 724 case 1:
725 725 /* Add time component and min random (i.e. 1). */
726 726 new_iss += (gethrtime() >> ISS_NSEC_SHT) + 1;
727 727 break;
728 728 default:
729 729 /* Add only time component. */
730 730 new_iss += (uint32_t)gethrestime_sec() *
731 731 tcps->tcps_iss_incr;
732 732 break;
733 733 }
734 734 if ((adj = (int32_t)(tcp->tcp_snxt - new_iss)) > 0) {
735 735 /*
736 736 * New ISS not guaranteed to be tcp_iss_incr/2
737 737 * ahead of the current tcp_snxt, so add the
738 738 * difference to tcp_iss_incr_extra.
739 739 */
740 740 tcps->tcps_iss_incr_extra += adj;
741 741 }
742 742 /*
743 743 * If tcp_clean_death() can not perform the task now,
744 744 * drop the SYN packet and let the other side re-xmit.
745 745 * Otherwise pass the SYN packet back in, since the
746 746 * old tcp state has been cleaned up or freed.
747 747 */
748 748 if (tcp_clean_death(tcp, 0) == -1)
749 749 goto done;
750 750 nconnp = ipcl_classify(mp, ira, ipst);
751 751 if (nconnp != NULL) {
752 752 TCP_STAT(tcps, tcp_time_wait_syn_success);
753 753 /* Drops ref on nconnp */
754 754 tcp_reinput(nconnp, mp, ira, ipst);
755 755 return;
756 756 }
757 757 goto done;
758 758 }
759 759
760 760 /*
761 761 * rgap is the amount of stuff received out of window. A negative
762 762 * value is the amount out of window.
763 763 */
764 764 if (rgap < 0) {
765 765 TCPS_BUMP_MIB(tcps, tcpInDataPastWinSegs);
766 766 TCPS_UPDATE_MIB(tcps, tcpInDataPastWinBytes, -rgap);
767 767 /* Fix seg_len and make sure there is something left. */
768 768 seg_len += rgap;
769 769 if (seg_len <= 0) {
770 770 if (flags & TH_RST) {
771 771 goto done;
772 772 }
773 773 flags |= TH_ACK_NEEDED;
774 774 seg_len = 0;
775 775 goto process_ack;
776 776 }
777 777 }
778 778 /*
779 779 * Check whether we can update tcp_ts_recent. This test is from RFC
780 780 * 7323, section 5.3.
781 781 */
782 782 if (tcp->tcp_snd_ts_ok && !(flags & TH_RST) &&
783 783 TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) &&
784 784 SEQ_LEQ(seg_seq, tcp->tcp_rack)) {
785 785 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val;
786 786 tcp->tcp_last_rcv_lbolt = ddi_get_lbolt64();
↓ open down ↓ |
165 lines elided |
↑ open up ↑ |
787 787 }
788 788
789 789 if (seg_seq != tcp->tcp_rnxt && seg_len > 0) {
790 790 /* Always ack out of order packets */
791 791 flags |= TH_ACK_NEEDED;
792 792 seg_len = 0;
793 793 } else if (seg_len > 0) {
794 794 TCPS_BUMP_MIB(tcps, tcpInClosed);
795 795 TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs);
796 796 TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, seg_len);
797 + tcp->tcp_cs.tcp_in_data_inorder_segs++;
798 + tcp->tcp_cs.tcp_in_data_inorder_bytes += seg_len;
797 799 }
798 800 if (flags & TH_RST) {
799 801 (void) tcp_clean_death(tcp, 0);
800 802 goto done;
801 803 }
802 804 if (flags & TH_SYN) {
803 805 tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1,
804 806 TH_RST|TH_ACK);
805 807 /*
806 808 * Do not delete the TCP structure if it is in
807 809 * TIME_WAIT state. Refer to RFC 1122, 4.2.2.13.
808 810 */
809 811 goto done;
810 812 }
811 813 process_ack:
812 814 if (flags & TH_ACK) {
813 815 bytes_acked = (int)(seg_ack - tcp->tcp_suna);
814 816 if (bytes_acked <= 0) {
815 817 if (bytes_acked == 0 && seg_len == 0 &&
816 818 new_swnd == tcp->tcp_swnd)
817 819 TCPS_BUMP_MIB(tcps, tcpInDupAck);
818 820 } else {
819 821 /* Acks something not sent */
820 822 flags |= TH_ACK_NEEDED;
821 823 }
822 824 }
823 825 if (flags & TH_ACK_NEEDED) {
824 826 /*
825 827 * Time to send an ack for some reason.
826 828 */
827 829 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt,
828 830 tcp->tcp_rnxt, TH_ACK);
829 831 }
830 832 done:
831 833 freemsg(mp);
832 834 }
↓ open down ↓ |
26 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX