Print this page
11546 Track TCP round-trip time in nanoseconds
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Portions contributed by: Brandon Baker <bbaker@delphix.com>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/inet/tcp/tcp_timers.c
+++ new/usr/src/uts/common/inet/tcp/tcp_timers.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
↓ open down ↓ |
15 lines elided |
↑ open up ↑ |
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 24 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
25 25 * Copyright 2011 Joyent, Inc. All rights reserved.
26 - * Copyright (c) 2014 by Delphix. All rights reserved.
26 + * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
27 27 */
28 28
29 29 #include <sys/types.h>
30 30 #include <sys/strlog.h>
31 31 #include <sys/strsun.h>
32 32 #include <sys/squeue_impl.h>
33 33 #include <sys/squeue.h>
34 34 #include <sys/callo.h>
35 35 #include <sys/strsubr.h>
36 36
37 37 #include <inet/common.h>
38 38 #include <inet/ip.h>
39 39 #include <inet/ip_ire.h>
40 40 #include <inet/ip_rts.h>
41 41 #include <inet/tcp.h>
42 42 #include <inet/tcp_impl.h>
43 43
44 44 /*
45 45 * Implementation of TCP Timers.
46 46 * =============================
47 47 *
48 48 * INTERFACE:
49 49 *
50 50 * There are two basic functions dealing with tcp timers:
51 51 *
52 52 * timeout_id_t tcp_timeout(connp, func, time)
53 53 * clock_t tcp_timeout_cancel(connp, timeout_id)
54 54 * TCP_TIMER_RESTART(tcp, intvl)
55 55 *
56 56 * tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func'
57 57 * after 'time' ticks passed. The function called by timeout() must adhere to
58 58 * the same restrictions as a driver soft interrupt handler - it must not sleep
59 59 * or call other functions that might sleep. The value returned is the opaque
60 60 * non-zero timeout identifier that can be passed to tcp_timeout_cancel() to
61 61 * cancel the request. The call to tcp_timeout() may fail in which case it
62 62 * returns zero. This is different from the timeout(9F) function which never
63 63 * fails.
64 64 *
65 65 * The call-back function 'func' always receives 'connp' as its single
66 66 * argument. It is always executed in the squeue corresponding to the tcp
67 67 * structure. The tcp structure is guaranteed to be present at the time the
68 68 * call-back is called.
69 69 *
70 70 * NOTE: The call-back function 'func' is never called if tcp is in
71 71 * the TCPS_CLOSED state.
72 72 *
73 73 * tcp_timeout_cancel() attempts to cancel a pending tcp_timeout()
74 74 * request. locks acquired by the call-back routine should not be held across
75 75 * the call to tcp_timeout_cancel() or a deadlock may result.
76 76 *
77 77 * tcp_timeout_cancel() returns -1 if the timeout request is invalid.
78 78 * Otherwise, it returns an integer value greater than or equal to 0.
79 79 *
80 80 * NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called
81 81 * within squeue context corresponding to the tcp instance. Since the
82 82 * call-back is also called via the same squeue, there are no race
83 83 * conditions described in untimeout(9F) manual page since all calls are
84 84 * strictly serialized.
85 85 *
86 86 * TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout
87 87 * stored in tcp_timer_tid and starts a new one using
88 88 * MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back
89 89 * and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid
90 90 * field.
91 91 *
92 92 * IMPLEMENTATION:
93 93 *
94 94 * TCP timers are implemented using three-stage process. The call to
95 95 * tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function
96 96 * when the timer expires. The tcp_timer_callback() arranges the call of the
97 97 * tcp_timer_handler() function via squeue corresponding to the tcp
98 98 * instance. The tcp_timer_handler() calls actual requested timeout call-back
99 99 * and passes tcp instance as an argument to it. Information is passed between
100 100 * stages using the tcp_timer_t structure which contains the connp pointer, the
101 101 * tcp call-back to call and the timeout id returned by the timeout(9F).
102 102 *
103 103 * The tcp_timer_t structure is not used directly, it is embedded in an mblk_t -
104 104 * like structure that is used to enter an squeue. The mp->b_rptr of this pseudo
105 105 * mblk points to the beginning of tcp_timer_t structure. The tcp_timeout()
106 106 * returns the pointer to this mblk.
107 107 *
108 108 * The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It
109 109 * looks like a normal mblk without actual dblk attached to it.
110 110 *
111 111 * To optimize performance each tcp instance holds a small cache of timer
112 112 * mblocks. In the current implementation it caches up to two timer mblocks per
113 113 * tcp instance. The cache is preserved over tcp frees and is only freed when
114 114 * the whole tcp structure is destroyed by its kmem destructor. Since all tcp
115 115 * timer processing happens on a corresponding squeue, the cache manipulation
116 116 * does not require any locks. Experiments show that majority of timer mblocks
117 117 * allocations are satisfied from the tcp cache and do not involve kmem calls.
118 118 *
119 119 * The tcp_timeout() places a refhold on the connp instance which guarantees
120 120 * that it will be present at the time the call-back function fires. The
121 121 * tcp_timer_handler() drops the reference after calling the call-back, so the
122 122 * call-back function does not need to manipulate the references explicitly.
123 123 */
124 124
125 125 kmem_cache_t *tcp_timercache;
126 126
127 127 static void tcp_ip_notify(tcp_t *);
128 128 static void tcp_timer_callback(void *);
129 129 static void tcp_timer_free(tcp_t *, mblk_t *);
130 130 static void tcp_timer_handler(void *, mblk_t *, void *, ip_recv_attr_t *);
131 131
132 132 /*
133 133 * tim is in millisec.
134 134 */
135 135 timeout_id_t
136 136 tcp_timeout(conn_t *connp, void (*f)(void *), hrtime_t tim)
137 137 {
138 138 mblk_t *mp;
139 139 tcp_timer_t *tcpt;
140 140 tcp_t *tcp = connp->conn_tcp;
141 141
142 142 ASSERT(connp->conn_sqp != NULL);
143 143
144 144 TCP_DBGSTAT(tcp->tcp_tcps, tcp_timeout_calls);
145 145
146 146 if (tcp->tcp_timercache == NULL) {
147 147 mp = tcp_timermp_alloc(KM_NOSLEEP | KM_PANIC);
148 148 } else {
149 149 TCP_DBGSTAT(tcp->tcp_tcps, tcp_timeout_cached_alloc);
150 150 mp = tcp->tcp_timercache;
151 151 tcp->tcp_timercache = mp->b_next;
152 152 mp->b_next = NULL;
153 153 ASSERT(mp->b_wptr == NULL);
154 154 }
155 155
156 156 CONN_INC_REF(connp);
157 157 tcpt = (tcp_timer_t *)mp->b_rptr;
158 158 tcpt->connp = connp;
159 159 tcpt->tcpt_proc = f;
160 160 /*
161 161 * TCP timers are normal timeouts. Plus, they do not require more than
162 162 * a 10 millisecond resolution. By choosing a coarser resolution and by
163 163 * rounding up the expiration to the next resolution boundary, we can
164 164 * batch timers in the callout subsystem to make TCP timers more
165 165 * efficient. The roundup also protects short timers from expiring too
166 166 * early before they have a chance to be cancelled.
167 167 */
168 168 tcpt->tcpt_tid = timeout_generic(CALLOUT_NORMAL, tcp_timer_callback, mp,
169 169 tim * MICROSEC, CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
170 170 VERIFY(!(tcpt->tcpt_tid & CALLOUT_ID_FREE));
171 171
172 172 return ((timeout_id_t)mp);
173 173 }
174 174
175 175 static void
176 176 tcp_timer_callback(void *arg)
177 177 {
178 178 mblk_t *mp = (mblk_t *)arg;
179 179 tcp_timer_t *tcpt;
180 180 conn_t *connp;
181 181
182 182 tcpt = (tcp_timer_t *)mp->b_rptr;
183 183 connp = tcpt->connp;
184 184 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_timer_handler, connp,
185 185 NULL, SQ_FILL, SQTAG_TCP_TIMER);
186 186 }
187 187
188 188 /* ARGSUSED */
189 189 static void
190 190 tcp_timer_handler(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
191 191 {
192 192 tcp_timer_t *tcpt;
193 193 conn_t *connp = (conn_t *)arg;
194 194 tcp_t *tcp = connp->conn_tcp;
195 195
196 196 tcpt = (tcp_timer_t *)mp->b_rptr;
197 197 ASSERT(connp == tcpt->connp);
198 198 ASSERT((squeue_t *)arg2 == connp->conn_sqp);
199 199
200 200 if (tcpt->tcpt_tid & CALLOUT_ID_FREE) {
201 201 /*
202 202 * This timeout was cancelled after it was enqueued to the
203 203 * squeue; free the timer and return.
204 204 */
205 205 tcp_timer_free(connp->conn_tcp, mp);
206 206 return;
207 207 }
208 208
209 209 /*
210 210 * If the TCP has reached the closed state, don't proceed any
211 211 * further. This TCP logically does not exist on the system.
212 212 * tcpt_proc could for example access queues, that have already
213 213 * been qprocoff'ed off.
214 214 */
215 215 if (tcp->tcp_state != TCPS_CLOSED) {
216 216 (*tcpt->tcpt_proc)(connp);
217 217 } else {
218 218 tcp->tcp_timer_tid = 0;
219 219 }
220 220
221 221 tcp_timer_free(connp->conn_tcp, mp);
222 222 }
223 223
224 224 /*
225 225 * There is potential race with untimeout and the handler firing at the same
226 226 * time. The mblock may be freed by the handler while we are trying to use
227 227 * it. But since both should execute on the same squeue, this race should not
228 228 * occur.
229 229 */
230 230 clock_t
231 231 tcp_timeout_cancel(conn_t *connp, timeout_id_t id)
232 232 {
233 233 mblk_t *mp = (mblk_t *)id;
234 234 tcp_timer_t *tcpt;
235 235 clock_t delta;
236 236
237 237 TCP_DBGSTAT(connp->conn_tcp->tcp_tcps, tcp_timeout_cancel_reqs);
238 238
239 239 if (mp == NULL)
240 240 return (-1);
241 241
242 242 tcpt = (tcp_timer_t *)mp->b_rptr;
243 243 ASSERT(tcpt->connp == connp);
244 244
245 245 delta = untimeout_default(tcpt->tcpt_tid, 0);
246 246
247 247 if (delta >= 0) {
248 248 TCP_DBGSTAT(connp->conn_tcp->tcp_tcps, tcp_timeout_canceled);
249 249 tcp_timer_free(connp->conn_tcp, mp);
250 250 CONN_DEC_REF(connp);
251 251 } else {
252 252 /*
253 253 * If we were unable to untimeout successfully, it has already
254 254 * been enqueued on the squeue; mark the ID with the free
255 255 * bit. This bit can never be set in a valid identifier, and
256 256 * we'll use it to prevent the timeout from being executed.
257 257 * And note that we're within the squeue perimeter here, so
258 258 * we don't need to worry about racing with timer handling
259 259 * (which also executes within the perimeter).
260 260 */
261 261 tcpt->tcpt_tid |= CALLOUT_ID_FREE;
262 262 delta = 0;
263 263 }
264 264
265 265 return (TICK_TO_MSEC(delta));
266 266 }
267 267
268 268 /*
269 269 * Allocate space for the timer event. The allocation looks like mblk, but it is
270 270 * not a proper mblk. To avoid confusion we set b_wptr to NULL.
271 271 *
272 272 * Dealing with failures: If we can't allocate from the timer cache we try
273 273 * allocating from dblock caches using allocb_tryhard(). In this case b_wptr
274 274 * points to b_rptr.
275 275 * If we can't allocate anything using allocb_tryhard(), we perform a last
276 276 * attempt and use kmem_alloc_tryhard(). In this case we set b_wptr to -1 and
277 277 * save the actual allocation size in b_datap.
278 278 */
279 279 mblk_t *
280 280 tcp_timermp_alloc(int kmflags)
281 281 {
282 282 mblk_t *mp = (mblk_t *)kmem_cache_alloc(tcp_timercache,
283 283 kmflags & ~KM_PANIC);
284 284
285 285 if (mp != NULL) {
286 286 mp->b_next = mp->b_prev = NULL;
287 287 mp->b_rptr = (uchar_t *)(&mp[1]);
288 288 mp->b_wptr = NULL;
289 289 mp->b_datap = NULL;
290 290 mp->b_queue = NULL;
291 291 mp->b_cont = NULL;
292 292 } else if (kmflags & KM_PANIC) {
293 293 /*
294 294 * Failed to allocate memory for the timer. Try allocating from
295 295 * dblock caches.
296 296 */
297 297 /* ipclassifier calls this from a constructor - hence no tcps */
298 298 TCP_G_STAT(tcp_timermp_allocfail);
299 299 mp = allocb_tryhard(sizeof (tcp_timer_t));
300 300 if (mp == NULL) {
301 301 size_t size = 0;
302 302 /*
303 303 * Memory is really low. Try tryhard allocation.
304 304 *
305 305 * ipclassifier calls this from a constructor -
306 306 * hence no tcps
307 307 */
308 308 TCP_G_STAT(tcp_timermp_allocdblfail);
309 309 mp = kmem_alloc_tryhard(sizeof (mblk_t) +
310 310 sizeof (tcp_timer_t), &size, kmflags);
311 311 mp->b_rptr = (uchar_t *)(&mp[1]);
312 312 mp->b_next = mp->b_prev = NULL;
313 313 mp->b_wptr = (uchar_t *)-1;
314 314 mp->b_datap = (dblk_t *)size;
315 315 mp->b_queue = NULL;
316 316 mp->b_cont = NULL;
317 317 }
318 318 ASSERT(mp->b_wptr != NULL);
319 319 }
320 320 /* ipclassifier calls this from a constructor - hence no tcps */
321 321 TCP_G_DBGSTAT(tcp_timermp_alloced);
322 322
323 323 return (mp);
324 324 }
325 325
326 326 /*
327 327 * Free per-tcp timer cache.
328 328 * It can only contain entries from tcp_timercache.
329 329 */
330 330 void
331 331 tcp_timermp_free(tcp_t *tcp)
332 332 {
333 333 mblk_t *mp;
334 334
335 335 while ((mp = tcp->tcp_timercache) != NULL) {
336 336 ASSERT(mp->b_wptr == NULL);
337 337 tcp->tcp_timercache = tcp->tcp_timercache->b_next;
338 338 kmem_cache_free(tcp_timercache, mp);
339 339 }
340 340 }
341 341
342 342 /*
343 343 * Free timer event. Put it on the per-tcp timer cache if there is not too many
344 344 * events there already (currently at most two events are cached).
345 345 * If the event is not allocated from the timer cache, free it right away.
346 346 */
347 347 static void
348 348 tcp_timer_free(tcp_t *tcp, mblk_t *mp)
349 349 {
350 350 mblk_t *mp1 = tcp->tcp_timercache;
351 351
352 352 if (mp->b_wptr != NULL) {
353 353 /*
354 354 * This allocation is not from a timer cache, free it right
355 355 * away.
356 356 */
357 357 if (mp->b_wptr != (uchar_t *)-1)
358 358 freeb(mp);
359 359 else
360 360 kmem_free(mp, (size_t)mp->b_datap);
361 361 } else if (mp1 == NULL || mp1->b_next == NULL) {
362 362 /* Cache this timer block for future allocations */
363 363 mp->b_rptr = (uchar_t *)(&mp[1]);
364 364 mp->b_next = mp1;
365 365 tcp->tcp_timercache = mp;
366 366 } else {
367 367 kmem_cache_free(tcp_timercache, mp);
368 368 TCP_DBGSTAT(tcp->tcp_tcps, tcp_timermp_freed);
369 369 }
370 370 }
371 371
372 372 /*
373 373 * Stop all TCP timers.
374 374 */
375 375 void
376 376 tcp_timers_stop(tcp_t *tcp)
377 377 {
378 378 if (tcp->tcp_timer_tid != 0) {
379 379 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid);
380 380 tcp->tcp_timer_tid = 0;
381 381 }
382 382 if (tcp->tcp_ka_tid != 0) {
383 383 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ka_tid);
384 384 tcp->tcp_ka_tid = 0;
385 385 }
386 386 if (tcp->tcp_ack_tid != 0) {
387 387 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid);
388 388 tcp->tcp_ack_tid = 0;
389 389 }
390 390 if (tcp->tcp_push_tid != 0) {
391 391 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid);
392 392 tcp->tcp_push_tid = 0;
393 393 }
394 394 if (tcp->tcp_reass_tid != 0) {
395 395 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_reass_tid);
396 396 tcp->tcp_reass_tid = 0;
397 397 }
398 398 }
399 399
400 400 /*
401 401 * Timer callback routine for keepalive probe. We do a fake resend of
402 402 * last ACKed byte. Then set a timer using RTO. When the timer expires,
403 403 * check to see if we have heard anything from the other end for the last
404 404 * RTO period. If we have, set the timer to expire for another
405 405 * tcp_keepalive_intrvl and check again. If we have not, set a timer using
406 406 * RTO << 1 and check again when it expires. Keep exponentially increasing
407 407 * the timeout if we have not heard from the other side. If for more than
408 408 * (tcp_ka_interval + tcp_ka_abort_thres) we have not heard anything,
409 409 * kill the connection unless the keepalive abort threshold is 0. In
410 410 * that case, we will probe "forever."
411 411 * If tcp_ka_cnt and tcp_ka_rinterval are non-zero, then we do not follow
412 412 * the exponential backoff, but send probes tcp_ka_cnt times in regular
413 413 * intervals of tcp_ka_rinterval milliseconds until we hear back from peer.
414 414 * Kill the connection if we don't hear back from peer after tcp_ka_cnt
415 415 * probes are sent.
416 416 */
417 417 void
418 418 tcp_keepalive_timer(void *arg)
419 419 {
420 420 mblk_t *mp;
421 421 conn_t *connp = (conn_t *)arg;
422 422 tcp_t *tcp = connp->conn_tcp;
423 423 int32_t firetime;
424 424 int32_t idletime;
425 425 int32_t ka_intrvl;
426 426 tcp_stack_t *tcps = tcp->tcp_tcps;
427 427
428 428 tcp->tcp_ka_tid = 0;
429 429
430 430 if (tcp->tcp_fused)
431 431 return;
432 432
433 433 TCPS_BUMP_MIB(tcps, tcpTimKeepalive);
434 434 ka_intrvl = tcp->tcp_ka_interval;
435 435
436 436 /*
437 437 * Keepalive probe should only be sent if the application has not
438 438 * done a close on the connection.
439 439 */
440 440 if (tcp->tcp_state > TCPS_CLOSE_WAIT) {
441 441 return;
442 442 }
443 443 /* Timer fired too early, restart it. */
444 444 if (tcp->tcp_state < TCPS_ESTABLISHED) {
445 445 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
446 446 ka_intrvl);
447 447 return;
448 448 }
449 449
450 450 idletime = TICK_TO_MSEC(ddi_get_lbolt() - tcp->tcp_last_recv_time);
451 451 /*
452 452 * If we have not heard from the other side for a long
453 453 * time, kill the connection unless the keepalive abort
454 454 * threshold is 0. In that case, we will probe "forever."
455 455 */
456 456 if (tcp->tcp_ka_abort_thres != 0 &&
457 457 idletime > (ka_intrvl + tcp->tcp_ka_abort_thres)) {
458 458 TCPS_BUMP_MIB(tcps, tcpTimKeepaliveDrop);
459 459 (void) tcp_clean_death(tcp, tcp->tcp_client_errno ?
460 460 tcp->tcp_client_errno : ETIMEDOUT);
461 461 return;
462 462 }
463 463
464 464 if (tcp->tcp_snxt == tcp->tcp_suna &&
465 465 idletime >= ka_intrvl) {
466 466 /* Fake resend of last ACKed byte. */
467 467 mblk_t *mp1 = allocb(1, BPRI_LO);
468 468
469 469 if (mp1 != NULL) {
470 470 *mp1->b_wptr++ = '\0';
471 471 mp = tcp_xmit_mp(tcp, mp1, 1, NULL, NULL,
472 472 tcp->tcp_suna - 1, B_FALSE, NULL, B_TRUE);
473 473 freeb(mp1);
474 474 /*
475 475 * if allocation failed, fall through to start the
476 476 * timer back.
477 477 */
478 478 if (mp != NULL) {
479 479 tcp_send_data(tcp, mp);
480 480 TCPS_BUMP_MIB(tcps, tcpTimKeepaliveProbe);
481 481 if (tcp->tcp_ka_rinterval) {
482 482 firetime = tcp->tcp_ka_rinterval;
483 483 } else if (tcp->tcp_ka_last_intrvl != 0) {
484 484 int max;
485 485 /*
486 486 * We should probe again at least
487 487 * in ka_intrvl, but not more than
488 488 * tcp_rto_max.
489 489 */
490 490 max = tcp->tcp_rto_max;
491 491 firetime = MIN(ka_intrvl - 1,
492 492 tcp->tcp_ka_last_intrvl << 1);
493 493 if (firetime > max)
494 494 firetime = max;
495 495 } else {
496 496 firetime = tcp->tcp_rto;
497 497 }
498 498 tcp->tcp_ka_tid = TCP_TIMER(tcp,
499 499 tcp_keepalive_timer, firetime);
500 500 tcp->tcp_ka_last_intrvl = firetime;
501 501 return;
502 502 }
503 503 }
504 504 } else {
505 505 tcp->tcp_ka_last_intrvl = 0;
506 506 }
507 507
508 508 /* firetime can be negative if (mp1 == NULL || mp == NULL) */
509 509 if ((firetime = ka_intrvl - idletime) < 0) {
510 510 firetime = ka_intrvl;
511 511 }
512 512 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer, firetime);
513 513 }
514 514
515 515 void
516 516 tcp_reass_timer(void *arg)
517 517 {
518 518 conn_t *connp = (conn_t *)arg;
519 519 tcp_t *tcp = connp->conn_tcp;
520 520
521 521 tcp->tcp_reass_tid = 0;
522 522 if (tcp->tcp_reass_head == NULL)
523 523 return;
524 524 ASSERT(tcp->tcp_reass_tail != NULL);
525 525 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) {
526 526 tcp_sack_remove(tcp->tcp_sack_list,
527 527 TCP_REASS_END(tcp->tcp_reass_tail), &tcp->tcp_num_sack_blk);
528 528 }
529 529 tcp_close_mpp(&tcp->tcp_reass_head);
530 530 tcp->tcp_reass_tail = NULL;
531 531 TCP_STAT(tcp->tcp_tcps, tcp_reass_timeout);
532 532 }
533 533
534 534 /* This function handles the push timeout. */
535 535 void
536 536 tcp_push_timer(void *arg)
537 537 {
538 538 conn_t *connp = (conn_t *)arg;
539 539 tcp_t *tcp = connp->conn_tcp;
540 540
541 541 TCP_DBGSTAT(tcp->tcp_tcps, tcp_push_timer_cnt);
542 542
543 543 ASSERT(tcp->tcp_listener == NULL);
544 544
545 545 ASSERT(!IPCL_IS_NONSTR(connp));
546 546
547 547 tcp->tcp_push_tid = 0;
548 548
549 549 if (tcp->tcp_rcv_list != NULL &&
550 550 tcp_rcv_drain(tcp) == TH_ACK_NEEDED)
551 551 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK);
552 552 }
553 553
554 554 /*
555 555 * This function handles delayed ACK timeout.
556 556 */
557 557 void
558 558 tcp_ack_timer(void *arg)
559 559 {
560 560 conn_t *connp = (conn_t *)arg;
561 561 tcp_t *tcp = connp->conn_tcp;
562 562 mblk_t *mp;
563 563 tcp_stack_t *tcps = tcp->tcp_tcps;
564 564
565 565 TCP_DBGSTAT(tcps, tcp_ack_timer_cnt);
566 566
567 567 tcp->tcp_ack_tid = 0;
568 568
569 569 if (tcp->tcp_fused)
570 570 return;
571 571
572 572 /*
573 573 * Do not send ACK if there is no outstanding unack'ed data.
574 574 */
575 575 if (tcp->tcp_rnxt == tcp->tcp_rack) {
576 576 return;
577 577 }
578 578
579 579 if ((tcp->tcp_rnxt - tcp->tcp_rack) > tcp->tcp_mss) {
580 580 /*
581 581 * Make sure we don't allow deferred ACKs to result in
582 582 * timer-based ACKing. If we have held off an ACK
583 583 * when there was more than an mss here, and the timer
584 584 * goes off, we have to worry about the possibility
585 585 * that the sender isn't doing slow-start, or is out
586 586 * of step with us for some other reason. We fall
587 587 * permanently back in the direction of
588 588 * ACK-every-other-packet as suggested in RFC 1122.
589 589 */
590 590 if (tcp->tcp_rack_abs_max > 2)
591 591 tcp->tcp_rack_abs_max--;
592 592 tcp->tcp_rack_cur_max = 2;
593 593 }
594 594 mp = tcp_ack_mp(tcp);
595 595
596 596 if (mp != NULL) {
597 597 BUMP_LOCAL(tcp->tcp_obsegs);
598 598 TCPS_BUMP_MIB(tcps, tcpOutAck);
599 599 TCPS_BUMP_MIB(tcps, tcpOutAckDelayed);
600 600 tcp_send_data(tcp, mp);
601 601 }
602 602 }
603 603
604 604 /*
605 605 * Notify IP that we are having trouble with this connection. IP should
606 606 * make note so it can potentially use a different IRE.
607 607 */
608 608 static void
609 609 tcp_ip_notify(tcp_t *tcp)
610 610 {
611 611 conn_t *connp = tcp->tcp_connp;
612 612 ire_t *ire;
613 613
614 614 /*
615 615 * Note: in the case of source routing we want to blow away the
616 616 * route to the first source route hop.
617 617 */
618 618 ire = connp->conn_ixa->ixa_ire;
619 619 if (ire != NULL && !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
620 620 if (ire->ire_ipversion == IPV4_VERSION) {
621 621 /*
622 622 * As per RFC 1122, we send an RTM_LOSING to inform
623 623 * routing protocols.
624 624 */
625 625 ip_rts_change(RTM_LOSING, ire->ire_addr,
626 626 ire->ire_gateway_addr, ire->ire_mask,
627 627 connp->conn_laddr_v4, 0, 0, 0,
628 628 (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA),
629 629 ire->ire_ipst);
630 630 }
631 631 (void) ire_no_good(ire);
632 632 }
633 633 }
634 634
635 635 /*
636 636 * tcp_timer is the timer service routine. It handles the retransmission,
637 637 * FIN_WAIT_2 flush, and zero window probe timeout events. It figures out
638 638 * from the state of the tcp instance what kind of action needs to be done
639 639 * at the time it is called.
640 640 */
641 641 void
642 642 tcp_timer(void *arg)
643 643 {
644 644 mblk_t *mp;
645 645 clock_t first_threshold;
646 646 clock_t second_threshold;
647 647 clock_t ms;
648 648 uint32_t mss;
649 649 conn_t *connp = (conn_t *)arg;
650 650 tcp_t *tcp = connp->conn_tcp;
651 651 tcp_stack_t *tcps = tcp->tcp_tcps;
652 652 boolean_t dont_timeout = B_FALSE;
653 653
654 654 tcp->tcp_timer_tid = 0;
655 655
656 656 if (tcp->tcp_fused)
657 657 return;
658 658
659 659 first_threshold = tcp->tcp_first_timer_threshold;
660 660 second_threshold = tcp->tcp_second_timer_threshold;
661 661 switch (tcp->tcp_state) {
662 662 case TCPS_IDLE:
663 663 case TCPS_BOUND:
664 664 case TCPS_LISTEN:
665 665 return;
666 666 case TCPS_SYN_RCVD: {
667 667 tcp_t *listener = tcp->tcp_listener;
668 668
669 669 if (tcp->tcp_syn_rcvd_timeout == 0 && (listener != NULL)) {
670 670 /* it's our first timeout */
671 671 tcp->tcp_syn_rcvd_timeout = 1;
672 672 mutex_enter(&listener->tcp_eager_lock);
673 673 listener->tcp_syn_rcvd_timeout++;
674 674 if (!tcp->tcp_dontdrop && !tcp->tcp_closemp_used) {
675 675 /*
676 676 * Make this eager available for drop if we
677 677 * need to drop one to accomodate a new
678 678 * incoming SYN request.
679 679 */
680 680 MAKE_DROPPABLE(listener, tcp);
681 681 }
682 682 if (!listener->tcp_syn_defense &&
683 683 (listener->tcp_syn_rcvd_timeout >
684 684 (tcps->tcps_conn_req_max_q0 >> 2)) &&
685 685 (tcps->tcps_conn_req_max_q0 > 200)) {
686 686 /* We may be under attack. Put on a defense. */
687 687 listener->tcp_syn_defense = B_TRUE;
688 688 cmn_err(CE_WARN, "High TCP connect timeout "
689 689 "rate! System (port %d) may be under a "
690 690 "SYN flood attack!",
691 691 ntohs(listener->tcp_connp->conn_lport));
692 692
693 693 listener->tcp_ip_addr_cache = kmem_zalloc(
694 694 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t),
695 695 KM_NOSLEEP);
696 696 }
697 697 mutex_exit(&listener->tcp_eager_lock);
698 698 } else if (listener != NULL) {
699 699 mutex_enter(&listener->tcp_eager_lock);
700 700 tcp->tcp_syn_rcvd_timeout++;
701 701 if (tcp->tcp_syn_rcvd_timeout > 1 &&
702 702 !tcp->tcp_closemp_used) {
703 703 /*
704 704 * This is our second timeout. Put the tcp in
705 705 * the list of droppable eagers to allow it to
706 706 * be dropped, if needed. We don't check
707 707 * whether tcp_dontdrop is set or not to
708 708 * protect ourselve from a SYN attack where a
709 709 * remote host can spoof itself as one of the
710 710 * good IP source and continue to hold
711 711 * resources too long.
712 712 */
713 713 MAKE_DROPPABLE(listener, tcp);
714 714 }
715 715 mutex_exit(&listener->tcp_eager_lock);
716 716 }
717 717 }
718 718 /* FALLTHRU */
719 719 case TCPS_SYN_SENT:
720 720 first_threshold = tcp->tcp_first_ctimer_threshold;
721 721 second_threshold = tcp->tcp_second_ctimer_threshold;
722 722
723 723 /*
724 724 * If an app has set the second_threshold to 0, it means that
725 725 * we need to retransmit forever, unless this is a passive
726 726 * open. We need to set second_threshold back to a normal
727 727 * value such that later comparison with it still makes
728 728 * sense. But we set dont_timeout to B_TRUE so that we will
729 729 * never time out.
730 730 */
731 731 if (second_threshold == 0) {
732 732 second_threshold = tcps->tcps_ip_abort_linterval;
733 733 if (tcp->tcp_active_open)
734 734 dont_timeout = B_TRUE;
735 735 }
736 736 break;
737 737 case TCPS_ESTABLISHED:
738 738 case TCPS_CLOSE_WAIT:
739 739 /*
740 740 * If the end point has not been closed, TCP can retransmit
741 741 * forever. But if the end point is closed, the normal
742 742 * timeout applies.
743 743 */
↓ open down ↓ |
707 lines elided |
↑ open up ↑ |
744 744 if (second_threshold == 0) {
745 745 second_threshold = tcps->tcps_ip_abort_linterval;
746 746 dont_timeout = B_TRUE;
747 747 }
748 748 /* FALLTHRU */
749 749 case TCPS_FIN_WAIT_1:
750 750 case TCPS_CLOSING:
751 751 case TCPS_LAST_ACK:
752 752 /* If we have data to rexmit */
753 753 if (tcp->tcp_suna != tcp->tcp_snxt) {
754 - clock_t time_to_wait;
754 + clock_t time_to_wait;
755 755
756 756 TCPS_BUMP_MIB(tcps, tcpTimRetrans);
757 757 if (!tcp->tcp_xmit_head)
758 758 break;
759 - time_to_wait = ddi_get_lbolt() -
760 - (clock_t)tcp->tcp_xmit_head->b_prev;
761 - time_to_wait = tcp->tcp_rto -
762 - TICK_TO_MSEC(time_to_wait);
759 + time_to_wait = NSEC2MSEC(gethrtime() -
760 + (hrtime_t)(intptr_t)tcp->tcp_xmit_head->b_prev);
761 + time_to_wait = tcp->tcp_rto - time_to_wait;
763 762 /*
764 763 * If the timer fires too early, 1 clock tick earlier,
765 764 * restart the timer.
766 765 */
767 766 if (time_to_wait > msec_per_tick) {
768 767 TCP_STAT(tcps, tcp_timer_fire_early);
769 768 TCP_TIMER_RESTART(tcp, time_to_wait);
770 769 return;
771 770 }
772 771 /*
773 772 * When we probe zero windows, we force the swnd open.
774 773 * If our peer acks with a closed window swnd will be
775 774 * set to zero by tcp_rput(). As long as we are
776 775 * receiving acks tcp_rput will
777 776 * reset 'tcp_ms_we_have_waited' so as not to trip the
778 777 * first and second interval actions. NOTE: the timer
779 778 * interval is allowed to continue its exponential
780 779 * backoff.
781 780 */
782 781 if (tcp->tcp_swnd == 0 || tcp->tcp_zero_win_probe) {
783 782 if (connp->conn_debug) {
784 783 (void) strlog(TCP_MOD_ID, 0, 1,
785 784 SL_TRACE, "tcp_timer: zero win");
786 785 }
787 786 } else {
788 787 /*
789 788 * After retransmission, we need to do
790 789 * slow start. Set the ssthresh to one
791 790 * half of current effective window and
792 791 * cwnd to one MSS. Also reset
793 792 * tcp_cwnd_cnt.
794 793 *
795 794 * Note that if tcp_ssthresh is reduced because
796 795 * of ECN, do not reduce it again unless it is
797 796 * already one window of data away (tcp_cwr
798 797 * should then be cleared) or this is a
799 798 * timeout for a retransmitted segment.
800 799 */
801 800 uint32_t npkt;
802 801
803 802 if (!tcp->tcp_cwr || tcp->tcp_rexmit) {
804 803 npkt = ((tcp->tcp_timer_backoff ?
805 804 tcp->tcp_cwnd_ssthresh :
806 805 tcp->tcp_snxt -
807 806 tcp->tcp_suna) >> 1) / tcp->tcp_mss;
808 807 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) *
809 808 tcp->tcp_mss;
810 809 }
811 810 tcp->tcp_cwnd = tcp->tcp_mss;
812 811 tcp->tcp_cwnd_cnt = 0;
813 812 if (tcp->tcp_ecn_ok) {
814 813 tcp->tcp_cwr = B_TRUE;
815 814 tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
816 815 tcp->tcp_ecn_cwr_sent = B_FALSE;
817 816 }
818 817 }
819 818 break;
820 819 }
821 820 /*
822 821 * We have something to send yet we cannot send. The
823 822 * reason can be:
824 823 *
825 824 * 1. Zero send window: we need to do zero window probe.
826 825 * 2. Zero cwnd: because of ECN, we need to "clock out
827 826 * segments.
828 827 * 3. SWS avoidance: receiver may have shrunk window,
829 828 * reset our knowledge.
830 829 *
831 830 * Note that condition 2 can happen with either 1 or
832 831 * 3. But 1 and 3 are exclusive.
833 832 */
834 833 if (tcp->tcp_unsent != 0) {
835 834 /*
836 835 * Should not hold the zero-copy messages for too long.
837 836 */
838 837 if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean)
839 838 tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp,
840 839 tcp->tcp_xmit_head, B_TRUE);
841 840
842 841 if (tcp->tcp_cwnd == 0) {
843 842 /*
844 843 * Set tcp_cwnd to 1 MSS so that a
845 844 * new segment can be sent out. We
846 845 * are "clocking out" new data when
847 846 * the network is really congested.
848 847 */
849 848 ASSERT(tcp->tcp_ecn_ok);
850 849 tcp->tcp_cwnd = tcp->tcp_mss;
851 850 }
852 851 if (tcp->tcp_swnd == 0) {
853 852 /* Extend window for zero window probe */
854 853 tcp->tcp_swnd++;
855 854 tcp->tcp_zero_win_probe = B_TRUE;
856 855 TCPS_BUMP_MIB(tcps, tcpOutWinProbe);
857 856 } else {
858 857 /*
859 858 * Handle timeout from sender SWS avoidance.
860 859 * Reset our knowledge of the max send window
861 860 * since the receiver might have reduced its
862 861 * receive buffer. Avoid setting tcp_max_swnd
863 862 * to one since that will essentially disable
864 863 * the SWS checks.
865 864 *
866 865 * Note that since we don't have a SWS
867 866 * state variable, if the timeout is set
868 867 * for ECN but not for SWS, this
869 868 * code will also be executed. This is
870 869 * fine as tcp_max_swnd is updated
871 870 * constantly and it will not affect
872 871 * anything.
873 872 */
874 873 tcp->tcp_max_swnd = MAX(tcp->tcp_swnd, 2);
875 874 }
876 875 tcp_wput_data(tcp, NULL, B_FALSE);
877 876 return;
878 877 }
879 878 /* Is there a FIN that needs to be to re retransmitted? */
880 879 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
881 880 !tcp->tcp_fin_acked)
882 881 break;
883 882 /* Nothing to do, return without restarting timer. */
884 883 TCP_STAT(tcps, tcp_timer_fire_miss);
885 884 return;
886 885 case TCPS_FIN_WAIT_2:
887 886 /*
888 887 * User closed the TCP endpoint and peer ACK'ed our FIN.
889 888 * We waited some time for for peer's FIN, but it hasn't
890 889 * arrived. We flush the connection now to avoid
891 890 * case where the peer has rebooted.
892 891 */
893 892 if (TCP_IS_DETACHED(tcp)) {
894 893 (void) tcp_clean_death(tcp, 0);
895 894 } else {
896 895 TCP_TIMER_RESTART(tcp,
897 896 tcp->tcp_fin_wait_2_flush_interval);
898 897 }
899 898 return;
900 899 case TCPS_TIME_WAIT:
901 900 (void) tcp_clean_death(tcp, 0);
902 901 return;
903 902 default:
904 903 if (connp->conn_debug) {
905 904 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
906 905 "tcp_timer: strange state (%d) %s",
907 906 tcp->tcp_state, tcp_display(tcp, NULL,
908 907 DISP_PORT_ONLY));
909 908 }
910 909 return;
911 910 }
912 911
913 912 /*
914 913 * If the system is under memory pressure or the max number of
915 914 * connections have been established for the listener, be more
916 915 * aggressive in aborting connections.
917 916 */
918 917 if (tcps->tcps_reclaim || (tcp->tcp_listen_cnt != NULL &&
919 918 tcp->tcp_listen_cnt->tlc_cnt > tcp->tcp_listen_cnt->tlc_max)) {
920 919 second_threshold = tcp_early_abort * SECONDS;
921 920
922 921 /* We will ignore the never timeout promise in this case... */
923 922 dont_timeout = B_FALSE;
924 923 }
925 924
926 925 ASSERT(second_threshold != 0);
927 926
928 927 if ((ms = tcp->tcp_ms_we_have_waited) > second_threshold) {
929 928 /*
930 929 * Should not hold the zero-copy messages for too long.
931 930 */
932 931 if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean)
933 932 tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp,
934 933 tcp->tcp_xmit_head, B_TRUE);
935 934
936 935 if (dont_timeout) {
937 936 /*
938 937 * Reset tcp_ms_we_have_waited to avoid overflow since
939 938 * we are going to retransmit forever.
940 939 */
941 940 tcp->tcp_ms_we_have_waited = second_threshold;
942 941 goto timer_rexmit;
943 942 }
944 943
945 944 /*
946 945 * For zero window probe, we need to send indefinitely,
947 946 * unless we have not heard from the other side for some
948 947 * time...
949 948 */
950 949 if ((tcp->tcp_zero_win_probe == 0) ||
951 950 (TICK_TO_MSEC(ddi_get_lbolt() - tcp->tcp_last_recv_time) >
952 951 second_threshold)) {
953 952 TCPS_BUMP_MIB(tcps, tcpTimRetransDrop);
954 953 /*
955 954 * If TCP is in SYN_RCVD state, send back a
956 955 * RST|ACK as BSD does. Note that tcp_zero_win_probe
957 956 * should be zero in TCPS_SYN_RCVD state.
958 957 */
959 958 if (tcp->tcp_state == TCPS_SYN_RCVD) {
960 959 tcp_xmit_ctl("tcp_timer: RST sent on timeout "
961 960 "in SYN_RCVD",
962 961 tcp, tcp->tcp_snxt,
963 962 tcp->tcp_rnxt, TH_RST | TH_ACK);
964 963 }
965 964 (void) tcp_clean_death(tcp,
966 965 tcp->tcp_client_errno ?
967 966 tcp->tcp_client_errno : ETIMEDOUT);
968 967 return;
969 968 } else {
970 969 /*
971 970 * If the system is under memory pressure, we also
972 971 * abort connection in zero window probing.
973 972 */
974 973 if (tcps->tcps_reclaim) {
975 974 (void) tcp_clean_death(tcp,
976 975 tcp->tcp_client_errno ?
977 976 tcp->tcp_client_errno : ETIMEDOUT);
978 977 TCP_STAT(tcps, tcp_zwin_mem_drop);
979 978 return;
980 979 }
981 980 /*
982 981 * Set tcp_ms_we_have_waited to second_threshold
983 982 * so that in next timeout, we will do the above
984 983 * check (ddi_get_lbolt() - tcp_last_recv_time).
985 984 * This is also to avoid overflow.
986 985 *
987 986 * We don't need to decrement tcp_timer_backoff
988 987 * to avoid overflow because it will be decremented
989 988 * later if new timeout value is greater than
990 989 * tcp_rto_max. In the case when tcp_rto_max is
991 990 * greater than second_threshold, it means that we
992 991 * will wait longer than second_threshold to send
993 992 * the next
994 993 * window probe.
995 994 */
996 995 tcp->tcp_ms_we_have_waited = second_threshold;
997 996 }
998 997 } else if (ms > first_threshold) {
999 998 /*
1000 999 * Should not hold the zero-copy messages for too long.
1001 1000 */
1002 1001 if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean)
1003 1002 tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp,
1004 1003 tcp->tcp_xmit_head, B_TRUE);
↓ open down ↓ |
232 lines elided |
↑ open up ↑ |
1005 1004
1006 1005 /*
1007 1006 * We have been retransmitting for too long... The RTT
1008 1007 * we calculated is probably incorrect. Reinitialize it.
1009 1008 * Need to compensate for 0 tcp_rtt_sa. Reset
1010 1009 * tcp_rtt_update so that we won't accidentally cache a
1011 1010 * bad value. But only do this if this is not a zero
1012 1011 * window probe.
1013 1012 */
1014 1013 if (tcp->tcp_rtt_sa != 0 && tcp->tcp_zero_win_probe == 0) {
1015 - tcp->tcp_rtt_sd += (tcp->tcp_rtt_sa >> 3) +
1016 - (tcp->tcp_rtt_sa >> 5);
1014 + tcp->tcp_rtt_sd += tcp->tcp_rtt_sa >> 3 +
1015 + tcp->tcp_rtt_sa >> 5;
1017 1016 tcp->tcp_rtt_sa = 0;
1018 1017 tcp_ip_notify(tcp);
1019 1018 tcp->tcp_rtt_update = 0;
1020 1019 }
1021 1020 }
1022 1021
1023 1022 timer_rexmit:
1024 1023 tcp->tcp_timer_backoff++;
1025 - if ((ms = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
1026 - tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) <
1027 - tcp->tcp_rto_min) {
1028 - /*
1029 - * This means the original RTO is tcp_rexmit_interval_min.
1030 - * So we will use tcp_rexmit_interval_min as the RTO value
1031 - * and do the backoff.
1032 - */
1033 - ms = tcp->tcp_rto_min << tcp->tcp_timer_backoff;
1034 - } else {
1035 - ms <<= tcp->tcp_timer_backoff;
1036 - }
1024 + /*
1025 + * Calculate the backed off retransmission timeout. If the shift brings
1026 + * us back over the max, then we repin the value, and decrement the
1027 + * backoff to avoid overflow.
1028 + */
1029 + ms = tcp_calculate_rto(tcp, tcps, 0) << tcp->tcp_timer_backoff;
1037 1030 if (ms > tcp->tcp_rto_max) {
1038 1031 ms = tcp->tcp_rto_max;
1039 - /*
1040 - * ms is at max, decrement tcp_timer_backoff to avoid
1041 - * overflow.
1042 - */
1043 1032 tcp->tcp_timer_backoff--;
1044 1033 }
1045 1034 tcp->tcp_ms_we_have_waited += ms;
1046 1035 if (tcp->tcp_zero_win_probe == 0) {
1047 1036 tcp->tcp_rto = ms;
1048 1037 }
1049 1038 TCP_TIMER_RESTART(tcp, ms);
1050 1039 /*
1051 1040 * This is after a timeout and tcp_rto is backed off. Set
1052 1041 * tcp_set_timer to 1 so that next time RTO is updated, we will
1053 1042 * restart the timer with a correct value.
1054 1043 */
1055 1044 tcp->tcp_set_timer = 1;
1056 1045 mss = tcp->tcp_snxt - tcp->tcp_suna;
1057 1046 if (mss > tcp->tcp_mss)
1058 1047 mss = tcp->tcp_mss;
1059 1048 if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0)
1060 1049 mss = tcp->tcp_swnd;
1061 1050
1062 - if ((mp = tcp->tcp_xmit_head) != NULL)
1063 - mp->b_prev = (mblk_t *)ddi_get_lbolt();
1051 + if ((mp = tcp->tcp_xmit_head) != NULL) {
1052 + mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
1053 + }
1064 1054 mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss,
1065 1055 B_TRUE);
1066 1056
1067 1057 /*
1068 1058 * When slow start after retransmission begins, start with
1069 1059 * this seq no. tcp_rexmit_max marks the end of special slow
1070 1060 * start phase.
1071 1061 */
1072 1062 tcp->tcp_rexmit_nxt = tcp->tcp_suna;
1073 1063 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
1074 1064 (tcp->tcp_unsent == 0)) {
1075 1065 tcp->tcp_rexmit_max = tcp->tcp_fss;
1076 1066 } else {
1077 1067 tcp->tcp_rexmit_max = tcp->tcp_snxt;
1078 1068 }
1079 1069 tcp->tcp_rexmit = B_TRUE;
1080 1070 tcp->tcp_dupack_cnt = 0;
1081 1071
1082 1072 /*
1083 1073 * Remove all rexmit SACK blk to start from fresh.
1084 1074 */
1085 1075 if (tcp->tcp_snd_sack_ok)
1086 1076 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, tcp);
1087 1077 if (mp == NULL) {
1088 1078 return;
1089 1079 }
1090 1080
1091 1081 tcp->tcp_csuna = tcp->tcp_snxt;
1092 1082 TCPS_BUMP_MIB(tcps, tcpRetransSegs);
1093 1083 TCPS_UPDATE_MIB(tcps, tcpRetransBytes, mss);
1094 1084 tcp_send_data(tcp, mp);
1095 1085
1096 1086 }
1097 1087
1098 1088 /*
1099 1089 * Handle lingering timeouts. This function is called when the SO_LINGER timeout
1100 1090 * expires.
1101 1091 */
1102 1092 void
1103 1093 tcp_close_linger_timeout(void *arg)
1104 1094 {
1105 1095 conn_t *connp = (conn_t *)arg;
1106 1096 tcp_t *tcp = connp->conn_tcp;
1107 1097
1108 1098 tcp->tcp_client_errno = ETIMEDOUT;
1109 1099 tcp_stop_lingering(tcp);
1110 1100 }
↓ open down ↓ |
37 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX