11553 Want pluggable TCP congestion control algorithms
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>
Reviewed by: Robert Mustacchi <robert.mustacchi@joyent.com>

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  25  * Copyright 2011 Joyent, Inc.  All rights reserved.
  26  * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
  27  */
  28 
  29 #include <sys/types.h>
  30 #include <sys/strlog.h>
  31 #include <sys/strsun.h>
  32 #include <sys/squeue_impl.h>
  33 #include <sys/squeue.h>
  34 #include <sys/callo.h>
  35 #include <sys/strsubr.h>
  36 
  37 #include <inet/common.h>
  38 #include <inet/ip.h>
  39 #include <inet/ip_ire.h>
  40 #include <inet/ip_rts.h>
  41 #include <inet/tcp.h>
  42 #include <inet/tcp_impl.h>
  43 
  44 /*
  45  * Implementation of TCP Timers.
  46  * =============================
  47  *
  48  * INTERFACE:
  49  *
  50  * There are two basic functions dealing with tcp timers:
  51  *
  52  *      timeout_id_t    tcp_timeout(connp, func, time)
  53  *      clock_t         tcp_timeout_cancel(connp, timeout_id)
  54  *      TCP_TIMER_RESTART(tcp, intvl)
  55  *
  56  * tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func'
  57  * after 'time' ticks passed. The function called by timeout() must adhere to
  58  * the same restrictions as a driver soft interrupt handler - it must not sleep
  59  * or call other functions that might sleep. The value returned is the opaque
  60  * non-zero timeout identifier that can be passed to tcp_timeout_cancel() to
  61  * cancel the request. The call to tcp_timeout() may fail in which case it
  62  * returns zero. This is different from the timeout(9F) function which never
  63  * fails.
  64  *
  65  * The call-back function 'func' always receives 'connp' as its single
  66  * argument. It is always executed in the squeue corresponding to the tcp
  67  * structure. The tcp structure is guaranteed to be present at the time the
  68  * call-back is called.
  69  *
  70  * NOTE: The call-back function 'func' is never called if tcp is in
  71  *      the TCPS_CLOSED state.
  72  *
  73  * tcp_timeout_cancel() attempts to cancel a pending tcp_timeout()
  74  * request. locks acquired by the call-back routine should not be held across
  75  * the call to tcp_timeout_cancel() or a deadlock may result.
  76  *
  77  * tcp_timeout_cancel() returns -1 if the timeout request is invalid.
  78  * Otherwise, it returns an integer value greater than or equal to 0.
  79  *
  80  * NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called
  81  *      within squeue context corresponding to the tcp instance. Since the
  82  *      call-back is also called via the same squeue, there are no race
  83  *      conditions described in untimeout(9F) manual page since all calls are
  84  *      strictly serialized.
  85  *
  86  *      TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout
  87  *      stored in tcp_timer_tid and starts a new one using
  88  *      MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back
  89  *      and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid
  90  *      field.
  91  *
  92  * IMPLEMENTATION:
  93  *
  94  * TCP timers are implemented using three-stage process. The call to
  95  * tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function
  96  * when the timer expires. The tcp_timer_callback() arranges the call of the
  97  * tcp_timer_handler() function via squeue corresponding to the tcp
  98  * instance. The tcp_timer_handler() calls actual requested timeout call-back
  99  * and passes tcp instance as an argument to it. Information is passed between
 100  * stages using the tcp_timer_t structure which contains the connp pointer, the
 101  * tcp call-back to call and the timeout id returned by the timeout(9F).
 102  *
 103  * The tcp_timer_t structure is not used directly, it is embedded in an mblk_t -
 104  * like structure that is used to enter an squeue. The mp->b_rptr of this pseudo
 105  * mblk points to the beginning of tcp_timer_t structure. The tcp_timeout()
 106  * returns the pointer to this mblk.
 107  *
 108  * The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It
 109  * looks like a normal mblk without actual dblk attached to it.
 110  *
 111  * To optimize performance each tcp instance holds a small cache of timer
 112  * mblocks. In the current implementation it caches up to two timer mblocks per
 113  * tcp instance. The cache is preserved over tcp frees and is only freed when
 114  * the whole tcp structure is destroyed by its kmem destructor. Since all tcp
 115  * timer processing happens on a corresponding squeue, the cache manipulation
 116  * does not require any locks. Experiments show that majority of timer mblocks
 117  * allocations are satisfied from the tcp cache and do not involve kmem calls.
 118  *
 119  * The tcp_timeout() places a refhold on the connp instance which guarantees
 120  * that it will be present at the time the call-back function fires. The
 121  * tcp_timer_handler() drops the reference after calling the call-back, so the
 122  * call-back function does not need to manipulate the references explicitly.
 123  */
 124 
 125 kmem_cache_t *tcp_timercache;
 126 
 127 static void     tcp_ip_notify(tcp_t *);
 128 static void     tcp_timer_callback(void *);
 129 static void     tcp_timer_free(tcp_t *, mblk_t *);
 130 static void     tcp_timer_handler(void *, mblk_t *, void *, ip_recv_attr_t *);
 131 
 132 /*
 133  * tim is in millisec.
 134  */
 135 timeout_id_t
 136 tcp_timeout(conn_t *connp, void (*f)(void *), hrtime_t tim)
 137 {
 138         mblk_t *mp;
 139         tcp_timer_t *tcpt;
 140         tcp_t *tcp = connp->conn_tcp;
 141 
 142         ASSERT(connp->conn_sqp != NULL);
 143 
 144         TCP_DBGSTAT(tcp->tcp_tcps, tcp_timeout_calls);
 145 
 146         if (tcp->tcp_timercache == NULL) {
 147                 mp = tcp_timermp_alloc(KM_NOSLEEP | KM_PANIC);
 148         } else {
 149                 TCP_DBGSTAT(tcp->tcp_tcps, tcp_timeout_cached_alloc);
 150                 mp = tcp->tcp_timercache;
 151                 tcp->tcp_timercache = mp->b_next;
 152                 mp->b_next = NULL;
 153                 ASSERT(mp->b_wptr == NULL);
 154         }
 155 
 156         CONN_INC_REF(connp);
 157         tcpt = (tcp_timer_t *)mp->b_rptr;
 158         tcpt->connp = connp;
 159         tcpt->tcpt_proc = f;
 160         /*
 161          * TCP timers are normal timeouts. Plus, they do not require more than
 162          * a 10 millisecond resolution. By choosing a coarser resolution and by
 163          * rounding up the expiration to the next resolution boundary, we can
 164          * batch timers in the callout subsystem to make TCP timers more
 165          * efficient. The roundup also protects short timers from expiring too
 166          * early before they have a chance to be cancelled.
 167          */
 168         tcpt->tcpt_tid = timeout_generic(CALLOUT_NORMAL, tcp_timer_callback, mp,
 169             tim * MICROSEC, CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
 170         VERIFY(!(tcpt->tcpt_tid & CALLOUT_ID_FREE));
 171 
 172         return ((timeout_id_t)mp);
 173 }
 174 
 175 static void
 176 tcp_timer_callback(void *arg)
 177 {
 178         mblk_t *mp = (mblk_t *)arg;
 179         tcp_timer_t *tcpt;
 180         conn_t  *connp;
 181 
 182         tcpt = (tcp_timer_t *)mp->b_rptr;
 183         connp = tcpt->connp;
 184         SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_timer_handler, connp,
 185             NULL, SQ_FILL, SQTAG_TCP_TIMER);
 186 }
 187 
 188 /* ARGSUSED */
 189 static void
 190 tcp_timer_handler(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
 191 {
 192         tcp_timer_t *tcpt;
 193         conn_t *connp = (conn_t *)arg;
 194         tcp_t *tcp = connp->conn_tcp;
 195 
 196         tcpt = (tcp_timer_t *)mp->b_rptr;
 197         ASSERT(connp == tcpt->connp);
 198         ASSERT((squeue_t *)arg2 == connp->conn_sqp);
 199 
 200         if (tcpt->tcpt_tid & CALLOUT_ID_FREE) {
 201                 /*
 202                  * This timeout was cancelled after it was enqueued to the
 203                  * squeue; free the timer and return.
 204                  */
 205                 tcp_timer_free(connp->conn_tcp, mp);
 206                 return;
 207         }
 208 
 209         /*
 210          * If the TCP has reached the closed state, don't proceed any
 211          * further. This TCP logically does not exist on the system.
 212          * tcpt_proc could for example access queues, that have already
 213          * been qprocoff'ed off.
 214          */
 215         if (tcp->tcp_state != TCPS_CLOSED) {
 216                 (*tcpt->tcpt_proc)(connp);
 217         } else {
 218                 tcp->tcp_timer_tid = 0;
 219         }
 220 
 221         tcp_timer_free(connp->conn_tcp, mp);
 222 }
 223 
 224 /*
 225  * There is potential race with untimeout and the handler firing at the same
 226  * time. The mblock may be freed by the handler while we are trying to use
 227  * it. But since both should execute on the same squeue, this race should not
 228  * occur.
 229  */
 230 clock_t
 231 tcp_timeout_cancel(conn_t *connp, timeout_id_t id)
 232 {
 233         mblk_t  *mp = (mblk_t *)id;
 234         tcp_timer_t *tcpt;
 235         clock_t delta;
 236 
 237         TCP_DBGSTAT(connp->conn_tcp->tcp_tcps, tcp_timeout_cancel_reqs);
 238 
 239         if (mp == NULL)
 240                 return (-1);
 241 
 242         tcpt = (tcp_timer_t *)mp->b_rptr;
 243         ASSERT(tcpt->connp == connp);
 244 
 245         delta = untimeout_default(tcpt->tcpt_tid, 0);
 246 
 247         if (delta >= 0) {
 248                 TCP_DBGSTAT(connp->conn_tcp->tcp_tcps, tcp_timeout_canceled);
 249                 tcp_timer_free(connp->conn_tcp, mp);
 250                 CONN_DEC_REF(connp);
 251         } else {
 252                 /*
 253                  * If we were unable to untimeout successfully, it has already
 254                  * been enqueued on the squeue; mark the ID with the free
 255                  * bit.  This bit can never be set in a valid identifier, and
 256                  * we'll use it to prevent the timeout from being executed.
 257                  * And note that we're within the squeue perimeter here, so
 258                  * we don't need to worry about racing with timer handling
 259                  * (which also executes within the perimeter).
 260                  */
 261                 tcpt->tcpt_tid |= CALLOUT_ID_FREE;
 262                 delta = 0;
 263         }
 264 
 265         return (TICK_TO_MSEC(delta));
 266 }
 267 
 268 /*
 269  * Allocate space for the timer event. The allocation looks like mblk, but it is
 270  * not a proper mblk. To avoid confusion we set b_wptr to NULL.
 271  *
 272  * Dealing with failures: If we can't allocate from the timer cache we try
 273  * allocating from dblock caches using allocb_tryhard(). In this case b_wptr
 274  * points to b_rptr.
 275  * If we can't allocate anything using allocb_tryhard(), we perform a last
 276  * attempt and use kmem_alloc_tryhard(). In this case we set b_wptr to -1 and
 277  * save the actual allocation size in b_datap.
 278  */
 279 mblk_t *
 280 tcp_timermp_alloc(int kmflags)
 281 {
 282         mblk_t *mp = (mblk_t *)kmem_cache_alloc(tcp_timercache,
 283             kmflags & ~KM_PANIC);
 284 
 285         if (mp != NULL) {
 286                 mp->b_next = mp->b_prev = NULL;
 287                 mp->b_rptr = (uchar_t *)(&mp[1]);
 288                 mp->b_wptr = NULL;
 289                 mp->b_datap = NULL;
 290                 mp->b_queue = NULL;
 291                 mp->b_cont = NULL;
 292         } else if (kmflags & KM_PANIC) {
 293                 /*
 294                  * Failed to allocate memory for the timer. Try allocating from
 295                  * dblock caches.
 296                  */
 297                 /* ipclassifier calls this from a constructor - hence no tcps */
 298                 TCP_G_STAT(tcp_timermp_allocfail);
 299                 mp = allocb_tryhard(sizeof (tcp_timer_t));
 300                 if (mp == NULL) {
 301                         size_t size = 0;
 302                         /*
 303                          * Memory is really low. Try tryhard allocation.
 304                          *
 305                          * ipclassifier calls this from a constructor -
 306                          * hence no tcps
 307                          */
 308                         TCP_G_STAT(tcp_timermp_allocdblfail);
 309                         mp = kmem_alloc_tryhard(sizeof (mblk_t) +
 310                             sizeof (tcp_timer_t), &size, kmflags);
 311                         mp->b_rptr = (uchar_t *)(&mp[1]);
 312                         mp->b_next = mp->b_prev = NULL;
 313                         mp->b_wptr = (uchar_t *)-1;
 314                         mp->b_datap = (dblk_t *)size;
 315                         mp->b_queue = NULL;
 316                         mp->b_cont = NULL;
 317                 }
 318                 ASSERT(mp->b_wptr != NULL);
 319         }
 320         /* ipclassifier calls this from a constructor - hence no tcps */
 321         TCP_G_DBGSTAT(tcp_timermp_alloced);
 322 
 323         return (mp);
 324 }
 325 
 326 /*
 327  * Free per-tcp timer cache.
 328  * It can only contain entries from tcp_timercache.
 329  */
 330 void
 331 tcp_timermp_free(tcp_t *tcp)
 332 {
 333         mblk_t *mp;
 334 
 335         while ((mp = tcp->tcp_timercache) != NULL) {
 336                 ASSERT(mp->b_wptr == NULL);
 337                 tcp->tcp_timercache = tcp->tcp_timercache->b_next;
 338                 kmem_cache_free(tcp_timercache, mp);
 339         }
 340 }
 341 
 342 /*
 343  * Free timer event. Put it on the per-tcp timer cache if there is not too many
 344  * events there already (currently at most two events are cached).
 345  * If the event is not allocated from the timer cache, free it right away.
 346  */
 347 static void
 348 tcp_timer_free(tcp_t *tcp, mblk_t *mp)
 349 {
 350         mblk_t *mp1 = tcp->tcp_timercache;
 351 
 352         if (mp->b_wptr != NULL) {
 353                 /*
 354                  * This allocation is not from a timer cache, free it right
 355                  * away.
 356                  */
 357                 if (mp->b_wptr != (uchar_t *)-1)
 358                         freeb(mp);
 359                 else
 360                         kmem_free(mp, (size_t)mp->b_datap);
 361         } else if (mp1 == NULL || mp1->b_next == NULL) {
 362                 /* Cache this timer block for future allocations */
 363                 mp->b_rptr = (uchar_t *)(&mp[1]);
 364                 mp->b_next = mp1;
 365                 tcp->tcp_timercache = mp;
 366         } else {
 367                 kmem_cache_free(tcp_timercache, mp);
 368                 TCP_DBGSTAT(tcp->tcp_tcps, tcp_timermp_freed);
 369         }
 370 }
 371 
 372 /*
 373  * Stop all TCP timers.
 374  */
 375 void
 376 tcp_timers_stop(tcp_t *tcp)
 377 {
 378         if (tcp->tcp_timer_tid != 0) {
 379                 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid);
 380                 tcp->tcp_timer_tid = 0;
 381         }
 382         if (tcp->tcp_ka_tid != 0) {
 383                 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ka_tid);
 384                 tcp->tcp_ka_tid = 0;
 385         }
 386         if (tcp->tcp_ack_tid != 0) {
 387                 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid);
 388                 tcp->tcp_ack_tid = 0;
 389         }
 390         if (tcp->tcp_push_tid != 0) {
 391                 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid);
 392                 tcp->tcp_push_tid = 0;
 393         }
 394         if (tcp->tcp_reass_tid != 0) {
 395                 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_reass_tid);
 396                 tcp->tcp_reass_tid = 0;
 397         }
 398 }
 399 
 400 /*
 401  * Timer callback routine for keepalive probe.  We do a fake resend of
 402  * last ACKed byte.  Then set a timer using RTO.  When the timer expires,
 403  * check to see if we have heard anything from the other end for the last
 404  * RTO period.  If we have, set the timer to expire for another
 405  * tcp_keepalive_intrvl and check again.  If we have not, set a timer using
 406  * RTO << 1 and check again when it expires.  Keep exponentially increasing
 407  * the timeout if we have not heard from the other side.  If for more than
 408  * (tcp_ka_interval + tcp_ka_abort_thres) we have not heard anything,
 409  * kill the connection unless the keepalive abort threshold is 0.  In
 410  * that case, we will probe "forever."
 411  * If tcp_ka_cnt and tcp_ka_rinterval are non-zero, then we do not follow
 412  * the exponential backoff, but send probes tcp_ka_cnt times in regular
 413  * intervals of tcp_ka_rinterval milliseconds until we hear back from peer.
 414  * Kill the connection if we don't hear back from peer after tcp_ka_cnt
 415  * probes are sent.
 416  */
 417 void
 418 tcp_keepalive_timer(void *arg)
 419 {
 420         mblk_t  *mp;
 421         conn_t  *connp = (conn_t *)arg;
 422         tcp_t   *tcp = connp->conn_tcp;
 423         int32_t firetime;
 424         int32_t idletime;
 425         int32_t ka_intrvl;
 426         tcp_stack_t     *tcps = tcp->tcp_tcps;
 427 
 428         tcp->tcp_ka_tid = 0;
 429 
 430         if (tcp->tcp_fused)
 431                 return;
 432 
 433         TCPS_BUMP_MIB(tcps, tcpTimKeepalive);
 434         ka_intrvl = tcp->tcp_ka_interval;
 435 
 436         /*
 437          * Keepalive probe should only be sent if the application has not
 438          * done a close on the connection.
 439          */
 440         if (tcp->tcp_state > TCPS_CLOSE_WAIT) {
 441                 return;
 442         }
 443         /* Timer fired too early, restart it. */
 444         if (tcp->tcp_state < TCPS_ESTABLISHED) {
 445                 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
 446                     ka_intrvl);
 447                 return;
 448         }
 449 
 450         idletime = TICK_TO_MSEC(ddi_get_lbolt() - tcp->tcp_last_recv_time);
 451         /*
 452          * If we have not heard from the other side for a long
 453          * time, kill the connection unless the keepalive abort
 454          * threshold is 0.  In that case, we will probe "forever."
 455          */
 456         if (tcp->tcp_ka_abort_thres != 0 &&
 457             idletime > (ka_intrvl + tcp->tcp_ka_abort_thres)) {
 458                 TCPS_BUMP_MIB(tcps, tcpTimKeepaliveDrop);
 459                 (void) tcp_clean_death(tcp, tcp->tcp_client_errno ?
 460                     tcp->tcp_client_errno : ETIMEDOUT);
 461                 return;
 462         }
 463 
 464         if (tcp->tcp_snxt == tcp->tcp_suna &&
 465             idletime >= ka_intrvl) {
 466                 /* Fake resend of last ACKed byte. */
 467                 mblk_t  *mp1 = allocb(1, BPRI_LO);
 468 
 469                 if (mp1 != NULL) {
 470                         *mp1->b_wptr++ = '\0';
 471                         mp = tcp_xmit_mp(tcp, mp1, 1, NULL, NULL,
 472                             tcp->tcp_suna - 1, B_FALSE, NULL, B_TRUE);
 473                         freeb(mp1);
 474                         /*
 475                          * if allocation failed, fall through to start the
 476                          * timer back.
 477                          */
 478                         if (mp != NULL) {
 479                                 tcp_send_data(tcp, mp);
 480                                 TCPS_BUMP_MIB(tcps, tcpTimKeepaliveProbe);
 481                                 if (tcp->tcp_ka_rinterval) {
 482                                         firetime = tcp->tcp_ka_rinterval;
 483                                 } else if (tcp->tcp_ka_last_intrvl != 0) {
 484                                         int max;
 485                                         /*
 486                                          * We should probe again at least
 487                                          * in ka_intrvl, but not more than
 488                                          * tcp_rto_max.
 489                                          */
 490                                         max = tcp->tcp_rto_max;
 491                                         firetime = MIN(ka_intrvl - 1,
 492                                             tcp->tcp_ka_last_intrvl << 1);
 493                                         if (firetime > max)
 494                                                 firetime = max;
 495                                 } else {
 496                                         firetime = tcp->tcp_rto;
 497                                 }
 498                                 tcp->tcp_ka_tid = TCP_TIMER(tcp,
 499                                     tcp_keepalive_timer, firetime);
 500                                 tcp->tcp_ka_last_intrvl = firetime;
 501                                 return;
 502                         }
 503                 }
 504         } else {
 505                 tcp->tcp_ka_last_intrvl = 0;
 506         }
 507 
 508         /* firetime can be negative if (mp1 == NULL || mp == NULL) */
 509         if ((firetime = ka_intrvl - idletime) < 0) {
 510                 firetime = ka_intrvl;
 511         }
 512         tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer, firetime);
 513 }
 514 
 515 void
 516 tcp_reass_timer(void *arg)
 517 {
 518         conn_t *connp = (conn_t *)arg;
 519         tcp_t *tcp = connp->conn_tcp;
 520 
 521         tcp->tcp_reass_tid = 0;
 522         if (tcp->tcp_reass_head == NULL)
 523                 return;
 524         ASSERT(tcp->tcp_reass_tail != NULL);
 525         if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) {
 526                 tcp_sack_remove(tcp->tcp_sack_list,
 527                     TCP_REASS_END(tcp->tcp_reass_tail), &tcp->tcp_num_sack_blk);
 528         }
 529         tcp_close_mpp(&tcp->tcp_reass_head);
 530         tcp->tcp_reass_tail = NULL;
 531         TCP_STAT(tcp->tcp_tcps, tcp_reass_timeout);
 532 }
 533 
 534 /* This function handles the push timeout. */
 535 void
 536 tcp_push_timer(void *arg)
 537 {
 538         conn_t  *connp = (conn_t *)arg;
 539         tcp_t *tcp = connp->conn_tcp;
 540 
 541         TCP_DBGSTAT(tcp->tcp_tcps, tcp_push_timer_cnt);
 542 
 543         ASSERT(tcp->tcp_listener == NULL);
 544 
 545         ASSERT(!IPCL_IS_NONSTR(connp));
 546 
 547         tcp->tcp_push_tid = 0;
 548 
 549         if (tcp->tcp_rcv_list != NULL &&
 550             tcp_rcv_drain(tcp) == TH_ACK_NEEDED)
 551                 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK);
 552 }
 553 
 554 /*
 555  * This function handles delayed ACK timeout.
 556  */
 557 void
 558 tcp_ack_timer(void *arg)
 559 {
 560         conn_t  *connp = (conn_t *)arg;
 561         tcp_t *tcp = connp->conn_tcp;
 562         mblk_t *mp;
 563         tcp_stack_t     *tcps = tcp->tcp_tcps;
 564 
 565         TCP_DBGSTAT(tcps, tcp_ack_timer_cnt);
 566 
 567         tcp->tcp_ack_tid = 0;
 568 
 569         if (tcp->tcp_fused)
 570                 return;
 571 
 572         /*
 573          * Do not send ACK if there is no outstanding unack'ed data.
 574          */
 575         if (tcp->tcp_rnxt == tcp->tcp_rack) {
 576                 return;
 577         }
 578 
 579         if ((tcp->tcp_rnxt - tcp->tcp_rack) > tcp->tcp_mss) {
 580                 /*
 581                  * Make sure we don't allow deferred ACKs to result in
 582                  * timer-based ACKing.  If we have held off an ACK
 583                  * when there was more than an mss here, and the timer
 584                  * goes off, we have to worry about the possibility
 585                  * that the sender isn't doing slow-start, or is out
 586                  * of step with us for some other reason.  We fall
 587                  * permanently back in the direction of
 588                  * ACK-every-other-packet as suggested in RFC 1122.
 589                  */
 590                 if (tcp->tcp_rack_abs_max > 2)
 591                         tcp->tcp_rack_abs_max--;
 592                 tcp->tcp_rack_cur_max = 2;
 593         }
 594         mp = tcp_ack_mp(tcp);
 595 
 596         if (mp != NULL) {
 597                 TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
 598                 TCPS_BUMP_MIB(tcps, tcpOutAck);
 599                 TCPS_BUMP_MIB(tcps, tcpOutAckDelayed);
 600                 tcp_send_data(tcp, mp);
 601         }
 602 }
 603 
 604 /*
 605  * Notify IP that we are having trouble with this connection.  IP should
 606  * make note so it can potentially use a different IRE.
 607  */
 608 static void
 609 tcp_ip_notify(tcp_t *tcp)
 610 {
 611         conn_t          *connp = tcp->tcp_connp;
 612         ire_t           *ire;
 613 
 614         /*
 615          * Note: in the case of source routing we want to blow away the
 616          * route to the first source route hop.
 617          */
 618         ire = connp->conn_ixa->ixa_ire;
 619         if (ire != NULL && !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
 620                 if (ire->ire_ipversion == IPV4_VERSION) {
 621                         /*
 622                          * As per RFC 1122, we send an RTM_LOSING to inform
 623                          * routing protocols.
 624                          */
 625                         ip_rts_change(RTM_LOSING, ire->ire_addr,
 626                             ire->ire_gateway_addr, ire->ire_mask,
 627                             connp->conn_laddr_v4,  0, 0, 0,
 628                             (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA),
 629                             ire->ire_ipst);
 630                 }
 631                 (void) ire_no_good(ire);
 632         }
 633 }
 634 
 635 /*
 636  * tcp_timer is the timer service routine.  It handles the retransmission,
 637  * FIN_WAIT_2 flush, and zero window probe timeout events.  It figures out
 638  * from the state of the tcp instance what kind of action needs to be done
 639  * at the time it is called.
 640  */
 641 void
 642 tcp_timer(void *arg)
 643 {
 644         mblk_t          *mp;
 645         clock_t         first_threshold;
 646         clock_t         second_threshold;
 647         clock_t         ms;
 648         uint32_t        mss;
 649         conn_t          *connp = (conn_t *)arg;
 650         tcp_t           *tcp = connp->conn_tcp;
 651         tcp_stack_t     *tcps = tcp->tcp_tcps;
 652         boolean_t       dont_timeout = B_FALSE;
 653 
 654         tcp->tcp_timer_tid = 0;
 655 
 656         if (tcp->tcp_fused)
 657                 return;
 658 
 659         first_threshold =  tcp->tcp_first_timer_threshold;
 660         second_threshold = tcp->tcp_second_timer_threshold;
 661         switch (tcp->tcp_state) {
 662         case TCPS_IDLE:
 663         case TCPS_BOUND:
 664         case TCPS_LISTEN:
 665                 return;
 666         case TCPS_SYN_RCVD: {
 667                 tcp_t   *listener = tcp->tcp_listener;
 668 
 669                 if (tcp->tcp_syn_rcvd_timeout == 0 && (listener != NULL)) {
 670                         /* it's our first timeout */
 671                         tcp->tcp_syn_rcvd_timeout = 1;
 672                         mutex_enter(&listener->tcp_eager_lock);
 673                         listener->tcp_syn_rcvd_timeout++;
 674                         if (!tcp->tcp_dontdrop && !tcp->tcp_closemp_used) {
 675                                 /*
 676                                  * Make this eager available for drop if we
 677                                  * need to drop one to accomodate a new
 678                                  * incoming SYN request.
 679                                  */
 680                                 MAKE_DROPPABLE(listener, tcp);
 681                         }
 682                         if (!listener->tcp_syn_defense &&
 683                             (listener->tcp_syn_rcvd_timeout >
 684                             (tcps->tcps_conn_req_max_q0 >> 2)) &&
 685                             (tcps->tcps_conn_req_max_q0 > 200)) {
 686                                 /* We may be under attack. Put on a defense. */
 687                                 listener->tcp_syn_defense = B_TRUE;
 688                                 cmn_err(CE_WARN, "High TCP connect timeout "
 689                                     "rate! System (port %d) may be under a "
 690                                     "SYN flood attack!",
 691                                     ntohs(listener->tcp_connp->conn_lport));
 692 
 693                                 listener->tcp_ip_addr_cache = kmem_zalloc(
 694                                     IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t),
 695                                     KM_NOSLEEP);
 696                         }
 697                         mutex_exit(&listener->tcp_eager_lock);
 698                 } else if (listener != NULL) {
 699                         mutex_enter(&listener->tcp_eager_lock);
 700                         tcp->tcp_syn_rcvd_timeout++;
 701                         if (tcp->tcp_syn_rcvd_timeout > 1 &&
 702                             !tcp->tcp_closemp_used) {
 703                                 /*
 704                                  * This is our second timeout. Put the tcp in
 705                                  * the list of droppable eagers to allow it to
 706                                  * be dropped, if needed. We don't check
 707                                  * whether tcp_dontdrop is set or not to
 708                                  * protect ourselve from a SYN attack where a
 709                                  * remote host can spoof itself as one of the
 710                                  * good IP source and continue to hold
 711                                  * resources too long.
 712                                  */
 713                                 MAKE_DROPPABLE(listener, tcp);
 714                         }
 715                         mutex_exit(&listener->tcp_eager_lock);
 716                 }
 717         }
 718                 /* FALLTHRU */
 719         case TCPS_SYN_SENT:
 720                 first_threshold =  tcp->tcp_first_ctimer_threshold;
 721                 second_threshold = tcp->tcp_second_ctimer_threshold;
 722 
 723                 /*
 724                  * If an app has set the second_threshold to 0, it means that
 725                  * we need to retransmit forever, unless this is a passive
 726                  * open.  We need to set second_threshold back to a normal
 727                  * value such that later comparison with it still makes
 728                  * sense.  But we set dont_timeout to B_TRUE so that we will
 729                  * never time out.
 730                  */
 731                 if (second_threshold == 0) {
 732                         second_threshold = tcps->tcps_ip_abort_linterval;
 733                         if (tcp->tcp_active_open)
 734                                 dont_timeout = B_TRUE;
 735                 }
 736                 break;
 737         case TCPS_ESTABLISHED:
 738         case TCPS_CLOSE_WAIT:
 739                 /*
 740                  * If the end point has not been closed, TCP can retransmit
 741                  * forever.  But if the end point is closed, the normal
 742                  * timeout applies.
 743                  */
 744                 if (second_threshold == 0) {
 745                         second_threshold = tcps->tcps_ip_abort_linterval;
 746                         dont_timeout = B_TRUE;
 747                 }
 748                 /* FALLTHRU */
 749         case TCPS_FIN_WAIT_1:
 750         case TCPS_CLOSING:
 751         case TCPS_LAST_ACK:
 752                 /* If we have data to rexmit */
 753                 if (tcp->tcp_suna != tcp->tcp_snxt) {
 754                         clock_t time_to_wait;
 755 
 756                         TCPS_BUMP_MIB(tcps, tcpTimRetrans);
 757                         if (!tcp->tcp_xmit_head)
 758                                 break;
 759                         time_to_wait = NSEC2MSEC(gethrtime() -
 760                             (hrtime_t)(intptr_t)tcp->tcp_xmit_head->b_prev);
 761                         time_to_wait = tcp->tcp_rto - time_to_wait;
 762                         /*
 763                          * If the timer fires too early, 1 clock tick earlier,
 764                          * restart the timer.
 765                          */
 766                         if (time_to_wait > msec_per_tick) {
 767                                 TCP_STAT(tcps, tcp_timer_fire_early);
 768                                 TCP_TIMER_RESTART(tcp, time_to_wait);
 769                                 return;
 770                         }
 771                         /*
 772                          * When we probe zero windows, we force the swnd open.
 773                          * If our peer acks with a closed window swnd will be
 774                          * set to zero by tcp_rput(). As long as we are
 775                          * receiving acks tcp_rput will
 776                          * reset 'tcp_ms_we_have_waited' so as not to trip the
 777                          * first and second interval actions.  NOTE: the timer
 778                          * interval is allowed to continue its exponential
 779                          * backoff.
 780                          */
 781                         if (tcp->tcp_swnd == 0 || tcp->tcp_zero_win_probe) {
 782                                 if (connp->conn_debug) {
 783                                         (void) strlog(TCP_MOD_ID, 0, 1,
 784                                             SL_TRACE, "tcp_timer: zero win");
 785                                 }
 786                         } else {
 787                                 /*
 788                                  * After retransmission, we need to do
 789                                  * slow start.  Set the ssthresh to one
 790                                  * half of current effective window and
 791                                  * cwnd to one MSS.  Also reset
 792                                  * tcp_cwnd_cnt.
 793                                  *
 794                                  * Note that if tcp_ssthresh is reduced because
 795                                  * of ECN, do not reduce it again unless it is
 796                                  * already one window of data away (tcp_cwr
 797                                  * should then be cleared) or this is a
 798                                  * timeout for a retransmitted segment.
 799                                  */
 800                                 uint32_t npkt;
 801 
 802                                 if (!tcp->tcp_cwr || tcp->tcp_rexmit) {
 803                                         npkt = ((tcp->tcp_timer_backoff ?
 804                                             tcp->tcp_cwnd_ssthresh :
 805                                             tcp->tcp_snxt -
 806                                             tcp->tcp_suna) >> 1) / tcp->tcp_mss;
 807                                         tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) *
 808                                             tcp->tcp_mss;
 809                                 }
 810                                 tcp->tcp_cwnd = tcp->tcp_mss;
 811                                 tcp->tcp_cwnd_cnt = 0;
 812                                 if (tcp->tcp_ecn_ok) {
 813                                         tcp->tcp_cwr = B_TRUE;
 814                                         tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
 815                                         tcp->tcp_ecn_cwr_sent = B_FALSE;
 816                                 }
 817                         }
 818                         break;
 819                 }
 820                 /*
 821                  * We have something to send yet we cannot send.  The
 822                  * reason can be:
 823                  *
 824                  * 1. Zero send window: we need to do zero window probe.
 825                  * 2. Zero cwnd: because of ECN, we need to "clock out
 826                  * segments.
 827                  * 3. SWS avoidance: receiver may have shrunk window,
 828                  * reset our knowledge.
 829                  *
 830                  * Note that condition 2 can happen with either 1 or
 831                  * 3.  But 1 and 3 are exclusive.
 832                  */
 833                 if (tcp->tcp_unsent != 0) {
 834                         /*
 835                          * Should not hold the zero-copy messages for too long.
 836                          */
 837                         if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean)
 838                                 tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp,
 839                                     tcp->tcp_xmit_head, B_TRUE);
 840 
 841                         if (tcp->tcp_cwnd == 0) {
 842                                 /*
 843                                  * Set tcp_cwnd to 1 MSS so that a
 844                                  * new segment can be sent out.  We
 845                                  * are "clocking out" new data when
 846                                  * the network is really congested.
 847                                  */
 848                                 ASSERT(tcp->tcp_ecn_ok);
 849                                 tcp->tcp_cwnd = tcp->tcp_mss;
 850                         }
 851                         if (tcp->tcp_swnd == 0) {
 852                                 /* Extend window for zero window probe */
 853                                 tcp->tcp_swnd++;
 854                                 tcp->tcp_zero_win_probe = B_TRUE;
 855                                 TCPS_BUMP_MIB(tcps, tcpOutWinProbe);
 856                                 tcp->tcp_cs.tcp_out_zwnd_probes++;
 857                         } else {
 858                                 /*
 859                                  * Handle timeout from sender SWS avoidance.
 860                                  * Reset our knowledge of the max send window
 861                                  * since the receiver might have reduced its
 862                                  * receive buffer.  Avoid setting tcp_max_swnd
 863                                  * to one since that will essentially disable
 864                                  * the SWS checks.
 865                                  *
 866                                  * Note that since we don't have a SWS
 867                                  * state variable, if the timeout is set
 868                                  * for ECN but not for SWS, this
 869                                  * code will also be executed.  This is
 870                                  * fine as tcp_max_swnd is updated
 871                                  * constantly and it will not affect
 872                                  * anything.
 873                                  */
 874                                 tcp->tcp_max_swnd = MAX(tcp->tcp_swnd, 2);
 875                         }
 876                         tcp_wput_data(tcp, NULL, B_FALSE);
 877                         return;
 878                 }
 879                 /* Is there a FIN that needs to be to re retransmitted? */
 880                 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
 881                     !tcp->tcp_fin_acked)
 882                         break;
 883                 /* Nothing to do, return without restarting timer. */
 884                 TCP_STAT(tcps, tcp_timer_fire_miss);
 885                 return;
 886         case TCPS_FIN_WAIT_2:
 887                 /*
 888                  * User closed the TCP endpoint and peer ACK'ed our FIN.
 889                  * We waited some time for for peer's FIN, but it hasn't
 890                  * arrived.  We flush the connection now to avoid
 891                  * case where the peer has rebooted.
 892                  */
 893                 if (TCP_IS_DETACHED(tcp)) {
 894                         (void) tcp_clean_death(tcp, 0);
 895                 } else {
 896                         TCP_TIMER_RESTART(tcp,
 897                             tcp->tcp_fin_wait_2_flush_interval);
 898                 }
 899                 return;
 900         case TCPS_TIME_WAIT:
 901                 (void) tcp_clean_death(tcp, 0);
 902                 return;
 903         default:
 904                 if (connp->conn_debug) {
 905                         (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
 906                             "tcp_timer: strange state (%d) %s",
 907                             tcp->tcp_state, tcp_display(tcp, NULL,
 908                             DISP_PORT_ONLY));
 909                 }
 910                 return;
 911         }
 912 
 913         /*
 914          * If the system is under memory pressure or the max number of
 915          * connections have been established for the listener, be more
 916          * aggressive in aborting connections.
 917          */
 918         if (tcps->tcps_reclaim || (tcp->tcp_listen_cnt != NULL &&
 919             tcp->tcp_listen_cnt->tlc_cnt > tcp->tcp_listen_cnt->tlc_max)) {
 920                 second_threshold = tcp_early_abort * SECONDS;
 921 
 922                 /* We will ignore the never timeout promise in this case... */
 923                 dont_timeout = B_FALSE;
 924         }
 925 
 926         ASSERT(second_threshold != 0);
 927 
 928         if ((ms = tcp->tcp_ms_we_have_waited) > second_threshold) {
 929                 /*
 930                  * Should not hold the zero-copy messages for too long.
 931                  */
 932                 if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean)
 933                         tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp,
 934                             tcp->tcp_xmit_head, B_TRUE);
 935 
 936                 if (dont_timeout) {
 937                         /*
 938                          * Reset tcp_ms_we_have_waited to avoid overflow since
 939                          * we are going to retransmit forever.
 940                          */
 941                         tcp->tcp_ms_we_have_waited = second_threshold;
 942                         goto timer_rexmit;
 943                 }
 944 
 945                 /*
 946                  * For zero window probe, we need to send indefinitely,
 947                  * unless we have not heard from the other side for some
 948                  * time...
 949                  */
 950                 if ((tcp->tcp_zero_win_probe == 0) ||
 951                     (TICK_TO_MSEC(ddi_get_lbolt() - tcp->tcp_last_recv_time) >
 952                     second_threshold)) {
 953                         TCPS_BUMP_MIB(tcps, tcpTimRetransDrop);
 954                         /*
 955                          * If TCP is in SYN_RCVD state, send back a
 956                          * RST|ACK as BSD does.  Note that tcp_zero_win_probe
 957                          * should be zero in TCPS_SYN_RCVD state.
 958                          */
 959                         if (tcp->tcp_state == TCPS_SYN_RCVD) {
 960                                 tcp_xmit_ctl("tcp_timer: RST sent on timeout "
 961                                     "in SYN_RCVD",
 962                                     tcp, tcp->tcp_snxt,
 963                                     tcp->tcp_rnxt, TH_RST | TH_ACK);
 964                         }
 965                         (void) tcp_clean_death(tcp,
 966                             tcp->tcp_client_errno ?
 967                             tcp->tcp_client_errno : ETIMEDOUT);
 968                         return;
 969                 } else {
 970                         /*
 971                          * If the system is under memory pressure, we also
 972                          * abort connection in zero window probing.
 973                          */
 974                         if (tcps->tcps_reclaim) {
 975                                 (void) tcp_clean_death(tcp,
 976                                     tcp->tcp_client_errno ?
 977                                     tcp->tcp_client_errno : ETIMEDOUT);
 978                                 TCP_STAT(tcps, tcp_zwin_mem_drop);
 979                                 return;
 980                         }
 981                         /*
 982                          * Set tcp_ms_we_have_waited to second_threshold
 983                          * so that in next timeout, we will do the above
 984                          * check (ddi_get_lbolt() - tcp_last_recv_time).
 985                          * This is also to avoid overflow.
 986                          *
 987                          * We don't need to decrement tcp_timer_backoff
 988                          * to avoid overflow because it will be decremented
 989                          * later if new timeout value is greater than
 990                          * tcp_rto_max.  In the case when tcp_rto_max is
 991                          * greater than second_threshold, it means that we
 992                          * will wait longer than second_threshold to send
 993                          * the next
 994                          * window probe.
 995                          */
 996                         tcp->tcp_ms_we_have_waited = second_threshold;
 997                 }
 998         } else if (ms > first_threshold) {
 999                 /*
1000                  * Should not hold the zero-copy messages for too long.
1001                  */
1002                 if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean)
1003                         tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp,
1004                             tcp->tcp_xmit_head, B_TRUE);
1005 
1006                 /*
1007                  * We have been retransmitting for too long...  The RTT
1008                  * we calculated is probably incorrect.  Reinitialize it.
1009                  * Need to compensate for 0 tcp_rtt_sa.  Reset
1010                  * tcp_rtt_update so that we won't accidentally cache a
1011                  * bad value.  But only do this if this is not a zero
1012                  * window probe.
1013                  */
1014                 if (tcp->tcp_rtt_sa != 0 && tcp->tcp_zero_win_probe == 0) {
1015                         tcp->tcp_rtt_sd += tcp->tcp_rtt_sa >> 3 +
1016                             tcp->tcp_rtt_sa >> 5;
1017                         tcp->tcp_rtt_sa = 0;
1018                         tcp_ip_notify(tcp);
1019                         tcp->tcp_rtt_update = 0;
1020                 }
1021         }
1022 
1023 timer_rexmit:
1024         tcp->tcp_timer_backoff++;
1025         /*
1026          * Calculate the backed off retransmission timeout. If the shift brings
1027          * us back over the max, then we repin the value, and decrement the
1028          * backoff to avoid overflow.
1029          */
1030         ms = tcp_calculate_rto(tcp, tcps, 0) << tcp->tcp_timer_backoff;
1031         if (ms > tcp->tcp_rto_max) {
1032                 ms = tcp->tcp_rto_max;
1033                 tcp->tcp_timer_backoff--;
1034         }
1035         tcp->tcp_ms_we_have_waited += ms;
1036         if (tcp->tcp_zero_win_probe == 0) {
1037                 tcp->tcp_rto = ms;
1038         }
1039         TCP_TIMER_RESTART(tcp, ms);
1040         /*
1041          * This is after a timeout and tcp_rto is backed off.  Set
1042          * tcp_set_timer to 1 so that next time RTO is updated, we will
1043          * restart the timer with a correct value.
1044          */
1045         tcp->tcp_set_timer = 1;
1046         mss = tcp->tcp_snxt - tcp->tcp_suna;
1047         if (mss > tcp->tcp_mss)
1048                 mss = tcp->tcp_mss;
1049         if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0)
1050                 mss = tcp->tcp_swnd;
1051 
1052         if ((mp = tcp->tcp_xmit_head) != NULL) {
1053                 mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
1054         }
1055         mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss,
1056             B_TRUE);
1057 
1058         /*
1059          * When slow start after retransmission begins, start with
1060          * this seq no.  tcp_rexmit_max marks the end of special slow
1061          * start phase.
1062          */
1063         tcp->tcp_rexmit_nxt = tcp->tcp_suna;
1064         if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
1065             (tcp->tcp_unsent == 0)) {
1066                 tcp->tcp_rexmit_max = tcp->tcp_fss;
1067         } else {
1068                 tcp->tcp_rexmit_max = tcp->tcp_snxt;
1069         }
1070         tcp->tcp_rexmit = B_TRUE;
1071         tcp->tcp_dupack_cnt = 0;
1072 
1073         /*
1074          * Remove all rexmit SACK blk to start from fresh.
1075          */
1076         if (tcp->tcp_snd_sack_ok)
1077                 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, tcp);
1078         if (mp == NULL) {
1079                 return;
1080         }
1081 
1082         tcp->tcp_csuna = tcp->tcp_snxt;
1083         TCPS_BUMP_MIB(tcps, tcpRetransSegs);
1084         TCPS_UPDATE_MIB(tcps, tcpRetransBytes, mss);
1085         tcp->tcp_cs.tcp_out_retrans_segs++;
1086         tcp->tcp_cs.tcp_out_retrans_bytes += mss;
1087         tcp_send_data(tcp, mp);
1088 
1089 }
1090 
1091 /*
1092  * Handle lingering timeouts. This function is called when the SO_LINGER timeout
1093  * expires.
1094  */
1095 void
1096 tcp_close_linger_timeout(void *arg)
1097 {
1098         conn_t  *connp = (conn_t *)arg;
1099         tcp_t   *tcp = connp->conn_tcp;
1100 
1101         tcp->tcp_client_errno = ETIMEDOUT;
1102         tcp_stop_lingering(tcp);
1103 }
--- EOF ---