Print this page
11547 Want connstat(1M) command to display per-connection TCP statistics
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Portions contributed by: Ahmed G <ahmedg@delphix.com>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>


 542          * care of that.
 543          */
 544         tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss));
 545 
 546         /*
 547          * Initialize tcp_cwnd value. After tcp_mss_set(), tcp_mss has been
 548          * updated properly.
 549          */
 550         TCP_SET_INIT_CWND(tcp, tcp->tcp_mss, tcps->tcps_slow_start_initial);
 551 }
 552 
 553 /*
 554  * Add a new piece to the tcp reassembly queue.  If the gap at the beginning
 555  * is filled, return as much as we can.  The message passed in may be
 556  * multi-part, chained using b_cont.  "start" is the starting sequence
 557  * number for this piece.
 558  */
 559 static mblk_t *
 560 tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start)
 561 {
 562         uint32_t        end;
 563         mblk_t          *mp1;
 564         mblk_t          *mp2;
 565         mblk_t          *next_mp;
 566         uint32_t        u1;
 567         tcp_stack_t     *tcps = tcp->tcp_tcps;
 568 
 569 
 570         /* Walk through all the new pieces. */
 571         do {
 572                 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <=
 573                     (uintptr_t)INT_MAX);
 574                 end = start + (int)(mp->b_wptr - mp->b_rptr);
 575                 next_mp = mp->b_cont;
 576                 if (start == end) {
 577                         /* Empty.  Blast it. */
 578                         freeb(mp);
 579                         continue;
 580                 }

 581                 mp->b_cont = NULL;
 582                 TCP_REASS_SET_SEQ(mp, start);
 583                 TCP_REASS_SET_END(mp, end);
 584                 mp1 = tcp->tcp_reass_tail;
 585                 if (!mp1) {
 586                         tcp->tcp_reass_tail = mp;






 587                         tcp->tcp_reass_head = mp;
 588                         TCPS_BUMP_MIB(tcps, tcpInDataUnorderSegs);
 589                         TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes,
 590                             end - start);
 591                         continue;
 592                 }
 593                 /* New stuff completely beyond tail? */
 594                 if (SEQ_GEQ(start, TCP_REASS_END(mp1))) {
 595                         /* Link it on end. */
 596                         mp1->b_cont = mp;
 597                         tcp->tcp_reass_tail = mp;
 598                         TCPS_BUMP_MIB(tcps, tcpInDataUnorderSegs);
 599                         TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes,
 600                             end - start);

 601                         continue;
 602                 }
 603                 mp1 = tcp->tcp_reass_head;
 604                 u1 = TCP_REASS_SEQ(mp1);
 605                 /* New stuff at the front? */
 606                 if (SEQ_LT(start, u1)) {
 607                         /* Yes... Check for overlap. */
 608                         mp->b_cont = mp1;
 609                         tcp->tcp_reass_head = mp;
 610                         tcp_reass_elim_overlap(tcp, mp);
 611                         continue;
 612                 }
 613                 /*
 614                  * The new piece fits somewhere between the head and tail.
 615                  * We find our slot, where mp1 precedes us and mp2 trails.
 616                  */
 617                 for (; (mp2 = mp1->b_cont) != NULL; mp1 = mp2) {
 618                         u1 = TCP_REASS_SEQ(mp2);
 619                         if (SEQ_LEQ(start, u1))
 620                                 break;


2397 
2398         if (tcp->tcp_state == TCPS_TIME_WAIT) {
2399                 tcp_time_wait_processing(tcp, mp, seg_seq, seg_ack,
2400                     seg_len, tcpha, ira);
2401                 return;
2402         }
2403 
2404         if (sqp != NULL) {
2405                 /*
2406                  * This is the correct place to update tcp_last_recv_time. Note
2407                  * that it is also updated for tcp structure that belongs to
2408                  * global and listener queues which do not really need updating.
2409                  * But that should not cause any harm.  And it is updated for
2410                  * all kinds of incoming segments, not only for data segments.
2411                  */
2412                 tcp->tcp_last_recv_time = LBOLT_FASTPATH;
2413         }
2414 
2415         flags = (unsigned int)tcpha->tha_flags & 0xFF;
2416 
2417         BUMP_LOCAL(tcp->tcp_ibsegs);
2418         DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp);
2419 
2420         if ((flags & TH_URG) && sqp != NULL) {
2421                 /*
2422                  * TCP can't handle urgent pointers that arrive before
2423                  * the connection has been accept()ed since it can't
2424                  * buffer OOB data.  Discard segment if this happens.
2425                  *
2426                  * We can't just rely on a non-null tcp_listener to indicate
2427                  * that the accept() has completed since unlinking of the
2428                  * eager and completion of the accept are not atomic.
2429                  * tcp_detached, when it is not set (B_FALSE) indicates
2430                  * that the accept() has completed.
2431                  *
2432                  * Nor can it reassemble urgent pointers, so discard
2433                  * if it's not the next segment expected.
2434                  *
2435                  * Otherwise, collapse chain into one mblk (discard if
2436                  * that fails).  This makes sure the headers, retransmitted
2437                  * data, and new data all are in the same mblk.


2642                         if (tcp->tcp_loopback) {
2643                                 mblk_t *ack_mp;
2644 
2645                                 ASSERT(!tcp->tcp_unfusable);
2646                                 ASSERT(mp1 != NULL);
2647                                 /*
2648                                  * For loopback, we always get a pure SYN-ACK
2649                                  * and only need to send back the final ACK
2650                                  * with no data (this is because the other
2651                                  * tcp is ours and we don't do T/TCP).  This
2652                                  * final ACK triggers the passive side to
2653                                  * perform fusion in ESTABLISHED state.
2654                                  */
2655                                 if ((ack_mp = tcp_ack_mp(tcp)) != NULL) {
2656                                         if (tcp->tcp_ack_tid != 0) {
2657                                                 (void) TCP_TIMER_CANCEL(tcp,
2658                                                     tcp->tcp_ack_tid);
2659                                                 tcp->tcp_ack_tid = 0;
2660                                         }
2661                                         tcp_send_data(tcp, ack_mp);
2662                                         BUMP_LOCAL(tcp->tcp_obsegs);
2663                                         TCPS_BUMP_MIB(tcps, tcpOutAck);
2664 
2665                                         if (!IPCL_IS_NONSTR(connp)) {
2666                                                 /* Send up T_CONN_CON */
2667                                                 if (ira->ira_cred != NULL) {
2668                                                         mblk_setcred(mp1,
2669                                                             ira->ira_cred,
2670                                                             ira->ira_cpid);
2671                                                 }
2672                                                 putnext(connp->conn_rq, mp1);
2673                                         } else {
2674                                                 (*sockupcalls->su_connected)
2675                                                     (connp->conn_upper_handle,
2676                                                     tcp->tcp_connid,
2677                                                     ira->ira_cred,
2678                                                     ira->ira_cpid);
2679                                                 freemsg(mp1);
2680                                         }
2681 
2682                                         freemsg(mp);


3031                         mp2 = mp;
3032                         mp = mp->b_cont;
3033                         freeb(mp2);
3034                 } while (gap < 0);
3035                 /*
3036                  * If the urgent data has already been acknowledged, we
3037                  * should ignore TH_URG below
3038                  */
3039                 if (urp < 0)
3040                         flags &= ~TH_URG;
3041         }
3042         /*
3043          * rgap is the amount of stuff received out of window.  A negative
3044          * value is the amount out of window.
3045          */
3046         if (rgap < 0) {
3047                 mblk_t  *mp2;
3048 
3049                 if (tcp->tcp_rwnd == 0) {
3050                         TCPS_BUMP_MIB(tcps, tcpInWinProbe);

3051                 } else {
3052                         TCPS_BUMP_MIB(tcps, tcpInDataPastWinSegs);
3053                         TCPS_UPDATE_MIB(tcps, tcpInDataPastWinBytes, -rgap);
3054                 }
3055 
3056                 /*
3057                  * seg_len does not include the FIN, so if more than
3058                  * just the FIN is out of window, we act like we don't
3059                  * see it.  (If just the FIN is out of window, rgap
3060                  * will be zero and we will go ahead and acknowledge
3061                  * the FIN.)
3062                  */
3063                 flags &= ~TH_FIN;
3064 
3065                 /* Fix seg_len and make sure there is something left. */
3066                 seg_len += rgap;
3067                 if (seg_len <= 0) {
3068                         /*
3069                          * Resets are only valid if they lie within our offered
3070                          * window.  If the RST bit is set, we just ignore this


3280                                  *
3281                                  * But TCP should not perform fast retransmit
3282                                  * because of the ack number.  TCP uses
3283                                  * seg_len == 0 to determine if it is a pure
3284                                  * ACK.  And this is not a pure ACK.
3285                                  */
3286                                 seg_len = 0;
3287                                 ofo_seg = B_TRUE;
3288 
3289                                 if (tcps->tcps_reass_timeout != 0 &&
3290                                     tcp->tcp_reass_tid == 0) {
3291                                         tcp->tcp_reass_tid = TCP_TIMER(tcp,
3292                                             tcp_reass_timer,
3293                                             tcps->tcps_reass_timeout);
3294                                 }
3295                         }
3296                 }
3297         } else if (seg_len > 0) {
3298                 TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs);
3299                 TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, seg_len);



3300                 /*
3301                  * If an out of order FIN was received before, and the seq
3302                  * num and len of the new segment match that of the FIN,
3303                  * put the FIN flag back in.
3304                  */
3305                 if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) &&
3306                     seg_seq + seg_len == tcp->tcp_ofo_fin_seq) {
3307                         flags |= TH_FIN;
3308                         tcp->tcp_valid_bits &= ~TCP_OFO_FIN_VALID;
3309                 }
3310         }
3311         if ((flags & (TH_RST | TH_SYN | TH_URG | TH_ACK)) != TH_ACK) {
3312         if (flags & TH_RST) {
3313                 freemsg(mp);
3314                 switch (tcp->tcp_state) {
3315                 case TCPS_SYN_RCVD:
3316                         (void) tcp_clean_death(tcp, ECONNREFUSED);
3317                         break;
3318                 case TCPS_ESTABLISHED:
3319                 case TCPS_FIN_WAIT_1:


4129                          * greater than 0, check if the number of such
4130                          * bogus ACks is greater than that count.  If yes,
4131                          * don't send back any ACK.  This prevents TCP from
4132                          * getting into an ACK storm if somehow an attacker
4133                          * successfully spoofs an acceptable segment to our
4134                          * peer.  If this continues (count > 2 X threshold),
4135                          * we should abort this connection.
4136                          */
4137                         if (tcp_drop_ack_unsent_cnt > 0 &&
4138                             ++tcp->tcp_in_ack_unsent >
4139                             tcp_drop_ack_unsent_cnt) {
4140                                 TCP_STAT(tcps, tcp_in_ack_unsent_drop);
4141                                 if (tcp->tcp_in_ack_unsent > 2 *
4142                                     tcp_drop_ack_unsent_cnt) {
4143                                         (void) tcp_clean_death(tcp, EPROTO);
4144                                 }
4145                                 return;
4146                         }
4147                         mp = tcp_ack_mp(tcp);
4148                         if (mp != NULL) {
4149                                 BUMP_LOCAL(tcp->tcp_obsegs);
4150                                 TCPS_BUMP_MIB(tcps, tcpOutAck);
4151                                 tcp_send_data(tcp, mp);
4152                         }
4153                         return;
4154                 }
4155         } else if (tcp->tcp_is_wnd_shrnk && SEQ_GEQ(seg_ack,
4156             tcp->tcp_snxt_shrunk)) {
4157                         tcp->tcp_is_wnd_shrnk = B_FALSE;
4158         }
4159 
4160         /*
4161          * TCP gets a new ACK, update the notsack'ed list to delete those
4162          * blocks that are covered by this ACK.
4163          */
4164         if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) {
4165                 tcp_notsack_remove(&(tcp->tcp_notsack_list), seg_ack,
4166                     &(tcp->tcp_num_notsack_blk), &(tcp->tcp_cnt_notsack_list));
4167         }
4168 
4169         /*


4820             TH_LIMIT_XMIT)) && tcp->tcp_swnd != 0) {
4821                 if (flags & TH_REXMIT_NEEDED) {
4822                         uint32_t snd_size = tcp->tcp_snxt - tcp->tcp_suna;
4823 
4824                         TCPS_BUMP_MIB(tcps, tcpOutFastRetrans);
4825                         if (snd_size > mss)
4826                                 snd_size = mss;
4827                         if (snd_size > tcp->tcp_swnd)
4828                                 snd_size = tcp->tcp_swnd;
4829                         mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, snd_size,
4830                             NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size,
4831                             B_TRUE);
4832 
4833                         if (mp1 != NULL) {
4834                                 tcp->tcp_xmit_head->b_prev =
4835                                     (mblk_t *)(intptr_t)gethrtime();
4836                                 tcp->tcp_csuna = tcp->tcp_snxt;
4837                                 TCPS_BUMP_MIB(tcps, tcpRetransSegs);
4838                                 TCPS_UPDATE_MIB(tcps, tcpRetransBytes,
4839                                     snd_size);


4840                                 tcp_send_data(tcp, mp1);
4841                         }
4842                 }
4843                 if (flags & TH_NEED_SACK_REXMIT) {
4844                         tcp_sack_rexmit(tcp, &flags);
4845                 }
4846                 /*
4847                  * For TH_LIMIT_XMIT, tcp_wput_data() is called to send
4848                  * out new segment.  Note that tcp_rexmit should not be
4849                  * set, otherwise TH_LIMIT_XMIT should not be set.
4850                  */
4851                 if (flags & (TH_XMIT_NEEDED|TH_LIMIT_XMIT)) {
4852                         if (!tcp->tcp_rexmit) {
4853                                 tcp_wput_data(tcp, NULL, B_FALSE);
4854                         } else {
4855                                 tcp_ss_rexmit(tcp);
4856                         }
4857                 }
4858                 /*
4859                  * Adjust tcp_cwnd back to normal value after sending


4895                         tcp_setcred_data(mp1, ira);
4896 
4897                 putnext(connp->conn_rq, mp1);
4898 #ifdef DEBUG
4899                 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
4900                     "tcp_rput: sending zero-length %s %s",
4901                     ((mp1->b_flag & MSGMARKNEXT) ? "MSGMARKNEXT" :
4902                     "MSGNOTMARKNEXT"),
4903                     tcp_display(tcp, NULL, DISP_PORT_ONLY));
4904 #endif /* DEBUG */
4905                 flags &= ~TH_SEND_URP_MARK;
4906         }
4907         if (flags & TH_ACK_NEEDED) {
4908                 /*
4909                  * Time to send an ack for some reason.
4910                  */
4911                 mp1 = tcp_ack_mp(tcp);
4912 
4913                 if (mp1 != NULL) {
4914                         tcp_send_data(tcp, mp1);
4915                         BUMP_LOCAL(tcp->tcp_obsegs);
4916                         TCPS_BUMP_MIB(tcps, tcpOutAck);
4917                 }
4918                 if (tcp->tcp_ack_tid != 0) {
4919                         (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid);
4920                         tcp->tcp_ack_tid = 0;
4921                 }
4922         }
4923         if (flags & TH_ACK_TIMER_NEEDED) {
4924                 /*
4925                  * Arrange for deferred ACK or push wait timeout.
4926                  * Start timer if it is not already running.
4927                  */
4928                 if (tcp->tcp_ack_tid == 0) {
4929                         tcp->tcp_ack_tid = TCP_TIMER(tcp, tcp_ack_timer,
4930                             tcp->tcp_localnet ?
4931                             tcps->tcps_local_dack_interval :
4932                             tcps->tcps_deferred_ack_interval);
4933                 }
4934         }
4935         if (flags & TH_ORDREL_NEEDED) {


5211 /*
5212  * Set RTO for this connection based on a new round-trip time measurement.
5213  * The formula is from Jacobson and Karels' "Congestion Avoidance and Control"
5214  * in SIGCOMM '88.  The variable names are the same as those in Appendix A.2
5215  * of that paper.
5216  *
5217  * m = new measurement
5218  * sa = smoothed RTT average (8 * average estimates).
5219  * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates).
5220  */
5221 static void
5222 tcp_set_rto(tcp_t *tcp, hrtime_t rtt)
5223 {
5224         hrtime_t m = rtt;
5225         hrtime_t sa = tcp->tcp_rtt_sa;
5226         hrtime_t sv = tcp->tcp_rtt_sd;
5227         tcp_stack_t *tcps = tcp->tcp_tcps;
5228 
5229         TCPS_BUMP_MIB(tcps, tcpRttUpdate);
5230         tcp->tcp_rtt_update++;


5231 
5232         /* tcp_rtt_sa is not 0 means this is a new sample. */
5233         if (sa != 0) {
5234                 /*
5235                  * Update average estimator (see section 2.3 of RFC6298):
5236                  *      SRTT = 7/8 SRTT + 1/8 rtt
5237                  *
5238                  * We maintain tcp_rtt_sa as 8 * SRTT, so this reduces to:
5239                  *      tcp_rtt_sa = 7 * SRTT + rtt
5240                  *      tcp_rtt_sa = 7 * (tcp_rtt_sa / 8) + rtt
5241                  *      tcp_rtt_sa = tcp_rtt_sa - (tcp_rtt_sa / 8) + rtt
5242                  *      tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa / 8))
5243                  *      tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa / 2^3))
5244                  *      tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa >> 3))
5245                  *
5246                  * (rtt - tcp_rtt_sa / 8) is simply the difference
5247                  * between the new rtt measurement and the existing smoothed
5248                  * RTT average. This is referred to as "Error" in subsequent
5249                  * calculations.
5250                  */




 542          * care of that.
 543          */
 544         tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss));
 545 
 546         /*
 547          * Initialize tcp_cwnd value. After tcp_mss_set(), tcp_mss has been
 548          * updated properly.
 549          */
 550         TCP_SET_INIT_CWND(tcp, tcp->tcp_mss, tcps->tcps_slow_start_initial);
 551 }
 552 
 553 /*
 554  * Add a new piece to the tcp reassembly queue.  If the gap at the beginning
 555  * is filled, return as much as we can.  The message passed in may be
 556  * multi-part, chained using b_cont.  "start" is the starting sequence
 557  * number for this piece.
 558  */
 559 static mblk_t *
 560 tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start)
 561 {
 562         uint32_t        end, bytes;
 563         mblk_t          *mp1;
 564         mblk_t          *mp2;
 565         mblk_t          *next_mp;
 566         uint32_t        u1;
 567         tcp_stack_t     *tcps = tcp->tcp_tcps;
 568 
 569 
 570         /* Walk through all the new pieces. */
 571         do {
 572                 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <=
 573                     (uintptr_t)INT_MAX);
 574                 end = start + (int)(mp->b_wptr - mp->b_rptr);
 575                 next_mp = mp->b_cont;
 576                 if (start == end) {
 577                         /* Empty.  Blast it. */
 578                         freeb(mp);
 579                         continue;
 580                 }
 581                 bytes = end - start;
 582                 mp->b_cont = NULL;
 583                 TCP_REASS_SET_SEQ(mp, start);
 584                 TCP_REASS_SET_END(mp, end);
 585                 mp1 = tcp->tcp_reass_tail;
 586                 if (mp1 == NULL || SEQ_GEQ(start, TCP_REASS_END(mp1))) {
 587                         if (mp1 != NULL) {
 588                                 /*
 589                                  * New stuff is beyond the tail; link it on the
 590                                  * end.
 591                                  */
 592                                 mp1->b_cont = mp;
 593                         } else {
 594                                 tcp->tcp_reass_head = mp;




 595                         }




 596                         tcp->tcp_reass_tail = mp;
 597                         TCPS_BUMP_MIB(tcps, tcpInDataUnorderSegs);
 598                         TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes, bytes);
 599                         tcp->tcp_cs.tcp_in_data_unorder_segs++;
 600                         tcp->tcp_cs.tcp_in_data_unorder_bytes += bytes;
 601                         continue;
 602                 }
 603                 mp1 = tcp->tcp_reass_head;
 604                 u1 = TCP_REASS_SEQ(mp1);
 605                 /* New stuff at the front? */
 606                 if (SEQ_LT(start, u1)) {
 607                         /* Yes... Check for overlap. */
 608                         mp->b_cont = mp1;
 609                         tcp->tcp_reass_head = mp;
 610                         tcp_reass_elim_overlap(tcp, mp);
 611                         continue;
 612                 }
 613                 /*
 614                  * The new piece fits somewhere between the head and tail.
 615                  * We find our slot, where mp1 precedes us and mp2 trails.
 616                  */
 617                 for (; (mp2 = mp1->b_cont) != NULL; mp1 = mp2) {
 618                         u1 = TCP_REASS_SEQ(mp2);
 619                         if (SEQ_LEQ(start, u1))
 620                                 break;


2397 
2398         if (tcp->tcp_state == TCPS_TIME_WAIT) {
2399                 tcp_time_wait_processing(tcp, mp, seg_seq, seg_ack,
2400                     seg_len, tcpha, ira);
2401                 return;
2402         }
2403 
2404         if (sqp != NULL) {
2405                 /*
2406                  * This is the correct place to update tcp_last_recv_time. Note
2407                  * that it is also updated for tcp structure that belongs to
2408                  * global and listener queues which do not really need updating.
2409                  * But that should not cause any harm.  And it is updated for
2410                  * all kinds of incoming segments, not only for data segments.
2411                  */
2412                 tcp->tcp_last_recv_time = LBOLT_FASTPATH;
2413         }
2414 
2415         flags = (unsigned int)tcpha->tha_flags & 0xFF;
2416 
2417         TCPS_BUMP_MIB(tcps, tcpHCInSegs);
2418         DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp);
2419 
2420         if ((flags & TH_URG) && sqp != NULL) {
2421                 /*
2422                  * TCP can't handle urgent pointers that arrive before
2423                  * the connection has been accept()ed since it can't
2424                  * buffer OOB data.  Discard segment if this happens.
2425                  *
2426                  * We can't just rely on a non-null tcp_listener to indicate
2427                  * that the accept() has completed since unlinking of the
2428                  * eager and completion of the accept are not atomic.
2429                  * tcp_detached, when it is not set (B_FALSE) indicates
2430                  * that the accept() has completed.
2431                  *
2432                  * Nor can it reassemble urgent pointers, so discard
2433                  * if it's not the next segment expected.
2434                  *
2435                  * Otherwise, collapse chain into one mblk (discard if
2436                  * that fails).  This makes sure the headers, retransmitted
2437                  * data, and new data all are in the same mblk.


2642                         if (tcp->tcp_loopback) {
2643                                 mblk_t *ack_mp;
2644 
2645                                 ASSERT(!tcp->tcp_unfusable);
2646                                 ASSERT(mp1 != NULL);
2647                                 /*
2648                                  * For loopback, we always get a pure SYN-ACK
2649                                  * and only need to send back the final ACK
2650                                  * with no data (this is because the other
2651                                  * tcp is ours and we don't do T/TCP).  This
2652                                  * final ACK triggers the passive side to
2653                                  * perform fusion in ESTABLISHED state.
2654                                  */
2655                                 if ((ack_mp = tcp_ack_mp(tcp)) != NULL) {
2656                                         if (tcp->tcp_ack_tid != 0) {
2657                                                 (void) TCP_TIMER_CANCEL(tcp,
2658                                                     tcp->tcp_ack_tid);
2659                                                 tcp->tcp_ack_tid = 0;
2660                                         }
2661                                         tcp_send_data(tcp, ack_mp);
2662                                         TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
2663                                         TCPS_BUMP_MIB(tcps, tcpOutAck);
2664 
2665                                         if (!IPCL_IS_NONSTR(connp)) {
2666                                                 /* Send up T_CONN_CON */
2667                                                 if (ira->ira_cred != NULL) {
2668                                                         mblk_setcred(mp1,
2669                                                             ira->ira_cred,
2670                                                             ira->ira_cpid);
2671                                                 }
2672                                                 putnext(connp->conn_rq, mp1);
2673                                         } else {
2674                                                 (*sockupcalls->su_connected)
2675                                                     (connp->conn_upper_handle,
2676                                                     tcp->tcp_connid,
2677                                                     ira->ira_cred,
2678                                                     ira->ira_cpid);
2679                                                 freemsg(mp1);
2680                                         }
2681 
2682                                         freemsg(mp);


3031                         mp2 = mp;
3032                         mp = mp->b_cont;
3033                         freeb(mp2);
3034                 } while (gap < 0);
3035                 /*
3036                  * If the urgent data has already been acknowledged, we
3037                  * should ignore TH_URG below
3038                  */
3039                 if (urp < 0)
3040                         flags &= ~TH_URG;
3041         }
3042         /*
3043          * rgap is the amount of stuff received out of window.  A negative
3044          * value is the amount out of window.
3045          */
3046         if (rgap < 0) {
3047                 mblk_t  *mp2;
3048 
3049                 if (tcp->tcp_rwnd == 0) {
3050                         TCPS_BUMP_MIB(tcps, tcpInWinProbe);
3051                         tcp->tcp_cs.tcp_in_zwnd_probes++;
3052                 } else {
3053                         TCPS_BUMP_MIB(tcps, tcpInDataPastWinSegs);
3054                         TCPS_UPDATE_MIB(tcps, tcpInDataPastWinBytes, -rgap);
3055                 }
3056 
3057                 /*
3058                  * seg_len does not include the FIN, so if more than
3059                  * just the FIN is out of window, we act like we don't
3060                  * see it.  (If just the FIN is out of window, rgap
3061                  * will be zero and we will go ahead and acknowledge
3062                  * the FIN.)
3063                  */
3064                 flags &= ~TH_FIN;
3065 
3066                 /* Fix seg_len and make sure there is something left. */
3067                 seg_len += rgap;
3068                 if (seg_len <= 0) {
3069                         /*
3070                          * Resets are only valid if they lie within our offered
3071                          * window.  If the RST bit is set, we just ignore this


3281                                  *
3282                                  * But TCP should not perform fast retransmit
3283                                  * because of the ack number.  TCP uses
3284                                  * seg_len == 0 to determine if it is a pure
3285                                  * ACK.  And this is not a pure ACK.
3286                                  */
3287                                 seg_len = 0;
3288                                 ofo_seg = B_TRUE;
3289 
3290                                 if (tcps->tcps_reass_timeout != 0 &&
3291                                     tcp->tcp_reass_tid == 0) {
3292                                         tcp->tcp_reass_tid = TCP_TIMER(tcp,
3293                                             tcp_reass_timer,
3294                                             tcps->tcps_reass_timeout);
3295                                 }
3296                         }
3297                 }
3298         } else if (seg_len > 0) {
3299                 TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs);
3300                 TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, seg_len);
3301                 tcp->tcp_cs.tcp_in_data_inorder_segs++;
3302                 tcp->tcp_cs.tcp_in_data_inorder_bytes += seg_len;
3303 
3304                 /*
3305                  * If an out of order FIN was received before, and the seq
3306                  * num and len of the new segment match that of the FIN,
3307                  * put the FIN flag back in.
3308                  */
3309                 if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) &&
3310                     seg_seq + seg_len == tcp->tcp_ofo_fin_seq) {
3311                         flags |= TH_FIN;
3312                         tcp->tcp_valid_bits &= ~TCP_OFO_FIN_VALID;
3313                 }
3314         }
3315         if ((flags & (TH_RST | TH_SYN | TH_URG | TH_ACK)) != TH_ACK) {
3316         if (flags & TH_RST) {
3317                 freemsg(mp);
3318                 switch (tcp->tcp_state) {
3319                 case TCPS_SYN_RCVD:
3320                         (void) tcp_clean_death(tcp, ECONNREFUSED);
3321                         break;
3322                 case TCPS_ESTABLISHED:
3323                 case TCPS_FIN_WAIT_1:


4133                          * greater than 0, check if the number of such
4134                          * bogus ACks is greater than that count.  If yes,
4135                          * don't send back any ACK.  This prevents TCP from
4136                          * getting into an ACK storm if somehow an attacker
4137                          * successfully spoofs an acceptable segment to our
4138                          * peer.  If this continues (count > 2 X threshold),
4139                          * we should abort this connection.
4140                          */
4141                         if (tcp_drop_ack_unsent_cnt > 0 &&
4142                             ++tcp->tcp_in_ack_unsent >
4143                             tcp_drop_ack_unsent_cnt) {
4144                                 TCP_STAT(tcps, tcp_in_ack_unsent_drop);
4145                                 if (tcp->tcp_in_ack_unsent > 2 *
4146                                     tcp_drop_ack_unsent_cnt) {
4147                                         (void) tcp_clean_death(tcp, EPROTO);
4148                                 }
4149                                 return;
4150                         }
4151                         mp = tcp_ack_mp(tcp);
4152                         if (mp != NULL) {
4153                                 TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
4154                                 TCPS_BUMP_MIB(tcps, tcpOutAck);
4155                                 tcp_send_data(tcp, mp);
4156                         }
4157                         return;
4158                 }
4159         } else if (tcp->tcp_is_wnd_shrnk && SEQ_GEQ(seg_ack,
4160             tcp->tcp_snxt_shrunk)) {
4161                         tcp->tcp_is_wnd_shrnk = B_FALSE;
4162         }
4163 
4164         /*
4165          * TCP gets a new ACK, update the notsack'ed list to delete those
4166          * blocks that are covered by this ACK.
4167          */
4168         if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) {
4169                 tcp_notsack_remove(&(tcp->tcp_notsack_list), seg_ack,
4170                     &(tcp->tcp_num_notsack_blk), &(tcp->tcp_cnt_notsack_list));
4171         }
4172 
4173         /*


4824             TH_LIMIT_XMIT)) && tcp->tcp_swnd != 0) {
4825                 if (flags & TH_REXMIT_NEEDED) {
4826                         uint32_t snd_size = tcp->tcp_snxt - tcp->tcp_suna;
4827 
4828                         TCPS_BUMP_MIB(tcps, tcpOutFastRetrans);
4829                         if (snd_size > mss)
4830                                 snd_size = mss;
4831                         if (snd_size > tcp->tcp_swnd)
4832                                 snd_size = tcp->tcp_swnd;
4833                         mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, snd_size,
4834                             NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size,
4835                             B_TRUE);
4836 
4837                         if (mp1 != NULL) {
4838                                 tcp->tcp_xmit_head->b_prev =
4839                                     (mblk_t *)(intptr_t)gethrtime();
4840                                 tcp->tcp_csuna = tcp->tcp_snxt;
4841                                 TCPS_BUMP_MIB(tcps, tcpRetransSegs);
4842                                 TCPS_UPDATE_MIB(tcps, tcpRetransBytes,
4843                                     snd_size);
4844                                 tcp->tcp_cs.tcp_out_retrans_segs++;
4845                                 tcp->tcp_cs.tcp_out_retrans_bytes += snd_size;
4846                                 tcp_send_data(tcp, mp1);
4847                         }
4848                 }
4849                 if (flags & TH_NEED_SACK_REXMIT) {
4850                         tcp_sack_rexmit(tcp, &flags);
4851                 }
4852                 /*
4853                  * For TH_LIMIT_XMIT, tcp_wput_data() is called to send
4854                  * out new segment.  Note that tcp_rexmit should not be
4855                  * set, otherwise TH_LIMIT_XMIT should not be set.
4856                  */
4857                 if (flags & (TH_XMIT_NEEDED|TH_LIMIT_XMIT)) {
4858                         if (!tcp->tcp_rexmit) {
4859                                 tcp_wput_data(tcp, NULL, B_FALSE);
4860                         } else {
4861                                 tcp_ss_rexmit(tcp);
4862                         }
4863                 }
4864                 /*
4865                  * Adjust tcp_cwnd back to normal value after sending


4901                         tcp_setcred_data(mp1, ira);
4902 
4903                 putnext(connp->conn_rq, mp1);
4904 #ifdef DEBUG
4905                 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
4906                     "tcp_rput: sending zero-length %s %s",
4907                     ((mp1->b_flag & MSGMARKNEXT) ? "MSGMARKNEXT" :
4908                     "MSGNOTMARKNEXT"),
4909                     tcp_display(tcp, NULL, DISP_PORT_ONLY));
4910 #endif /* DEBUG */
4911                 flags &= ~TH_SEND_URP_MARK;
4912         }
4913         if (flags & TH_ACK_NEEDED) {
4914                 /*
4915                  * Time to send an ack for some reason.
4916                  */
4917                 mp1 = tcp_ack_mp(tcp);
4918 
4919                 if (mp1 != NULL) {
4920                         tcp_send_data(tcp, mp1);
4921                         TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
4922                         TCPS_BUMP_MIB(tcps, tcpOutAck);
4923                 }
4924                 if (tcp->tcp_ack_tid != 0) {
4925                         (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid);
4926                         tcp->tcp_ack_tid = 0;
4927                 }
4928         }
4929         if (flags & TH_ACK_TIMER_NEEDED) {
4930                 /*
4931                  * Arrange for deferred ACK or push wait timeout.
4932                  * Start timer if it is not already running.
4933                  */
4934                 if (tcp->tcp_ack_tid == 0) {
4935                         tcp->tcp_ack_tid = TCP_TIMER(tcp, tcp_ack_timer,
4936                             tcp->tcp_localnet ?
4937                             tcps->tcps_local_dack_interval :
4938                             tcps->tcps_deferred_ack_interval);
4939                 }
4940         }
4941         if (flags & TH_ORDREL_NEEDED) {


5217 /*
5218  * Set RTO for this connection based on a new round-trip time measurement.
5219  * The formula is from Jacobson and Karels' "Congestion Avoidance and Control"
5220  * in SIGCOMM '88.  The variable names are the same as those in Appendix A.2
5221  * of that paper.
5222  *
5223  * m = new measurement
5224  * sa = smoothed RTT average (8 * average estimates).
5225  * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates).
5226  */
5227 static void
5228 tcp_set_rto(tcp_t *tcp, hrtime_t rtt)
5229 {
5230         hrtime_t m = rtt;
5231         hrtime_t sa = tcp->tcp_rtt_sa;
5232         hrtime_t sv = tcp->tcp_rtt_sd;
5233         tcp_stack_t *tcps = tcp->tcp_tcps;
5234 
5235         TCPS_BUMP_MIB(tcps, tcpRttUpdate);
5236         tcp->tcp_rtt_update++;
5237         tcp->tcp_rtt_sum += m;
5238         tcp->tcp_rtt_cnt++;
5239 
5240         /* tcp_rtt_sa is not 0 means this is a new sample. */
5241         if (sa != 0) {
5242                 /*
5243                  * Update average estimator (see section 2.3 of RFC6298):
5244                  *      SRTT = 7/8 SRTT + 1/8 rtt
5245                  *
5246                  * We maintain tcp_rtt_sa as 8 * SRTT, so this reduces to:
5247                  *      tcp_rtt_sa = 7 * SRTT + rtt
5248                  *      tcp_rtt_sa = 7 * (tcp_rtt_sa / 8) + rtt
5249                  *      tcp_rtt_sa = tcp_rtt_sa - (tcp_rtt_sa / 8) + rtt
5250                  *      tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa / 8))
5251                  *      tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa / 2^3))
5252                  *      tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa >> 3))
5253                  *
5254                  * (rtt - tcp_rtt_sa / 8) is simply the difference
5255                  * between the new rtt measurement and the existing smoothed
5256                  * RTT average. This is referred to as "Error" in subsequent
5257                  * calculations.
5258                  */