542 * care of that.
543 */
544 tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss));
545
546 /*
547 * Initialize tcp_cwnd value. After tcp_mss_set(), tcp_mss has been
548 * updated properly.
549 */
550 TCP_SET_INIT_CWND(tcp, tcp->tcp_mss, tcps->tcps_slow_start_initial);
551 }
552
553 /*
554 * Add a new piece to the tcp reassembly queue. If the gap at the beginning
555 * is filled, return as much as we can. The message passed in may be
556 * multi-part, chained using b_cont. "start" is the starting sequence
557 * number for this piece.
558 */
559 static mblk_t *
560 tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start)
561 {
562 uint32_t end;
563 mblk_t *mp1;
564 mblk_t *mp2;
565 mblk_t *next_mp;
566 uint32_t u1;
567 tcp_stack_t *tcps = tcp->tcp_tcps;
568
569
570 /* Walk through all the new pieces. */
571 do {
572 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <=
573 (uintptr_t)INT_MAX);
574 end = start + (int)(mp->b_wptr - mp->b_rptr);
575 next_mp = mp->b_cont;
576 if (start == end) {
577 /* Empty. Blast it. */
578 freeb(mp);
579 continue;
580 }
581 mp->b_cont = NULL;
582 TCP_REASS_SET_SEQ(mp, start);
583 TCP_REASS_SET_END(mp, end);
584 mp1 = tcp->tcp_reass_tail;
585 if (!mp1) {
586 tcp->tcp_reass_tail = mp;
587 tcp->tcp_reass_head = mp;
588 TCPS_BUMP_MIB(tcps, tcpInDataUnorderSegs);
589 TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes,
590 end - start);
591 continue;
592 }
593 /* New stuff completely beyond tail? */
594 if (SEQ_GEQ(start, TCP_REASS_END(mp1))) {
595 /* Link it on end. */
596 mp1->b_cont = mp;
597 tcp->tcp_reass_tail = mp;
598 TCPS_BUMP_MIB(tcps, tcpInDataUnorderSegs);
599 TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes,
600 end - start);
601 continue;
602 }
603 mp1 = tcp->tcp_reass_head;
604 u1 = TCP_REASS_SEQ(mp1);
605 /* New stuff at the front? */
606 if (SEQ_LT(start, u1)) {
607 /* Yes... Check for overlap. */
608 mp->b_cont = mp1;
609 tcp->tcp_reass_head = mp;
610 tcp_reass_elim_overlap(tcp, mp);
611 continue;
612 }
613 /*
614 * The new piece fits somewhere between the head and tail.
615 * We find our slot, where mp1 precedes us and mp2 trails.
616 */
617 for (; (mp2 = mp1->b_cont) != NULL; mp1 = mp2) {
618 u1 = TCP_REASS_SEQ(mp2);
619 if (SEQ_LEQ(start, u1))
620 break;
2397
2398 if (tcp->tcp_state == TCPS_TIME_WAIT) {
2399 tcp_time_wait_processing(tcp, mp, seg_seq, seg_ack,
2400 seg_len, tcpha, ira);
2401 return;
2402 }
2403
2404 if (sqp != NULL) {
2405 /*
2406 * This is the correct place to update tcp_last_recv_time. Note
2407 * that it is also updated for tcp structure that belongs to
2408 * global and listener queues which do not really need updating.
2409 * But that should not cause any harm. And it is updated for
2410 * all kinds of incoming segments, not only for data segments.
2411 */
2412 tcp->tcp_last_recv_time = LBOLT_FASTPATH;
2413 }
2414
2415 flags = (unsigned int)tcpha->tha_flags & 0xFF;
2416
2417 BUMP_LOCAL(tcp->tcp_ibsegs);
2418 DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp);
2419
2420 if ((flags & TH_URG) && sqp != NULL) {
2421 /*
2422 * TCP can't handle urgent pointers that arrive before
2423 * the connection has been accept()ed since it can't
2424 * buffer OOB data. Discard segment if this happens.
2425 *
2426 * We can't just rely on a non-null tcp_listener to indicate
2427 * that the accept() has completed since unlinking of the
2428 * eager and completion of the accept are not atomic.
2429 * tcp_detached, when it is not set (B_FALSE) indicates
2430 * that the accept() has completed.
2431 *
2432 * Nor can it reassemble urgent pointers, so discard
2433 * if it's not the next segment expected.
2434 *
2435 * Otherwise, collapse chain into one mblk (discard if
2436 * that fails). This makes sure the headers, retransmitted
2437 * data, and new data all are in the same mblk.
2642 if (tcp->tcp_loopback) {
2643 mblk_t *ack_mp;
2644
2645 ASSERT(!tcp->tcp_unfusable);
2646 ASSERT(mp1 != NULL);
2647 /*
2648 * For loopback, we always get a pure SYN-ACK
2649 * and only need to send back the final ACK
2650 * with no data (this is because the other
2651 * tcp is ours and we don't do T/TCP). This
2652 * final ACK triggers the passive side to
2653 * perform fusion in ESTABLISHED state.
2654 */
2655 if ((ack_mp = tcp_ack_mp(tcp)) != NULL) {
2656 if (tcp->tcp_ack_tid != 0) {
2657 (void) TCP_TIMER_CANCEL(tcp,
2658 tcp->tcp_ack_tid);
2659 tcp->tcp_ack_tid = 0;
2660 }
2661 tcp_send_data(tcp, ack_mp);
2662 BUMP_LOCAL(tcp->tcp_obsegs);
2663 TCPS_BUMP_MIB(tcps, tcpOutAck);
2664
2665 if (!IPCL_IS_NONSTR(connp)) {
2666 /* Send up T_CONN_CON */
2667 if (ira->ira_cred != NULL) {
2668 mblk_setcred(mp1,
2669 ira->ira_cred,
2670 ira->ira_cpid);
2671 }
2672 putnext(connp->conn_rq, mp1);
2673 } else {
2674 (*sockupcalls->su_connected)
2675 (connp->conn_upper_handle,
2676 tcp->tcp_connid,
2677 ira->ira_cred,
2678 ira->ira_cpid);
2679 freemsg(mp1);
2680 }
2681
2682 freemsg(mp);
3031 mp2 = mp;
3032 mp = mp->b_cont;
3033 freeb(mp2);
3034 } while (gap < 0);
3035 /*
3036 * If the urgent data has already been acknowledged, we
3037 * should ignore TH_URG below
3038 */
3039 if (urp < 0)
3040 flags &= ~TH_URG;
3041 }
3042 /*
3043 * rgap is the amount of stuff received out of window. A negative
3044 * value is the amount out of window.
3045 */
3046 if (rgap < 0) {
3047 mblk_t *mp2;
3048
3049 if (tcp->tcp_rwnd == 0) {
3050 TCPS_BUMP_MIB(tcps, tcpInWinProbe);
3051 } else {
3052 TCPS_BUMP_MIB(tcps, tcpInDataPastWinSegs);
3053 TCPS_UPDATE_MIB(tcps, tcpInDataPastWinBytes, -rgap);
3054 }
3055
3056 /*
3057 * seg_len does not include the FIN, so if more than
3058 * just the FIN is out of window, we act like we don't
3059 * see it. (If just the FIN is out of window, rgap
3060 * will be zero and we will go ahead and acknowledge
3061 * the FIN.)
3062 */
3063 flags &= ~TH_FIN;
3064
3065 /* Fix seg_len and make sure there is something left. */
3066 seg_len += rgap;
3067 if (seg_len <= 0) {
3068 /*
3069 * Resets are only valid if they lie within our offered
3070 * window. If the RST bit is set, we just ignore this
3280 *
3281 * But TCP should not perform fast retransmit
3282 * because of the ack number. TCP uses
3283 * seg_len == 0 to determine if it is a pure
3284 * ACK. And this is not a pure ACK.
3285 */
3286 seg_len = 0;
3287 ofo_seg = B_TRUE;
3288
3289 if (tcps->tcps_reass_timeout != 0 &&
3290 tcp->tcp_reass_tid == 0) {
3291 tcp->tcp_reass_tid = TCP_TIMER(tcp,
3292 tcp_reass_timer,
3293 tcps->tcps_reass_timeout);
3294 }
3295 }
3296 }
3297 } else if (seg_len > 0) {
3298 TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs);
3299 TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, seg_len);
3300 /*
3301 * If an out of order FIN was received before, and the seq
3302 * num and len of the new segment match that of the FIN,
3303 * put the FIN flag back in.
3304 */
3305 if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) &&
3306 seg_seq + seg_len == tcp->tcp_ofo_fin_seq) {
3307 flags |= TH_FIN;
3308 tcp->tcp_valid_bits &= ~TCP_OFO_FIN_VALID;
3309 }
3310 }
3311 if ((flags & (TH_RST | TH_SYN | TH_URG | TH_ACK)) != TH_ACK) {
3312 if (flags & TH_RST) {
3313 freemsg(mp);
3314 switch (tcp->tcp_state) {
3315 case TCPS_SYN_RCVD:
3316 (void) tcp_clean_death(tcp, ECONNREFUSED);
3317 break;
3318 case TCPS_ESTABLISHED:
3319 case TCPS_FIN_WAIT_1:
4129 * greater than 0, check if the number of such
4130 * bogus ACks is greater than that count. If yes,
4131 * don't send back any ACK. This prevents TCP from
4132 * getting into an ACK storm if somehow an attacker
4133 * successfully spoofs an acceptable segment to our
4134 * peer. If this continues (count > 2 X threshold),
4135 * we should abort this connection.
4136 */
4137 if (tcp_drop_ack_unsent_cnt > 0 &&
4138 ++tcp->tcp_in_ack_unsent >
4139 tcp_drop_ack_unsent_cnt) {
4140 TCP_STAT(tcps, tcp_in_ack_unsent_drop);
4141 if (tcp->tcp_in_ack_unsent > 2 *
4142 tcp_drop_ack_unsent_cnt) {
4143 (void) tcp_clean_death(tcp, EPROTO);
4144 }
4145 return;
4146 }
4147 mp = tcp_ack_mp(tcp);
4148 if (mp != NULL) {
4149 BUMP_LOCAL(tcp->tcp_obsegs);
4150 TCPS_BUMP_MIB(tcps, tcpOutAck);
4151 tcp_send_data(tcp, mp);
4152 }
4153 return;
4154 }
4155 } else if (tcp->tcp_is_wnd_shrnk && SEQ_GEQ(seg_ack,
4156 tcp->tcp_snxt_shrunk)) {
4157 tcp->tcp_is_wnd_shrnk = B_FALSE;
4158 }
4159
4160 /*
4161 * TCP gets a new ACK, update the notsack'ed list to delete those
4162 * blocks that are covered by this ACK.
4163 */
4164 if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) {
4165 tcp_notsack_remove(&(tcp->tcp_notsack_list), seg_ack,
4166 &(tcp->tcp_num_notsack_blk), &(tcp->tcp_cnt_notsack_list));
4167 }
4168
4169 /*
4820 TH_LIMIT_XMIT)) && tcp->tcp_swnd != 0) {
4821 if (flags & TH_REXMIT_NEEDED) {
4822 uint32_t snd_size = tcp->tcp_snxt - tcp->tcp_suna;
4823
4824 TCPS_BUMP_MIB(tcps, tcpOutFastRetrans);
4825 if (snd_size > mss)
4826 snd_size = mss;
4827 if (snd_size > tcp->tcp_swnd)
4828 snd_size = tcp->tcp_swnd;
4829 mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, snd_size,
4830 NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size,
4831 B_TRUE);
4832
4833 if (mp1 != NULL) {
4834 tcp->tcp_xmit_head->b_prev =
4835 (mblk_t *)(intptr_t)gethrtime();
4836 tcp->tcp_csuna = tcp->tcp_snxt;
4837 TCPS_BUMP_MIB(tcps, tcpRetransSegs);
4838 TCPS_UPDATE_MIB(tcps, tcpRetransBytes,
4839 snd_size);
4840 tcp_send_data(tcp, mp1);
4841 }
4842 }
4843 if (flags & TH_NEED_SACK_REXMIT) {
4844 tcp_sack_rexmit(tcp, &flags);
4845 }
4846 /*
4847 * For TH_LIMIT_XMIT, tcp_wput_data() is called to send
4848 * out new segment. Note that tcp_rexmit should not be
4849 * set, otherwise TH_LIMIT_XMIT should not be set.
4850 */
4851 if (flags & (TH_XMIT_NEEDED|TH_LIMIT_XMIT)) {
4852 if (!tcp->tcp_rexmit) {
4853 tcp_wput_data(tcp, NULL, B_FALSE);
4854 } else {
4855 tcp_ss_rexmit(tcp);
4856 }
4857 }
4858 /*
4859 * Adjust tcp_cwnd back to normal value after sending
4895 tcp_setcred_data(mp1, ira);
4896
4897 putnext(connp->conn_rq, mp1);
4898 #ifdef DEBUG
4899 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
4900 "tcp_rput: sending zero-length %s %s",
4901 ((mp1->b_flag & MSGMARKNEXT) ? "MSGMARKNEXT" :
4902 "MSGNOTMARKNEXT"),
4903 tcp_display(tcp, NULL, DISP_PORT_ONLY));
4904 #endif /* DEBUG */
4905 flags &= ~TH_SEND_URP_MARK;
4906 }
4907 if (flags & TH_ACK_NEEDED) {
4908 /*
4909 * Time to send an ack for some reason.
4910 */
4911 mp1 = tcp_ack_mp(tcp);
4912
4913 if (mp1 != NULL) {
4914 tcp_send_data(tcp, mp1);
4915 BUMP_LOCAL(tcp->tcp_obsegs);
4916 TCPS_BUMP_MIB(tcps, tcpOutAck);
4917 }
4918 if (tcp->tcp_ack_tid != 0) {
4919 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid);
4920 tcp->tcp_ack_tid = 0;
4921 }
4922 }
4923 if (flags & TH_ACK_TIMER_NEEDED) {
4924 /*
4925 * Arrange for deferred ACK or push wait timeout.
4926 * Start timer if it is not already running.
4927 */
4928 if (tcp->tcp_ack_tid == 0) {
4929 tcp->tcp_ack_tid = TCP_TIMER(tcp, tcp_ack_timer,
4930 tcp->tcp_localnet ?
4931 tcps->tcps_local_dack_interval :
4932 tcps->tcps_deferred_ack_interval);
4933 }
4934 }
4935 if (flags & TH_ORDREL_NEEDED) {
5211 /*
5212 * Set RTO for this connection based on a new round-trip time measurement.
5213 * The formula is from Jacobson and Karels' "Congestion Avoidance and Control"
5214 * in SIGCOMM '88. The variable names are the same as those in Appendix A.2
5215 * of that paper.
5216 *
5217 * m = new measurement
5218 * sa = smoothed RTT average (8 * average estimates).
5219 * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates).
5220 */
5221 static void
5222 tcp_set_rto(tcp_t *tcp, hrtime_t rtt)
5223 {
5224 hrtime_t m = rtt;
5225 hrtime_t sa = tcp->tcp_rtt_sa;
5226 hrtime_t sv = tcp->tcp_rtt_sd;
5227 tcp_stack_t *tcps = tcp->tcp_tcps;
5228
5229 TCPS_BUMP_MIB(tcps, tcpRttUpdate);
5230 tcp->tcp_rtt_update++;
5231
5232 /* tcp_rtt_sa is not 0 means this is a new sample. */
5233 if (sa != 0) {
5234 /*
5235 * Update average estimator (see section 2.3 of RFC6298):
5236 * SRTT = 7/8 SRTT + 1/8 rtt
5237 *
5238 * We maintain tcp_rtt_sa as 8 * SRTT, so this reduces to:
5239 * tcp_rtt_sa = 7 * SRTT + rtt
5240 * tcp_rtt_sa = 7 * (tcp_rtt_sa / 8) + rtt
5241 * tcp_rtt_sa = tcp_rtt_sa - (tcp_rtt_sa / 8) + rtt
5242 * tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa / 8))
5243 * tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa / 2^3))
5244 * tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa >> 3))
5245 *
5246 * (rtt - tcp_rtt_sa / 8) is simply the difference
5247 * between the new rtt measurement and the existing smoothed
5248 * RTT average. This is referred to as "Error" in subsequent
5249 * calculations.
5250 */
|
542 * care of that.
543 */
544 tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss));
545
546 /*
547 * Initialize tcp_cwnd value. After tcp_mss_set(), tcp_mss has been
548 * updated properly.
549 */
550 TCP_SET_INIT_CWND(tcp, tcp->tcp_mss, tcps->tcps_slow_start_initial);
551 }
552
553 /*
554 * Add a new piece to the tcp reassembly queue. If the gap at the beginning
555 * is filled, return as much as we can. The message passed in may be
556 * multi-part, chained using b_cont. "start" is the starting sequence
557 * number for this piece.
558 */
559 static mblk_t *
560 tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start)
561 {
562 uint32_t end, bytes;
563 mblk_t *mp1;
564 mblk_t *mp2;
565 mblk_t *next_mp;
566 uint32_t u1;
567 tcp_stack_t *tcps = tcp->tcp_tcps;
568
569
570 /* Walk through all the new pieces. */
571 do {
572 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <=
573 (uintptr_t)INT_MAX);
574 end = start + (int)(mp->b_wptr - mp->b_rptr);
575 next_mp = mp->b_cont;
576 if (start == end) {
577 /* Empty. Blast it. */
578 freeb(mp);
579 continue;
580 }
581 bytes = end - start;
582 mp->b_cont = NULL;
583 TCP_REASS_SET_SEQ(mp, start);
584 TCP_REASS_SET_END(mp, end);
585 mp1 = tcp->tcp_reass_tail;
586 if (mp1 == NULL || SEQ_GEQ(start, TCP_REASS_END(mp1))) {
587 if (mp1 != NULL) {
588 /*
589 * New stuff is beyond the tail; link it on the
590 * end.
591 */
592 mp1->b_cont = mp;
593 } else {
594 tcp->tcp_reass_head = mp;
595 }
596 tcp->tcp_reass_tail = mp;
597 TCPS_BUMP_MIB(tcps, tcpInDataUnorderSegs);
598 TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes, bytes);
599 tcp->tcp_cs.tcp_in_data_unorder_segs++;
600 tcp->tcp_cs.tcp_in_data_unorder_bytes += bytes;
601 continue;
602 }
603 mp1 = tcp->tcp_reass_head;
604 u1 = TCP_REASS_SEQ(mp1);
605 /* New stuff at the front? */
606 if (SEQ_LT(start, u1)) {
607 /* Yes... Check for overlap. */
608 mp->b_cont = mp1;
609 tcp->tcp_reass_head = mp;
610 tcp_reass_elim_overlap(tcp, mp);
611 continue;
612 }
613 /*
614 * The new piece fits somewhere between the head and tail.
615 * We find our slot, where mp1 precedes us and mp2 trails.
616 */
617 for (; (mp2 = mp1->b_cont) != NULL; mp1 = mp2) {
618 u1 = TCP_REASS_SEQ(mp2);
619 if (SEQ_LEQ(start, u1))
620 break;
2397
2398 if (tcp->tcp_state == TCPS_TIME_WAIT) {
2399 tcp_time_wait_processing(tcp, mp, seg_seq, seg_ack,
2400 seg_len, tcpha, ira);
2401 return;
2402 }
2403
2404 if (sqp != NULL) {
2405 /*
2406 * This is the correct place to update tcp_last_recv_time. Note
2407 * that it is also updated for tcp structure that belongs to
2408 * global and listener queues which do not really need updating.
2409 * But that should not cause any harm. And it is updated for
2410 * all kinds of incoming segments, not only for data segments.
2411 */
2412 tcp->tcp_last_recv_time = LBOLT_FASTPATH;
2413 }
2414
2415 flags = (unsigned int)tcpha->tha_flags & 0xFF;
2416
2417 TCPS_BUMP_MIB(tcps, tcpHCInSegs);
2418 DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp);
2419
2420 if ((flags & TH_URG) && sqp != NULL) {
2421 /*
2422 * TCP can't handle urgent pointers that arrive before
2423 * the connection has been accept()ed since it can't
2424 * buffer OOB data. Discard segment if this happens.
2425 *
2426 * We can't just rely on a non-null tcp_listener to indicate
2427 * that the accept() has completed since unlinking of the
2428 * eager and completion of the accept are not atomic.
2429 * tcp_detached, when it is not set (B_FALSE) indicates
2430 * that the accept() has completed.
2431 *
2432 * Nor can it reassemble urgent pointers, so discard
2433 * if it's not the next segment expected.
2434 *
2435 * Otherwise, collapse chain into one mblk (discard if
2436 * that fails). This makes sure the headers, retransmitted
2437 * data, and new data all are in the same mblk.
2642 if (tcp->tcp_loopback) {
2643 mblk_t *ack_mp;
2644
2645 ASSERT(!tcp->tcp_unfusable);
2646 ASSERT(mp1 != NULL);
2647 /*
2648 * For loopback, we always get a pure SYN-ACK
2649 * and only need to send back the final ACK
2650 * with no data (this is because the other
2651 * tcp is ours and we don't do T/TCP). This
2652 * final ACK triggers the passive side to
2653 * perform fusion in ESTABLISHED state.
2654 */
2655 if ((ack_mp = tcp_ack_mp(tcp)) != NULL) {
2656 if (tcp->tcp_ack_tid != 0) {
2657 (void) TCP_TIMER_CANCEL(tcp,
2658 tcp->tcp_ack_tid);
2659 tcp->tcp_ack_tid = 0;
2660 }
2661 tcp_send_data(tcp, ack_mp);
2662 TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
2663 TCPS_BUMP_MIB(tcps, tcpOutAck);
2664
2665 if (!IPCL_IS_NONSTR(connp)) {
2666 /* Send up T_CONN_CON */
2667 if (ira->ira_cred != NULL) {
2668 mblk_setcred(mp1,
2669 ira->ira_cred,
2670 ira->ira_cpid);
2671 }
2672 putnext(connp->conn_rq, mp1);
2673 } else {
2674 (*sockupcalls->su_connected)
2675 (connp->conn_upper_handle,
2676 tcp->tcp_connid,
2677 ira->ira_cred,
2678 ira->ira_cpid);
2679 freemsg(mp1);
2680 }
2681
2682 freemsg(mp);
3031 mp2 = mp;
3032 mp = mp->b_cont;
3033 freeb(mp2);
3034 } while (gap < 0);
3035 /*
3036 * If the urgent data has already been acknowledged, we
3037 * should ignore TH_URG below
3038 */
3039 if (urp < 0)
3040 flags &= ~TH_URG;
3041 }
3042 /*
3043 * rgap is the amount of stuff received out of window. A negative
3044 * value is the amount out of window.
3045 */
3046 if (rgap < 0) {
3047 mblk_t *mp2;
3048
3049 if (tcp->tcp_rwnd == 0) {
3050 TCPS_BUMP_MIB(tcps, tcpInWinProbe);
3051 tcp->tcp_cs.tcp_in_zwnd_probes++;
3052 } else {
3053 TCPS_BUMP_MIB(tcps, tcpInDataPastWinSegs);
3054 TCPS_UPDATE_MIB(tcps, tcpInDataPastWinBytes, -rgap);
3055 }
3056
3057 /*
3058 * seg_len does not include the FIN, so if more than
3059 * just the FIN is out of window, we act like we don't
3060 * see it. (If just the FIN is out of window, rgap
3061 * will be zero and we will go ahead and acknowledge
3062 * the FIN.)
3063 */
3064 flags &= ~TH_FIN;
3065
3066 /* Fix seg_len and make sure there is something left. */
3067 seg_len += rgap;
3068 if (seg_len <= 0) {
3069 /*
3070 * Resets are only valid if they lie within our offered
3071 * window. If the RST bit is set, we just ignore this
3281 *
3282 * But TCP should not perform fast retransmit
3283 * because of the ack number. TCP uses
3284 * seg_len == 0 to determine if it is a pure
3285 * ACK. And this is not a pure ACK.
3286 */
3287 seg_len = 0;
3288 ofo_seg = B_TRUE;
3289
3290 if (tcps->tcps_reass_timeout != 0 &&
3291 tcp->tcp_reass_tid == 0) {
3292 tcp->tcp_reass_tid = TCP_TIMER(tcp,
3293 tcp_reass_timer,
3294 tcps->tcps_reass_timeout);
3295 }
3296 }
3297 }
3298 } else if (seg_len > 0) {
3299 TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs);
3300 TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, seg_len);
3301 tcp->tcp_cs.tcp_in_data_inorder_segs++;
3302 tcp->tcp_cs.tcp_in_data_inorder_bytes += seg_len;
3303
3304 /*
3305 * If an out of order FIN was received before, and the seq
3306 * num and len of the new segment match that of the FIN,
3307 * put the FIN flag back in.
3308 */
3309 if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) &&
3310 seg_seq + seg_len == tcp->tcp_ofo_fin_seq) {
3311 flags |= TH_FIN;
3312 tcp->tcp_valid_bits &= ~TCP_OFO_FIN_VALID;
3313 }
3314 }
3315 if ((flags & (TH_RST | TH_SYN | TH_URG | TH_ACK)) != TH_ACK) {
3316 if (flags & TH_RST) {
3317 freemsg(mp);
3318 switch (tcp->tcp_state) {
3319 case TCPS_SYN_RCVD:
3320 (void) tcp_clean_death(tcp, ECONNREFUSED);
3321 break;
3322 case TCPS_ESTABLISHED:
3323 case TCPS_FIN_WAIT_1:
4133 * greater than 0, check if the number of such
4134 * bogus ACks is greater than that count. If yes,
4135 * don't send back any ACK. This prevents TCP from
4136 * getting into an ACK storm if somehow an attacker
4137 * successfully spoofs an acceptable segment to our
4138 * peer. If this continues (count > 2 X threshold),
4139 * we should abort this connection.
4140 */
4141 if (tcp_drop_ack_unsent_cnt > 0 &&
4142 ++tcp->tcp_in_ack_unsent >
4143 tcp_drop_ack_unsent_cnt) {
4144 TCP_STAT(tcps, tcp_in_ack_unsent_drop);
4145 if (tcp->tcp_in_ack_unsent > 2 *
4146 tcp_drop_ack_unsent_cnt) {
4147 (void) tcp_clean_death(tcp, EPROTO);
4148 }
4149 return;
4150 }
4151 mp = tcp_ack_mp(tcp);
4152 if (mp != NULL) {
4153 TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
4154 TCPS_BUMP_MIB(tcps, tcpOutAck);
4155 tcp_send_data(tcp, mp);
4156 }
4157 return;
4158 }
4159 } else if (tcp->tcp_is_wnd_shrnk && SEQ_GEQ(seg_ack,
4160 tcp->tcp_snxt_shrunk)) {
4161 tcp->tcp_is_wnd_shrnk = B_FALSE;
4162 }
4163
4164 /*
4165 * TCP gets a new ACK, update the notsack'ed list to delete those
4166 * blocks that are covered by this ACK.
4167 */
4168 if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) {
4169 tcp_notsack_remove(&(tcp->tcp_notsack_list), seg_ack,
4170 &(tcp->tcp_num_notsack_blk), &(tcp->tcp_cnt_notsack_list));
4171 }
4172
4173 /*
4824 TH_LIMIT_XMIT)) && tcp->tcp_swnd != 0) {
4825 if (flags & TH_REXMIT_NEEDED) {
4826 uint32_t snd_size = tcp->tcp_snxt - tcp->tcp_suna;
4827
4828 TCPS_BUMP_MIB(tcps, tcpOutFastRetrans);
4829 if (snd_size > mss)
4830 snd_size = mss;
4831 if (snd_size > tcp->tcp_swnd)
4832 snd_size = tcp->tcp_swnd;
4833 mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, snd_size,
4834 NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size,
4835 B_TRUE);
4836
4837 if (mp1 != NULL) {
4838 tcp->tcp_xmit_head->b_prev =
4839 (mblk_t *)(intptr_t)gethrtime();
4840 tcp->tcp_csuna = tcp->tcp_snxt;
4841 TCPS_BUMP_MIB(tcps, tcpRetransSegs);
4842 TCPS_UPDATE_MIB(tcps, tcpRetransBytes,
4843 snd_size);
4844 tcp->tcp_cs.tcp_out_retrans_segs++;
4845 tcp->tcp_cs.tcp_out_retrans_bytes += snd_size;
4846 tcp_send_data(tcp, mp1);
4847 }
4848 }
4849 if (flags & TH_NEED_SACK_REXMIT) {
4850 tcp_sack_rexmit(tcp, &flags);
4851 }
4852 /*
4853 * For TH_LIMIT_XMIT, tcp_wput_data() is called to send
4854 * out new segment. Note that tcp_rexmit should not be
4855 * set, otherwise TH_LIMIT_XMIT should not be set.
4856 */
4857 if (flags & (TH_XMIT_NEEDED|TH_LIMIT_XMIT)) {
4858 if (!tcp->tcp_rexmit) {
4859 tcp_wput_data(tcp, NULL, B_FALSE);
4860 } else {
4861 tcp_ss_rexmit(tcp);
4862 }
4863 }
4864 /*
4865 * Adjust tcp_cwnd back to normal value after sending
4901 tcp_setcred_data(mp1, ira);
4902
4903 putnext(connp->conn_rq, mp1);
4904 #ifdef DEBUG
4905 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
4906 "tcp_rput: sending zero-length %s %s",
4907 ((mp1->b_flag & MSGMARKNEXT) ? "MSGMARKNEXT" :
4908 "MSGNOTMARKNEXT"),
4909 tcp_display(tcp, NULL, DISP_PORT_ONLY));
4910 #endif /* DEBUG */
4911 flags &= ~TH_SEND_URP_MARK;
4912 }
4913 if (flags & TH_ACK_NEEDED) {
4914 /*
4915 * Time to send an ack for some reason.
4916 */
4917 mp1 = tcp_ack_mp(tcp);
4918
4919 if (mp1 != NULL) {
4920 tcp_send_data(tcp, mp1);
4921 TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
4922 TCPS_BUMP_MIB(tcps, tcpOutAck);
4923 }
4924 if (tcp->tcp_ack_tid != 0) {
4925 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid);
4926 tcp->tcp_ack_tid = 0;
4927 }
4928 }
4929 if (flags & TH_ACK_TIMER_NEEDED) {
4930 /*
4931 * Arrange for deferred ACK or push wait timeout.
4932 * Start timer if it is not already running.
4933 */
4934 if (tcp->tcp_ack_tid == 0) {
4935 tcp->tcp_ack_tid = TCP_TIMER(tcp, tcp_ack_timer,
4936 tcp->tcp_localnet ?
4937 tcps->tcps_local_dack_interval :
4938 tcps->tcps_deferred_ack_interval);
4939 }
4940 }
4941 if (flags & TH_ORDREL_NEEDED) {
5217 /*
5218 * Set RTO for this connection based on a new round-trip time measurement.
5219 * The formula is from Jacobson and Karels' "Congestion Avoidance and Control"
5220 * in SIGCOMM '88. The variable names are the same as those in Appendix A.2
5221 * of that paper.
5222 *
5223 * m = new measurement
5224 * sa = smoothed RTT average (8 * average estimates).
5225 * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates).
5226 */
5227 static void
5228 tcp_set_rto(tcp_t *tcp, hrtime_t rtt)
5229 {
5230 hrtime_t m = rtt;
5231 hrtime_t sa = tcp->tcp_rtt_sa;
5232 hrtime_t sv = tcp->tcp_rtt_sd;
5233 tcp_stack_t *tcps = tcp->tcp_tcps;
5234
5235 TCPS_BUMP_MIB(tcps, tcpRttUpdate);
5236 tcp->tcp_rtt_update++;
5237 tcp->tcp_rtt_sum += m;
5238 tcp->tcp_rtt_cnt++;
5239
5240 /* tcp_rtt_sa is not 0 means this is a new sample. */
5241 if (sa != 0) {
5242 /*
5243 * Update average estimator (see section 2.3 of RFC6298):
5244 * SRTT = 7/8 SRTT + 1/8 rtt
5245 *
5246 * We maintain tcp_rtt_sa as 8 * SRTT, so this reduces to:
5247 * tcp_rtt_sa = 7 * SRTT + rtt
5248 * tcp_rtt_sa = 7 * (tcp_rtt_sa / 8) + rtt
5249 * tcp_rtt_sa = tcp_rtt_sa - (tcp_rtt_sa / 8) + rtt
5250 * tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa / 8))
5251 * tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa / 2^3))
5252 * tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa >> 3))
5253 *
5254 * (rtt - tcp_rtt_sa / 8) is simply the difference
5255 * between the new rtt measurement and the existing smoothed
5256 * RTT average. This is referred to as "Error" in subsequent
5257 * calculations.
5258 */
|