Print this page
11553 Want pluggable TCP congestion control algorithms
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>
Reviewed by: Robert Mustacchi <robert.mustacchi@joyent.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/inet/tcp/tcp_input.c
          +++ new/usr/src/uts/common/inet/tcp/tcp_input.c
↓ open down ↓ 162 lines elided ↑ open up ↑
 163  163                      ip_recv_attr_t *);
 164  164  static void     tcp_input_listener(void *, mblk_t *, void *, ip_recv_attr_t *);
 165  165  static void     tcp_process_options(tcp_t *, tcpha_t *);
 166  166  static mblk_t   *tcp_reass(tcp_t *, mblk_t *, uint32_t);
 167  167  static void     tcp_reass_elim_overlap(tcp_t *, mblk_t *);
 168  168  static void     tcp_rsrv_input(void *, mblk_t *, void *, ip_recv_attr_t *);
 169  169  static void     tcp_set_rto(tcp_t *, hrtime_t);
 170  170  static void     tcp_setcred_data(mblk_t *, ip_recv_attr_t *);
 171  171  
 172  172  /*
      173 + * CC wrapper hook functions
      174 + */
      175 +static void
      176 +cc_ack_received(tcp_t *tcp, uint32_t seg_ack, int32_t bytes_acked,
      177 +    uint16_t type)
      178 +{
      179 +        uint32_t old_cwnd = tcp->tcp_cwnd;
      180 +
      181 +        tcp->tcp_ccv.bytes_this_ack = bytes_acked;
      182 +        if (tcp->tcp_cwnd <= tcp->tcp_swnd)
      183 +                tcp->tcp_ccv.flags |= CCF_CWND_LIMITED;
      184 +        else
      185 +                tcp->tcp_ccv.flags &= ~CCF_CWND_LIMITED;
      186 +
      187 +        if (type == CC_ACK) {
      188 +                if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) {
      189 +                        if (tcp->tcp_ccv.flags & CCF_RTO)
      190 +                                tcp->tcp_ccv.flags &= ~CCF_RTO;
      191 +
      192 +                        tcp->tcp_ccv.t_bytes_acked +=
      193 +                            min(tcp->tcp_ccv.bytes_this_ack,
      194 +                            tcp->tcp_tcps->tcps_abc_l_var * tcp->tcp_mss);
      195 +                        if (tcp->tcp_ccv.t_bytes_acked >= tcp->tcp_cwnd) {
      196 +                                tcp->tcp_ccv.t_bytes_acked -= tcp->tcp_cwnd;
      197 +                                tcp->tcp_ccv.flags |= CCF_ABC_SENTAWND;
      198 +                        }
      199 +                } else {
      200 +                        tcp->tcp_ccv.flags &= ~CCF_ABC_SENTAWND;
      201 +                        tcp->tcp_ccv.t_bytes_acked = 0;
      202 +                }
      203 +        }
      204 +
      205 +        if (CC_ALGO(tcp)->ack_received != NULL) {
      206 +                /*
      207 +                 * The FreeBSD code where this originated had a comment "Find
      208 +                 * a way to live without this" in several places where curack
      209 +                 * got set.  If they eventually dump curack from the cc
      210 +                 * variables, we'll need to adapt our code.
      211 +                 */
      212 +                tcp->tcp_ccv.curack = seg_ack;
      213 +                CC_ALGO(tcp)->ack_received(&tcp->tcp_ccv, type);
      214 +        }
      215 +
      216 +        DTRACE_PROBE3(cwnd__cc__ack__received, tcp_t *, tcp, uint32_t, old_cwnd,
      217 +            uint32_t, tcp->tcp_cwnd);
      218 +}
      219 +
      220 +void
      221 +cc_cong_signal(tcp_t *tcp, uint32_t seg_ack, uint32_t type)
      222 +{
      223 +        uint32_t old_cwnd = tcp->tcp_cwnd;
      224 +        uint32_t old_cwnd_ssthresh = tcp->tcp_cwnd_ssthresh;
      225 +        switch (type) {
      226 +        case CC_NDUPACK:
      227 +                if (!IN_FASTRECOVERY(tcp->tcp_ccv.flags)) {
      228 +                        tcp->tcp_rexmit_max = tcp->tcp_snxt;
      229 +                        if (tcp->tcp_ecn_ok) {
      230 +                                tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
      231 +                                tcp->tcp_cwr = B_TRUE;
      232 +                                tcp->tcp_ecn_cwr_sent = B_FALSE;
      233 +                        }
      234 +                }
      235 +                break;
      236 +        case CC_ECN:
      237 +                if (!IN_CONGRECOVERY(tcp->tcp_ccv.flags)) {
      238 +                        tcp->tcp_rexmit_max = tcp->tcp_snxt;
      239 +                        if (tcp->tcp_ecn_ok) {
      240 +                                tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
      241 +                                tcp->tcp_cwr = B_TRUE;
      242 +                                tcp->tcp_ecn_cwr_sent = B_FALSE;
      243 +                        }
      244 +                }
      245 +                break;
      246 +        case CC_RTO:
      247 +                tcp->tcp_ccv.flags |= CCF_RTO;
      248 +                tcp->tcp_dupack_cnt = 0;
      249 +                tcp->tcp_ccv.t_bytes_acked = 0;
      250 +                /*
      251 +                 * Give up on fast recovery and congestion recovery if we were
      252 +                 * attempting either.
      253 +                 */
      254 +                EXIT_RECOVERY(tcp->tcp_ccv.flags);
      255 +                if (CC_ALGO(tcp)->cong_signal == NULL) {
      256 +                        /*
      257 +                         * RFC5681 Section 3.1
      258 +                         * ssthresh = max (FlightSize / 2, 2*SMSS) eq (4)
      259 +                         */
      260 +                        tcp->tcp_cwnd_ssthresh = max(
      261 +                            (tcp->tcp_snxt - tcp->tcp_suna) / 2 / tcp->tcp_mss,
      262 +                            2) * tcp->tcp_mss;
      263 +                        tcp->tcp_cwnd = tcp->tcp_mss;
      264 +                }
      265 +
      266 +                if (tcp->tcp_ecn_ok) {
      267 +                        tcp->tcp_cwr = B_TRUE;
      268 +                        tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
      269 +                        tcp->tcp_ecn_cwr_sent = B_FALSE;
      270 +                }
      271 +                break;
      272 +        }
      273 +
      274 +        if (CC_ALGO(tcp)->cong_signal != NULL) {
      275 +                tcp->tcp_ccv.curack = seg_ack;
      276 +                CC_ALGO(tcp)->cong_signal(&tcp->tcp_ccv, type);
      277 +        }
      278 +
      279 +        DTRACE_PROBE6(cwnd__cc__cong__signal, tcp_t *, tcp, uint32_t, old_cwnd,
      280 +            uint32_t, tcp->tcp_cwnd, uint32_t, old_cwnd_ssthresh,
      281 +            uint32_t, tcp->tcp_cwnd_ssthresh, uint32_t, type);
      282 +}
      283 +
      284 +static void
      285 +cc_post_recovery(tcp_t *tcp, uint32_t seg_ack)
      286 +{
      287 +        uint32_t old_cwnd = tcp->tcp_cwnd;
      288 +
      289 +        if (CC_ALGO(tcp)->post_recovery != NULL) {
      290 +                tcp->tcp_ccv.curack = seg_ack;
      291 +                CC_ALGO(tcp)->post_recovery(&tcp->tcp_ccv);
      292 +        }
      293 +        tcp->tcp_ccv.t_bytes_acked = 0;
      294 +
      295 +        DTRACE_PROBE3(cwnd__cc__post__recovery, tcp_t *, tcp,
      296 +            uint32_t, old_cwnd, uint32_t, tcp->tcp_cwnd);
      297 +}
      298 +
      299 +/*
 173  300   * Set the MSS associated with a particular tcp based on its current value,
 174  301   * and a new one passed in. Observe minimums and maximums, and reset other
 175  302   * state variables that we want to view as multiples of MSS.
 176  303   *
 177  304   * The value of MSS could be either increased or descreased.
 178  305   */
 179  306  void
 180  307  tcp_mss_set(tcp_t *tcp, uint32_t mss)
 181  308  {
 182  309          uint32_t        mss_max;
↓ open down ↓ 358 lines elided ↑ open up ↑
 541  668           * didn't want to do window scale, tcp_rwnd_set() will take
 542  669           * care of that.
 543  670           */
 544  671          tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss));
 545  672  
 546  673          /*
 547  674           * Initialize tcp_cwnd value. After tcp_mss_set(), tcp_mss has been
 548  675           * updated properly.
 549  676           */
 550  677          TCP_SET_INIT_CWND(tcp, tcp->tcp_mss, tcps->tcps_slow_start_initial);
      678 +
      679 +        if (tcp->tcp_cc_algo->conn_init != NULL)
      680 +                tcp->tcp_cc_algo->conn_init(&tcp->tcp_ccv);
 551  681  }
 552  682  
 553  683  /*
 554  684   * Add a new piece to the tcp reassembly queue.  If the gap at the beginning
 555  685   * is filled, return as much as we can.  The message passed in may be
 556  686   * multi-part, chained using b_cont.  "start" is the starting sequence
 557  687   * number for this piece.
 558  688   */
 559  689  static mblk_t *
 560  690  tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start)
↓ open down ↓ 837 lines elided ↑ open up ↑
1398 1528          mutex_exit(&listener->tcp_eager_lock);
1399 1529  
1400 1530          /*
1401 1531           * IP sets ira_sqp to either the senders conn_sqp (for loopback)
1402 1532           * or based on the ring (for packets from GLD). Otherwise it is
1403 1533           * set based on lbolt i.e., a somewhat random number.
1404 1534           */
1405 1535          ASSERT(ira->ira_sqp != NULL);
1406 1536          new_sqp = ira->ira_sqp;
1407 1537  
1408      -        econnp = (conn_t *)tcp_get_conn(arg2, tcps);
     1538 +        econnp = tcp_get_conn(arg2, tcps);
1409 1539          if (econnp == NULL)
1410 1540                  goto error2;
1411 1541  
1412 1542          ASSERT(econnp->conn_netstack == lconnp->conn_netstack);
1413 1543          econnp->conn_sqp = new_sqp;
1414 1544          econnp->conn_initial_sqp = new_sqp;
1415 1545          econnp->conn_ixa->ixa_sqp = new_sqp;
1416 1546  
1417 1547          econnp->conn_fport = tcpha->tha_lport;
1418 1548          econnp->conn_lport = tcpha->tha_fport;
↓ open down ↓ 898 lines elided ↑ open up ↑
2317 2447          uint32_t        seg_ack;
2318 2448          int             seg_len;
2319 2449          uint_t          ip_hdr_len;
2320 2450          uint32_t        seg_seq;
2321 2451          tcpha_t         *tcpha;
2322 2452          int             urp;
2323 2453          tcp_opt_t       tcpopt;
2324 2454          ip_pkt_t        ipp;
2325 2455          boolean_t       ofo_seg = B_FALSE; /* Out of order segment */
2326 2456          uint32_t        cwnd;
2327      -        uint32_t        add;
2328      -        int             npkt;
2329 2457          int             mss;
2330 2458          conn_t          *connp = (conn_t *)arg;
2331 2459          squeue_t        *sqp = (squeue_t *)arg2;
2332 2460          tcp_t           *tcp = connp->conn_tcp;
2333 2461          tcp_stack_t     *tcps = tcp->tcp_tcps;
2334 2462          sock_upcalls_t  *sockupcalls;
2335 2463  
2336 2464          /*
2337 2465           * RST from fused tcp loopback peer should trigger an unfuse.
2338 2466           */
↓ open down ↓ 255 lines elided ↑ open up ↑
2594 2722                                  tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
2595 2723                                  tcp->tcp_rexmit_max = tcp->tcp_snxt;
2596 2724                                  tcp->tcp_ms_we_have_waited = 0;
2597 2725  
2598 2726                                  /*
2599 2727                                   * Set tcp_cwnd back to 1 MSS, per
2600 2728                                   * recommendation from
2601 2729                                   * draft-floyd-incr-init-win-01.txt,
2602 2730                                   * Increasing TCP's Initial Window.
2603 2731                                   */
     2732 +                                DTRACE_PROBE3(cwnd__retransmitted__syn,
     2733 +                                    tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
     2734 +                                    uint32_t, tcp->tcp_mss);
2604 2735                                  tcp->tcp_cwnd = tcp->tcp_mss;
2605 2736                          }
2606 2737  
2607 2738                          tcp->tcp_swl1 = seg_seq;
2608 2739                          tcp->tcp_swl2 = seg_ack;
2609 2740  
2610 2741                          new_swnd = ntohs(tcpha->tha_win);
2611 2742                          tcp->tcp_swnd = new_swnd;
2612 2743                          if (new_swnd > tcp->tcp_max_swnd)
2613 2744                                  tcp->tcp_max_swnd = new_swnd;
↓ open down ↓ 1202 lines elided ↑ open up ↑
3816 3947                  /*
3817 3948                   * If SYN was retransmitted, need to reset all
3818 3949                   * retransmission info as this segment will be
3819 3950                   * treated as a dup ACK.
3820 3951                   */
3821 3952                  if (tcp->tcp_rexmit) {
3822 3953                          tcp->tcp_rexmit = B_FALSE;
3823 3954                          tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
3824 3955                          tcp->tcp_rexmit_max = tcp->tcp_snxt;
3825 3956                          tcp->tcp_ms_we_have_waited = 0;
     3957 +                        DTRACE_PROBE3(cwnd__retransmitted__syn,
     3958 +                            tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
     3959 +                            uint32_t, tcp->tcp_mss);
3826 3960                          tcp->tcp_cwnd = mss;
3827 3961                  }
3828 3962  
3829 3963                  /*
3830 3964                   * We set the send window to zero here.
3831 3965                   * This is needed if there is data to be
3832 3966                   * processed already on the queue.
3833 3967                   * Later (at swnd_update label), the
3834 3968                   * "new_swnd > tcp_swnd" condition is satisfied
3835 3969                   * the XMIT_NEEDED flag is set in the current
↓ open down ↓ 23 lines elided ↑ open up ↑
3859 3993          if (bytes_acked < 0)
3860 3994                  goto est;
3861 3995  
3862 3996          /*
3863 3997           * If TCP is ECN capable and the congestion experience bit is
3864 3998           * set, reduce tcp_cwnd and tcp_ssthresh.  But this should only be
3865 3999           * done once per window (or more loosely, per RTT).
3866 4000           */
3867 4001          if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max))
3868 4002                  tcp->tcp_cwr = B_FALSE;
3869      -        if (tcp->tcp_ecn_ok && (flags & TH_ECE)) {
3870      -                if (!tcp->tcp_cwr) {
3871      -                        npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) / mss;
3872      -                        tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * mss;
3873      -                        tcp->tcp_cwnd = npkt * mss;
3874      -                        /*
3875      -                         * If the cwnd is 0, use the timer to clock out
3876      -                         * new segments.  This is required by the ECN spec.
3877      -                         */
3878      -                        if (npkt == 0) {
3879      -                                TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
3880      -                                /*
3881      -                                 * This makes sure that when the ACK comes
3882      -                                 * back, we will increase tcp_cwnd by 1 MSS.
3883      -                                 */
3884      -                                tcp->tcp_cwnd_cnt = 0;
3885      -                        }
3886      -                        tcp->tcp_cwr = B_TRUE;
3887      -                        /*
3888      -                         * This marks the end of the current window of in
3889      -                         * flight data.  That is why we don't use
3890      -                         * tcp_suna + tcp_swnd.  Only data in flight can
3891      -                         * provide ECN info.
3892      -                         */
3893      -                        tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
3894      -                        tcp->tcp_ecn_cwr_sent = B_FALSE;
3895      -                }
     4003 +        if (tcp->tcp_ecn_ok && (flags & TH_ECE) && !tcp->tcp_cwr) {
     4004 +                cc_cong_signal(tcp, seg_ack, CC_ECN);
     4005 +                /*
     4006 +                 * If the cwnd is 0, use the timer to clock out
     4007 +                 * new segments.  This is required by the ECN spec.
     4008 +                 */
     4009 +                if (tcp->tcp_cwnd == 0)
     4010 +                        TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
     4011 +                tcp->tcp_cwr = B_TRUE;
     4012 +                /*
     4013 +                 * This marks the end of the current window of in
     4014 +                 * flight data.  That is why we don't use
     4015 +                 * tcp_suna + tcp_swnd.  Only data in flight can
     4016 +                 * provide ECN info.
     4017 +                 */
     4018 +                tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
3896 4019          }
3897 4020  
3898 4021          mp1 = tcp->tcp_xmit_head;
3899 4022          if (bytes_acked == 0) {
3900 4023                  if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) {
3901 4024                          int dupack_cnt;
3902 4025  
3903 4026                          TCPS_BUMP_MIB(tcps, tcpInDupAck);
3904 4027                          /*
3905 4028                           * Fast retransmit.  When we have seen exactly three
↓ open down ↓ 1 lines elided ↑ open up ↑
3907 4030                           * outstanding we take it as a hint that our peer
3908 4031                           * dropped something.
3909 4032                           *
3910 4033                           * If TCP is retransmitting, don't do fast retransmit.
3911 4034                           */
3912 4035                          if (mp1 && tcp->tcp_suna != tcp->tcp_snxt &&
3913 4036                              ! tcp->tcp_rexmit) {
3914 4037                                  /* Do Limited Transmit */
3915 4038                                  if ((dupack_cnt = ++tcp->tcp_dupack_cnt) <
3916 4039                                      tcps->tcps_dupack_fast_retransmit) {
     4040 +                                        cc_ack_received(tcp, seg_ack,
     4041 +                                            bytes_acked, CC_DUPACK);
3917 4042                                          /*
3918 4043                                           * RFC 3042
3919 4044                                           *
3920 4045                                           * What we need to do is temporarily
3921 4046                                           * increase tcp_cwnd so that new
3922 4047                                           * data can be sent if it is allowed
3923 4048                                           * by the receive window (tcp_rwnd).
3924 4049                                           * tcp_wput_data() will take care of
3925 4050                                           * the rest.
3926 4051                                           *
↓ open down ↓ 26 lines elided ↑ open up ↑
3953 4078                                   * away.  After one window of data, tcp_cwr
3954 4079                                   * should then be cleared.  Note that
3955 4080                                   * for non ECN capable connection, tcp_cwr
3956 4081                                   * should always be false.
3957 4082                                   *
3958 4083                                   * Adjust cwnd since the duplicate
3959 4084                                   * ack indicates that a packet was
3960 4085                                   * dropped (due to congestion.)
3961 4086                                   */
3962 4087                                  if (!tcp->tcp_cwr) {
3963      -                                        npkt = ((tcp->tcp_snxt -
3964      -                                            tcp->tcp_suna) >> 1) / mss;
3965      -                                        tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) *
3966      -                                            mss;
3967      -                                        tcp->tcp_cwnd = (npkt +
3968      -                                            tcp->tcp_dupack_cnt) * mss;
     4088 +                                        cc_cong_signal(tcp, seg_ack,
     4089 +                                            CC_NDUPACK);
     4090 +                                        cc_ack_received(tcp, seg_ack,
     4091 +                                            bytes_acked, CC_DUPACK);
3969 4092                                  }
3970 4093                                  if (tcp->tcp_ecn_ok) {
3971 4094                                          tcp->tcp_cwr = B_TRUE;
3972 4095                                          tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
3973 4096                                          tcp->tcp_ecn_cwr_sent = B_FALSE;
3974 4097                                  }
3975 4098  
3976 4099                                  /*
3977 4100                                   * We do Hoe's algorithm.  Refer to her
3978 4101                                   * paper "Improving the Start-up Behavior
↓ open down ↓ 41 lines elided ↑ open up ↑
4020 4143                                                   * funny things will happen.
4021 4144                                                   */
4022 4145                                                  tcp->tcp_pipe =
4023 4146                                                      tcp->tcp_cwnd_ssthresh;
4024 4147                                          }
4025 4148                                  } else {
4026 4149                                          flags |= TH_REXMIT_NEEDED;
4027 4150                                  } /* tcp_snd_sack_ok */
4028 4151  
4029 4152                                  } else {
     4153 +                                        cc_ack_received(tcp, seg_ack,
     4154 +                                            bytes_acked, CC_DUPACK);
4030 4155                                          /*
4031 4156                                           * Here we perform congestion
4032 4157                                           * avoidance, but NOT slow start.
4033 4158                                           * This is known as the Fast
4034 4159                                           * Recovery Algorithm.
4035 4160                                           */
4036 4161                                          if (tcp->tcp_snd_sack_ok &&
4037 4162                                              tcp->tcp_notsack_list != NULL) {
4038 4163                                                  flags |= TH_NEED_SACK_REXMIT;
4039 4164                                                  tcp->tcp_pipe -= mss;
↓ open down ↓ 1 lines elided ↑ open up ↑
4041 4166                                                          tcp->tcp_pipe = 0;
4042 4167                                          } else {
4043 4168                                          /*
4044 4169                                           * We know that one more packet has
4045 4170                                           * left the pipe thus we can update
4046 4171                                           * cwnd.
4047 4172                                           */
4048 4173                                          cwnd = tcp->tcp_cwnd + mss;
4049 4174                                          if (cwnd > tcp->tcp_cwnd_max)
4050 4175                                                  cwnd = tcp->tcp_cwnd_max;
     4176 +                                        DTRACE_PROBE3(cwnd__fast__recovery,
     4177 +                                            tcp_t *, tcp,
     4178 +                                            uint32_t, tcp->tcp_cwnd,
     4179 +                                            uint32_t, cwnd);
4051 4180                                          tcp->tcp_cwnd = cwnd;
4052 4181                                          if (tcp->tcp_unsent > 0)
4053 4182                                                  flags |= TH_XMIT_NEEDED;
4054 4183                                          }
4055 4184                                  }
4056 4185                          }
4057 4186                  } else if (tcp->tcp_zero_win_probe) {
4058 4187                          /*
4059 4188                           * If the window has opened, need to arrange
4060 4189                           * to send additional data.
↓ open down ↓ 112 lines elided ↑ open up ↑
4173 4302          /*
4174 4303           * If we got an ACK after fast retransmit, check to see
4175 4304           * if it is a partial ACK.  If it is not and the congestion
4176 4305           * window was inflated to account for the other side's
4177 4306           * cached packets, retract it.  If it is, do Hoe's algorithm.
4178 4307           */
4179 4308          if (tcp->tcp_dupack_cnt >= tcps->tcps_dupack_fast_retransmit) {
4180 4309                  ASSERT(tcp->tcp_rexmit == B_FALSE);
4181 4310                  if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) {
4182 4311                          tcp->tcp_dupack_cnt = 0;
4183      -                        /*
4184      -                         * Restore the orig tcp_cwnd_ssthresh after
4185      -                         * fast retransmit phase.
4186      -                         */
4187      -                        if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) {
4188      -                                tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh;
4189      -                        }
     4312 +
     4313 +                        cc_post_recovery(tcp, seg_ack);
     4314 +
4190 4315                          tcp->tcp_rexmit_max = seg_ack;
4191      -                        tcp->tcp_cwnd_cnt = 0;
4192 4316  
4193 4317                          /*
4194 4318                           * Remove all notsack info to avoid confusion with
4195 4319                           * the next fast retrasnmit/recovery phase.
4196 4320                           */
4197 4321                          if (tcp->tcp_snd_sack_ok) {
4198 4322                                  TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list,
4199 4323                                      tcp);
4200 4324                          }
4201 4325                  } else {
↓ open down ↓ 8 lines elided ↑ open up ↑
4210 4334                                   * Hoe's algorithm:
4211 4335                                   *
4212 4336                                   * Retransmit the unack'ed segment and
4213 4337                                   * restart fast recovery.  Note that we
4214 4338                                   * need to scale back tcp_cwnd to the
4215 4339                                   * original value when we started fast
4216 4340                                   * recovery.  This is to prevent overly
4217 4341                                   * aggressive behaviour in sending new
4218 4342                                   * segments.
4219 4343                                   */
4220      -                                tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh +
     4344 +                                cwnd = tcp->tcp_cwnd_ssthresh +
4221 4345                                      tcps->tcps_dupack_fast_retransmit * mss;
     4346 +                                DTRACE_PROBE3(cwnd__fast__retransmit__part__ack,
     4347 +                                    tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
     4348 +                                    uint32_t, cwnd);
     4349 +                                tcp->tcp_cwnd = cwnd;
4222 4350                                  tcp->tcp_cwnd_cnt = tcp->tcp_cwnd;
4223 4351                                  flags |= TH_REXMIT_NEEDED;
4224 4352                          }
4225 4353                  }
4226 4354          } else {
4227 4355                  tcp->tcp_dupack_cnt = 0;
4228 4356                  if (tcp->tcp_rexmit) {
4229 4357                          /*
4230 4358                           * TCP is retranmitting.  If the ACK ack's all
4231 4359                           * outstanding data, update tcp_rexmit_max and
↓ open down ↓ 40 lines elided ↑ open up ↑
4272 4400          }
4273 4401  
4274 4402          /*
4275 4403           * Update the congestion window.
4276 4404           *
4277 4405           * If TCP is not ECN capable or TCP is ECN capable but the
4278 4406           * congestion experience bit is not set, increase the tcp_cwnd as
4279 4407           * usual.
4280 4408           */
4281 4409          if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) {
4282      -                cwnd = tcp->tcp_cwnd;
4283      -                add = mss;
4284      -
4285      -                if (cwnd >= tcp->tcp_cwnd_ssthresh) {
4286      -                        /*
4287      -                         * This is to prevent an increase of less than 1 MSS of
4288      -                         * tcp_cwnd.  With partial increase, tcp_wput_data()
4289      -                         * may send out tinygrams in order to preserve mblk
4290      -                         * boundaries.
4291      -                         *
4292      -                         * By initializing tcp_cwnd_cnt to new tcp_cwnd and
4293      -                         * decrementing it by 1 MSS for every ACKs, tcp_cwnd is
4294      -                         * increased by 1 MSS for every RTTs.
4295      -                         */
4296      -                        if (tcp->tcp_cwnd_cnt <= 0) {
4297      -                                tcp->tcp_cwnd_cnt = cwnd + add;
4298      -                        } else {
4299      -                                tcp->tcp_cwnd_cnt -= add;
4300      -                                add = 0;
4301      -                        }
     4410 +                if (IN_RECOVERY(tcp->tcp_ccv.flags)) {
     4411 +                        EXIT_RECOVERY(tcp->tcp_ccv.flags);
4302 4412                  }
4303      -                tcp->tcp_cwnd = MIN(cwnd + add, tcp->tcp_cwnd_max);
     4413 +                cc_ack_received(tcp, seg_ack, bytes_acked, CC_ACK);
4304 4414          }
4305 4415  
4306 4416          /* See if the latest urgent data has been acknowledged */
4307 4417          if ((tcp->tcp_valid_bits & TCP_URG_VALID) &&
4308 4418              SEQ_GT(seg_ack, tcp->tcp_urg))
4309 4419                  tcp->tcp_valid_bits &= ~TCP_URG_VALID;
4310 4420  
4311 4421          /*
4312 4422           * Update the RTT estimates. Note that we don't use the TCP
4313 4423           * timestamp option to calculate RTT even if one is present. This is
↓ open down ↓ 1313 lines elided ↑ open up ↑
5627 5737                  if (tcp_icmp_source_quench) {
5628 5738                          /*
5629 5739                           * Reduce the sending rate as if we got a
5630 5740                           * retransmit timeout
5631 5741                           */
5632 5742                          uint32_t npkt;
5633 5743  
5634 5744                          npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) /
5635 5745                              tcp->tcp_mss;
5636 5746                          tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * tcp->tcp_mss;
     5747 +
     5748 +                        DTRACE_PROBE3(cwnd__source__quench, tcp_t *, tcp,
     5749 +                            uint32_t, tcp->tcp_cwnd,
     5750 +                            uint32_t, tcp->tcp_mss);
5637 5751                          tcp->tcp_cwnd = tcp->tcp_mss;
5638 5752                          tcp->tcp_cwnd_cnt = 0;
5639 5753                  }
5640 5754                  break;
5641 5755          }
5642 5756          }
5643 5757          freemsg(mp);
5644 5758  }
5645 5759  
5646 5760  /*
↓ open down ↓ 149 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX