Print this page
11553 Want pluggable TCP congestion control algorithms
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>
Reviewed by: Robert Mustacchi <robert.mustacchi@joyent.com>


   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
  25  * Copyright 2019 Joyent, Inc.
  26  */
  27 
  28 /* This file contains all TCP output processing functions. */
  29 
  30 #include <sys/types.h>
  31 #include <sys/stream.h>
  32 #include <sys/strsun.h>
  33 #include <sys/strsubr.h>
  34 #include <sys/stropts.h>
  35 #include <sys/strlog.h>
  36 #define _SUN_TPI_VERSION 2
  37 #include <sys/tihdr.h>
  38 #include <sys/suntpi.h>
  39 #include <sys/xti_inet.h>
  40 #include <sys/timod.h>
  41 #include <sys/pattr.h>
  42 #include <sys/squeue_impl.h>
  43 #include <sys/squeue.h>
  44 #include <sys/sockio.h>


  64                     int, ip_recv_attr_t *, ip_stack_t *, conn_t *);
  65 static boolean_t        tcp_send_rst_chk(tcp_stack_t *);
  66 static void     tcp_process_shrunk_swnd(tcp_t *, uint32_t);
  67 static void     tcp_fill_header(tcp_t *, uchar_t *, int);
  68 
  69 /*
  70  * Functions called directly via squeue having a prototype of edesc_t.
  71  */
  72 static void     tcp_wput_nondata(void *, mblk_t *, void *, ip_recv_attr_t *);
  73 static void     tcp_wput_ioctl(void *, mblk_t *, void *, ip_recv_attr_t *);
  74 static void     tcp_wput_proto(void *, mblk_t *, void *, ip_recv_attr_t *);
  75 
  76 /*
  77  * This controls how tiny a write must be before we try to copy it
  78  * into the mblk on the tail of the transmit queue.  Not much
  79  * speedup is observed for values larger than sixteen.  Zero will
  80  * disable the optimisation.
  81  */
  82 static int tcp_tx_pull_len = 16;
  83 












  84 int
  85 tcp_wput(queue_t *q, mblk_t *mp)
  86 {
  87         conn_t  *connp = Q_TO_CONN(q);
  88         tcp_t   *tcp;
  89         void (*output_proc)();
  90         t_scalar_t type;
  91         uchar_t *rptr;
  92         struct iocblk   *iocp;
  93         size_t size;
  94 
  95         ASSERT(connp->conn_ref >= 2);
  96 
  97         switch (DB_TYPE(mp)) {
  98         case M_DATA:
  99                 tcp = connp->conn_tcp;
 100                 ASSERT(tcp != NULL);
 101 
 102                 size = msgdsize(mp);
 103 


 202 /*
 203  * The TCP normal data output path.
 204  * NOTE: the logic of the fast path is duplicated from this function.
 205  */
 206 void
 207 tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent)
 208 {
 209         int             len;
 210         mblk_t          *local_time;
 211         mblk_t          *mp1;
 212         uint32_t        snxt;
 213         int             tail_unsent;
 214         int             tcpstate;
 215         int             usable = 0;
 216         mblk_t          *xmit_tail;
 217         int32_t         mss;
 218         int32_t         num_sack_blk = 0;
 219         int32_t         total_hdr_len;
 220         int32_t         tcp_hdr_len;
 221         int             rc;
 222         tcp_stack_t     *tcps = tcp->tcp_tcps;
 223         conn_t          *connp = tcp->tcp_connp;
 224         clock_t         now = LBOLT_FASTPATH;
 225 
 226         tcpstate = tcp->tcp_state;
 227         if (mp == NULL) {
 228                 /*
 229                  * tcp_wput_data() with NULL mp should only be called when
 230                  * there is unsent data.
 231                  */
 232                 ASSERT(tcp->tcp_unsent > 0);
 233                 /* Really tacky... but we need this for detached closes. */
 234                 len = tcp->tcp_unsent;
 235                 goto data_null;
 236         }
 237 
 238         ASSERT(mp->b_datap->db_type == M_DATA);
 239         /*
 240          * Don't allow data after T_ORDREL_REQ or T_DISCON_REQ,
 241          * or before a connection attempt has begun.
 242          */


 357          * includes SACK options.
 358          */
 359         if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) {
 360                 int32_t opt_len;
 361 
 362                 num_sack_blk = MIN(tcp->tcp_max_sack_blk,
 363                     tcp->tcp_num_sack_blk);
 364                 opt_len = num_sack_blk * sizeof (sack_blk_t) + TCPOPT_NOP_LEN *
 365                     2 + TCPOPT_HEADER_LEN;
 366                 mss = tcp->tcp_mss - opt_len;
 367                 total_hdr_len = connp->conn_ht_iphc_len + opt_len;
 368                 tcp_hdr_len = connp->conn_ht_ulp_len + opt_len;
 369         } else {
 370                 mss = tcp->tcp_mss;
 371                 total_hdr_len = connp->conn_ht_iphc_len;
 372                 tcp_hdr_len = connp->conn_ht_ulp_len;
 373         }
 374 
 375         if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
 376             (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
 377                 TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle);
 378         }
 379         if (tcpstate == TCPS_SYN_RCVD) {
 380                 /*
 381                  * The three-way connection establishment handshake is not
 382                  * complete yet. We want to queue the data for transmission
 383                  * after entering ESTABLISHED state (RFC793). A jump to
 384                  * "done" label effectively leaves data on the queue.
 385                  */
 386                 goto done;
 387         } else {
 388                 int usable_r;
 389 
 390                 /*
 391                  * In the special case when cwnd is zero, which can only
 392                  * happen if the connection is ECN capable, return now.
 393                  * New segments is sent using tcp_timer().  The timer
 394                  * is set in tcp_input_data().
 395                  */
 396                 if (tcp->tcp_cwnd == 0) {
 397                         /*


1178          *  |--------------|-----------------|
1179          *  tcp_suna       tcp_snxt       tcp_suna+tcp_swnd
1180          */
1181         /* END CSTYLED */
1182 
1183         /* start sending from tcp_snxt */
1184         snxt = tcp->tcp_snxt;
1185 
1186         /*
1187          * Check to see if this connection has been idle for some time and no
1188          * ACK is expected. If so, then the congestion window size is no longer
1189          * meaningfully tied to current network conditions.
1190          *
1191          * We reinitialize tcp_cwnd, and slow start again to get back the
1192          * connection's "self-clock" as described in Van Jacobson's 1988 paper
1193          * "Congestion avoidance and control".
1194          */
1195         now = LBOLT_FASTPATH;
1196         if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
1197             (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
1198                 TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle);
1199         }
1200 
1201         usable = tcp->tcp_swnd;              /* tcp window size */
1202         if (usable > tcp->tcp_cwnd)
1203                 usable = tcp->tcp_cwnd;      /* congestion window smaller */
1204         usable -= snxt;         /* subtract stuff already sent */
1205         suna = tcp->tcp_suna;
1206         usable += suna;
1207         /* usable can be < 0 if the congestion window is smaller */
1208         if (len > usable) {
1209                 /* Can't send complete M_DATA in one shot */
1210                 goto slow;
1211         }
1212 
1213         mutex_enter(&tcp->tcp_non_sq_lock);
1214         if (tcp->tcp_flow_stopped &&
1215             TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) {
1216                 tcp_clrqfull(tcp);
1217         }
1218         mutex_exit(&tcp->tcp_non_sq_lock);




   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
  25  * Copyright 2019 Joyent, Inc.
  26  */
  27 
  28 /* This file contains all TCP output processing functions. */
  29 
  30 #include <sys/types.h>
  31 #include <sys/stream.h>
  32 #include <sys/strsun.h>
  33 #include <sys/strsubr.h>
  34 #include <sys/stropts.h>
  35 #include <sys/strlog.h>
  36 #define _SUN_TPI_VERSION 2
  37 #include <sys/tihdr.h>
  38 #include <sys/suntpi.h>
  39 #include <sys/xti_inet.h>
  40 #include <sys/timod.h>
  41 #include <sys/pattr.h>
  42 #include <sys/squeue_impl.h>
  43 #include <sys/squeue.h>
  44 #include <sys/sockio.h>


  64                     int, ip_recv_attr_t *, ip_stack_t *, conn_t *);
  65 static boolean_t        tcp_send_rst_chk(tcp_stack_t *);
  66 static void     tcp_process_shrunk_swnd(tcp_t *, uint32_t);
  67 static void     tcp_fill_header(tcp_t *, uchar_t *, int);
  68 
  69 /*
  70  * Functions called directly via squeue having a prototype of edesc_t.
  71  */
  72 static void     tcp_wput_nondata(void *, mblk_t *, void *, ip_recv_attr_t *);
  73 static void     tcp_wput_ioctl(void *, mblk_t *, void *, ip_recv_attr_t *);
  74 static void     tcp_wput_proto(void *, mblk_t *, void *, ip_recv_attr_t *);
  75 
  76 /*
  77  * This controls how tiny a write must be before we try to copy it
  78  * into the mblk on the tail of the transmit queue.  Not much
  79  * speedup is observed for values larger than sixteen.  Zero will
  80  * disable the optimisation.
  81  */
  82 static int tcp_tx_pull_len = 16;
  83 
  84 static void
  85 cc_after_idle(tcp_t *tcp)
  86 {
  87         uint32_t old_cwnd = tcp->tcp_cwnd;
  88 
  89         if (CC_ALGO(tcp)->after_idle != NULL)
  90                 CC_ALGO(tcp)->after_idle(&tcp->tcp_ccv);
  91 
  92         DTRACE_PROBE3(cwnd__cc__after__idle, tcp_t *, tcp, uint32_t, old_cwnd,
  93             uint32_t, tcp->tcp_cwnd);
  94 }
  95 
  96 int
  97 tcp_wput(queue_t *q, mblk_t *mp)
  98 {
  99         conn_t  *connp = Q_TO_CONN(q);
 100         tcp_t   *tcp;
 101         void (*output_proc)();
 102         t_scalar_t type;
 103         uchar_t *rptr;
 104         struct iocblk   *iocp;
 105         size_t size;
 106 
 107         ASSERT(connp->conn_ref >= 2);
 108 
 109         switch (DB_TYPE(mp)) {
 110         case M_DATA:
 111                 tcp = connp->conn_tcp;
 112                 ASSERT(tcp != NULL);
 113 
 114                 size = msgdsize(mp);
 115 


 214 /*
 215  * The TCP normal data output path.
 216  * NOTE: the logic of the fast path is duplicated from this function.
 217  */
 218 void
 219 tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent)
 220 {
 221         int             len;
 222         mblk_t          *local_time;
 223         mblk_t          *mp1;
 224         uint32_t        snxt;
 225         int             tail_unsent;
 226         int             tcpstate;
 227         int             usable = 0;
 228         mblk_t          *xmit_tail;
 229         int32_t         mss;
 230         int32_t         num_sack_blk = 0;
 231         int32_t         total_hdr_len;
 232         int32_t         tcp_hdr_len;
 233         int             rc;

 234         conn_t          *connp = tcp->tcp_connp;
 235         clock_t         now = LBOLT_FASTPATH;
 236 
 237         tcpstate = tcp->tcp_state;
 238         if (mp == NULL) {
 239                 /*
 240                  * tcp_wput_data() with NULL mp should only be called when
 241                  * there is unsent data.
 242                  */
 243                 ASSERT(tcp->tcp_unsent > 0);
 244                 /* Really tacky... but we need this for detached closes. */
 245                 len = tcp->tcp_unsent;
 246                 goto data_null;
 247         }
 248 
 249         ASSERT(mp->b_datap->db_type == M_DATA);
 250         /*
 251          * Don't allow data after T_ORDREL_REQ or T_DISCON_REQ,
 252          * or before a connection attempt has begun.
 253          */


 368          * includes SACK options.
 369          */
 370         if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) {
 371                 int32_t opt_len;
 372 
 373                 num_sack_blk = MIN(tcp->tcp_max_sack_blk,
 374                     tcp->tcp_num_sack_blk);
 375                 opt_len = num_sack_blk * sizeof (sack_blk_t) + TCPOPT_NOP_LEN *
 376                     2 + TCPOPT_HEADER_LEN;
 377                 mss = tcp->tcp_mss - opt_len;
 378                 total_hdr_len = connp->conn_ht_iphc_len + opt_len;
 379                 tcp_hdr_len = connp->conn_ht_ulp_len + opt_len;
 380         } else {
 381                 mss = tcp->tcp_mss;
 382                 total_hdr_len = connp->conn_ht_iphc_len;
 383                 tcp_hdr_len = connp->conn_ht_ulp_len;
 384         }
 385 
 386         if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
 387             (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
 388                 cc_after_idle(tcp);
 389         }
 390         if (tcpstate == TCPS_SYN_RCVD) {
 391                 /*
 392                  * The three-way connection establishment handshake is not
 393                  * complete yet. We want to queue the data for transmission
 394                  * after entering ESTABLISHED state (RFC793). A jump to
 395                  * "done" label effectively leaves data on the queue.
 396                  */
 397                 goto done;
 398         } else {
 399                 int usable_r;
 400 
 401                 /*
 402                  * In the special case when cwnd is zero, which can only
 403                  * happen if the connection is ECN capable, return now.
 404                  * New segments is sent using tcp_timer().  The timer
 405                  * is set in tcp_input_data().
 406                  */
 407                 if (tcp->tcp_cwnd == 0) {
 408                         /*


1189          *  |--------------|-----------------|
1190          *  tcp_suna       tcp_snxt       tcp_suna+tcp_swnd
1191          */
1192         /* END CSTYLED */
1193 
1194         /* start sending from tcp_snxt */
1195         snxt = tcp->tcp_snxt;
1196 
1197         /*
1198          * Check to see if this connection has been idle for some time and no
1199          * ACK is expected. If so, then the congestion window size is no longer
1200          * meaningfully tied to current network conditions.
1201          *
1202          * We reinitialize tcp_cwnd, and slow start again to get back the
1203          * connection's "self-clock" as described in Van Jacobson's 1988 paper
1204          * "Congestion avoidance and control".
1205          */
1206         now = LBOLT_FASTPATH;
1207         if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
1208             (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
1209                 cc_after_idle(tcp);
1210         }
1211 
1212         usable = tcp->tcp_swnd;              /* tcp window size */
1213         if (usable > tcp->tcp_cwnd)
1214                 usable = tcp->tcp_cwnd;      /* congestion window smaller */
1215         usable -= snxt;         /* subtract stuff already sent */
1216         suna = tcp->tcp_suna;
1217         usable += suna;
1218         /* usable can be < 0 if the congestion window is smaller */
1219         if (len > usable) {
1220                 /* Can't send complete M_DATA in one shot */
1221                 goto slow;
1222         }
1223 
1224         mutex_enter(&tcp->tcp_non_sq_lock);
1225         if (tcp->tcp_flow_stopped &&
1226             TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) {
1227                 tcp_clrqfull(tcp);
1228         }
1229         mutex_exit(&tcp->tcp_non_sq_lock);