4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
25 * Copyright 2019 Joyent, Inc.
26 */
27
28 /* This file contains all TCP output processing functions. */
29
30 #include <sys/types.h>
31 #include <sys/stream.h>
32 #include <sys/strsun.h>
33 #include <sys/strsubr.h>
34 #include <sys/stropts.h>
35 #include <sys/strlog.h>
36 #define _SUN_TPI_VERSION 2
37 #include <sys/tihdr.h>
38 #include <sys/suntpi.h>
39 #include <sys/xti_inet.h>
40 #include <sys/timod.h>
41 #include <sys/pattr.h>
42 #include <sys/squeue_impl.h>
43 #include <sys/squeue.h>
44 #include <sys/sockio.h>
64 int, ip_recv_attr_t *, ip_stack_t *, conn_t *);
65 static boolean_t tcp_send_rst_chk(tcp_stack_t *);
66 static void tcp_process_shrunk_swnd(tcp_t *, uint32_t);
67 static void tcp_fill_header(tcp_t *, uchar_t *, int);
68
69 /*
70 * Functions called directly via squeue having a prototype of edesc_t.
71 */
72 static void tcp_wput_nondata(void *, mblk_t *, void *, ip_recv_attr_t *);
73 static void tcp_wput_ioctl(void *, mblk_t *, void *, ip_recv_attr_t *);
74 static void tcp_wput_proto(void *, mblk_t *, void *, ip_recv_attr_t *);
75
76 /*
77 * This controls how tiny a write must be before we try to copy it
78 * into the mblk on the tail of the transmit queue. Not much
79 * speedup is observed for values larger than sixteen. Zero will
80 * disable the optimisation.
81 */
82 static int tcp_tx_pull_len = 16;
83
84 int
85 tcp_wput(queue_t *q, mblk_t *mp)
86 {
87 conn_t *connp = Q_TO_CONN(q);
88 tcp_t *tcp;
89 void (*output_proc)();
90 t_scalar_t type;
91 uchar_t *rptr;
92 struct iocblk *iocp;
93 size_t size;
94
95 ASSERT(connp->conn_ref >= 2);
96
97 switch (DB_TYPE(mp)) {
98 case M_DATA:
99 tcp = connp->conn_tcp;
100 ASSERT(tcp != NULL);
101
102 size = msgdsize(mp);
103
202 /*
203 * The TCP normal data output path.
204 * NOTE: the logic of the fast path is duplicated from this function.
205 */
206 void
207 tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent)
208 {
209 int len;
210 mblk_t *local_time;
211 mblk_t *mp1;
212 uint32_t snxt;
213 int tail_unsent;
214 int tcpstate;
215 int usable = 0;
216 mblk_t *xmit_tail;
217 int32_t mss;
218 int32_t num_sack_blk = 0;
219 int32_t total_hdr_len;
220 int32_t tcp_hdr_len;
221 int rc;
222 tcp_stack_t *tcps = tcp->tcp_tcps;
223 conn_t *connp = tcp->tcp_connp;
224 clock_t now = LBOLT_FASTPATH;
225
226 tcpstate = tcp->tcp_state;
227 if (mp == NULL) {
228 /*
229 * tcp_wput_data() with NULL mp should only be called when
230 * there is unsent data.
231 */
232 ASSERT(tcp->tcp_unsent > 0);
233 /* Really tacky... but we need this for detached closes. */
234 len = tcp->tcp_unsent;
235 goto data_null;
236 }
237
238 ASSERT(mp->b_datap->db_type == M_DATA);
239 /*
240 * Don't allow data after T_ORDREL_REQ or T_DISCON_REQ,
241 * or before a connection attempt has begun.
242 */
357 * includes SACK options.
358 */
359 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) {
360 int32_t opt_len;
361
362 num_sack_blk = MIN(tcp->tcp_max_sack_blk,
363 tcp->tcp_num_sack_blk);
364 opt_len = num_sack_blk * sizeof (sack_blk_t) + TCPOPT_NOP_LEN *
365 2 + TCPOPT_HEADER_LEN;
366 mss = tcp->tcp_mss - opt_len;
367 total_hdr_len = connp->conn_ht_iphc_len + opt_len;
368 tcp_hdr_len = connp->conn_ht_ulp_len + opt_len;
369 } else {
370 mss = tcp->tcp_mss;
371 total_hdr_len = connp->conn_ht_iphc_len;
372 tcp_hdr_len = connp->conn_ht_ulp_len;
373 }
374
375 if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
376 (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
377 TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle);
378 }
379 if (tcpstate == TCPS_SYN_RCVD) {
380 /*
381 * The three-way connection establishment handshake is not
382 * complete yet. We want to queue the data for transmission
383 * after entering ESTABLISHED state (RFC793). A jump to
384 * "done" label effectively leaves data on the queue.
385 */
386 goto done;
387 } else {
388 int usable_r;
389
390 /*
391 * In the special case when cwnd is zero, which can only
392 * happen if the connection is ECN capable, return now.
393 * New segments is sent using tcp_timer(). The timer
394 * is set in tcp_input_data().
395 */
396 if (tcp->tcp_cwnd == 0) {
397 /*
1178 * |--------------|-----------------|
1179 * tcp_suna tcp_snxt tcp_suna+tcp_swnd
1180 */
1181 /* END CSTYLED */
1182
1183 /* start sending from tcp_snxt */
1184 snxt = tcp->tcp_snxt;
1185
1186 /*
1187 * Check to see if this connection has been idle for some time and no
1188 * ACK is expected. If so, then the congestion window size is no longer
1189 * meaningfully tied to current network conditions.
1190 *
1191 * We reinitialize tcp_cwnd, and slow start again to get back the
1192 * connection's "self-clock" as described in Van Jacobson's 1988 paper
1193 * "Congestion avoidance and control".
1194 */
1195 now = LBOLT_FASTPATH;
1196 if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
1197 (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
1198 TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle);
1199 }
1200
1201 usable = tcp->tcp_swnd; /* tcp window size */
1202 if (usable > tcp->tcp_cwnd)
1203 usable = tcp->tcp_cwnd; /* congestion window smaller */
1204 usable -= snxt; /* subtract stuff already sent */
1205 suna = tcp->tcp_suna;
1206 usable += suna;
1207 /* usable can be < 0 if the congestion window is smaller */
1208 if (len > usable) {
1209 /* Can't send complete M_DATA in one shot */
1210 goto slow;
1211 }
1212
1213 mutex_enter(&tcp->tcp_non_sq_lock);
1214 if (tcp->tcp_flow_stopped &&
1215 TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) {
1216 tcp_clrqfull(tcp);
1217 }
1218 mutex_exit(&tcp->tcp_non_sq_lock);
|
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
25 * Copyright 2019 Joyent, Inc.
26 */
27
28 /* This file contains all TCP output processing functions. */
29
30 #include <sys/types.h>
31 #include <sys/stream.h>
32 #include <sys/strsun.h>
33 #include <sys/strsubr.h>
34 #include <sys/stropts.h>
35 #include <sys/strlog.h>
36 #define _SUN_TPI_VERSION 2
37 #include <sys/tihdr.h>
38 #include <sys/suntpi.h>
39 #include <sys/xti_inet.h>
40 #include <sys/timod.h>
41 #include <sys/pattr.h>
42 #include <sys/squeue_impl.h>
43 #include <sys/squeue.h>
44 #include <sys/sockio.h>
64 int, ip_recv_attr_t *, ip_stack_t *, conn_t *);
65 static boolean_t tcp_send_rst_chk(tcp_stack_t *);
66 static void tcp_process_shrunk_swnd(tcp_t *, uint32_t);
67 static void tcp_fill_header(tcp_t *, uchar_t *, int);
68
69 /*
70 * Functions called directly via squeue having a prototype of edesc_t.
71 */
72 static void tcp_wput_nondata(void *, mblk_t *, void *, ip_recv_attr_t *);
73 static void tcp_wput_ioctl(void *, mblk_t *, void *, ip_recv_attr_t *);
74 static void tcp_wput_proto(void *, mblk_t *, void *, ip_recv_attr_t *);
75
76 /*
77 * This controls how tiny a write must be before we try to copy it
78 * into the mblk on the tail of the transmit queue. Not much
79 * speedup is observed for values larger than sixteen. Zero will
80 * disable the optimisation.
81 */
82 static int tcp_tx_pull_len = 16;
83
84 static void
85 cc_after_idle(tcp_t *tcp)
86 {
87 uint32_t old_cwnd = tcp->tcp_cwnd;
88
89 if (CC_ALGO(tcp)->after_idle != NULL)
90 CC_ALGO(tcp)->after_idle(&tcp->tcp_ccv);
91
92 DTRACE_PROBE3(cwnd__cc__after__idle, tcp_t *, tcp, uint32_t, old_cwnd,
93 uint32_t, tcp->tcp_cwnd);
94 }
95
96 int
97 tcp_wput(queue_t *q, mblk_t *mp)
98 {
99 conn_t *connp = Q_TO_CONN(q);
100 tcp_t *tcp;
101 void (*output_proc)();
102 t_scalar_t type;
103 uchar_t *rptr;
104 struct iocblk *iocp;
105 size_t size;
106
107 ASSERT(connp->conn_ref >= 2);
108
109 switch (DB_TYPE(mp)) {
110 case M_DATA:
111 tcp = connp->conn_tcp;
112 ASSERT(tcp != NULL);
113
114 size = msgdsize(mp);
115
214 /*
215 * The TCP normal data output path.
216 * NOTE: the logic of the fast path is duplicated from this function.
217 */
218 void
219 tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent)
220 {
221 int len;
222 mblk_t *local_time;
223 mblk_t *mp1;
224 uint32_t snxt;
225 int tail_unsent;
226 int tcpstate;
227 int usable = 0;
228 mblk_t *xmit_tail;
229 int32_t mss;
230 int32_t num_sack_blk = 0;
231 int32_t total_hdr_len;
232 int32_t tcp_hdr_len;
233 int rc;
234 conn_t *connp = tcp->tcp_connp;
235 clock_t now = LBOLT_FASTPATH;
236
237 tcpstate = tcp->tcp_state;
238 if (mp == NULL) {
239 /*
240 * tcp_wput_data() with NULL mp should only be called when
241 * there is unsent data.
242 */
243 ASSERT(tcp->tcp_unsent > 0);
244 /* Really tacky... but we need this for detached closes. */
245 len = tcp->tcp_unsent;
246 goto data_null;
247 }
248
249 ASSERT(mp->b_datap->db_type == M_DATA);
250 /*
251 * Don't allow data after T_ORDREL_REQ or T_DISCON_REQ,
252 * or before a connection attempt has begun.
253 */
368 * includes SACK options.
369 */
370 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) {
371 int32_t opt_len;
372
373 num_sack_blk = MIN(tcp->tcp_max_sack_blk,
374 tcp->tcp_num_sack_blk);
375 opt_len = num_sack_blk * sizeof (sack_blk_t) + TCPOPT_NOP_LEN *
376 2 + TCPOPT_HEADER_LEN;
377 mss = tcp->tcp_mss - opt_len;
378 total_hdr_len = connp->conn_ht_iphc_len + opt_len;
379 tcp_hdr_len = connp->conn_ht_ulp_len + opt_len;
380 } else {
381 mss = tcp->tcp_mss;
382 total_hdr_len = connp->conn_ht_iphc_len;
383 tcp_hdr_len = connp->conn_ht_ulp_len;
384 }
385
386 if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
387 (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
388 cc_after_idle(tcp);
389 }
390 if (tcpstate == TCPS_SYN_RCVD) {
391 /*
392 * The three-way connection establishment handshake is not
393 * complete yet. We want to queue the data for transmission
394 * after entering ESTABLISHED state (RFC793). A jump to
395 * "done" label effectively leaves data on the queue.
396 */
397 goto done;
398 } else {
399 int usable_r;
400
401 /*
402 * In the special case when cwnd is zero, which can only
403 * happen if the connection is ECN capable, return now.
404 * New segments is sent using tcp_timer(). The timer
405 * is set in tcp_input_data().
406 */
407 if (tcp->tcp_cwnd == 0) {
408 /*
1189 * |--------------|-----------------|
1190 * tcp_suna tcp_snxt tcp_suna+tcp_swnd
1191 */
1192 /* END CSTYLED */
1193
1194 /* start sending from tcp_snxt */
1195 snxt = tcp->tcp_snxt;
1196
1197 /*
1198 * Check to see if this connection has been idle for some time and no
1199 * ACK is expected. If so, then the congestion window size is no longer
1200 * meaningfully tied to current network conditions.
1201 *
1202 * We reinitialize tcp_cwnd, and slow start again to get back the
1203 * connection's "self-clock" as described in Van Jacobson's 1988 paper
1204 * "Congestion avoidance and control".
1205 */
1206 now = LBOLT_FASTPATH;
1207 if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
1208 (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
1209 cc_after_idle(tcp);
1210 }
1211
1212 usable = tcp->tcp_swnd; /* tcp window size */
1213 if (usable > tcp->tcp_cwnd)
1214 usable = tcp->tcp_cwnd; /* congestion window smaller */
1215 usable -= snxt; /* subtract stuff already sent */
1216 suna = tcp->tcp_suna;
1217 usable += suna;
1218 /* usable can be < 0 if the congestion window is smaller */
1219 if (len > usable) {
1220 /* Can't send complete M_DATA in one shot */
1221 goto slow;
1222 }
1223
1224 mutex_enter(&tcp->tcp_non_sq_lock);
1225 if (tcp->tcp_flow_stopped &&
1226 TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) {
1227 tcp_clrqfull(tcp);
1228 }
1229 mutex_exit(&tcp->tcp_non_sq_lock);
|