4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2014 by Delphix. All rights reserved.
25 */
26
27 /* This file contains all TCP output processing functions. */
28
29 #include <sys/types.h>
30 #include <sys/stream.h>
31 #include <sys/strsun.h>
32 #include <sys/strsubr.h>
33 #include <sys/stropts.h>
34 #include <sys/strlog.h>
35 #define _SUN_TPI_VERSION 2
36 #include <sys/tihdr.h>
37 #include <sys/suntpi.h>
38 #include <sys/xti_inet.h>
39 #include <sys/timod.h>
40 #include <sys/pattr.h>
41 #include <sys/squeue_impl.h>
42 #include <sys/squeue.h>
43 #include <sys/sockio.h>
44 #include <sys/tsol/tnet.h>
45
46 #include <inet/common.h>
47 #include <inet/ip.h>
48 #include <inet/tcp.h>
49 #include <inet/tcp_impl.h>
50 #include <inet/snmpcom.h>
51 #include <inet/proto_set.h>
52 #include <inet/ipsec_impl.h>
53 #include <inet/ip_ndp.h>
54
55 static mblk_t *tcp_get_seg_mp(tcp_t *, uint32_t, int32_t *);
56 static void tcp_wput_cmdblk(queue_t *, mblk_t *);
57 static void tcp_wput_flush(tcp_t *, mblk_t *);
58 static void tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp);
59 static int tcp_xmit_end(tcp_t *);
60 static int tcp_send(tcp_t *, const int, const int, const int,
61 const int, int *, uint_t *, int *, mblk_t **, mblk_t *);
62 static void tcp_xmit_early_reset(char *, mblk_t *, uint32_t, uint32_t,
63 int, ip_recv_attr_t *, ip_stack_t *, conn_t *);
64 static boolean_t tcp_send_rst_chk(tcp_stack_t *);
65 static void tcp_process_shrunk_swnd(tcp_t *, uint32_t);
66 static void tcp_fill_header(tcp_t *, uchar_t *, clock_t, int);
67
68 /*
69 * Functions called directly via squeue having a prototype of edesc_t.
70 */
71 static void tcp_wput_nondata(void *, mblk_t *, void *, ip_recv_attr_t *);
72 static void tcp_wput_ioctl(void *, mblk_t *, void *, ip_recv_attr_t *);
73 static void tcp_wput_proto(void *, mblk_t *, void *, ip_recv_attr_t *);
74
75 /*
76 * This controls how tiny a write must be before we try to copy it
77 * into the mblk on the tail of the transmit queue. Not much
78 * speedup is observed for values larger than sixteen. Zero will
79 * disable the optimisation.
80 */
81 static int tcp_tx_pull_len = 16;
82
83 int
84 tcp_wput(queue_t *q, mblk_t *mp)
85 {
86 conn_t *connp = Q_TO_CONN(q);
437 return;
438 }
439
440 /* usable = MIN(swnd, cwnd) - unacked_bytes */
441 if (tcp->tcp_swnd > tcp->tcp_cwnd)
442 usable_r -= tcp->tcp_swnd - tcp->tcp_cwnd;
443
444 /* usable = MIN(usable, unsent) */
445 if (usable_r > len)
446 usable_r = len;
447
448 /* usable = MAX(usable, {1 for urgent, 0 for data}) */
449 if (usable_r > 0) {
450 usable = usable_r;
451 } else {
452 /* Bypass all other unnecessary processing. */
453 goto done;
454 }
455 }
456
457 local_time = (mblk_t *)now;
458
459 /*
460 * "Our" Nagle Algorithm. This is not the same as in the old
461 * BSD. This is more in line with the true intent of Nagle.
462 *
463 * The conditions are:
464 * 1. The amount of unsent data (or amount of data which can be
465 * sent, whichever is smaller) is less than Nagle limit.
466 * 2. The last sent size is also less than Nagle limit.
467 * 3. There is unack'ed data.
468 * 4. Urgent pointer is not set. Send urgent data ignoring the
469 * Nagle algorithm. This reduces the probability that urgent
470 * bytes get "merged" together.
471 * 5. The app has not closed the connection. This eliminates the
472 * wait time of the receiving side waiting for the last piece of
473 * (small) data.
474 *
475 * If all are satisified, exit without sending anything. Note
476 * that Nagle limit can be smaller than 1 MSS. Nagle limit is
477 * the smaller of 1 MSS and global tcp_naglim_def (default to be
1166 tcp->tcp_xmit_head = mp;
1167 } else {
1168 tcp->tcp_xmit_last->b_cont = mp;
1169 }
1170 tcp->tcp_xmit_last = mp;
1171 tcp->tcp_xmit_tail = mp;
1172
1173 /* find out how much we can send */
1174 /* BEGIN CSTYLED */
1175 /*
1176 * un-acked usable
1177 * |--------------|-----------------|
1178 * tcp_suna tcp_snxt tcp_suna+tcp_swnd
1179 */
1180 /* END CSTYLED */
1181
1182 /* start sending from tcp_snxt */
1183 snxt = tcp->tcp_snxt;
1184
1185 /*
1186 * Check to see if this connection has been idled for some
1187 * time and no ACK is expected. If it is, we need to slow
1188 * start again to get back the connection's "self-clock" as
1189 * described in VJ's paper.
1190 *
1191 * Reinitialize tcp_cwnd after idle.
1192 */
1193 now = LBOLT_FASTPATH;
1194 if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
1195 (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
1196 TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle);
1197 }
1198
1199 usable = tcp->tcp_swnd; /* tcp window size */
1200 if (usable > tcp->tcp_cwnd)
1201 usable = tcp->tcp_cwnd; /* congestion window smaller */
1202 usable -= snxt; /* subtract stuff already sent */
1203 suna = tcp->tcp_suna;
1204 usable += suna;
1205 /* usable can be < 0 if the congestion window is smaller */
1206 if (len > usable) {
1207 /* Can't send complete M_DATA in one shot */
1208 goto slow;
1209 }
1210
1211 mutex_enter(&tcp->tcp_non_sq_lock);
1239 return;
1240 }
1241
1242 /*
1243 * len <= tcp->tcp_mss && len == unsent so no sender silly window. Can
1244 * send now.
1245 */
1246
1247 if (snxt == suna) {
1248 TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
1249 }
1250
1251 /* we have always sent something */
1252 tcp->tcp_rack_cnt = 0;
1253
1254 tcp->tcp_snxt = snxt + len;
1255 tcp->tcp_rack = tcp->tcp_rnxt;
1256
1257 if ((mp1 = dupb(mp)) == 0)
1258 goto no_memory;
1259 mp->b_prev = (mblk_t *)(uintptr_t)now;
1260 mp->b_next = (mblk_t *)(uintptr_t)snxt;
1261
1262 /* adjust tcp header information */
1263 tcpha = tcp->tcp_tcpha;
1264 tcpha->tha_flags = (TH_ACK|TH_PUSH);
1265
1266 sum = len + connp->conn_ht_ulp_len + connp->conn_sum;
1267 sum = (sum >> 16) + (sum & 0xFFFF);
1268 tcpha->tha_sum = htons(sum);
1269
1270 tcpha->tha_seq = htonl(snxt);
1271
1272 TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
1273 TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
1274 BUMP_LOCAL(tcp->tcp_obsegs);
1275
1276 /* Update the latest receive window size in TCP header. */
1277 tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
1278
1279 tcp->tcp_last_sent_len = (ushort_t)len;
1294 rptr = mp1->b_rptr - hdrlen;
1295 db = mp1->b_datap;
1296 if ((db->db_ref != 2) || rptr < db->db_base ||
1297 (!OK_32PTR(rptr))) {
1298 /* NOTE: we assume allocb returns an OK_32PTR */
1299 mp = allocb(hdrlen + tcps->tcps_wroff_xtra, BPRI_MED);
1300 if (!mp) {
1301 freemsg(mp1);
1302 goto no_memory;
1303 }
1304 mp->b_cont = mp1;
1305 mp1 = mp;
1306 /* Leave room for Link Level header */
1307 rptr = &mp1->b_rptr[tcps->tcps_wroff_xtra];
1308 mp1->b_wptr = &rptr[hdrlen];
1309 }
1310 mp1->b_rptr = rptr;
1311
1312 /* Fill in the timestamp option. */
1313 if (tcp->tcp_snd_ts_ok) {
1314 uint32_t llbolt = (uint32_t)LBOLT_FASTPATH;
1315
1316 U32_TO_BE32(llbolt,
1317 (char *)tcpha + TCP_MIN_HEADER_LENGTH+4);
1318 U32_TO_BE32(tcp->tcp_ts_recent,
1319 (char *)tcpha + TCP_MIN_HEADER_LENGTH+8);
1320 } else {
1321 ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
1322 }
1323
1324 /* copy header into outgoing packet */
1325 dst = (ipaddr_t *)rptr;
1326 src = (ipaddr_t *)connp->conn_ht_iphc;
1327 dst[0] = src[0];
1328 dst[1] = src[1];
1329 dst[2] = src[2];
1330 dst[3] = src[3];
1331 dst[4] = src[4];
1332 dst[5] = src[5];
1333 dst[6] = src[6];
1334 dst[7] = src[7];
1335 dst[8] = src[8];
1336 dst[9] = src[9];
1337 if (hdrlen -= 40) {
1338 hdrlen >>= 2;
1339 dst += 10;
1754 /*
1755 * In the off-chance that the eager received and responded to
1756 * some other packet while the SYN|ACK was queued, we recalculate
1757 * the ixa_pktlen. It would be better to fix the SYN/accept
1758 * multithreading scheme to avoid this complexity.
1759 */
1760 ixa->ixa_pktlen = msgdsize(mp);
1761 (void) conn_ip_output(mp, ixa);
1762 }
1763
1764 /*
1765 * tcp_send() is called by tcp_wput_data() and returns one of the following:
1766 *
1767 * -1 = failed allocation.
1768 * 0 = We've either successfully sent data, or our usable send window is too
1769 * small and we'd rather wait until later before sending again.
1770 */
1771 static int
1772 tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len,
1773 const int tcp_hdr_len, const int num_sack_blk, int *usable,
1774 uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time)
1775 {
1776 int num_lso_seg = 1;
1777 uint_t lso_usable;
1778 boolean_t do_lso_send = B_FALSE;
1779 tcp_stack_t *tcps = tcp->tcp_tcps;
1780 conn_t *connp = tcp->tcp_connp;
1781 ip_xmit_attr_t *ixa = connp->conn_ixa;
1782
1783 /*
1784 * Check LSO possibility. The value of tcp->tcp_lso indicates whether
1785 * the underlying connection is LSO capable. Will check whether having
1786 * enough available data to initiate LSO transmission in the for(){}
1787 * loops.
1788 */
1789 if (tcp->tcp_lso && (tcp->tcp_valid_bits & ~TCP_FSS_VALID) == 0)
1790 do_lso_send = B_TRUE;
1791
1792 for (;;) {
1793 struct datab *db;
1794 tcpha_t *tcpha;
2049
2050 must_alloc:;
2051 mp1 = allocb(connp->conn_ht_iphc_allocated +
2052 tcps->tcps_wroff_xtra, BPRI_MED);
2053 if (mp1 == NULL) {
2054 freemsg(mp);
2055 return (-1); /* out_of_mem */
2056 }
2057 mp1->b_cont = mp;
2058 mp = mp1;
2059 /* Leave room for Link Level header */
2060 len = total_hdr_len;
2061 rptr = &mp->b_rptr[tcps->tcps_wroff_xtra];
2062 mp->b_wptr = &rptr[len];
2063 }
2064
2065 /*
2066 * Fill in the header using the template header, and add
2067 * options such as time-stamp, ECN and/or SACK, as needed.
2068 */
2069 tcp_fill_header(tcp, rptr, (clock_t)local_time, num_sack_blk);
2070
2071 mp->b_rptr = rptr;
2072
2073 if (*tail_unsent) {
2074 int spill = *tail_unsent;
2075
2076 mp1 = mp->b_cont;
2077 if (mp1 == NULL)
2078 mp1 = mp;
2079
2080 /*
2081 * If we're a little short, tack on more mblks until
2082 * there is no more spillover.
2083 */
2084 while (spill < 0) {
2085 mblk_t *nmp;
2086 int nmpsz;
2087
2088 nmp = (*xmit_tail)->b_cont;
2089 nmpsz = MBLKL(nmp);
2267 * so we have to check that and unset it first.
2268 */
2269 if (tcp->tcp_cork)
2270 tcp->tcp_cork = B_FALSE;
2271 tcp_wput_data(tcp, NULL, B_FALSE);
2272 }
2273
2274 /*
2275 * If TCP does not get enough samples of RTT or tcp_rtt_updates
2276 * is 0, don't update the cache.
2277 */
2278 if (tcps->tcps_rtt_updates == 0 ||
2279 tcp->tcp_rtt_update < tcps->tcps_rtt_updates)
2280 return (0);
2281
2282 /*
2283 * We do not have a good algorithm to update ssthresh at this time.
2284 * So don't do any update.
2285 */
2286 bzero(&uinfo, sizeof (uinfo));
2287 uinfo.iulp_rtt = tcp->tcp_rtt_sa;
2288 uinfo.iulp_rtt_sd = tcp->tcp_rtt_sd;
2289
2290 /*
2291 * Note that uinfo is kept for conn_faddr in the DCE. Could update even
2292 * if source routed but we don't.
2293 */
2294 if (connp->conn_ipversion == IPV4_VERSION) {
2295 if (connp->conn_faddr_v4 != tcp->tcp_ipha->ipha_dst) {
2296 return (0);
2297 }
2298 (void) dce_update_uinfo_v4(connp->conn_faddr_v4, &uinfo, ipst);
2299 } else {
2300 uint_t ifindex;
2301
2302 if (!(IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
2303 &tcp->tcp_ip6h->ip6_dst))) {
2304 return (0);
2305 }
2306 ifindex = 0;
2307 if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_faddr_v6)) {
2308 ip_xmit_attr_t *ixa = connp->conn_ixa;
3372 ASSERT(snxt_mp != NULL);
3373 /* This should not happen. Defensive coding again... */
3374 if (snxt_mp == NULL) {
3375 return;
3376 }
3377
3378 xmit_mp = tcp_xmit_mp(tcp, snxt_mp, seg_len, &off,
3379 &tmp_mp, begin, B_TRUE, &seg_len, B_TRUE);
3380 if (xmit_mp == NULL)
3381 return;
3382
3383 usable_swnd -= seg_len;
3384 tcp->tcp_pipe += seg_len;
3385 tcp->tcp_sack_snxt = begin + seg_len;
3386
3387 tcp_send_data(tcp, xmit_mp);
3388
3389 /*
3390 * Update the send timestamp to avoid false retransmission.
3391 */
3392 snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt();
3393
3394 TCPS_BUMP_MIB(tcps, tcpRetransSegs);
3395 TCPS_UPDATE_MIB(tcps, tcpRetransBytes, seg_len);
3396 TCPS_BUMP_MIB(tcps, tcpOutSackRetransSegs);
3397 /*
3398 * Update tcp_rexmit_max to extend this SACK recovery phase.
3399 * This happens when new data sent during fast recovery is
3400 * also lost. If TCP retransmits those new data, it needs
3401 * to extend SACK recover phase to avoid starting another
3402 * fast retransmit/recovery unnecessarily.
3403 */
3404 if (SEQ_GT(tcp->tcp_sack_snxt, tcp->tcp_rexmit_max)) {
3405 tcp->tcp_rexmit_max = tcp->tcp_sack_snxt;
3406 }
3407 }
3408 }
3409
3410 /*
3411 * tcp_ss_rexmit() is called to do slow start retransmission after a timeout
3412 * or ICMP errors.
3444
3445 if (win < cnt) {
3446 cnt = win;
3447 }
3448 if (SEQ_GT(snxt + cnt, smax)) {
3449 cnt = smax - snxt;
3450 }
3451 xmit_mp = tcp_xmit_mp(tcp, snxt_mp, cnt, &off,
3452 &snxt_mp, snxt, B_TRUE, &cnt, B_TRUE);
3453 if (xmit_mp == NULL)
3454 return;
3455
3456 tcp_send_data(tcp, xmit_mp);
3457
3458 snxt += cnt;
3459 win -= cnt;
3460 /*
3461 * Update the send timestamp to avoid false
3462 * retransmission.
3463 */
3464 old_snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt();
3465 TCPS_BUMP_MIB(tcps, tcpRetransSegs);
3466 TCPS_UPDATE_MIB(tcps, tcpRetransBytes, cnt);
3467
3468 tcp->tcp_rexmit_nxt = snxt;
3469 }
3470 /*
3471 * If we have transmitted all we have at the time
3472 * we started the retranmission, we can leave
3473 * the rest of the job to tcp_wput_data(). But we
3474 * need to check the send window first. If the
3475 * win is not 0, go on with tcp_wput_data().
3476 */
3477 if (SEQ_LT(snxt, smax) || win == 0) {
3478 return;
3479 }
3480 }
3481 /* Only call tcp_wput_data() if there is data to be sent. */
3482 if (tcp->tcp_unsent) {
3483 tcp_wput_data(tcp, NULL, B_FALSE);
3484 }
3604 /*
3605 * If the SACK option is set, delete the entire list of
3606 * notsack'ed blocks.
3607 */
3608 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, tcp);
3609
3610 if (tcp->tcp_suna == tcp->tcp_snxt && tcp->tcp_swnd == 0)
3611 /*
3612 * Make sure the timer is running so that we will probe a zero
3613 * window.
3614 */
3615 TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
3616 }
3617
3618 /*
3619 * tcp_fill_header is called by tcp_send() to fill the outgoing TCP header
3620 * with the template header, as well as other options such as time-stamp,
3621 * ECN and/or SACK.
3622 */
3623 static void
3624 tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk)
3625 {
3626 tcpha_t *tcp_tmpl, *tcpha;
3627 uint32_t *dst, *src;
3628 int hdrlen;
3629 conn_t *connp = tcp->tcp_connp;
3630
3631 ASSERT(OK_32PTR(rptr));
3632
3633 /* Template header */
3634 tcp_tmpl = tcp->tcp_tcpha;
3635
3636 /* Header of outgoing packet */
3637 tcpha = (tcpha_t *)(rptr + connp->conn_ixa->ixa_ip_hdr_length);
3638
3639 /* dst and src are opaque 32-bit fields, used for copying */
3640 dst = (uint32_t *)rptr;
3641 src = (uint32_t *)connp->conn_ht_iphc;
3642 hdrlen = connp->conn_ht_iphc_len;
3643
3644 /* Fill time-stamp option if needed */
3645 if (tcp->tcp_snd_ts_ok) {
3646 U32_TO_BE32((uint32_t)now,
3647 (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 4);
3648 U32_TO_BE32(tcp->tcp_ts_recent,
3649 (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 8);
3650 } else {
3651 ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
3652 }
3653
3654 /*
3655 * Copy the template header; is this really more efficient than
3656 * calling bcopy()? For simple IPv4/TCP, it may be the case,
3657 * but perhaps not for other scenarios.
3658 */
3659 dst[0] = src[0];
3660 dst[1] = src[1];
3661 dst[2] = src[2];
3662 dst[3] = src[3];
3663 dst[4] = src[4];
3664 dst[5] = src[5];
3665 dst[6] = src[6];
3666 dst[7] = src[7];
|
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
25 * Copyright 2019 Joyent, Inc.
26 */
27
28 /* This file contains all TCP output processing functions. */
29
30 #include <sys/types.h>
31 #include <sys/stream.h>
32 #include <sys/strsun.h>
33 #include <sys/strsubr.h>
34 #include <sys/stropts.h>
35 #include <sys/strlog.h>
36 #define _SUN_TPI_VERSION 2
37 #include <sys/tihdr.h>
38 #include <sys/suntpi.h>
39 #include <sys/xti_inet.h>
40 #include <sys/timod.h>
41 #include <sys/pattr.h>
42 #include <sys/squeue_impl.h>
43 #include <sys/squeue.h>
44 #include <sys/sockio.h>
45 #include <sys/tsol/tnet.h>
46
47 #include <inet/common.h>
48 #include <inet/ip.h>
49 #include <inet/tcp.h>
50 #include <inet/tcp_impl.h>
51 #include <inet/snmpcom.h>
52 #include <inet/proto_set.h>
53 #include <inet/ipsec_impl.h>
54 #include <inet/ip_ndp.h>
55
56 static mblk_t *tcp_get_seg_mp(tcp_t *, uint32_t, int32_t *);
57 static void tcp_wput_cmdblk(queue_t *, mblk_t *);
58 static void tcp_wput_flush(tcp_t *, mblk_t *);
59 static void tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp);
60 static int tcp_xmit_end(tcp_t *);
61 static int tcp_send(tcp_t *, const int, const int, const int,
62 const int, int *, uint32_t *, int *, mblk_t **, mblk_t *);
63 static void tcp_xmit_early_reset(char *, mblk_t *, uint32_t, uint32_t,
64 int, ip_recv_attr_t *, ip_stack_t *, conn_t *);
65 static boolean_t tcp_send_rst_chk(tcp_stack_t *);
66 static void tcp_process_shrunk_swnd(tcp_t *, uint32_t);
67 static void tcp_fill_header(tcp_t *, uchar_t *, int);
68
69 /*
70 * Functions called directly via squeue having a prototype of edesc_t.
71 */
72 static void tcp_wput_nondata(void *, mblk_t *, void *, ip_recv_attr_t *);
73 static void tcp_wput_ioctl(void *, mblk_t *, void *, ip_recv_attr_t *);
74 static void tcp_wput_proto(void *, mblk_t *, void *, ip_recv_attr_t *);
75
76 /*
77 * This controls how tiny a write must be before we try to copy it
78 * into the mblk on the tail of the transmit queue. Not much
79 * speedup is observed for values larger than sixteen. Zero will
80 * disable the optimisation.
81 */
82 static int tcp_tx_pull_len = 16;
83
84 int
85 tcp_wput(queue_t *q, mblk_t *mp)
86 {
87 conn_t *connp = Q_TO_CONN(q);
438 return;
439 }
440
441 /* usable = MIN(swnd, cwnd) - unacked_bytes */
442 if (tcp->tcp_swnd > tcp->tcp_cwnd)
443 usable_r -= tcp->tcp_swnd - tcp->tcp_cwnd;
444
445 /* usable = MIN(usable, unsent) */
446 if (usable_r > len)
447 usable_r = len;
448
449 /* usable = MAX(usable, {1 for urgent, 0 for data}) */
450 if (usable_r > 0) {
451 usable = usable_r;
452 } else {
453 /* Bypass all other unnecessary processing. */
454 goto done;
455 }
456 }
457
458 local_time = (mblk_t *)(intptr_t)gethrtime();
459
460 /*
461 * "Our" Nagle Algorithm. This is not the same as in the old
462 * BSD. This is more in line with the true intent of Nagle.
463 *
464 * The conditions are:
465 * 1. The amount of unsent data (or amount of data which can be
466 * sent, whichever is smaller) is less than Nagle limit.
467 * 2. The last sent size is also less than Nagle limit.
468 * 3. There is unack'ed data.
469 * 4. Urgent pointer is not set. Send urgent data ignoring the
470 * Nagle algorithm. This reduces the probability that urgent
471 * bytes get "merged" together.
472 * 5. The app has not closed the connection. This eliminates the
473 * wait time of the receiving side waiting for the last piece of
474 * (small) data.
475 *
476 * If all are satisified, exit without sending anything. Note
477 * that Nagle limit can be smaller than 1 MSS. Nagle limit is
478 * the smaller of 1 MSS and global tcp_naglim_def (default to be
1167 tcp->tcp_xmit_head = mp;
1168 } else {
1169 tcp->tcp_xmit_last->b_cont = mp;
1170 }
1171 tcp->tcp_xmit_last = mp;
1172 tcp->tcp_xmit_tail = mp;
1173
1174 /* find out how much we can send */
1175 /* BEGIN CSTYLED */
1176 /*
1177 * un-acked usable
1178 * |--------------|-----------------|
1179 * tcp_suna tcp_snxt tcp_suna+tcp_swnd
1180 */
1181 /* END CSTYLED */
1182
1183 /* start sending from tcp_snxt */
1184 snxt = tcp->tcp_snxt;
1185
1186 /*
1187 * Check to see if this connection has been idle for some time and no
1188 * ACK is expected. If so, then the congestion window size is no longer
1189 * meaningfully tied to current network conditions.
1190 *
1191 * We reinitialize tcp_cwnd, and slow start again to get back the
1192 * connection's "self-clock" as described in Van Jacobson's 1988 paper
1193 * "Congestion avoidance and control".
1194 */
1195 now = LBOLT_FASTPATH;
1196 if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
1197 (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
1198 TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle);
1199 }
1200
1201 usable = tcp->tcp_swnd; /* tcp window size */
1202 if (usable > tcp->tcp_cwnd)
1203 usable = tcp->tcp_cwnd; /* congestion window smaller */
1204 usable -= snxt; /* subtract stuff already sent */
1205 suna = tcp->tcp_suna;
1206 usable += suna;
1207 /* usable can be < 0 if the congestion window is smaller */
1208 if (len > usable) {
1209 /* Can't send complete M_DATA in one shot */
1210 goto slow;
1211 }
1212
1213 mutex_enter(&tcp->tcp_non_sq_lock);
1241 return;
1242 }
1243
1244 /*
1245 * len <= tcp->tcp_mss && len == unsent so no sender silly window. Can
1246 * send now.
1247 */
1248
1249 if (snxt == suna) {
1250 TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
1251 }
1252
1253 /* we have always sent something */
1254 tcp->tcp_rack_cnt = 0;
1255
1256 tcp->tcp_snxt = snxt + len;
1257 tcp->tcp_rack = tcp->tcp_rnxt;
1258
1259 if ((mp1 = dupb(mp)) == 0)
1260 goto no_memory;
1261 mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
1262 mp->b_next = (mblk_t *)(uintptr_t)snxt;
1263
1264 /* adjust tcp header information */
1265 tcpha = tcp->tcp_tcpha;
1266 tcpha->tha_flags = (TH_ACK|TH_PUSH);
1267
1268 sum = len + connp->conn_ht_ulp_len + connp->conn_sum;
1269 sum = (sum >> 16) + (sum & 0xFFFF);
1270 tcpha->tha_sum = htons(sum);
1271
1272 tcpha->tha_seq = htonl(snxt);
1273
1274 TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
1275 TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
1276 BUMP_LOCAL(tcp->tcp_obsegs);
1277
1278 /* Update the latest receive window size in TCP header. */
1279 tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
1280
1281 tcp->tcp_last_sent_len = (ushort_t)len;
1296 rptr = mp1->b_rptr - hdrlen;
1297 db = mp1->b_datap;
1298 if ((db->db_ref != 2) || rptr < db->db_base ||
1299 (!OK_32PTR(rptr))) {
1300 /* NOTE: we assume allocb returns an OK_32PTR */
1301 mp = allocb(hdrlen + tcps->tcps_wroff_xtra, BPRI_MED);
1302 if (!mp) {
1303 freemsg(mp1);
1304 goto no_memory;
1305 }
1306 mp->b_cont = mp1;
1307 mp1 = mp;
1308 /* Leave room for Link Level header */
1309 rptr = &mp1->b_rptr[tcps->tcps_wroff_xtra];
1310 mp1->b_wptr = &rptr[hdrlen];
1311 }
1312 mp1->b_rptr = rptr;
1313
1314 /* Fill in the timestamp option. */
1315 if (tcp->tcp_snd_ts_ok) {
1316 U32_TO_BE32(now,
1317 (char *)tcpha + TCP_MIN_HEADER_LENGTH + 4);
1318 U32_TO_BE32(tcp->tcp_ts_recent,
1319 (char *)tcpha + TCP_MIN_HEADER_LENGTH + 8);
1320 } else {
1321 ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
1322 }
1323
1324 /* copy header into outgoing packet */
1325 dst = (ipaddr_t *)rptr;
1326 src = (ipaddr_t *)connp->conn_ht_iphc;
1327 dst[0] = src[0];
1328 dst[1] = src[1];
1329 dst[2] = src[2];
1330 dst[3] = src[3];
1331 dst[4] = src[4];
1332 dst[5] = src[5];
1333 dst[6] = src[6];
1334 dst[7] = src[7];
1335 dst[8] = src[8];
1336 dst[9] = src[9];
1337 if (hdrlen -= 40) {
1338 hdrlen >>= 2;
1339 dst += 10;
1754 /*
1755 * In the off-chance that the eager received and responded to
1756 * some other packet while the SYN|ACK was queued, we recalculate
1757 * the ixa_pktlen. It would be better to fix the SYN/accept
1758 * multithreading scheme to avoid this complexity.
1759 */
1760 ixa->ixa_pktlen = msgdsize(mp);
1761 (void) conn_ip_output(mp, ixa);
1762 }
1763
1764 /*
1765 * tcp_send() is called by tcp_wput_data() and returns one of the following:
1766 *
1767 * -1 = failed allocation.
1768 * 0 = We've either successfully sent data, or our usable send window is too
1769 * small and we'd rather wait until later before sending again.
1770 */
1771 static int
1772 tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len,
1773 const int tcp_hdr_len, const int num_sack_blk, int *usable,
1774 uint32_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time)
1775 {
1776 int num_lso_seg = 1;
1777 uint_t lso_usable;
1778 boolean_t do_lso_send = B_FALSE;
1779 tcp_stack_t *tcps = tcp->tcp_tcps;
1780 conn_t *connp = tcp->tcp_connp;
1781 ip_xmit_attr_t *ixa = connp->conn_ixa;
1782
1783 /*
1784 * Check LSO possibility. The value of tcp->tcp_lso indicates whether
1785 * the underlying connection is LSO capable. Will check whether having
1786 * enough available data to initiate LSO transmission in the for(){}
1787 * loops.
1788 */
1789 if (tcp->tcp_lso && (tcp->tcp_valid_bits & ~TCP_FSS_VALID) == 0)
1790 do_lso_send = B_TRUE;
1791
1792 for (;;) {
1793 struct datab *db;
1794 tcpha_t *tcpha;
2049
2050 must_alloc:;
2051 mp1 = allocb(connp->conn_ht_iphc_allocated +
2052 tcps->tcps_wroff_xtra, BPRI_MED);
2053 if (mp1 == NULL) {
2054 freemsg(mp);
2055 return (-1); /* out_of_mem */
2056 }
2057 mp1->b_cont = mp;
2058 mp = mp1;
2059 /* Leave room for Link Level header */
2060 len = total_hdr_len;
2061 rptr = &mp->b_rptr[tcps->tcps_wroff_xtra];
2062 mp->b_wptr = &rptr[len];
2063 }
2064
2065 /*
2066 * Fill in the header using the template header, and add
2067 * options such as time-stamp, ECN and/or SACK, as needed.
2068 */
2069 tcp_fill_header(tcp, rptr, num_sack_blk);
2070
2071 mp->b_rptr = rptr;
2072
2073 if (*tail_unsent) {
2074 int spill = *tail_unsent;
2075
2076 mp1 = mp->b_cont;
2077 if (mp1 == NULL)
2078 mp1 = mp;
2079
2080 /*
2081 * If we're a little short, tack on more mblks until
2082 * there is no more spillover.
2083 */
2084 while (spill < 0) {
2085 mblk_t *nmp;
2086 int nmpsz;
2087
2088 nmp = (*xmit_tail)->b_cont;
2089 nmpsz = MBLKL(nmp);
2267 * so we have to check that and unset it first.
2268 */
2269 if (tcp->tcp_cork)
2270 tcp->tcp_cork = B_FALSE;
2271 tcp_wput_data(tcp, NULL, B_FALSE);
2272 }
2273
2274 /*
2275 * If TCP does not get enough samples of RTT or tcp_rtt_updates
2276 * is 0, don't update the cache.
2277 */
2278 if (tcps->tcps_rtt_updates == 0 ||
2279 tcp->tcp_rtt_update < tcps->tcps_rtt_updates)
2280 return (0);
2281
2282 /*
2283 * We do not have a good algorithm to update ssthresh at this time.
2284 * So don't do any update.
2285 */
2286 bzero(&uinfo, sizeof (uinfo));
2287 uinfo.iulp_rtt = NSEC2MSEC(tcp->tcp_rtt_sa);
2288 uinfo.iulp_rtt_sd = NSEC2MSEC(tcp->tcp_rtt_sd);
2289
2290 /*
2291 * Note that uinfo is kept for conn_faddr in the DCE. Could update even
2292 * if source routed but we don't.
2293 */
2294 if (connp->conn_ipversion == IPV4_VERSION) {
2295 if (connp->conn_faddr_v4 != tcp->tcp_ipha->ipha_dst) {
2296 return (0);
2297 }
2298 (void) dce_update_uinfo_v4(connp->conn_faddr_v4, &uinfo, ipst);
2299 } else {
2300 uint_t ifindex;
2301
2302 if (!(IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
2303 &tcp->tcp_ip6h->ip6_dst))) {
2304 return (0);
2305 }
2306 ifindex = 0;
2307 if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_faddr_v6)) {
2308 ip_xmit_attr_t *ixa = connp->conn_ixa;
3372 ASSERT(snxt_mp != NULL);
3373 /* This should not happen. Defensive coding again... */
3374 if (snxt_mp == NULL) {
3375 return;
3376 }
3377
3378 xmit_mp = tcp_xmit_mp(tcp, snxt_mp, seg_len, &off,
3379 &tmp_mp, begin, B_TRUE, &seg_len, B_TRUE);
3380 if (xmit_mp == NULL)
3381 return;
3382
3383 usable_swnd -= seg_len;
3384 tcp->tcp_pipe += seg_len;
3385 tcp->tcp_sack_snxt = begin + seg_len;
3386
3387 tcp_send_data(tcp, xmit_mp);
3388
3389 /*
3390 * Update the send timestamp to avoid false retransmission.
3391 */
3392 snxt_mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
3393
3394 TCPS_BUMP_MIB(tcps, tcpRetransSegs);
3395 TCPS_UPDATE_MIB(tcps, tcpRetransBytes, seg_len);
3396 TCPS_BUMP_MIB(tcps, tcpOutSackRetransSegs);
3397 /*
3398 * Update tcp_rexmit_max to extend this SACK recovery phase.
3399 * This happens when new data sent during fast recovery is
3400 * also lost. If TCP retransmits those new data, it needs
3401 * to extend SACK recover phase to avoid starting another
3402 * fast retransmit/recovery unnecessarily.
3403 */
3404 if (SEQ_GT(tcp->tcp_sack_snxt, tcp->tcp_rexmit_max)) {
3405 tcp->tcp_rexmit_max = tcp->tcp_sack_snxt;
3406 }
3407 }
3408 }
3409
3410 /*
3411 * tcp_ss_rexmit() is called to do slow start retransmission after a timeout
3412 * or ICMP errors.
3444
3445 if (win < cnt) {
3446 cnt = win;
3447 }
3448 if (SEQ_GT(snxt + cnt, smax)) {
3449 cnt = smax - snxt;
3450 }
3451 xmit_mp = tcp_xmit_mp(tcp, snxt_mp, cnt, &off,
3452 &snxt_mp, snxt, B_TRUE, &cnt, B_TRUE);
3453 if (xmit_mp == NULL)
3454 return;
3455
3456 tcp_send_data(tcp, xmit_mp);
3457
3458 snxt += cnt;
3459 win -= cnt;
3460 /*
3461 * Update the send timestamp to avoid false
3462 * retransmission.
3463 */
3464 old_snxt_mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
3465 TCPS_BUMP_MIB(tcps, tcpRetransSegs);
3466 TCPS_UPDATE_MIB(tcps, tcpRetransBytes, cnt);
3467
3468 tcp->tcp_rexmit_nxt = snxt;
3469 }
3470 /*
3471 * If we have transmitted all we have at the time
3472 * we started the retranmission, we can leave
3473 * the rest of the job to tcp_wput_data(). But we
3474 * need to check the send window first. If the
3475 * win is not 0, go on with tcp_wput_data().
3476 */
3477 if (SEQ_LT(snxt, smax) || win == 0) {
3478 return;
3479 }
3480 }
3481 /* Only call tcp_wput_data() if there is data to be sent. */
3482 if (tcp->tcp_unsent) {
3483 tcp_wput_data(tcp, NULL, B_FALSE);
3484 }
3604 /*
3605 * If the SACK option is set, delete the entire list of
3606 * notsack'ed blocks.
3607 */
3608 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, tcp);
3609
3610 if (tcp->tcp_suna == tcp->tcp_snxt && tcp->tcp_swnd == 0)
3611 /*
3612 * Make sure the timer is running so that we will probe a zero
3613 * window.
3614 */
3615 TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
3616 }
3617
3618 /*
3619 * tcp_fill_header is called by tcp_send() to fill the outgoing TCP header
3620 * with the template header, as well as other options such as time-stamp,
3621 * ECN and/or SACK.
3622 */
3623 static void
3624 tcp_fill_header(tcp_t *tcp, uchar_t *rptr, int num_sack_blk)
3625 {
3626 tcpha_t *tcp_tmpl, *tcpha;
3627 uint32_t *dst, *src;
3628 int hdrlen;
3629 conn_t *connp = tcp->tcp_connp;
3630
3631 ASSERT(OK_32PTR(rptr));
3632
3633 /* Template header */
3634 tcp_tmpl = tcp->tcp_tcpha;
3635
3636 /* Header of outgoing packet */
3637 tcpha = (tcpha_t *)(rptr + connp->conn_ixa->ixa_ip_hdr_length);
3638
3639 /* dst and src are opaque 32-bit fields, used for copying */
3640 dst = (uint32_t *)rptr;
3641 src = (uint32_t *)connp->conn_ht_iphc;
3642 hdrlen = connp->conn_ht_iphc_len;
3643
3644 /* Fill time-stamp option if needed */
3645 if (tcp->tcp_snd_ts_ok) {
3646 U32_TO_BE32(LBOLT_FASTPATH,
3647 (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 4);
3648 U32_TO_BE32(tcp->tcp_ts_recent,
3649 (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 8);
3650 } else {
3651 ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
3652 }
3653
3654 /*
3655 * Copy the template header; is this really more efficient than
3656 * calling bcopy()? For simple IPv4/TCP, it may be the case,
3657 * but perhaps not for other scenarios.
3658 */
3659 dst[0] = src[0];
3660 dst[1] = src[1];
3661 dst[2] = src[2];
3662 dst[3] = src[3];
3663 dst[4] = src[4];
3664 dst[5] = src[5];
3665 dst[6] = src[6];
3666 dst[7] = src[7];
|