1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
24 * Copyright 2019 Joyent, Inc.
25 * Copyright (c) 2016 by Delphix. All rights reserved.
26 */
27
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #define _SUN_TPI_VERSION 2
31 #include <sys/tihdr.h>
32 #include <sys/socket.h>
33 #include <sys/xti_xtiopt.h>
34 #include <sys/xti_inet.h>
35 #include <sys/policy.h>
36
37 #include <inet/cc.h>
38 #include <inet/common.h>
39 #include <netinet/ip6.h>
40 #include <inet/ip.h>
41
42 #include <netinet/in.h>
43 #include <netinet/tcp.h>
44 #include <inet/optcom.h>
45 #include <inet/proto_set.h>
46 #include <inet/tcp_impl.h>
47
48 static int tcp_opt_default(queue_t *, int, int, uchar_t *);
49
50 /*
51 * Table of all known options handled on a TCP protocol stack.
52 *
53 * Note: This table contains options processed by both TCP and IP levels
54 * and is the superset of options that can be performed on a TCP over IP
55 * stack.
56 */
57 opdes_t tcp_opt_arr[] = {
58
59 { SO_LINGER, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
60 sizeof (struct linger), 0 },
61
62 { SO_DEBUG, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
63 { SO_KEEPALIVE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
64 { SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
65 { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
66 },
67 { SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
68 { SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
69 { SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
70 { SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
71 { SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
72 { SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
73 { SO_SNDTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
74 sizeof (struct timeval), 0 },
75 { SO_RCVTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
76 sizeof (struct timeval), 0 },
77 { SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
78 },
79 { SO_SND_COPYAVOID, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
80 { SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
81 0 },
82 { SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
83 0 },
84 { SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
85 0 },
86 { SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
87 0 },
88 { SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
89
90 { SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
91
92 { SO_PROTOTYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
93
94 { TCP_NODELAY, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
95 },
96 { TCP_MAXSEG, IPPROTO_TCP, OA_R, OA_R, OP_NP, 0, sizeof (uint_t),
97 536 },
98
99 { TCP_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
100 OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
101
102 { TCP_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
103 OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
104
105 { TCP_CONN_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
106 OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
107
108 { TCP_CONN_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
109 OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
110
111 { TCP_RECVDSTADDR, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
112 0 },
113
114 { TCP_ANONPRIVBIND, IPPROTO_TCP, OA_R, OA_RW, OP_PRIVPORT, 0,
115 sizeof (int), 0 },
116
117 { TCP_EXCLBIND, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
118 },
119
120 { TCP_INIT_CWND, IPPROTO_TCP, OA_RW, OA_RW, OP_CONFIG, 0,
121 sizeof (int), 0 },
122
123 { TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
124 sizeof (int), 0 },
125
126 { TCP_KEEPIDLE, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
127
128 { TCP_KEEPCNT, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
129
130 { TCP_KEEPINTVL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
131
132 { TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
133 sizeof (int), 0 },
134
135 { TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
136
137 { TCP_RTO_INITIAL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
138
139 { TCP_RTO_MIN, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
140
141 { TCP_RTO_MAX, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
142
143 { TCP_LINGER2, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
144
145 { TCP_CONGESTION, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
146 OP_VARLEN, CC_ALGO_NAME_MAX, 0 },
147
148 { IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
149 (OP_VARLEN|OP_NODEFAULT),
150 IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
151 { T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
152 (OP_VARLEN|OP_NODEFAULT),
153 IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
154
155 { IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
156 { T_IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
157 { IP_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
158 sizeof (int), -1 /* not initialized */ },
159
160 { IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
161 sizeof (ipsec_req_t), -1 /* not initialized */ },
162
163 { IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
164 sizeof (int), 0 /* no ifindex */ },
165
166 { IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
167 sizeof (int), 0 },
168
169 { IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
170 sizeof (int), -1 /* not initialized */ },
171
172 { IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
173 sizeof (int), 0 /* no ifindex */ },
174
175 { IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
176
177 { IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
178 sizeof (in_addr_t), -1 /* not initialized */ },
179
180 { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
181 sizeof (int), 0 },
182
183 { IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
184 (OP_NODEFAULT|OP_VARLEN),
185 sizeof (struct in6_pktinfo), -1 /* not initialized */ },
186 { IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
187 OP_NODEFAULT,
188 sizeof (sin6_t), -1 /* not initialized */ },
189 { IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
190 (OP_VARLEN|OP_NODEFAULT), 255*8,
191 -1 /* not initialized */ },
192 { IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
193 (OP_VARLEN|OP_NODEFAULT), 255*8,
194 -1 /* not initialized */ },
195 { IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
196 (OP_VARLEN|OP_NODEFAULT), 255*8,
197 -1 /* not initialized */ },
198 { IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
199 (OP_VARLEN|OP_NODEFAULT), 255*8,
200 -1 /* not initialized */ },
201 { IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
202 OP_NODEFAULT,
203 sizeof (int), -1 /* not initialized */ },
204 { IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
205 OP_NODEFAULT,
206 sizeof (struct ip6_mtuinfo), -1 /* not initialized */ },
207 { IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
208 sizeof (int), 0 },
209 { IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
210 sizeof (int), 0 },
211 { IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
212 sizeof (int), 0 },
213
214 /* Enable receipt of ancillary data */
215 { IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
216 sizeof (int), 0 },
217 { IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
218 sizeof (int), 0 },
219 { IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
220 sizeof (int), 0 },
221 { _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
222 sizeof (int), 0 },
223 { IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
224 sizeof (int), 0 },
225 { IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
226 sizeof (int), 0 },
227 { IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
228 sizeof (int), 0 },
229 { IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
230 sizeof (int), 0 },
231
232 { IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
233 sizeof (ipsec_req_t), -1 /* not initialized */ },
234 { IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
235 sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT },
236 };
237
238 /*
239 * Table of all supported levels
240 * Note: Some levels (e.g. XTI_GENERIC) may be valid but may not have
241 * any supported options so we need this info separately.
242 *
243 * This is needed only for topmost tpi providers and is used only by
244 * XTI interfaces.
245 */
246 optlevel_t tcp_valid_levels_arr[] = {
247 XTI_GENERIC,
248 SOL_SOCKET,
249 IPPROTO_TCP,
250 IPPROTO_IP,
251 IPPROTO_IPV6
252 };
253
254
255 #define TCP_OPT_ARR_CNT A_CNT(tcp_opt_arr)
256 #define TCP_VALID_LEVELS_CNT A_CNT(tcp_valid_levels_arr)
257
258 uint_t tcp_max_optsize; /* initialized when TCP driver is loaded */
259
260 /*
261 * Initialize option database object for TCP
262 *
263 * This object represents database of options to search passed to
264 * {sock,tpi}optcom_req() interface routine to take care of option
265 * management and associated methods.
266 */
267
268 optdb_obj_t tcp_opt_obj = {
269 tcp_opt_default, /* TCP default value function pointer */
270 tcp_tpi_opt_get, /* TCP get function pointer */
271 tcp_tpi_opt_set, /* TCP set function pointer */
272 TCP_OPT_ARR_CNT, /* TCP option database count of entries */
273 tcp_opt_arr, /* TCP option database */
274 TCP_VALID_LEVELS_CNT, /* TCP valid level count of entries */
275 tcp_valid_levels_arr /* TCP valid level array */
276 };
277
278 static int tcp_max_init_cwnd = TCP_MAX_INIT_CWND;
279
280 /*
281 * Some TCP options can be "set" by requesting them in the option
282 * buffer. This is needed for XTI feature test though we do not
283 * allow it in general. We interpret that this mechanism is more
284 * applicable to OSI protocols and need not be allowed in general.
285 * This routine filters out options for which it is not allowed (most)
286 * and lets through those (few) for which it is. [ The XTI interface
287 * test suite specifics will imply that any XTI_GENERIC level XTI_* if
288 * ever implemented will have to be allowed here ].
289 */
290 static boolean_t
291 tcp_allow_connopt_set(int level, int name)
292 {
293
294 switch (level) {
295 case IPPROTO_TCP:
296 switch (name) {
297 case TCP_NODELAY:
298 return (B_TRUE);
299 default:
300 return (B_FALSE);
301 }
302 /*NOTREACHED*/
303 default:
304 return (B_FALSE);
305 }
306 /*NOTREACHED*/
307 }
308
309 /*
310 * This routine gets default values of certain options whose default
311 * values are maintained by protocol specific code
312 */
313 /* ARGSUSED */
314 static int
315 tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
316 {
317 int32_t *i1 = (int32_t *)ptr;
318 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps;
319
320 switch (level) {
321 case IPPROTO_TCP:
322 switch (name) {
323 case TCP_NOTIFY_THRESHOLD:
324 *i1 = tcps->tcps_ip_notify_interval;
325 break;
326 case TCP_ABORT_THRESHOLD:
327 *i1 = tcps->tcps_ip_abort_interval;
328 break;
329 case TCP_CONN_NOTIFY_THRESHOLD:
330 *i1 = tcps->tcps_ip_notify_cinterval;
331 break;
332 case TCP_CONN_ABORT_THRESHOLD:
333 *i1 = tcps->tcps_ip_abort_cinterval;
334 break;
335 default:
336 return (-1);
337 }
338 break;
339 case IPPROTO_IP:
340 switch (name) {
341 case IP_TTL:
342 *i1 = tcps->tcps_ipv4_ttl;
343 break;
344 default:
345 return (-1);
346 }
347 break;
348 case IPPROTO_IPV6:
349 switch (name) {
350 case IPV6_UNICAST_HOPS:
351 *i1 = tcps->tcps_ipv6_hoplimit;
352 break;
353 default:
354 return (-1);
355 }
356 break;
357 default:
358 return (-1);
359 }
360 return (sizeof (int));
361 }
362
363 /*
364 * TCP routine to get the values of options.
365 */
366 int
367 tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
368 {
369 int *i1 = (int *)ptr;
370 tcp_t *tcp = connp->conn_tcp;
371 conn_opt_arg_t coas;
372 int retval;
373
374 coas.coa_connp = connp;
375 coas.coa_ixa = connp->conn_ixa;
376 coas.coa_ipp = &connp->conn_xmit_ipp;
377 coas.coa_ancillary = B_FALSE;
378 coas.coa_changed = 0;
379
380 switch (level) {
381 case SOL_SOCKET:
382 switch (name) {
383 case SO_SND_COPYAVOID:
384 *i1 = tcp->tcp_snd_zcopy_on ?
385 SO_SND_COPYAVOID : 0;
386 return (sizeof (int));
387 case SO_ACCEPTCONN:
388 *i1 = (tcp->tcp_state == TCPS_LISTEN);
389 return (sizeof (int));
390 }
391 break;
392 case IPPROTO_TCP:
393 switch (name) {
394 case TCP_NODELAY:
395 *i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0;
396 return (sizeof (int));
397 case TCP_MAXSEG:
398 *i1 = tcp->tcp_mss;
399 return (sizeof (int));
400 case TCP_NOTIFY_THRESHOLD:
401 *i1 = (int)tcp->tcp_first_timer_threshold;
402 return (sizeof (int));
403 case TCP_ABORT_THRESHOLD:
404 *i1 = tcp->tcp_second_timer_threshold;
405 return (sizeof (int));
406 case TCP_CONN_NOTIFY_THRESHOLD:
407 *i1 = tcp->tcp_first_ctimer_threshold;
408 return (sizeof (int));
409 case TCP_CONN_ABORT_THRESHOLD:
410 *i1 = tcp->tcp_second_ctimer_threshold;
411 return (sizeof (int));
412 case TCP_INIT_CWND:
413 *i1 = tcp->tcp_init_cwnd;
414 return (sizeof (int));
415 case TCP_KEEPALIVE_THRESHOLD:
416 *i1 = tcp->tcp_ka_interval;
417 return (sizeof (int));
418
419 /*
420 * TCP_KEEPIDLE expects value in seconds, but
421 * tcp_ka_interval is in milliseconds.
422 */
423 case TCP_KEEPIDLE:
424 *i1 = tcp->tcp_ka_interval / 1000;
425 return (sizeof (int));
426 case TCP_KEEPCNT:
427 *i1 = tcp->tcp_ka_cnt;
428 return (sizeof (int));
429
430 /*
431 * TCP_KEEPINTVL expects value in seconds, but
432 * tcp_ka_rinterval is in milliseconds.
433 */
434 case TCP_KEEPINTVL:
435 *i1 = tcp->tcp_ka_rinterval / 1000;
436 return (sizeof (int));
437 case TCP_KEEPALIVE_ABORT_THRESHOLD:
438 *i1 = tcp->tcp_ka_abort_thres;
439 return (sizeof (int));
440 case TCP_CONGESTION: {
441 size_t len = strlcpy((char *)ptr, CC_ALGO(tcp)->name,
442 CC_ALGO_NAME_MAX);
443 if (len >= CC_ALGO_NAME_MAX)
444 return (-1);
445 return (len + 1);
446 }
447 case TCP_CORK:
448 *i1 = tcp->tcp_cork;
449 return (sizeof (int));
450 case TCP_RTO_INITIAL:
451 *i1 = tcp->tcp_rto_initial;
452 return (sizeof (uint32_t));
453 case TCP_RTO_MIN:
454 *i1 = tcp->tcp_rto_min;
455 return (sizeof (uint32_t));
456 case TCP_RTO_MAX:
457 *i1 = tcp->tcp_rto_max;
458 return (sizeof (uint32_t));
459 case TCP_LINGER2:
460 *i1 = tcp->tcp_fin_wait_2_flush_interval / SECONDS;
461 return (sizeof (int));
462 }
463 break;
464 case IPPROTO_IP:
465 if (connp->conn_family != AF_INET)
466 return (-1);
467 switch (name) {
468 case IP_OPTIONS:
469 case T_IP_OPTIONS:
470 /* Caller ensures enough space */
471 return (ip_opt_get_user(connp, ptr));
472 default:
473 break;
474 }
475 break;
476
477 case IPPROTO_IPV6:
478 /*
479 * IPPROTO_IPV6 options are only supported for sockets
480 * that are using IPv6 on the wire.
481 */
482 if (connp->conn_ipversion != IPV6_VERSION) {
483 return (-1);
484 }
485 switch (name) {
486 case IPV6_PATHMTU:
487 if (tcp->tcp_state < TCPS_ESTABLISHED)
488 return (-1);
489 break;
490 }
491 break;
492 }
493 mutex_enter(&connp->conn_lock);
494 retval = conn_opt_get(&coas, level, name, ptr);
495 mutex_exit(&connp->conn_lock);
496 return (retval);
497 }
498
499 /*
500 * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
501 * Parameters are assumed to be verified by the caller.
502 */
503 /* ARGSUSED */
504 int
505 tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
506 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
507 void *thisdg_attrs, cred_t *cr)
508 {
509 tcp_t *tcp = connp->conn_tcp;
510 int *i1 = (int *)invalp;
511 boolean_t onoff = (*i1 == 0) ? 0 : 1;
512 boolean_t checkonly;
513 int reterr;
514 tcp_stack_t *tcps = tcp->tcp_tcps;
515 conn_opt_arg_t coas;
516 uint32_t val = *((uint32_t *)invalp);
517
518 coas.coa_connp = connp;
519 coas.coa_ixa = connp->conn_ixa;
520 coas.coa_ipp = &connp->conn_xmit_ipp;
521 coas.coa_ancillary = B_FALSE;
522 coas.coa_changed = 0;
523
524 switch (optset_context) {
525 case SETFN_OPTCOM_CHECKONLY:
526 checkonly = B_TRUE;
527 /*
528 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
529 * inlen != 0 implies value supplied and
530 * we have to "pretend" to set it.
531 * inlen == 0 implies that there is no
532 * value part in T_CHECK request and just validation
533 * done elsewhere should be enough, we just return here.
534 */
535 if (inlen == 0) {
536 *outlenp = 0;
537 return (0);
538 }
539 break;
540 case SETFN_OPTCOM_NEGOTIATE:
541 checkonly = B_FALSE;
542 break;
543 case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */
544 case SETFN_CONN_NEGOTIATE:
545 checkonly = B_FALSE;
546 /*
547 * Negotiating local and "association-related" options
548 * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ)
549 * primitives is allowed by XTI, but we choose
550 * to not implement this style negotiation for Internet
551 * protocols (We interpret it is a must for OSI world but
552 * optional for Internet protocols) for all options.
553 * [ Will do only for the few options that enable test
554 * suites that our XTI implementation of this feature
555 * works for transports that do allow it ]
556 */
557 if (!tcp_allow_connopt_set(level, name)) {
558 *outlenp = 0;
559 return (EINVAL);
560 }
561 break;
562 default:
563 /*
564 * We should never get here
565 */
566 *outlenp = 0;
567 return (EINVAL);
568 }
569
570 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
571 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
572
573 /*
574 * For TCP, we should have no ancillary data sent down
575 * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs
576 * has to be zero.
577 */
578 ASSERT(thisdg_attrs == NULL);
579
580 /*
581 * For fixed length options, no sanity check
582 * of passed in length is done. It is assumed *_optcom_req()
583 * routines do the right thing.
584 */
585 switch (level) {
586 case SOL_SOCKET:
587 switch (name) {
588 case SO_KEEPALIVE:
589 if (checkonly) {
590 /* check only case */
591 break;
592 }
593
594 if (!onoff) {
595 if (connp->conn_keepalive) {
596 if (tcp->tcp_ka_tid != 0) {
597 (void) TCP_TIMER_CANCEL(tcp,
598 tcp->tcp_ka_tid);
599 tcp->tcp_ka_tid = 0;
600 }
601 connp->conn_keepalive = 0;
602 }
603 break;
604 }
605 if (!connp->conn_keepalive) {
606 /* Crank up the keepalive timer */
607 tcp->tcp_ka_last_intrvl = 0;
608 tcp->tcp_ka_tid = TCP_TIMER(tcp,
609 tcp_keepalive_timer, tcp->tcp_ka_interval);
610 connp->conn_keepalive = 1;
611 }
612 break;
613 case SO_SNDBUF: {
614 if (*i1 > tcps->tcps_max_buf) {
615 *outlenp = 0;
616 return (ENOBUFS);
617 }
618 if (checkonly)
619 break;
620
621 connp->conn_sndbuf = *i1;
622 if (tcps->tcps_snd_lowat_fraction != 0) {
623 connp->conn_sndlowat = connp->conn_sndbuf /
624 tcps->tcps_snd_lowat_fraction;
625 }
626 (void) tcp_maxpsz_set(tcp, B_TRUE);
627 /*
628 * If we are flow-controlled, recheck the condition.
629 * There are apps that increase SO_SNDBUF size when
630 * flow-controlled (EWOULDBLOCK), and expect the flow
631 * control condition to be lifted right away.
632 */
633 mutex_enter(&tcp->tcp_non_sq_lock);
634 if (tcp->tcp_flow_stopped &&
635 TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) {
636 tcp_clrqfull(tcp);
637 }
638 mutex_exit(&tcp->tcp_non_sq_lock);
639 *outlenp = inlen;
640 return (0);
641 }
642 case SO_RCVBUF:
643 if (*i1 > tcps->tcps_max_buf) {
644 *outlenp = 0;
645 return (ENOBUFS);
646 }
647 /* Silently ignore zero */
648 if (!checkonly && *i1 != 0) {
649 *i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss);
650 (void) tcp_rwnd_set(tcp, *i1);
651 }
652 /*
653 * XXX should we return the rwnd here
654 * and tcp_opt_get ?
655 */
656 *outlenp = inlen;
657 return (0);
658 case SO_SND_COPYAVOID:
659 if (!checkonly) {
660 if (tcp->tcp_loopback ||
661 (onoff != 1) || !tcp_zcopy_check(tcp)) {
662 *outlenp = 0;
663 return (EOPNOTSUPP);
664 }
665 tcp->tcp_snd_zcopy_aware = 1;
666 }
667 *outlenp = inlen;
668 return (0);
669 }
670 break;
671 case IPPROTO_TCP:
672 switch (name) {
673 case TCP_NODELAY:
674 if (!checkonly)
675 tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss;
676 break;
677 case TCP_NOTIFY_THRESHOLD:
678 if (!checkonly)
679 tcp->tcp_first_timer_threshold = *i1;
680 break;
681 case TCP_ABORT_THRESHOLD:
682 if (!checkonly)
683 tcp->tcp_second_timer_threshold = *i1;
684 break;
685 case TCP_CONN_NOTIFY_THRESHOLD:
686 if (!checkonly)
687 tcp->tcp_first_ctimer_threshold = *i1;
688 break;
689 case TCP_CONN_ABORT_THRESHOLD:
690 if (!checkonly)
691 tcp->tcp_second_ctimer_threshold = *i1;
692 break;
693 case TCP_RECVDSTADDR:
694 if (tcp->tcp_state > TCPS_LISTEN) {
695 *outlenp = 0;
696 return (EOPNOTSUPP);
697 }
698 /* Setting done in conn_opt_set */
699 break;
700 case TCP_INIT_CWND:
701 if (checkonly)
702 break;
703
704 /*
705 * Only allow socket with network configuration
706 * privilege to set the initial cwnd to be larger
707 * than allowed by RFC 3390.
708 */
709 if (val > MIN(4, MAX(2, 4380 / tcp->tcp_mss))) {
710 if ((reterr = secpolicy_ip_config(cr, B_TRUE))
711 != 0) {
712 *outlenp = 0;
713 return (reterr);
714 }
715 if (val > tcp_max_init_cwnd) {
716 *outlenp = 0;
717 return (EINVAL);
718 }
719 }
720
721 tcp->tcp_init_cwnd = val;
722
723 /*
724 * If the socket is connected, AND no outbound data
725 * has been sent, reset the actual cwnd values.
726 */
727 if (tcp->tcp_state == TCPS_ESTABLISHED &&
728 tcp->tcp_iss == tcp->tcp_snxt - 1) {
729 tcp->tcp_cwnd =
730 MIN(tcp->tcp_rwnd, val * tcp->tcp_mss);
731 }
732 break;
733
734 /*
735 * TCP_KEEPIDLE is in seconds but TCP_KEEPALIVE_THRESHOLD
736 * is in milliseconds. TCP_KEEPIDLE is introduced for
737 * compatibility with other Unix flavors.
738 * We can fall through TCP_KEEPALIVE_THRESHOLD logic after
739 * converting the input to milliseconds.
740 */
741 case TCP_KEEPIDLE:
742 *i1 *= 1000;
743 /* FALLTHRU */
744
745 case TCP_KEEPALIVE_THRESHOLD:
746 if (checkonly)
747 break;
748
749 if (*i1 < tcps->tcps_keepalive_interval_low ||
750 *i1 > tcps->tcps_keepalive_interval_high) {
751 *outlenp = 0;
752 return (EINVAL);
753 }
754 if (*i1 != tcp->tcp_ka_interval) {
755 tcp->tcp_ka_interval = *i1;
756 /*
757 * Check if we need to restart the
758 * keepalive timer.
759 */
760 if (tcp->tcp_ka_tid != 0) {
761 ASSERT(connp->conn_keepalive);
762 (void) TCP_TIMER_CANCEL(tcp,
763 tcp->tcp_ka_tid);
764 tcp->tcp_ka_last_intrvl = 0;
765 tcp->tcp_ka_tid = TCP_TIMER(tcp,
766 tcp_keepalive_timer,
767 tcp->tcp_ka_interval);
768 }
769 }
770 break;
771
772 /*
773 * tcp_ka_abort_thres = tcp_ka_rinterval * tcp_ka_cnt.
774 * So setting TCP_KEEPCNT or TCP_KEEPINTVL can affect all the
775 * three members - tcp_ka_abort_thres, tcp_ka_rinterval and
776 * tcp_ka_cnt.
777 */
778 case TCP_KEEPCNT:
779 if (checkonly)
780 break;
781
782 if (*i1 == 0) {
783 return (EINVAL);
784 } else if (tcp->tcp_ka_rinterval == 0) {
785 /*
786 * When TCP_KEEPCNT is specified without first
787 * specifying a TCP_KEEPINTVL, we infer an
788 * interval based on a tunable specific to our
789 * stack: the tcp_keepalive_abort_interval.
790 * (Or the TCP_KEEPALIVE_ABORT_THRESHOLD, in
791 * the unlikely event that that has been set.)
792 * Given the abort interval's default value of
793 * 480 seconds, low TCP_KEEPCNT values can
794 * result in intervals that exceed the default
795 * maximum RTO of 60 seconds. Rather than
796 * fail in these cases, we (implicitly) clamp
797 * the interval at the maximum RTO; if the
798 * TCP_KEEPCNT is shortly followed by a
799 * TCP_KEEPINTVL (as we expect), the abort
800 * threshold will be recalculated correctly --
801 * and if a TCP_KEEPINTVL is not forthcoming,
802 * keep-alive will at least operate reasonably
803 * given the underconfigured state.
804 */
805 uint32_t interval;
806
807 interval = tcp->tcp_ka_abort_thres / *i1;
808
809 if (interval < tcp->tcp_rto_min)
810 interval = tcp->tcp_rto_min;
811
812 if (interval > tcp->tcp_rto_max)
813 interval = tcp->tcp_rto_max;
814
815 tcp->tcp_ka_rinterval = interval;
816 } else {
817 if ((*i1 * tcp->tcp_ka_rinterval) <
818 tcps->tcps_keepalive_abort_interval_low ||
819 (*i1 * tcp->tcp_ka_rinterval) >
820 tcps->tcps_keepalive_abort_interval_high)
821 return (EINVAL);
822 tcp->tcp_ka_abort_thres =
823 (*i1 * tcp->tcp_ka_rinterval);
824 }
825 tcp->tcp_ka_cnt = *i1;
826 break;
827 case TCP_KEEPINTVL:
828 /*
829 * TCP_KEEPINTVL is specified in seconds, but
830 * tcp_ka_rinterval is in milliseconds.
831 */
832
833 if (checkonly)
834 break;
835
836 if ((*i1 * 1000) < tcp->tcp_rto_min ||
837 (*i1 * 1000) > tcp->tcp_rto_max)
838 return (EINVAL);
839
840 if (tcp->tcp_ka_cnt == 0) {
841 tcp->tcp_ka_cnt =
842 tcp->tcp_ka_abort_thres / (*i1 * 1000);
843 } else {
844 if ((*i1 * tcp->tcp_ka_cnt * 1000) <
845 tcps->tcps_keepalive_abort_interval_low ||
846 (*i1 * tcp->tcp_ka_cnt * 1000) >
847 tcps->tcps_keepalive_abort_interval_high)
848 return (EINVAL);
849 tcp->tcp_ka_abort_thres =
850 (*i1 * tcp->tcp_ka_cnt * 1000);
851 }
852 tcp->tcp_ka_rinterval = *i1 * 1000;
853 break;
854 case TCP_KEEPALIVE_ABORT_THRESHOLD:
855 if (!checkonly) {
856 if (*i1 <
857 tcps->tcps_keepalive_abort_interval_low ||
858 *i1 >
859 tcps->tcps_keepalive_abort_interval_high) {
860 *outlenp = 0;
861 return (EINVAL);
862 }
863 tcp->tcp_ka_abort_thres = *i1;
864 tcp->tcp_ka_cnt = 0;
865 tcp->tcp_ka_rinterval = 0;
866 }
867 break;
868 case TCP_CONGESTION: {
869 struct cc_algo *algo;
870
871 if (checkonly) {
872 break;
873 }
874
875 /*
876 * Make sure the string is NUL-terminated. Some
877 * consumers pass only the number of characters
878 * in the string, and don't include the NUL
879 * terminator, so we set it for them.
880 */
881 if (inlen < CC_ALGO_NAME_MAX) {
882 invalp[inlen] = '\0';
883 }
884 invalp[CC_ALGO_NAME_MAX - 1] = '\0';
885
886 if ((algo = cc_load_algo((char *)invalp)) == NULL) {
887 return (ENOENT);
888 }
889
890 if (CC_ALGO(tcp)->cb_destroy != NULL) {
891 CC_ALGO(tcp)->cb_destroy(&tcp->tcp_ccv);
892 }
893
894 CC_DATA(tcp) = NULL;
895 CC_ALGO(tcp) = algo;
896
897 if (CC_ALGO(tcp)->cb_init != NULL) {
898 VERIFY0(CC_ALGO(tcp)->cb_init(&tcp->tcp_ccv));
899 }
900
901 break;
902 }
903 case TCP_CORK:
904 if (!checkonly) {
905 /*
906 * if tcp->tcp_cork was set and is now
907 * being unset, we have to make sure that
908 * the remaining data gets sent out. Also
909 * unset tcp->tcp_cork so that tcp_wput_data()
910 * can send data even if it is less than mss
911 */
912 if (tcp->tcp_cork && onoff == 0 &&
913 tcp->tcp_unsent > 0) {
914 tcp->tcp_cork = B_FALSE;
915 tcp_wput_data(tcp, NULL, B_FALSE);
916 }
917 tcp->tcp_cork = onoff;
918 }
919 break;
920 case TCP_RTO_INITIAL:
921 if (checkonly || val == 0)
922 break;
923
924 /*
925 * Sanity checks
926 *
927 * The initial RTO should be bounded by the minimum
928 * and maximum RTO. And it should also be smaller
929 * than the connect attempt abort timeout. Otherwise,
930 * the connection won't be aborted in a period
931 * reasonably close to that timeout.
932 */
933 if (val < tcp->tcp_rto_min || val > tcp->tcp_rto_max ||
934 val > tcp->tcp_second_ctimer_threshold ||
935 val < tcps->tcps_rexmit_interval_initial_low ||
936 val > tcps->tcps_rexmit_interval_initial_high) {
937 *outlenp = 0;
938 return (EINVAL);
939 }
940 tcp->tcp_rto_initial = val;
941
942 /*
943 * If TCP has not sent anything, need to re-calculate
944 * tcp_rto. Otherwise, this option change does not
945 * really affect anything.
946 */
947 if (tcp->tcp_state >= TCPS_SYN_SENT)
948 break;
949
950 tcp->tcp_rtt_sa = MSEC2NSEC(tcp->tcp_rto_initial) << 2;
951 tcp->tcp_rtt_sd = MSEC2NSEC(tcp->tcp_rto_initial) >> 1;
952 tcp->tcp_rto = tcp_calculate_rto(tcp, tcps,
953 tcps->tcps_conn_grace_period);
954 break;
955 case TCP_RTO_MIN:
956 if (checkonly || val == 0)
957 break;
958
959 if (val < tcps->tcps_rexmit_interval_min_low ||
960 val > tcps->tcps_rexmit_interval_min_high ||
961 val > tcp->tcp_rto_max) {
962 *outlenp = 0;
963 return (EINVAL);
964 }
965 tcp->tcp_rto_min = val;
966 if (tcp->tcp_rto < val)
967 tcp->tcp_rto = val;
968 break;
969 case TCP_RTO_MAX:
970 if (checkonly || val == 0)
971 break;
972
973 /*
974 * Sanity checks
975 *
976 * The maximum RTO should not be larger than the
977 * connection abort timeout. Otherwise, the
978 * connection won't be aborted in a period reasonably
979 * close to that timeout.
980 */
981 if (val < tcps->tcps_rexmit_interval_max_low ||
982 val > tcps->tcps_rexmit_interval_max_high ||
983 val < tcp->tcp_rto_min ||
984 val > tcp->tcp_second_timer_threshold) {
985 *outlenp = 0;
986 return (EINVAL);
987 }
988 tcp->tcp_rto_max = val;
989 if (tcp->tcp_rto > val)
990 tcp->tcp_rto = val;
991 break;
992 case TCP_LINGER2:
993 if (checkonly || *i1 == 0)
994 break;
995
996 /*
997 * Note that the option value's unit is second. And
998 * the value should be bigger than the private
999 * parameter tcp_fin_wait_2_flush_interval's lower
1000 * bound and smaller than the current value of that
1001 * parameter. It should be smaller than the current
1002 * value to avoid an app setting TCP_LINGER2 to a big
1003 * value, causing resource to be held up too long in
1004 * FIN-WAIT-2 state.
1005 */
1006 if (*i1 < 0 ||
1007 tcps->tcps_fin_wait_2_flush_interval_low/SECONDS >
1008 *i1 ||
1009 tcps->tcps_fin_wait_2_flush_interval/SECONDS <
1010 *i1) {
1011 *outlenp = 0;
1012 return (EINVAL);
1013 }
1014 tcp->tcp_fin_wait_2_flush_interval = *i1 * SECONDS;
1015 break;
1016 default:
1017 break;
1018 }
1019 break;
1020 case IPPROTO_IP:
1021 if (connp->conn_family != AF_INET) {
1022 *outlenp = 0;
1023 return (EINVAL);
1024 }
1025 switch (name) {
1026 case IP_SEC_OPT:
1027 /*
1028 * We should not allow policy setting after
1029 * we start listening for connections.
1030 */
1031 if (tcp->tcp_state == TCPS_LISTEN) {
1032 return (EINVAL);
1033 }
1034 break;
1035 }
1036 break;
1037 case IPPROTO_IPV6:
1038 /*
1039 * IPPROTO_IPV6 options are only supported for sockets
1040 * that are using IPv6 on the wire.
1041 */
1042 if (connp->conn_ipversion != IPV6_VERSION) {
1043 *outlenp = 0;
1044 return (EINVAL);
1045 }
1046
1047 switch (name) {
1048 case IPV6_RECVPKTINFO:
1049 if (!checkonly) {
1050 /* Force it to be sent up with the next msg */
1051 tcp->tcp_recvifindex = 0;
1052 }
1053 break;
1054 case IPV6_RECVTCLASS:
1055 if (!checkonly) {
1056 /* Force it to be sent up with the next msg */
1057 tcp->tcp_recvtclass = 0xffffffffU;
1058 }
1059 break;
1060 case IPV6_RECVHOPLIMIT:
1061 if (!checkonly) {
1062 /* Force it to be sent up with the next msg */
1063 tcp->tcp_recvhops = 0xffffffffU;
1064 }
1065 break;
1066 case IPV6_PKTINFO:
1067 /* This is an extra check for TCP */
1068 if (inlen == sizeof (struct in6_pktinfo)) {
1069 struct in6_pktinfo *pkti;
1070
1071 pkti = (struct in6_pktinfo *)invalp;
1072 /*
1073 * RFC 3542 states that ipi6_addr must be
1074 * the unspecified address when setting the
1075 * IPV6_PKTINFO sticky socket option on a
1076 * TCP socket.
1077 */
1078 if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr))
1079 return (EINVAL);
1080 }
1081 break;
1082 case IPV6_SEC_OPT:
1083 /*
1084 * We should not allow policy setting after
1085 * we start listening for connections.
1086 */
1087 if (tcp->tcp_state == TCPS_LISTEN) {
1088 return (EINVAL);
1089 }
1090 break;
1091 }
1092 break;
1093 }
1094 reterr = conn_opt_set(&coas, level, name, inlen, invalp,
1095 checkonly, cr);
1096 if (reterr != 0) {
1097 *outlenp = 0;
1098 return (reterr);
1099 }
1100
1101 /*
1102 * Common case of OK return with outval same as inval
1103 */
1104 if (invalp != outvalp) {
1105 /* don't trust bcopy for identical src/dst */
1106 (void) bcopy(invalp, outvalp, inlen);
1107 }
1108 *outlenp = inlen;
1109
1110 if (coas.coa_changed & COA_HEADER_CHANGED) {
1111 /* If we are connected we rebuilt the headers */
1112 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1113 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1114 reterr = tcp_build_hdrs(tcp);
1115 if (reterr != 0)
1116 return (reterr);
1117 }
1118 }
1119 if (coas.coa_changed & COA_ROUTE_CHANGED) {
1120 in6_addr_t nexthop;
1121
1122 /*
1123 * If we are connected we re-cache the information.
1124 * We ignore errors to preserve BSD behavior.
1125 * Note that we don't redo IPsec policy lookup here
1126 * since the final destination (or source) didn't change.
1127 */
1128 ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa,
1129 &connp->conn_faddr_v6, &nexthop);
1130
1131 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1132 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1133 (void) ip_attr_connect(connp, connp->conn_ixa,
1134 &connp->conn_laddr_v6, &connp->conn_faddr_v6,
1135 &nexthop, connp->conn_fport, NULL, NULL,
1136 IPDF_VERIFY_DST);
1137 }
1138 }
1139 if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
1140 connp->conn_wq->q_hiwat = connp->conn_sndbuf;
1141 }
1142 if (coas.coa_changed & COA_WROFF_CHANGED) {
1143 connp->conn_wroff = connp->conn_ht_iphc_allocated +
1144 tcps->tcps_wroff_xtra;
1145 (void) proto_set_tx_wroff(connp->conn_rq, connp,
1146 connp->conn_wroff);
1147 }
1148 if (coas.coa_changed & COA_OOBINLINE_CHANGED) {
1149 if (IPCL_IS_NONSTR(connp))
1150 proto_set_rx_oob_opt(connp, onoff);
1151 }
1152 return (0);
1153 }