1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
24 * Copyright 2019 Joyent, Inc.
25 * Copyright (c) 2016 by Delphix. All rights reserved.
26 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
27 */
28
29 #include <sys/types.h>
30 #include <sys/stream.h>
31 #define _SUN_TPI_VERSION 2
32 #include <sys/tihdr.h>
33 #include <sys/socket.h>
34 #include <sys/xti_xtiopt.h>
35 #include <sys/xti_inet.h>
36 #include <sys/policy.h>
37
38 #include <inet/cc.h>
39 #include <inet/common.h>
40 #include <netinet/ip6.h>
41 #include <inet/ip.h>
42
43 #include <netinet/in.h>
44 #include <netinet/tcp.h>
45 #include <inet/optcom.h>
46 #include <inet/proto_set.h>
47 #include <inet/tcp_impl.h>
48
49 static int tcp_opt_default(queue_t *, int, int, uchar_t *);
50
51 /*
52 * Table of all known options handled on a TCP protocol stack.
53 *
54 * Note: This table contains options processed by both TCP and IP levels
55 * and is the superset of options that can be performed on a TCP over IP
56 * stack.
57 */
58 opdes_t tcp_opt_arr[] = {
59
60 { SO_LINGER, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
61 sizeof (struct linger), 0 },
62
63 { SO_DEBUG, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
64 { SO_KEEPALIVE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
65 { SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
66 { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
67 },
68 { SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
69 { SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
70 { SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
71 { SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
72 { SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
73 { SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
74 { SO_SNDTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
75 sizeof (struct timeval), 0 },
76 { SO_RCVTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
77 sizeof (struct timeval), 0 },
78 { SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
79 },
80 { SO_SND_COPYAVOID, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
81 { SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
82 0 },
83 { SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
84 0 },
85 { SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
86 0 },
87 { SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
88 0 },
89 { SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
90
91 { SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
92
93 { SO_PROTOTYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
94
95 { TCP_NODELAY, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
96 },
97 { TCP_MAXSEG, IPPROTO_TCP, OA_R, OA_R, OP_NP, 0, sizeof (uint_t),
98 536 },
99
100 { TCP_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
101 OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
102
103 { TCP_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
104 OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
105
106 { TCP_CONN_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
107 OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
108
109 { TCP_CONN_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
110 OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
111
112 { TCP_RECVDSTADDR, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
113 0 },
114
115 { TCP_ANONPRIVBIND, IPPROTO_TCP, OA_R, OA_RW, OP_PRIVPORT, 0,
116 sizeof (int), 0 },
117
118 { TCP_EXCLBIND, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
119 },
120
121 { TCP_INIT_CWND, IPPROTO_TCP, OA_RW, OA_RW, OP_CONFIG, 0,
122 sizeof (int), 0 },
123
124 { TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
125 sizeof (int), 0 },
126
127 { TCP_KEEPIDLE, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
128
129 { TCP_KEEPCNT, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
130
131 { TCP_KEEPINTVL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
132
133 { TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
134 sizeof (int), 0 },
135
136 { TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
137
138 { TCP_RTO_INITIAL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
139
140 { TCP_RTO_MIN, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
141
142 { TCP_RTO_MAX, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
143
144 { TCP_LINGER2, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
145
146 { TCP_CONGESTION, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
147 OP_VARLEN, CC_ALGO_NAME_MAX, 0 },
148
149 { IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
150 (OP_VARLEN|OP_NODEFAULT),
151 IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
152 { T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
153 (OP_VARLEN|OP_NODEFAULT),
154 IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
155
156 { IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
157 { T_IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
158 { IP_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
159 sizeof (int), -1 /* not initialized */ },
160 { IP_RECVTOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
161
162 { IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
163 sizeof (ipsec_req_t), -1 /* not initialized */ },
164
165 { IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
166 sizeof (int), 0 /* no ifindex */ },
167
168 { IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
169 sizeof (int), 0 },
170
171 { IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
172 sizeof (int), -1 /* not initialized */ },
173
174 { IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
175 sizeof (int), 0 /* no ifindex */ },
176
177 { IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
178
179 { IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
180 sizeof (in_addr_t), -1 /* not initialized */ },
181
182 { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
183 sizeof (int), 0 },
184
185 { IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
186 (OP_NODEFAULT|OP_VARLEN),
187 sizeof (struct in6_pktinfo), -1 /* not initialized */ },
188 { IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
189 OP_NODEFAULT,
190 sizeof (sin6_t), -1 /* not initialized */ },
191 { IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
192 (OP_VARLEN|OP_NODEFAULT), 255*8,
193 -1 /* not initialized */ },
194 { IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
195 (OP_VARLEN|OP_NODEFAULT), 255*8,
196 -1 /* not initialized */ },
197 { IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
198 (OP_VARLEN|OP_NODEFAULT), 255*8,
199 -1 /* not initialized */ },
200 { IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
201 (OP_VARLEN|OP_NODEFAULT), 255*8,
202 -1 /* not initialized */ },
203 { IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
204 OP_NODEFAULT,
205 sizeof (int), -1 /* not initialized */ },
206 { IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
207 OP_NODEFAULT,
208 sizeof (struct ip6_mtuinfo), -1 /* not initialized */ },
209 { IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
210 sizeof (int), 0 },
211 { IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
212 sizeof (int), 0 },
213 { IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
214 sizeof (int), 0 },
215
216 /* Enable receipt of ancillary data */
217 { IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
218 sizeof (int), 0 },
219 { IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
220 sizeof (int), 0 },
221 { IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
222 sizeof (int), 0 },
223 { _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
224 sizeof (int), 0 },
225 { IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
226 sizeof (int), 0 },
227 { IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
228 sizeof (int), 0 },
229 { IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
230 sizeof (int), 0 },
231 { IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
232 sizeof (int), 0 },
233
234 { IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
235 sizeof (ipsec_req_t), -1 /* not initialized */ },
236 { IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
237 sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT },
238 };
239
240 /*
241 * Table of all supported levels
242 * Note: Some levels (e.g. XTI_GENERIC) may be valid but may not have
243 * any supported options so we need this info separately.
244 *
245 * This is needed only for topmost tpi providers and is used only by
246 * XTI interfaces.
247 */
248 optlevel_t tcp_valid_levels_arr[] = {
249 XTI_GENERIC,
250 SOL_SOCKET,
251 IPPROTO_TCP,
252 IPPROTO_IP,
253 IPPROTO_IPV6
254 };
255
256
257 #define TCP_OPT_ARR_CNT A_CNT(tcp_opt_arr)
258 #define TCP_VALID_LEVELS_CNT A_CNT(tcp_valid_levels_arr)
259
260 uint_t tcp_max_optsize; /* initialized when TCP driver is loaded */
261
262 /*
263 * Initialize option database object for TCP
264 *
265 * This object represents database of options to search passed to
266 * {sock,tpi}optcom_req() interface routine to take care of option
267 * management and associated methods.
268 */
269
270 optdb_obj_t tcp_opt_obj = {
271 tcp_opt_default, /* TCP default value function pointer */
272 tcp_tpi_opt_get, /* TCP get function pointer */
273 tcp_tpi_opt_set, /* TCP set function pointer */
274 TCP_OPT_ARR_CNT, /* TCP option database count of entries */
275 tcp_opt_arr, /* TCP option database */
276 TCP_VALID_LEVELS_CNT, /* TCP valid level count of entries */
277 tcp_valid_levels_arr /* TCP valid level array */
278 };
279
280 static int tcp_max_init_cwnd = TCP_MAX_INIT_CWND;
281
282 /*
283 * Some TCP options can be "set" by requesting them in the option
284 * buffer. This is needed for XTI feature test though we do not
285 * allow it in general. We interpret that this mechanism is more
286 * applicable to OSI protocols and need not be allowed in general.
287 * This routine filters out options for which it is not allowed (most)
288 * and lets through those (few) for which it is. [ The XTI interface
289 * test suite specifics will imply that any XTI_GENERIC level XTI_* if
290 * ever implemented will have to be allowed here ].
291 */
292 static boolean_t
293 tcp_allow_connopt_set(int level, int name)
294 {
295
296 switch (level) {
297 case IPPROTO_TCP:
298 switch (name) {
299 case TCP_NODELAY:
300 return (B_TRUE);
301 default:
302 return (B_FALSE);
303 }
304 /*NOTREACHED*/
305 default:
306 return (B_FALSE);
307 }
308 /*NOTREACHED*/
309 }
310
311 /*
312 * This routine gets default values of certain options whose default
313 * values are maintained by protocol specific code
314 */
315 /* ARGSUSED */
316 static int
317 tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
318 {
319 int32_t *i1 = (int32_t *)ptr;
320 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps;
321
322 switch (level) {
323 case IPPROTO_TCP:
324 switch (name) {
325 case TCP_NOTIFY_THRESHOLD:
326 *i1 = tcps->tcps_ip_notify_interval;
327 break;
328 case TCP_ABORT_THRESHOLD:
329 *i1 = tcps->tcps_ip_abort_interval;
330 break;
331 case TCP_CONN_NOTIFY_THRESHOLD:
332 *i1 = tcps->tcps_ip_notify_cinterval;
333 break;
334 case TCP_CONN_ABORT_THRESHOLD:
335 *i1 = tcps->tcps_ip_abort_cinterval;
336 break;
337 default:
338 return (-1);
339 }
340 break;
341 case IPPROTO_IP:
342 switch (name) {
343 case IP_TTL:
344 *i1 = tcps->tcps_ipv4_ttl;
345 break;
346 default:
347 return (-1);
348 }
349 break;
350 case IPPROTO_IPV6:
351 switch (name) {
352 case IPV6_UNICAST_HOPS:
353 *i1 = tcps->tcps_ipv6_hoplimit;
354 break;
355 default:
356 return (-1);
357 }
358 break;
359 default:
360 return (-1);
361 }
362 return (sizeof (int));
363 }
364
365 /*
366 * TCP routine to get the values of options.
367 */
368 int
369 tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
370 {
371 int *i1 = (int *)ptr;
372 tcp_t *tcp = connp->conn_tcp;
373 conn_opt_arg_t coas;
374 int retval;
375
376 coas.coa_connp = connp;
377 coas.coa_ixa = connp->conn_ixa;
378 coas.coa_ipp = &connp->conn_xmit_ipp;
379 coas.coa_ancillary = B_FALSE;
380 coas.coa_changed = 0;
381
382 switch (level) {
383 case SOL_SOCKET:
384 switch (name) {
385 case SO_SND_COPYAVOID:
386 *i1 = tcp->tcp_snd_zcopy_on ?
387 SO_SND_COPYAVOID : 0;
388 return (sizeof (int));
389 case SO_ACCEPTCONN:
390 *i1 = (tcp->tcp_state == TCPS_LISTEN);
391 return (sizeof (int));
392 }
393 break;
394 case IPPROTO_TCP:
395 switch (name) {
396 case TCP_NODELAY:
397 *i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0;
398 return (sizeof (int));
399 case TCP_MAXSEG:
400 *i1 = tcp->tcp_mss;
401 return (sizeof (int));
402 case TCP_NOTIFY_THRESHOLD:
403 *i1 = (int)tcp->tcp_first_timer_threshold;
404 return (sizeof (int));
405 case TCP_ABORT_THRESHOLD:
406 *i1 = tcp->tcp_second_timer_threshold;
407 return (sizeof (int));
408 case TCP_CONN_NOTIFY_THRESHOLD:
409 *i1 = tcp->tcp_first_ctimer_threshold;
410 return (sizeof (int));
411 case TCP_CONN_ABORT_THRESHOLD:
412 *i1 = tcp->tcp_second_ctimer_threshold;
413 return (sizeof (int));
414 case TCP_INIT_CWND:
415 *i1 = tcp->tcp_init_cwnd;
416 return (sizeof (int));
417 case TCP_KEEPALIVE_THRESHOLD:
418 *i1 = tcp->tcp_ka_interval;
419 return (sizeof (int));
420
421 /*
422 * TCP_KEEPIDLE expects value in seconds, but
423 * tcp_ka_interval is in milliseconds.
424 */
425 case TCP_KEEPIDLE:
426 *i1 = tcp->tcp_ka_interval / 1000;
427 return (sizeof (int));
428 case TCP_KEEPCNT:
429 *i1 = tcp->tcp_ka_cnt;
430 return (sizeof (int));
431
432 /*
433 * TCP_KEEPINTVL expects value in seconds, but
434 * tcp_ka_rinterval is in milliseconds.
435 */
436 case TCP_KEEPINTVL:
437 *i1 = tcp->tcp_ka_rinterval / 1000;
438 return (sizeof (int));
439 case TCP_KEEPALIVE_ABORT_THRESHOLD:
440 *i1 = tcp->tcp_ka_abort_thres;
441 return (sizeof (int));
442 case TCP_CONGESTION: {
443 size_t len = strlcpy((char *)ptr, CC_ALGO(tcp)->name,
444 CC_ALGO_NAME_MAX);
445 if (len >= CC_ALGO_NAME_MAX)
446 return (-1);
447 return (len + 1);
448 }
449 case TCP_CORK:
450 *i1 = tcp->tcp_cork;
451 return (sizeof (int));
452 case TCP_RTO_INITIAL:
453 *i1 = tcp->tcp_rto_initial;
454 return (sizeof (uint32_t));
455 case TCP_RTO_MIN:
456 *i1 = tcp->tcp_rto_min;
457 return (sizeof (uint32_t));
458 case TCP_RTO_MAX:
459 *i1 = tcp->tcp_rto_max;
460 return (sizeof (uint32_t));
461 case TCP_LINGER2:
462 *i1 = tcp->tcp_fin_wait_2_flush_interval / SECONDS;
463 return (sizeof (int));
464 }
465 break;
466 case IPPROTO_IP:
467 if (connp->conn_family != AF_INET)
468 return (-1);
469 switch (name) {
470 case IP_OPTIONS:
471 case T_IP_OPTIONS:
472 /* Caller ensures enough space */
473 return (ip_opt_get_user(connp, ptr));
474 default:
475 break;
476 }
477 break;
478
479 case IPPROTO_IPV6:
480 /*
481 * IPPROTO_IPV6 options are only supported for sockets
482 * that are using IPv6 on the wire.
483 */
484 if (connp->conn_ipversion != IPV6_VERSION) {
485 return (-1);
486 }
487 switch (name) {
488 case IPV6_PATHMTU:
489 if (tcp->tcp_state < TCPS_ESTABLISHED)
490 return (-1);
491 break;
492 }
493 break;
494 }
495 mutex_enter(&connp->conn_lock);
496 retval = conn_opt_get(&coas, level, name, ptr);
497 mutex_exit(&connp->conn_lock);
498 return (retval);
499 }
500
501 /*
502 * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
503 * Parameters are assumed to be verified by the caller.
504 */
505 /* ARGSUSED */
506 int
507 tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
508 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
509 void *thisdg_attrs, cred_t *cr)
510 {
511 tcp_t *tcp = connp->conn_tcp;
512 int *i1 = (int *)invalp;
513 boolean_t onoff = (*i1 == 0) ? 0 : 1;
514 boolean_t checkonly;
515 int reterr;
516 tcp_stack_t *tcps = tcp->tcp_tcps;
517 conn_opt_arg_t coas;
518 uint32_t val = *((uint32_t *)invalp);
519
520 coas.coa_connp = connp;
521 coas.coa_ixa = connp->conn_ixa;
522 coas.coa_ipp = &connp->conn_xmit_ipp;
523 coas.coa_ancillary = B_FALSE;
524 coas.coa_changed = 0;
525
526 switch (optset_context) {
527 case SETFN_OPTCOM_CHECKONLY:
528 checkonly = B_TRUE;
529 /*
530 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
531 * inlen != 0 implies value supplied and
532 * we have to "pretend" to set it.
533 * inlen == 0 implies that there is no
534 * value part in T_CHECK request and just validation
535 * done elsewhere should be enough, we just return here.
536 */
537 if (inlen == 0) {
538 *outlenp = 0;
539 return (0);
540 }
541 break;
542 case SETFN_OPTCOM_NEGOTIATE:
543 checkonly = B_FALSE;
544 break;
545 case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */
546 case SETFN_CONN_NEGOTIATE:
547 checkonly = B_FALSE;
548 /*
549 * Negotiating local and "association-related" options
550 * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ)
551 * primitives is allowed by XTI, but we choose
552 * to not implement this style negotiation for Internet
553 * protocols (We interpret it is a must for OSI world but
554 * optional for Internet protocols) for all options.
555 * [ Will do only for the few options that enable test
556 * suites that our XTI implementation of this feature
557 * works for transports that do allow it ]
558 */
559 if (!tcp_allow_connopt_set(level, name)) {
560 *outlenp = 0;
561 return (EINVAL);
562 }
563 break;
564 default:
565 /*
566 * We should never get here
567 */
568 *outlenp = 0;
569 return (EINVAL);
570 }
571
572 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
573 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
574
575 /*
576 * For TCP, we should have no ancillary data sent down
577 * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs
578 * has to be zero.
579 */
580 ASSERT(thisdg_attrs == NULL);
581
582 /*
583 * For fixed length options, no sanity check
584 * of passed in length is done. It is assumed *_optcom_req()
585 * routines do the right thing.
586 */
587 switch (level) {
588 case SOL_SOCKET:
589 switch (name) {
590 case SO_KEEPALIVE:
591 if (checkonly) {
592 /* check only case */
593 break;
594 }
595
596 if (!onoff) {
597 if (connp->conn_keepalive) {
598 if (tcp->tcp_ka_tid != 0) {
599 (void) TCP_TIMER_CANCEL(tcp,
600 tcp->tcp_ka_tid);
601 tcp->tcp_ka_tid = 0;
602 }
603 connp->conn_keepalive = 0;
604 }
605 break;
606 }
607 if (!connp->conn_keepalive) {
608 /* Crank up the keepalive timer */
609 tcp->tcp_ka_last_intrvl = 0;
610 tcp->tcp_ka_tid = TCP_TIMER(tcp,
611 tcp_keepalive_timer, tcp->tcp_ka_interval);
612 connp->conn_keepalive = 1;
613 }
614 break;
615 case SO_SNDBUF: {
616 if (*i1 > tcps->tcps_max_buf) {
617 *outlenp = 0;
618 return (ENOBUFS);
619 }
620 if (checkonly)
621 break;
622
623 connp->conn_sndbuf = *i1;
624 if (tcps->tcps_snd_lowat_fraction != 0) {
625 connp->conn_sndlowat = connp->conn_sndbuf /
626 tcps->tcps_snd_lowat_fraction;
627 }
628 (void) tcp_maxpsz_set(tcp, B_TRUE);
629 /*
630 * If we are flow-controlled, recheck the condition.
631 * There are apps that increase SO_SNDBUF size when
632 * flow-controlled (EWOULDBLOCK), and expect the flow
633 * control condition to be lifted right away.
634 */
635 mutex_enter(&tcp->tcp_non_sq_lock);
636 if (tcp->tcp_flow_stopped &&
637 TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) {
638 tcp_clrqfull(tcp);
639 }
640 mutex_exit(&tcp->tcp_non_sq_lock);
641 *outlenp = inlen;
642 return (0);
643 }
644 case SO_RCVBUF:
645 if (*i1 > tcps->tcps_max_buf) {
646 *outlenp = 0;
647 return (ENOBUFS);
648 }
649 /* Silently ignore zero */
650 if (!checkonly && *i1 != 0) {
651 *i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss);
652 (void) tcp_rwnd_set(tcp, *i1);
653 }
654 /*
655 * XXX should we return the rwnd here
656 * and tcp_opt_get ?
657 */
658 *outlenp = inlen;
659 return (0);
660 case SO_SND_COPYAVOID:
661 if (!checkonly) {
662 if (tcp->tcp_loopback ||
663 (onoff != 1) || !tcp_zcopy_check(tcp)) {
664 *outlenp = 0;
665 return (EOPNOTSUPP);
666 }
667 tcp->tcp_snd_zcopy_aware = 1;
668 }
669 *outlenp = inlen;
670 return (0);
671 }
672 break;
673 case IPPROTO_TCP:
674 switch (name) {
675 case TCP_NODELAY:
676 if (!checkonly)
677 tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss;
678 break;
679 case TCP_NOTIFY_THRESHOLD:
680 if (!checkonly)
681 tcp->tcp_first_timer_threshold = *i1;
682 break;
683 case TCP_ABORT_THRESHOLD:
684 if (!checkonly)
685 tcp->tcp_second_timer_threshold = *i1;
686 break;
687 case TCP_CONN_NOTIFY_THRESHOLD:
688 if (!checkonly)
689 tcp->tcp_first_ctimer_threshold = *i1;
690 break;
691 case TCP_CONN_ABORT_THRESHOLD:
692 if (!checkonly)
693 tcp->tcp_second_ctimer_threshold = *i1;
694 break;
695 case TCP_RECVDSTADDR:
696 if (tcp->tcp_state > TCPS_LISTEN) {
697 *outlenp = 0;
698 return (EOPNOTSUPP);
699 }
700 /* Setting done in conn_opt_set */
701 break;
702 case TCP_INIT_CWND:
703 if (checkonly)
704 break;
705
706 /*
707 * Only allow socket with network configuration
708 * privilege to set the initial cwnd to be larger
709 * than allowed by RFC 3390.
710 */
711 if (val > MIN(4, MAX(2, 4380 / tcp->tcp_mss))) {
712 if ((reterr = secpolicy_ip_config(cr, B_TRUE))
713 != 0) {
714 *outlenp = 0;
715 return (reterr);
716 }
717 if (val > tcp_max_init_cwnd) {
718 *outlenp = 0;
719 return (EINVAL);
720 }
721 }
722
723 tcp->tcp_init_cwnd = val;
724
725 /*
726 * If the socket is connected, AND no outbound data
727 * has been sent, reset the actual cwnd values.
728 */
729 if (tcp->tcp_state == TCPS_ESTABLISHED &&
730 tcp->tcp_iss == tcp->tcp_snxt - 1) {
731 tcp->tcp_cwnd =
732 MIN(tcp->tcp_rwnd, val * tcp->tcp_mss);
733 }
734 break;
735
736 /*
737 * TCP_KEEPIDLE is in seconds but TCP_KEEPALIVE_THRESHOLD
738 * is in milliseconds. TCP_KEEPIDLE is introduced for
739 * compatibility with other Unix flavors.
740 * We can fall through TCP_KEEPALIVE_THRESHOLD logic after
741 * converting the input to milliseconds.
742 */
743 case TCP_KEEPIDLE:
744 *i1 *= 1000;
745 /* FALLTHRU */
746
747 case TCP_KEEPALIVE_THRESHOLD:
748 if (checkonly)
749 break;
750
751 if (*i1 < tcps->tcps_keepalive_interval_low ||
752 *i1 > tcps->tcps_keepalive_interval_high) {
753 *outlenp = 0;
754 return (EINVAL);
755 }
756 if (*i1 != tcp->tcp_ka_interval) {
757 tcp->tcp_ka_interval = *i1;
758 /*
759 * Check if we need to restart the
760 * keepalive timer.
761 */
762 if (tcp->tcp_ka_tid != 0) {
763 ASSERT(connp->conn_keepalive);
764 (void) TCP_TIMER_CANCEL(tcp,
765 tcp->tcp_ka_tid);
766 tcp->tcp_ka_last_intrvl = 0;
767 tcp->tcp_ka_tid = TCP_TIMER(tcp,
768 tcp_keepalive_timer,
769 tcp->tcp_ka_interval);
770 }
771 }
772 break;
773
774 /*
775 * tcp_ka_abort_thres = tcp_ka_rinterval * tcp_ka_cnt.
776 * So setting TCP_KEEPCNT or TCP_KEEPINTVL can affect all the
777 * three members - tcp_ka_abort_thres, tcp_ka_rinterval and
778 * tcp_ka_cnt.
779 */
780 case TCP_KEEPCNT:
781 if (checkonly)
782 break;
783
784 if (*i1 == 0) {
785 return (EINVAL);
786 } else if (tcp->tcp_ka_rinterval == 0) {
787 /*
788 * When TCP_KEEPCNT is specified without first
789 * specifying a TCP_KEEPINTVL, we infer an
790 * interval based on a tunable specific to our
791 * stack: the tcp_keepalive_abort_interval.
792 * (Or the TCP_KEEPALIVE_ABORT_THRESHOLD, in
793 * the unlikely event that that has been set.)
794 * Given the abort interval's default value of
795 * 480 seconds, low TCP_KEEPCNT values can
796 * result in intervals that exceed the default
797 * maximum RTO of 60 seconds. Rather than
798 * fail in these cases, we (implicitly) clamp
799 * the interval at the maximum RTO; if the
800 * TCP_KEEPCNT is shortly followed by a
801 * TCP_KEEPINTVL (as we expect), the abort
802 * threshold will be recalculated correctly --
803 * and if a TCP_KEEPINTVL is not forthcoming,
804 * keep-alive will at least operate reasonably
805 * given the underconfigured state.
806 */
807 uint32_t interval;
808
809 interval = tcp->tcp_ka_abort_thres / *i1;
810
811 if (interval < tcp->tcp_rto_min)
812 interval = tcp->tcp_rto_min;
813
814 if (interval > tcp->tcp_rto_max)
815 interval = tcp->tcp_rto_max;
816
817 tcp->tcp_ka_rinterval = interval;
818 } else {
819 if ((*i1 * tcp->tcp_ka_rinterval) <
820 tcps->tcps_keepalive_abort_interval_low ||
821 (*i1 * tcp->tcp_ka_rinterval) >
822 tcps->tcps_keepalive_abort_interval_high)
823 return (EINVAL);
824 tcp->tcp_ka_abort_thres =
825 (*i1 * tcp->tcp_ka_rinterval);
826 }
827 tcp->tcp_ka_cnt = *i1;
828 break;
829 case TCP_KEEPINTVL:
830 /*
831 * TCP_KEEPINTVL is specified in seconds, but
832 * tcp_ka_rinterval is in milliseconds.
833 */
834
835 if (checkonly)
836 break;
837
838 if ((*i1 * 1000) < tcp->tcp_rto_min ||
839 (*i1 * 1000) > tcp->tcp_rto_max)
840 return (EINVAL);
841
842 if (tcp->tcp_ka_cnt == 0) {
843 tcp->tcp_ka_cnt =
844 tcp->tcp_ka_abort_thres / (*i1 * 1000);
845 } else {
846 if ((*i1 * tcp->tcp_ka_cnt * 1000) <
847 tcps->tcps_keepalive_abort_interval_low ||
848 (*i1 * tcp->tcp_ka_cnt * 1000) >
849 tcps->tcps_keepalive_abort_interval_high)
850 return (EINVAL);
851 tcp->tcp_ka_abort_thres =
852 (*i1 * tcp->tcp_ka_cnt * 1000);
853 }
854 tcp->tcp_ka_rinterval = *i1 * 1000;
855 break;
856 case TCP_KEEPALIVE_ABORT_THRESHOLD:
857 if (!checkonly) {
858 if (*i1 <
859 tcps->tcps_keepalive_abort_interval_low ||
860 *i1 >
861 tcps->tcps_keepalive_abort_interval_high) {
862 *outlenp = 0;
863 return (EINVAL);
864 }
865 tcp->tcp_ka_abort_thres = *i1;
866 tcp->tcp_ka_cnt = 0;
867 tcp->tcp_ka_rinterval = 0;
868 }
869 break;
870 case TCP_CONGESTION: {
871 struct cc_algo *algo;
872
873 if (checkonly) {
874 break;
875 }
876
877 /*
878 * Make sure the string is NUL-terminated. Some
879 * consumers pass only the number of characters
880 * in the string, and don't include the NUL
881 * terminator, so we set it for them.
882 */
883 if (inlen < CC_ALGO_NAME_MAX) {
884 invalp[inlen] = '\0';
885 }
886 invalp[CC_ALGO_NAME_MAX - 1] = '\0';
887
888 if ((algo = cc_load_algo((char *)invalp)) == NULL) {
889 return (ENOENT);
890 }
891
892 if (CC_ALGO(tcp)->cb_destroy != NULL) {
893 CC_ALGO(tcp)->cb_destroy(&tcp->tcp_ccv);
894 }
895
896 CC_DATA(tcp) = NULL;
897 CC_ALGO(tcp) = algo;
898
899 if (CC_ALGO(tcp)->cb_init != NULL) {
900 VERIFY0(CC_ALGO(tcp)->cb_init(&tcp->tcp_ccv));
901 }
902
903 break;
904 }
905 case TCP_CORK:
906 if (!checkonly) {
907 /*
908 * if tcp->tcp_cork was set and is now
909 * being unset, we have to make sure that
910 * the remaining data gets sent out. Also
911 * unset tcp->tcp_cork so that tcp_wput_data()
912 * can send data even if it is less than mss
913 */
914 if (tcp->tcp_cork && onoff == 0 &&
915 tcp->tcp_unsent > 0) {
916 tcp->tcp_cork = B_FALSE;
917 tcp_wput_data(tcp, NULL, B_FALSE);
918 }
919 tcp->tcp_cork = onoff;
920 }
921 break;
922 case TCP_RTO_INITIAL:
923 if (checkonly || val == 0)
924 break;
925
926 /*
927 * Sanity checks
928 *
929 * The initial RTO should be bounded by the minimum
930 * and maximum RTO. And it should also be smaller
931 * than the connect attempt abort timeout. Otherwise,
932 * the connection won't be aborted in a period
933 * reasonably close to that timeout.
934 */
935 if (val < tcp->tcp_rto_min || val > tcp->tcp_rto_max ||
936 val > tcp->tcp_second_ctimer_threshold ||
937 val < tcps->tcps_rexmit_interval_initial_low ||
938 val > tcps->tcps_rexmit_interval_initial_high) {
939 *outlenp = 0;
940 return (EINVAL);
941 }
942 tcp->tcp_rto_initial = val;
943
944 /*
945 * If TCP has not sent anything, need to re-calculate
946 * tcp_rto. Otherwise, this option change does not
947 * really affect anything.
948 */
949 if (tcp->tcp_state >= TCPS_SYN_SENT)
950 break;
951
952 tcp->tcp_rtt_sa = MSEC2NSEC(tcp->tcp_rto_initial) << 2;
953 tcp->tcp_rtt_sd = MSEC2NSEC(tcp->tcp_rto_initial) >> 1;
954 tcp->tcp_rto = tcp_calculate_rto(tcp, tcps,
955 tcps->tcps_conn_grace_period);
956 break;
957 case TCP_RTO_MIN:
958 if (checkonly || val == 0)
959 break;
960
961 if (val < tcps->tcps_rexmit_interval_min_low ||
962 val > tcps->tcps_rexmit_interval_min_high ||
963 val > tcp->tcp_rto_max) {
964 *outlenp = 0;
965 return (EINVAL);
966 }
967 tcp->tcp_rto_min = val;
968 if (tcp->tcp_rto < val)
969 tcp->tcp_rto = val;
970 break;
971 case TCP_RTO_MAX:
972 if (checkonly || val == 0)
973 break;
974
975 /*
976 * Sanity checks
977 *
978 * The maximum RTO should not be larger than the
979 * connection abort timeout. Otherwise, the
980 * connection won't be aborted in a period reasonably
981 * close to that timeout.
982 */
983 if (val < tcps->tcps_rexmit_interval_max_low ||
984 val > tcps->tcps_rexmit_interval_max_high ||
985 val < tcp->tcp_rto_min ||
986 val > tcp->tcp_second_timer_threshold) {
987 *outlenp = 0;
988 return (EINVAL);
989 }
990 tcp->tcp_rto_max = val;
991 if (tcp->tcp_rto > val)
992 tcp->tcp_rto = val;
993 break;
994 case TCP_LINGER2:
995 if (checkonly || *i1 == 0)
996 break;
997
998 /*
999 * Note that the option value's unit is second. And
1000 * the value should be bigger than the private
1001 * parameter tcp_fin_wait_2_flush_interval's lower
1002 * bound and smaller than the current value of that
1003 * parameter. It should be smaller than the current
1004 * value to avoid an app setting TCP_LINGER2 to a big
1005 * value, causing resource to be held up too long in
1006 * FIN-WAIT-2 state.
1007 */
1008 if (*i1 < 0 ||
1009 tcps->tcps_fin_wait_2_flush_interval_low/SECONDS >
1010 *i1 ||
1011 tcps->tcps_fin_wait_2_flush_interval/SECONDS <
1012 *i1) {
1013 *outlenp = 0;
1014 return (EINVAL);
1015 }
1016 tcp->tcp_fin_wait_2_flush_interval = *i1 * SECONDS;
1017 break;
1018 default:
1019 break;
1020 }
1021 break;
1022 case IPPROTO_IP:
1023 if (connp->conn_family != AF_INET) {
1024 *outlenp = 0;
1025 return (EINVAL);
1026 }
1027 switch (name) {
1028 case IP_SEC_OPT:
1029 /*
1030 * We should not allow policy setting after
1031 * we start listening for connections.
1032 */
1033 if (tcp->tcp_state == TCPS_LISTEN) {
1034 return (EINVAL);
1035 }
1036 break;
1037 case IP_RECVTOS:
1038 if (!checkonly) {
1039 /*
1040 * Force it to be sent up with the next msg
1041 * by setting it to a value which cannot
1042 * appear in a packet (TOS is only 8-bits)
1043 */
1044 tcp->tcp_recvtos = 0xffffffffU;
1045 }
1046 break;
1047 }
1048 break;
1049 case IPPROTO_IPV6:
1050 /*
1051 * IPPROTO_IPV6 options are only supported for sockets
1052 * that are using IPv6 on the wire.
1053 */
1054 if (connp->conn_ipversion != IPV6_VERSION) {
1055 *outlenp = 0;
1056 return (EINVAL);
1057 }
1058
1059 switch (name) {
1060 case IPV6_RECVPKTINFO:
1061 if (!checkonly) {
1062 /* Force it to be sent up with the next msg */
1063 tcp->tcp_recvifindex = 0;
1064 }
1065 break;
1066 case IPV6_RECVTCLASS:
1067 if (!checkonly) {
1068 /* Force it to be sent up with the next msg */
1069 tcp->tcp_recvtclass = 0xffffffffU;
1070 }
1071 break;
1072 case IPV6_RECVHOPLIMIT:
1073 if (!checkonly) {
1074 /* Force it to be sent up with the next msg */
1075 tcp->tcp_recvhops = 0xffffffffU;
1076 }
1077 break;
1078 case IPV6_PKTINFO:
1079 /* This is an extra check for TCP */
1080 if (inlen == sizeof (struct in6_pktinfo)) {
1081 struct in6_pktinfo *pkti;
1082
1083 pkti = (struct in6_pktinfo *)invalp;
1084 /*
1085 * RFC 3542 states that ipi6_addr must be
1086 * the unspecified address when setting the
1087 * IPV6_PKTINFO sticky socket option on a
1088 * TCP socket.
1089 */
1090 if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr))
1091 return (EINVAL);
1092 }
1093 break;
1094 case IPV6_SEC_OPT:
1095 /*
1096 * We should not allow policy setting after
1097 * we start listening for connections.
1098 */
1099 if (tcp->tcp_state == TCPS_LISTEN) {
1100 return (EINVAL);
1101 }
1102 break;
1103 }
1104 break;
1105 }
1106 reterr = conn_opt_set(&coas, level, name, inlen, invalp,
1107 checkonly, cr);
1108 if (reterr != 0) {
1109 *outlenp = 0;
1110 return (reterr);
1111 }
1112
1113 /*
1114 * Common case of OK return with outval same as inval
1115 */
1116 if (invalp != outvalp) {
1117 /* don't trust bcopy for identical src/dst */
1118 (void) bcopy(invalp, outvalp, inlen);
1119 }
1120 *outlenp = inlen;
1121
1122 if (coas.coa_changed & COA_HEADER_CHANGED) {
1123 /* If we are connected we rebuilt the headers */
1124 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1125 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1126 reterr = tcp_build_hdrs(tcp);
1127 if (reterr != 0)
1128 return (reterr);
1129 }
1130 }
1131 if (coas.coa_changed & COA_ROUTE_CHANGED) {
1132 in6_addr_t nexthop;
1133
1134 /*
1135 * If we are connected we re-cache the information.
1136 * We ignore errors to preserve BSD behavior.
1137 * Note that we don't redo IPsec policy lookup here
1138 * since the final destination (or source) didn't change.
1139 */
1140 ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa,
1141 &connp->conn_faddr_v6, &nexthop);
1142
1143 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1144 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1145 (void) ip_attr_connect(connp, connp->conn_ixa,
1146 &connp->conn_laddr_v6, &connp->conn_faddr_v6,
1147 &nexthop, connp->conn_fport, NULL, NULL,
1148 IPDF_VERIFY_DST);
1149 }
1150 }
1151 if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
1152 connp->conn_wq->q_hiwat = connp->conn_sndbuf;
1153 }
1154 if (coas.coa_changed & COA_WROFF_CHANGED) {
1155 connp->conn_wroff = connp->conn_ht_iphc_allocated +
1156 tcps->tcps_wroff_xtra;
1157 (void) proto_set_tx_wroff(connp->conn_rq, connp,
1158 connp->conn_wroff);
1159 }
1160 if (coas.coa_changed & COA_OOBINLINE_CHANGED) {
1161 if (IPCL_IS_NONSTR(connp))
1162 proto_set_rx_oob_opt(connp, onoff);
1163 }
1164 return (0);
1165 }