1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
24 * Copyright 2016 Joyent, Inc.
25 * Copyright (c) 2016 by Delphix. All rights reserved.
26 */
27
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #define _SUN_TPI_VERSION 2
31 #include <sys/tihdr.h>
32 #include <sys/socket.h>
33 #include <sys/xti_xtiopt.h>
34 #include <sys/xti_inet.h>
35 #include <sys/policy.h>
36
37 #include <inet/common.h>
38 #include <netinet/ip6.h>
39 #include <inet/ip.h>
40
41 #include <netinet/in.h>
42 #include <netinet/tcp.h>
43 #include <inet/optcom.h>
44 #include <inet/proto_set.h>
45 #include <inet/tcp_impl.h>
46
47 static int tcp_opt_default(queue_t *, int, int, uchar_t *);
48
49 /*
50 * Table of all known options handled on a TCP protocol stack.
51 *
52 * Note: This table contains options processed by both TCP and IP levels
53 * and is the superset of options that can be performed on a TCP over IP
54 * stack.
55 */
56 opdes_t tcp_opt_arr[] = {
57
58 { SO_LINGER, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
59 sizeof (struct linger), 0 },
60
61 { SO_DEBUG, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
62 { SO_KEEPALIVE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
63 { SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
64 { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
65 },
66 { SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
67 { SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
68 { SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
69 { SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
70 { SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
71 { SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
72 { SO_SNDTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
73 sizeof (struct timeval), 0 },
74 { SO_RCVTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
75 sizeof (struct timeval), 0 },
76 { SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
77 },
78 { SO_SND_COPYAVOID, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
79 { SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
80 0 },
81 { SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
82 0 },
83 { SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
84 0 },
85 { SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
86 0 },
87 { SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
88
89 { SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
90
91 { SO_PROTOTYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
92
93 { TCP_NODELAY, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
94 },
95 { TCP_MAXSEG, IPPROTO_TCP, OA_R, OA_R, OP_NP, 0, sizeof (uint_t),
96 536 },
97
98 { TCP_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
99 OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
100
101 { TCP_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
102 OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
103
104 { TCP_CONN_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
105 OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
106
107 { TCP_CONN_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
108 OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
109
110 { TCP_RECVDSTADDR, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
111 0 },
112
113 { TCP_ANONPRIVBIND, IPPROTO_TCP, OA_R, OA_RW, OP_PRIVPORT, 0,
114 sizeof (int), 0 },
115
116 { TCP_EXCLBIND, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
117 },
118
119 { TCP_INIT_CWND, IPPROTO_TCP, OA_RW, OA_RW, OP_CONFIG, 0,
120 sizeof (int), 0 },
121
122 { TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
123 sizeof (int), 0 },
124
125 { TCP_KEEPIDLE, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
126
127 { TCP_KEEPCNT, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
128
129 { TCP_KEEPINTVL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
130
131 { TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
132 sizeof (int), 0 },
133
134 { TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
135
136 { TCP_RTO_INITIAL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
137
138 { TCP_RTO_MIN, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
139
140 { TCP_RTO_MAX, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
141
142 { TCP_LINGER2, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
143
144 { IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
145 (OP_VARLEN|OP_NODEFAULT),
146 IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
147 { T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
148 (OP_VARLEN|OP_NODEFAULT),
149 IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
150
151 { IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
152 { T_IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
153 { IP_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
154 sizeof (int), -1 /* not initialized */ },
155
156 { IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
157 sizeof (ipsec_req_t), -1 /* not initialized */ },
158
159 { IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
160 sizeof (int), 0 /* no ifindex */ },
161
162 { IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
163 sizeof (int), 0 },
164
165 { IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
166 sizeof (int), -1 /* not initialized */ },
167
168 { IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
169 sizeof (int), 0 /* no ifindex */ },
170
171 { IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
172
173 { IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
174 sizeof (in_addr_t), -1 /* not initialized */ },
175
176 { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
177 sizeof (int), 0 },
178
179 { IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
180 (OP_NODEFAULT|OP_VARLEN),
181 sizeof (struct in6_pktinfo), -1 /* not initialized */ },
182 { IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
183 OP_NODEFAULT,
184 sizeof (sin6_t), -1 /* not initialized */ },
185 { IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
186 (OP_VARLEN|OP_NODEFAULT), 255*8,
187 -1 /* not initialized */ },
188 { IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
189 (OP_VARLEN|OP_NODEFAULT), 255*8,
190 -1 /* not initialized */ },
191 { IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
192 (OP_VARLEN|OP_NODEFAULT), 255*8,
193 -1 /* not initialized */ },
194 { IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
195 (OP_VARLEN|OP_NODEFAULT), 255*8,
196 -1 /* not initialized */ },
197 { IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
198 OP_NODEFAULT,
199 sizeof (int), -1 /* not initialized */ },
200 { IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
201 OP_NODEFAULT,
202 sizeof (struct ip6_mtuinfo), -1 /* not initialized */ },
203 { IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
204 sizeof (int), 0 },
205 { IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
206 sizeof (int), 0 },
207 { IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
208 sizeof (int), 0 },
209
210 /* Enable receipt of ancillary data */
211 { IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
212 sizeof (int), 0 },
213 { IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
214 sizeof (int), 0 },
215 { IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
216 sizeof (int), 0 },
217 { _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
218 sizeof (int), 0 },
219 { IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
220 sizeof (int), 0 },
221 { IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
222 sizeof (int), 0 },
223 { IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
224 sizeof (int), 0 },
225 { IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
226 sizeof (int), 0 },
227
228 { IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
229 sizeof (ipsec_req_t), -1 /* not initialized */ },
230 { IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
231 sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT },
232 };
233
234 /*
235 * Table of all supported levels
236 * Note: Some levels (e.g. XTI_GENERIC) may be valid but may not have
237 * any supported options so we need this info separately.
238 *
239 * This is needed only for topmost tpi providers and is used only by
240 * XTI interfaces.
241 */
242 optlevel_t tcp_valid_levels_arr[] = {
243 XTI_GENERIC,
244 SOL_SOCKET,
245 IPPROTO_TCP,
246 IPPROTO_IP,
247 IPPROTO_IPV6
248 };
249
250
251 #define TCP_OPT_ARR_CNT A_CNT(tcp_opt_arr)
252 #define TCP_VALID_LEVELS_CNT A_CNT(tcp_valid_levels_arr)
253
254 uint_t tcp_max_optsize; /* initialized when TCP driver is loaded */
255
256 /*
257 * Initialize option database object for TCP
258 *
259 * This object represents database of options to search passed to
260 * {sock,tpi}optcom_req() interface routine to take care of option
261 * management and associated methods.
262 */
263
264 optdb_obj_t tcp_opt_obj = {
265 tcp_opt_default, /* TCP default value function pointer */
266 tcp_tpi_opt_get, /* TCP get function pointer */
267 tcp_tpi_opt_set, /* TCP set function pointer */
268 TCP_OPT_ARR_CNT, /* TCP option database count of entries */
269 tcp_opt_arr, /* TCP option database */
270 TCP_VALID_LEVELS_CNT, /* TCP valid level count of entries */
271 tcp_valid_levels_arr /* TCP valid level array */
272 };
273
274 static int tcp_max_init_cwnd = TCP_MAX_INIT_CWND;
275
276 /*
277 * Some TCP options can be "set" by requesting them in the option
278 * buffer. This is needed for XTI feature test though we do not
279 * allow it in general. We interpret that this mechanism is more
280 * applicable to OSI protocols and need not be allowed in general.
281 * This routine filters out options for which it is not allowed (most)
282 * and lets through those (few) for which it is. [ The XTI interface
283 * test suite specifics will imply that any XTI_GENERIC level XTI_* if
284 * ever implemented will have to be allowed here ].
285 */
286 static boolean_t
287 tcp_allow_connopt_set(int level, int name)
288 {
289
290 switch (level) {
291 case IPPROTO_TCP:
292 switch (name) {
293 case TCP_NODELAY:
294 return (B_TRUE);
295 default:
296 return (B_FALSE);
297 }
298 /*NOTREACHED*/
299 default:
300 return (B_FALSE);
301 }
302 /*NOTREACHED*/
303 }
304
305 /*
306 * This routine gets default values of certain options whose default
307 * values are maintained by protocol specific code
308 */
309 /* ARGSUSED */
310 static int
311 tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
312 {
313 int32_t *i1 = (int32_t *)ptr;
314 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps;
315
316 switch (level) {
317 case IPPROTO_TCP:
318 switch (name) {
319 case TCP_NOTIFY_THRESHOLD:
320 *i1 = tcps->tcps_ip_notify_interval;
321 break;
322 case TCP_ABORT_THRESHOLD:
323 *i1 = tcps->tcps_ip_abort_interval;
324 break;
325 case TCP_CONN_NOTIFY_THRESHOLD:
326 *i1 = tcps->tcps_ip_notify_cinterval;
327 break;
328 case TCP_CONN_ABORT_THRESHOLD:
329 *i1 = tcps->tcps_ip_abort_cinterval;
330 break;
331 default:
332 return (-1);
333 }
334 break;
335 case IPPROTO_IP:
336 switch (name) {
337 case IP_TTL:
338 *i1 = tcps->tcps_ipv4_ttl;
339 break;
340 default:
341 return (-1);
342 }
343 break;
344 case IPPROTO_IPV6:
345 switch (name) {
346 case IPV6_UNICAST_HOPS:
347 *i1 = tcps->tcps_ipv6_hoplimit;
348 break;
349 default:
350 return (-1);
351 }
352 break;
353 default:
354 return (-1);
355 }
356 return (sizeof (int));
357 }
358
359 /*
360 * TCP routine to get the values of options.
361 */
362 int
363 tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
364 {
365 int *i1 = (int *)ptr;
366 tcp_t *tcp = connp->conn_tcp;
367 conn_opt_arg_t coas;
368 int retval;
369
370 coas.coa_connp = connp;
371 coas.coa_ixa = connp->conn_ixa;
372 coas.coa_ipp = &connp->conn_xmit_ipp;
373 coas.coa_ancillary = B_FALSE;
374 coas.coa_changed = 0;
375
376 switch (level) {
377 case SOL_SOCKET:
378 switch (name) {
379 case SO_SND_COPYAVOID:
380 *i1 = tcp->tcp_snd_zcopy_on ?
381 SO_SND_COPYAVOID : 0;
382 return (sizeof (int));
383 case SO_ACCEPTCONN:
384 *i1 = (tcp->tcp_state == TCPS_LISTEN);
385 return (sizeof (int));
386 }
387 break;
388 case IPPROTO_TCP:
389 switch (name) {
390 case TCP_NODELAY:
391 *i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0;
392 return (sizeof (int));
393 case TCP_MAXSEG:
394 *i1 = tcp->tcp_mss;
395 return (sizeof (int));
396 case TCP_NOTIFY_THRESHOLD:
397 *i1 = (int)tcp->tcp_first_timer_threshold;
398 return (sizeof (int));
399 case TCP_ABORT_THRESHOLD:
400 *i1 = tcp->tcp_second_timer_threshold;
401 return (sizeof (int));
402 case TCP_CONN_NOTIFY_THRESHOLD:
403 *i1 = tcp->tcp_first_ctimer_threshold;
404 return (sizeof (int));
405 case TCP_CONN_ABORT_THRESHOLD:
406 *i1 = tcp->tcp_second_ctimer_threshold;
407 return (sizeof (int));
408 case TCP_INIT_CWND:
409 *i1 = tcp->tcp_init_cwnd;
410 return (sizeof (int));
411 case TCP_KEEPALIVE_THRESHOLD:
412 *i1 = tcp->tcp_ka_interval;
413 return (sizeof (int));
414
415 /*
416 * TCP_KEEPIDLE expects value in seconds, but
417 * tcp_ka_interval is in milliseconds.
418 */
419 case TCP_KEEPIDLE:
420 *i1 = tcp->tcp_ka_interval / 1000;
421 return (sizeof (int));
422 case TCP_KEEPCNT:
423 *i1 = tcp->tcp_ka_cnt;
424 return (sizeof (int));
425
426 /*
427 * TCP_KEEPINTVL expects value in seconds, but
428 * tcp_ka_rinterval is in milliseconds.
429 */
430 case TCP_KEEPINTVL:
431 *i1 = tcp->tcp_ka_rinterval / 1000;
432 return (sizeof (int));
433 case TCP_KEEPALIVE_ABORT_THRESHOLD:
434 *i1 = tcp->tcp_ka_abort_thres;
435 return (sizeof (int));
436 case TCP_CORK:
437 *i1 = tcp->tcp_cork;
438 return (sizeof (int));
439 case TCP_RTO_INITIAL:
440 *i1 = tcp->tcp_rto_initial;
441 return (sizeof (uint32_t));
442 case TCP_RTO_MIN:
443 *i1 = tcp->tcp_rto_min;
444 return (sizeof (uint32_t));
445 case TCP_RTO_MAX:
446 *i1 = tcp->tcp_rto_max;
447 return (sizeof (uint32_t));
448 case TCP_LINGER2:
449 *i1 = tcp->tcp_fin_wait_2_flush_interval / SECONDS;
450 return (sizeof (int));
451 }
452 break;
453 case IPPROTO_IP:
454 if (connp->conn_family != AF_INET)
455 return (-1);
456 switch (name) {
457 case IP_OPTIONS:
458 case T_IP_OPTIONS:
459 /* Caller ensures enough space */
460 return (ip_opt_get_user(connp, ptr));
461 default:
462 break;
463 }
464 break;
465
466 case IPPROTO_IPV6:
467 /*
468 * IPPROTO_IPV6 options are only supported for sockets
469 * that are using IPv6 on the wire.
470 */
471 if (connp->conn_ipversion != IPV6_VERSION) {
472 return (-1);
473 }
474 switch (name) {
475 case IPV6_PATHMTU:
476 if (tcp->tcp_state < TCPS_ESTABLISHED)
477 return (-1);
478 break;
479 }
480 break;
481 }
482 mutex_enter(&connp->conn_lock);
483 retval = conn_opt_get(&coas, level, name, ptr);
484 mutex_exit(&connp->conn_lock);
485 return (retval);
486 }
487
488 /*
489 * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
490 * Parameters are assumed to be verified by the caller.
491 */
492 /* ARGSUSED */
493 int
494 tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
495 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
496 void *thisdg_attrs, cred_t *cr)
497 {
498 tcp_t *tcp = connp->conn_tcp;
499 int *i1 = (int *)invalp;
500 boolean_t onoff = (*i1 == 0) ? 0 : 1;
501 boolean_t checkonly;
502 int reterr;
503 tcp_stack_t *tcps = tcp->tcp_tcps;
504 conn_opt_arg_t coas;
505 uint32_t val = *((uint32_t *)invalp);
506
507 coas.coa_connp = connp;
508 coas.coa_ixa = connp->conn_ixa;
509 coas.coa_ipp = &connp->conn_xmit_ipp;
510 coas.coa_ancillary = B_FALSE;
511 coas.coa_changed = 0;
512
513 switch (optset_context) {
514 case SETFN_OPTCOM_CHECKONLY:
515 checkonly = B_TRUE;
516 /*
517 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
518 * inlen != 0 implies value supplied and
519 * we have to "pretend" to set it.
520 * inlen == 0 implies that there is no
521 * value part in T_CHECK request and just validation
522 * done elsewhere should be enough, we just return here.
523 */
524 if (inlen == 0) {
525 *outlenp = 0;
526 return (0);
527 }
528 break;
529 case SETFN_OPTCOM_NEGOTIATE:
530 checkonly = B_FALSE;
531 break;
532 case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */
533 case SETFN_CONN_NEGOTIATE:
534 checkonly = B_FALSE;
535 /*
536 * Negotiating local and "association-related" options
537 * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ)
538 * primitives is allowed by XTI, but we choose
539 * to not implement this style negotiation for Internet
540 * protocols (We interpret it is a must for OSI world but
541 * optional for Internet protocols) for all options.
542 * [ Will do only for the few options that enable test
543 * suites that our XTI implementation of this feature
544 * works for transports that do allow it ]
545 */
546 if (!tcp_allow_connopt_set(level, name)) {
547 *outlenp = 0;
548 return (EINVAL);
549 }
550 break;
551 default:
552 /*
553 * We should never get here
554 */
555 *outlenp = 0;
556 return (EINVAL);
557 }
558
559 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
560 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
561
562 /*
563 * For TCP, we should have no ancillary data sent down
564 * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs
565 * has to be zero.
566 */
567 ASSERT(thisdg_attrs == NULL);
568
569 /*
570 * For fixed length options, no sanity check
571 * of passed in length is done. It is assumed *_optcom_req()
572 * routines do the right thing.
573 */
574 switch (level) {
575 case SOL_SOCKET:
576 switch (name) {
577 case SO_KEEPALIVE:
578 if (checkonly) {
579 /* check only case */
580 break;
581 }
582
583 if (!onoff) {
584 if (connp->conn_keepalive) {
585 if (tcp->tcp_ka_tid != 0) {
586 (void) TCP_TIMER_CANCEL(tcp,
587 tcp->tcp_ka_tid);
588 tcp->tcp_ka_tid = 0;
589 }
590 connp->conn_keepalive = 0;
591 }
592 break;
593 }
594 if (!connp->conn_keepalive) {
595 /* Crank up the keepalive timer */
596 tcp->tcp_ka_last_intrvl = 0;
597 tcp->tcp_ka_tid = TCP_TIMER(tcp,
598 tcp_keepalive_timer, tcp->tcp_ka_interval);
599 connp->conn_keepalive = 1;
600 }
601 break;
602 case SO_SNDBUF: {
603 if (*i1 > tcps->tcps_max_buf) {
604 *outlenp = 0;
605 return (ENOBUFS);
606 }
607 if (checkonly)
608 break;
609
610 connp->conn_sndbuf = *i1;
611 if (tcps->tcps_snd_lowat_fraction != 0) {
612 connp->conn_sndlowat = connp->conn_sndbuf /
613 tcps->tcps_snd_lowat_fraction;
614 }
615 (void) tcp_maxpsz_set(tcp, B_TRUE);
616 /*
617 * If we are flow-controlled, recheck the condition.
618 * There are apps that increase SO_SNDBUF size when
619 * flow-controlled (EWOULDBLOCK), and expect the flow
620 * control condition to be lifted right away.
621 */
622 mutex_enter(&tcp->tcp_non_sq_lock);
623 if (tcp->tcp_flow_stopped &&
624 TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) {
625 tcp_clrqfull(tcp);
626 }
627 mutex_exit(&tcp->tcp_non_sq_lock);
628 *outlenp = inlen;
629 return (0);
630 }
631 case SO_RCVBUF:
632 if (*i1 > tcps->tcps_max_buf) {
633 *outlenp = 0;
634 return (ENOBUFS);
635 }
636 /* Silently ignore zero */
637 if (!checkonly && *i1 != 0) {
638 *i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss);
639 (void) tcp_rwnd_set(tcp, *i1);
640 }
641 /*
642 * XXX should we return the rwnd here
643 * and tcp_opt_get ?
644 */
645 *outlenp = inlen;
646 return (0);
647 case SO_SND_COPYAVOID:
648 if (!checkonly) {
649 if (tcp->tcp_loopback ||
650 (onoff != 1) || !tcp_zcopy_check(tcp)) {
651 *outlenp = 0;
652 return (EOPNOTSUPP);
653 }
654 tcp->tcp_snd_zcopy_aware = 1;
655 }
656 *outlenp = inlen;
657 return (0);
658 }
659 break;
660 case IPPROTO_TCP:
661 switch (name) {
662 case TCP_NODELAY:
663 if (!checkonly)
664 tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss;
665 break;
666 case TCP_NOTIFY_THRESHOLD:
667 if (!checkonly)
668 tcp->tcp_first_timer_threshold = *i1;
669 break;
670 case TCP_ABORT_THRESHOLD:
671 if (!checkonly)
672 tcp->tcp_second_timer_threshold = *i1;
673 break;
674 case TCP_CONN_NOTIFY_THRESHOLD:
675 if (!checkonly)
676 tcp->tcp_first_ctimer_threshold = *i1;
677 break;
678 case TCP_CONN_ABORT_THRESHOLD:
679 if (!checkonly)
680 tcp->tcp_second_ctimer_threshold = *i1;
681 break;
682 case TCP_RECVDSTADDR:
683 if (tcp->tcp_state > TCPS_LISTEN) {
684 *outlenp = 0;
685 return (EOPNOTSUPP);
686 }
687 /* Setting done in conn_opt_set */
688 break;
689 case TCP_INIT_CWND:
690 if (checkonly)
691 break;
692
693 /*
694 * Only allow socket with network configuration
695 * privilege to set the initial cwnd to be larger
696 * than allowed by RFC 3390.
697 */
698 if (val > MIN(4, MAX(2, 4380 / tcp->tcp_mss))) {
699 if ((reterr = secpolicy_ip_config(cr, B_TRUE))
700 != 0) {
701 *outlenp = 0;
702 return (reterr);
703 }
704 if (val > tcp_max_init_cwnd) {
705 *outlenp = 0;
706 return (EINVAL);
707 }
708 }
709
710 tcp->tcp_init_cwnd = val;
711
712 /*
713 * If the socket is connected, AND no outbound data
714 * has been sent, reset the actual cwnd values.
715 */
716 if (tcp->tcp_state == TCPS_ESTABLISHED &&
717 tcp->tcp_iss == tcp->tcp_snxt - 1) {
718 tcp->tcp_cwnd =
719 MIN(tcp->tcp_rwnd, val * tcp->tcp_mss);
720 }
721 break;
722
723 /*
724 * TCP_KEEPIDLE is in seconds but TCP_KEEPALIVE_THRESHOLD
725 * is in milliseconds. TCP_KEEPIDLE is introduced for
726 * compatibility with other Unix flavors.
727 * We can fall through TCP_KEEPALIVE_THRESHOLD logic after
728 * converting the input to milliseconds.
729 */
730 case TCP_KEEPIDLE:
731 *i1 *= 1000;
732 /* FALLTHRU */
733
734 case TCP_KEEPALIVE_THRESHOLD:
735 if (checkonly)
736 break;
737
738 if (*i1 < tcps->tcps_keepalive_interval_low ||
739 *i1 > tcps->tcps_keepalive_interval_high) {
740 *outlenp = 0;
741 return (EINVAL);
742 }
743 if (*i1 != tcp->tcp_ka_interval) {
744 tcp->tcp_ka_interval = *i1;
745 /*
746 * Check if we need to restart the
747 * keepalive timer.
748 */
749 if (tcp->tcp_ka_tid != 0) {
750 ASSERT(connp->conn_keepalive);
751 (void) TCP_TIMER_CANCEL(tcp,
752 tcp->tcp_ka_tid);
753 tcp->tcp_ka_last_intrvl = 0;
754 tcp->tcp_ka_tid = TCP_TIMER(tcp,
755 tcp_keepalive_timer,
756 tcp->tcp_ka_interval);
757 }
758 }
759 break;
760
761 /*
762 * tcp_ka_abort_thres = tcp_ka_rinterval * tcp_ka_cnt.
763 * So setting TCP_KEEPCNT or TCP_KEEPINTVL can affect all the
764 * three members - tcp_ka_abort_thres, tcp_ka_rinterval and
765 * tcp_ka_cnt.
766 */
767 case TCP_KEEPCNT:
768 if (checkonly)
769 break;
770
771 if (*i1 == 0) {
772 return (EINVAL);
773 } else if (tcp->tcp_ka_rinterval == 0) {
774 /*
775 * When TCP_KEEPCNT is specified without first
776 * specifying a TCP_KEEPINTVL, we infer an
777 * interval based on a tunable specific to our
778 * stack: the tcp_keepalive_abort_interval.
779 * (Or the TCP_KEEPALIVE_ABORT_THRESHOLD, in
780 * the unlikely event that that has been set.)
781 * Given the abort interval's default value of
782 * 480 seconds, low TCP_KEEPCNT values can
783 * result in intervals that exceed the default
784 * maximum RTO of 60 seconds. Rather than
785 * fail in these cases, we (implicitly) clamp
786 * the interval at the maximum RTO; if the
787 * TCP_KEEPCNT is shortly followed by a
788 * TCP_KEEPINTVL (as we expect), the abort
789 * threshold will be recalculated correctly --
790 * and if a TCP_KEEPINTVL is not forthcoming,
791 * keep-alive will at least operate reasonably
792 * given the underconfigured state.
793 */
794 uint32_t interval;
795
796 interval = tcp->tcp_ka_abort_thres / *i1;
797
798 if (interval < tcp->tcp_rto_min)
799 interval = tcp->tcp_rto_min;
800
801 if (interval > tcp->tcp_rto_max)
802 interval = tcp->tcp_rto_max;
803
804 tcp->tcp_ka_rinterval = interval;
805 } else {
806 if ((*i1 * tcp->tcp_ka_rinterval) <
807 tcps->tcps_keepalive_abort_interval_low ||
808 (*i1 * tcp->tcp_ka_rinterval) >
809 tcps->tcps_keepalive_abort_interval_high)
810 return (EINVAL);
811 tcp->tcp_ka_abort_thres =
812 (*i1 * tcp->tcp_ka_rinterval);
813 }
814 tcp->tcp_ka_cnt = *i1;
815 break;
816 case TCP_KEEPINTVL:
817 /*
818 * TCP_KEEPINTVL is specified in seconds, but
819 * tcp_ka_rinterval is in milliseconds.
820 */
821
822 if (checkonly)
823 break;
824
825 if ((*i1 * 1000) < tcp->tcp_rto_min ||
826 (*i1 * 1000) > tcp->tcp_rto_max)
827 return (EINVAL);
828
829 if (tcp->tcp_ka_cnt == 0) {
830 tcp->tcp_ka_cnt =
831 tcp->tcp_ka_abort_thres / (*i1 * 1000);
832 } else {
833 if ((*i1 * tcp->tcp_ka_cnt * 1000) <
834 tcps->tcps_keepalive_abort_interval_low ||
835 (*i1 * tcp->tcp_ka_cnt * 1000) >
836 tcps->tcps_keepalive_abort_interval_high)
837 return (EINVAL);
838 tcp->tcp_ka_abort_thres =
839 (*i1 * tcp->tcp_ka_cnt * 1000);
840 }
841 tcp->tcp_ka_rinterval = *i1 * 1000;
842 break;
843 case TCP_KEEPALIVE_ABORT_THRESHOLD:
844 if (!checkonly) {
845 if (*i1 <
846 tcps->tcps_keepalive_abort_interval_low ||
847 *i1 >
848 tcps->tcps_keepalive_abort_interval_high) {
849 *outlenp = 0;
850 return (EINVAL);
851 }
852 tcp->tcp_ka_abort_thres = *i1;
853 tcp->tcp_ka_cnt = 0;
854 tcp->tcp_ka_rinterval = 0;
855 }
856 break;
857 case TCP_CORK:
858 if (!checkonly) {
859 /*
860 * if tcp->tcp_cork was set and is now
861 * being unset, we have to make sure that
862 * the remaining data gets sent out. Also
863 * unset tcp->tcp_cork so that tcp_wput_data()
864 * can send data even if it is less than mss
865 */
866 if (tcp->tcp_cork && onoff == 0 &&
867 tcp->tcp_unsent > 0) {
868 tcp->tcp_cork = B_FALSE;
869 tcp_wput_data(tcp, NULL, B_FALSE);
870 }
871 tcp->tcp_cork = onoff;
872 }
873 break;
874 case TCP_RTO_INITIAL:
875 if (checkonly || val == 0)
876 break;
877
878 /*
879 * Sanity checks
880 *
881 * The initial RTO should be bounded by the minimum
882 * and maximum RTO. And it should also be smaller
883 * than the connect attempt abort timeout. Otherwise,
884 * the connection won't be aborted in a period
885 * reasonably close to that timeout.
886 */
887 if (val < tcp->tcp_rto_min || val > tcp->tcp_rto_max ||
888 val > tcp->tcp_second_ctimer_threshold ||
889 val < tcps->tcps_rexmit_interval_initial_low ||
890 val > tcps->tcps_rexmit_interval_initial_high) {
891 *outlenp = 0;
892 return (EINVAL);
893 }
894 tcp->tcp_rto_initial = val;
895
896 /*
897 * If TCP has not sent anything, need to re-calculate
898 * tcp_rto. Otherwise, this option change does not
899 * really affect anything.
900 */
901 if (tcp->tcp_state >= TCPS_SYN_SENT)
902 break;
903
904 tcp->tcp_rtt_sa = MSEC2NSEC(tcp->tcp_rto_initial) << 2;
905 tcp->tcp_rtt_sd = MSEC2NSEC(tcp->tcp_rto_initial) >> 1;
906 tcp->tcp_rto = tcp_calculate_rto(tcp, tcps,
907 tcps->tcps_conn_grace_period);
908 break;
909 case TCP_RTO_MIN:
910 if (checkonly || val == 0)
911 break;
912
913 if (val < tcps->tcps_rexmit_interval_min_low ||
914 val > tcps->tcps_rexmit_interval_min_high ||
915 val > tcp->tcp_rto_max) {
916 *outlenp = 0;
917 return (EINVAL);
918 }
919 tcp->tcp_rto_min = val;
920 if (tcp->tcp_rto < val)
921 tcp->tcp_rto = val;
922 break;
923 case TCP_RTO_MAX:
924 if (checkonly || val == 0)
925 break;
926
927 /*
928 * Sanity checks
929 *
930 * The maximum RTO should not be larger than the
931 * connection abort timeout. Otherwise, the
932 * connection won't be aborted in a period reasonably
933 * close to that timeout.
934 */
935 if (val < tcps->tcps_rexmit_interval_max_low ||
936 val > tcps->tcps_rexmit_interval_max_high ||
937 val < tcp->tcp_rto_min ||
938 val > tcp->tcp_second_timer_threshold) {
939 *outlenp = 0;
940 return (EINVAL);
941 }
942 tcp->tcp_rto_max = val;
943 if (tcp->tcp_rto > val)
944 tcp->tcp_rto = val;
945 break;
946 case TCP_LINGER2:
947 if (checkonly || *i1 == 0)
948 break;
949
950 /*
951 * Note that the option value's unit is second. And
952 * the value should be bigger than the private
953 * parameter tcp_fin_wait_2_flush_interval's lower
954 * bound and smaller than the current value of that
955 * parameter. It should be smaller than the current
956 * value to avoid an app setting TCP_LINGER2 to a big
957 * value, causing resource to be held up too long in
958 * FIN-WAIT-2 state.
959 */
960 if (*i1 < 0 ||
961 tcps->tcps_fin_wait_2_flush_interval_low/SECONDS >
962 *i1 ||
963 tcps->tcps_fin_wait_2_flush_interval/SECONDS <
964 *i1) {
965 *outlenp = 0;
966 return (EINVAL);
967 }
968 tcp->tcp_fin_wait_2_flush_interval = *i1 * SECONDS;
969 break;
970 default:
971 break;
972 }
973 break;
974 case IPPROTO_IP:
975 if (connp->conn_family != AF_INET) {
976 *outlenp = 0;
977 return (EINVAL);
978 }
979 switch (name) {
980 case IP_SEC_OPT:
981 /*
982 * We should not allow policy setting after
983 * we start listening for connections.
984 */
985 if (tcp->tcp_state == TCPS_LISTEN) {
986 return (EINVAL);
987 }
988 break;
989 }
990 break;
991 case IPPROTO_IPV6:
992 /*
993 * IPPROTO_IPV6 options are only supported for sockets
994 * that are using IPv6 on the wire.
995 */
996 if (connp->conn_ipversion != IPV6_VERSION) {
997 *outlenp = 0;
998 return (EINVAL);
999 }
1000
1001 switch (name) {
1002 case IPV6_RECVPKTINFO:
1003 if (!checkonly) {
1004 /* Force it to be sent up with the next msg */
1005 tcp->tcp_recvifindex = 0;
1006 }
1007 break;
1008 case IPV6_RECVTCLASS:
1009 if (!checkonly) {
1010 /* Force it to be sent up with the next msg */
1011 tcp->tcp_recvtclass = 0xffffffffU;
1012 }
1013 break;
1014 case IPV6_RECVHOPLIMIT:
1015 if (!checkonly) {
1016 /* Force it to be sent up with the next msg */
1017 tcp->tcp_recvhops = 0xffffffffU;
1018 }
1019 break;
1020 case IPV6_PKTINFO:
1021 /* This is an extra check for TCP */
1022 if (inlen == sizeof (struct in6_pktinfo)) {
1023 struct in6_pktinfo *pkti;
1024
1025 pkti = (struct in6_pktinfo *)invalp;
1026 /*
1027 * RFC 3542 states that ipi6_addr must be
1028 * the unspecified address when setting the
1029 * IPV6_PKTINFO sticky socket option on a
1030 * TCP socket.
1031 */
1032 if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr))
1033 return (EINVAL);
1034 }
1035 break;
1036 case IPV6_SEC_OPT:
1037 /*
1038 * We should not allow policy setting after
1039 * we start listening for connections.
1040 */
1041 if (tcp->tcp_state == TCPS_LISTEN) {
1042 return (EINVAL);
1043 }
1044 break;
1045 }
1046 break;
1047 }
1048 reterr = conn_opt_set(&coas, level, name, inlen, invalp,
1049 checkonly, cr);
1050 if (reterr != 0) {
1051 *outlenp = 0;
1052 return (reterr);
1053 }
1054
1055 /*
1056 * Common case of OK return with outval same as inval
1057 */
1058 if (invalp != outvalp) {
1059 /* don't trust bcopy for identical src/dst */
1060 (void) bcopy(invalp, outvalp, inlen);
1061 }
1062 *outlenp = inlen;
1063
1064 if (coas.coa_changed & COA_HEADER_CHANGED) {
1065 /* If we are connected we rebuilt the headers */
1066 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1067 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1068 reterr = tcp_build_hdrs(tcp);
1069 if (reterr != 0)
1070 return (reterr);
1071 }
1072 }
1073 if (coas.coa_changed & COA_ROUTE_CHANGED) {
1074 in6_addr_t nexthop;
1075
1076 /*
1077 * If we are connected we re-cache the information.
1078 * We ignore errors to preserve BSD behavior.
1079 * Note that we don't redo IPsec policy lookup here
1080 * since the final destination (or source) didn't change.
1081 */
1082 ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa,
1083 &connp->conn_faddr_v6, &nexthop);
1084
1085 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1086 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1087 (void) ip_attr_connect(connp, connp->conn_ixa,
1088 &connp->conn_laddr_v6, &connp->conn_faddr_v6,
1089 &nexthop, connp->conn_fport, NULL, NULL,
1090 IPDF_VERIFY_DST);
1091 }
1092 }
1093 if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
1094 connp->conn_wq->q_hiwat = connp->conn_sndbuf;
1095 }
1096 if (coas.coa_changed & COA_WROFF_CHANGED) {
1097 connp->conn_wroff = connp->conn_ht_iphc_allocated +
1098 tcps->tcps_wroff_xtra;
1099 (void) proto_set_tx_wroff(connp->conn_rq, connp,
1100 connp->conn_wroff);
1101 }
1102 if (coas.coa_changed & COA_OOBINLINE_CHANGED) {
1103 if (IPCL_IS_NONSTR(connp))
1104 proto_set_rx_oob_opt(connp, onoff);
1105 }
1106 return (0);
1107 }