1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
24 */
25
26 #include <sys/types.h>
27 #include <sys/stream.h>
28 #define _SUN_TPI_VERSION 2
29 #include <sys/tihdr.h>
30 #include <sys/socket.h>
31 #include <sys/xti_xtiopt.h>
32 #include <sys/xti_inet.h>
33 #include <sys/policy.h>
34
35 #include <inet/common.h>
36 #include <netinet/ip6.h>
37 #include <inet/ip.h>
38
39 #include <netinet/in.h>
40 #include <netinet/tcp.h>
41 #include <inet/optcom.h>
42 #include <inet/proto_set.h>
43 #include <inet/tcp_impl.h>
44
45 static int tcp_opt_default(queue_t *, int, int, uchar_t *);
46
47 /*
48 * Table of all known options handled on a TCP protocol stack.
49 *
50 * Note: This table contains options processed by both TCP and IP levels
51 * and is the superset of options that can be performed on a TCP over IP
52 * stack.
53 */
54 opdes_t tcp_opt_arr[] = {
55
56 { SO_LINGER, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
57 sizeof (struct linger), 0 },
58
59 { SO_DEBUG, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
60 { SO_KEEPALIVE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
61 { SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
62 { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
63 },
64 { SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
65 { SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
66 { SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
67 { SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
68 { SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
69 { SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
70 { SO_SNDTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
71 sizeof (struct timeval), 0 },
72 { SO_RCVTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
73 sizeof (struct timeval), 0 },
74 { SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
75 },
76 { SO_SND_COPYAVOID, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
77 { SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
78 0 },
79 { SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
80 0 },
81 { SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
82 0 },
83 { SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
84 0 },
85 { SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
86
87 { SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
88
89 { SO_PROTOTYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
90
91 { TCP_NODELAY, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
92 },
93 { TCP_MAXSEG, IPPROTO_TCP, OA_R, OA_R, OP_NP, 0, sizeof (uint_t),
94 536 },
95
96 { TCP_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
97 OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
98
99 { TCP_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
100 OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
101
102 { TCP_CONN_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
103 OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
104
105 { TCP_CONN_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
106 OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
107
108 { TCP_RECVDSTADDR, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
109 0 },
110
111 { TCP_ANONPRIVBIND, IPPROTO_TCP, OA_R, OA_RW, OP_PRIVPORT, 0,
112 sizeof (int), 0 },
113
114 { TCP_EXCLBIND, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
115 },
116
117 { TCP_INIT_CWND, IPPROTO_TCP, OA_RW, OA_RW, OP_CONFIG, 0,
118 sizeof (int), 0 },
119
120 { TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
121 sizeof (int), 0 },
122
123 { TCP_KEEPIDLE, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
124
125 { TCP_KEEPCNT, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
126
127 { TCP_KEEPINTVL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
128
129 { TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
130 sizeof (int), 0 },
131
132 { TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
133
134 { TCP_RTO_INITIAL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
135
136 { TCP_RTO_MIN, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
137
138 { TCP_RTO_MAX, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
139
140 { TCP_LINGER2, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
141
142 { IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
143 (OP_VARLEN|OP_NODEFAULT),
144 IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
145 { T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
146 (OP_VARLEN|OP_NODEFAULT),
147 IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
148
149 { IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
150 { T_IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
151 { IP_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
152 sizeof (int), -1 /* not initialized */ },
153
154 { IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
155 sizeof (ipsec_req_t), -1 /* not initialized */ },
156
157 { IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
158 sizeof (int), 0 /* no ifindex */ },
159
160 { IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
161 sizeof (int), 0 },
162
163 { IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
164 sizeof (int), -1 /* not initialized */ },
165
166 { IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
167 sizeof (int), 0 /* no ifindex */ },
168
169 { IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
170
171 { IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
172 sizeof (in_addr_t), -1 /* not initialized */ },
173
174 { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
175 sizeof (int), 0 },
176
177 { IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
178 (OP_NODEFAULT|OP_VARLEN),
179 sizeof (struct in6_pktinfo), -1 /* not initialized */ },
180 { IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
181 OP_NODEFAULT,
182 sizeof (sin6_t), -1 /* not initialized */ },
183 { IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
184 (OP_VARLEN|OP_NODEFAULT), 255*8,
185 -1 /* not initialized */ },
186 { IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
187 (OP_VARLEN|OP_NODEFAULT), 255*8,
188 -1 /* not initialized */ },
189 { IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
190 (OP_VARLEN|OP_NODEFAULT), 255*8,
191 -1 /* not initialized */ },
192 { IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
193 (OP_VARLEN|OP_NODEFAULT), 255*8,
194 -1 /* not initialized */ },
195 { IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
196 OP_NODEFAULT,
197 sizeof (int), -1 /* not initialized */ },
198 { IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
199 OP_NODEFAULT,
200 sizeof (struct ip6_mtuinfo), -1 /* not initialized */ },
201 { IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
202 sizeof (int), 0 },
203 { IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
204 sizeof (int), 0 },
205 { IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
206 sizeof (int), 0 },
207
208 /* Enable receipt of ancillary data */
209 { IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
210 sizeof (int), 0 },
211 { IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
212 sizeof (int), 0 },
213 { IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
214 sizeof (int), 0 },
215 { _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
216 sizeof (int), 0 },
217 { IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
218 sizeof (int), 0 },
219 { IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
220 sizeof (int), 0 },
221 { IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
222 sizeof (int), 0 },
223 { IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
224 sizeof (int), 0 },
225
226 { IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
227 sizeof (ipsec_req_t), -1 /* not initialized */ },
228 { IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
229 sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT },
230 };
231
232 /*
233 * Table of all supported levels
234 * Note: Some levels (e.g. XTI_GENERIC) may be valid but may not have
235 * any supported options so we need this info separately.
236 *
237 * This is needed only for topmost tpi providers and is used only by
238 * XTI interfaces.
239 */
240 optlevel_t tcp_valid_levels_arr[] = {
241 XTI_GENERIC,
242 SOL_SOCKET,
243 IPPROTO_TCP,
244 IPPROTO_IP,
245 IPPROTO_IPV6
246 };
247
248
249 #define TCP_OPT_ARR_CNT A_CNT(tcp_opt_arr)
250 #define TCP_VALID_LEVELS_CNT A_CNT(tcp_valid_levels_arr)
251
252 uint_t tcp_max_optsize; /* initialized when TCP driver is loaded */
253
254 /*
255 * Initialize option database object for TCP
256 *
257 * This object represents database of options to search passed to
258 * {sock,tpi}optcom_req() interface routine to take care of option
259 * management and associated methods.
260 */
261
262 optdb_obj_t tcp_opt_obj = {
263 tcp_opt_default, /* TCP default value function pointer */
264 tcp_tpi_opt_get, /* TCP get function pointer */
265 tcp_tpi_opt_set, /* TCP set function pointer */
266 TCP_OPT_ARR_CNT, /* TCP option database count of entries */
267 tcp_opt_arr, /* TCP option database */
268 TCP_VALID_LEVELS_CNT, /* TCP valid level count of entries */
269 tcp_valid_levels_arr /* TCP valid level array */
270 };
271
272 static int tcp_max_init_cwnd = TCP_MAX_INIT_CWND;
273
274 /*
275 * Some TCP options can be "set" by requesting them in the option
276 * buffer. This is needed for XTI feature test though we do not
277 * allow it in general. We interpret that this mechanism is more
278 * applicable to OSI protocols and need not be allowed in general.
279 * This routine filters out options for which it is not allowed (most)
280 * and lets through those (few) for which it is. [ The XTI interface
281 * test suite specifics will imply that any XTI_GENERIC level XTI_* if
282 * ever implemented will have to be allowed here ].
283 */
284 static boolean_t
285 tcp_allow_connopt_set(int level, int name)
286 {
287
288 switch (level) {
289 case IPPROTO_TCP:
290 switch (name) {
291 case TCP_NODELAY:
292 return (B_TRUE);
293 default:
294 return (B_FALSE);
295 }
296 /*NOTREACHED*/
297 default:
298 return (B_FALSE);
299 }
300 /*NOTREACHED*/
301 }
302
303 /*
304 * This routine gets default values of certain options whose default
305 * values are maintained by protocol specific code
306 */
307 /* ARGSUSED */
308 static int
309 tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
310 {
311 int32_t *i1 = (int32_t *)ptr;
312 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps;
313
314 switch (level) {
315 case IPPROTO_TCP:
316 switch (name) {
317 case TCP_NOTIFY_THRESHOLD:
318 *i1 = tcps->tcps_ip_notify_interval;
319 break;
320 case TCP_ABORT_THRESHOLD:
321 *i1 = tcps->tcps_ip_abort_interval;
322 break;
323 case TCP_CONN_NOTIFY_THRESHOLD:
324 *i1 = tcps->tcps_ip_notify_cinterval;
325 break;
326 case TCP_CONN_ABORT_THRESHOLD:
327 *i1 = tcps->tcps_ip_abort_cinterval;
328 break;
329 default:
330 return (-1);
331 }
332 break;
333 case IPPROTO_IP:
334 switch (name) {
335 case IP_TTL:
336 *i1 = tcps->tcps_ipv4_ttl;
337 break;
338 default:
339 return (-1);
340 }
341 break;
342 case IPPROTO_IPV6:
343 switch (name) {
344 case IPV6_UNICAST_HOPS:
345 *i1 = tcps->tcps_ipv6_hoplimit;
346 break;
347 default:
348 return (-1);
349 }
350 break;
351 default:
352 return (-1);
353 }
354 return (sizeof (int));
355 }
356
357 /*
358 * TCP routine to get the values of options.
359 */
360 int
361 tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
362 {
363 int *i1 = (int *)ptr;
364 tcp_t *tcp = connp->conn_tcp;
365 conn_opt_arg_t coas;
366 int retval;
367
368 coas.coa_connp = connp;
369 coas.coa_ixa = connp->conn_ixa;
370 coas.coa_ipp = &connp->conn_xmit_ipp;
371 coas.coa_ancillary = B_FALSE;
372 coas.coa_changed = 0;
373
374 switch (level) {
375 case SOL_SOCKET:
376 switch (name) {
377 case SO_SND_COPYAVOID:
378 *i1 = tcp->tcp_snd_zcopy_on ?
379 SO_SND_COPYAVOID : 0;
380 return (sizeof (int));
381 case SO_ACCEPTCONN:
382 *i1 = (tcp->tcp_state == TCPS_LISTEN);
383 return (sizeof (int));
384 }
385 break;
386 case IPPROTO_TCP:
387 switch (name) {
388 case TCP_NODELAY:
389 *i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0;
390 return (sizeof (int));
391 case TCP_MAXSEG:
392 *i1 = tcp->tcp_mss;
393 return (sizeof (int));
394 case TCP_NOTIFY_THRESHOLD:
395 *i1 = (int)tcp->tcp_first_timer_threshold;
396 return (sizeof (int));
397 case TCP_ABORT_THRESHOLD:
398 *i1 = tcp->tcp_second_timer_threshold;
399 return (sizeof (int));
400 case TCP_CONN_NOTIFY_THRESHOLD:
401 *i1 = tcp->tcp_first_ctimer_threshold;
402 return (sizeof (int));
403 case TCP_CONN_ABORT_THRESHOLD:
404 *i1 = tcp->tcp_second_ctimer_threshold;
405 return (sizeof (int));
406 case TCP_INIT_CWND:
407 *i1 = tcp->tcp_init_cwnd;
408 return (sizeof (int));
409 case TCP_KEEPALIVE_THRESHOLD:
410 *i1 = tcp->tcp_ka_interval;
411 return (sizeof (int));
412
413 /*
414 * TCP_KEEPIDLE expects value in seconds, but
415 * tcp_ka_interval is in milliseconds.
416 */
417 case TCP_KEEPIDLE:
418 *i1 = tcp->tcp_ka_interval / 1000;
419 return (sizeof (int));
420 case TCP_KEEPCNT:
421 *i1 = tcp->tcp_ka_cnt;
422 return (sizeof (int));
423
424 /*
425 * TCP_KEEPINTVL expects value in seconds, but
426 * tcp_ka_rinterval is in milliseconds.
427 */
428 case TCP_KEEPINTVL:
429 *i1 = tcp->tcp_ka_rinterval / 1000;
430 return (sizeof (int));
431 case TCP_KEEPALIVE_ABORT_THRESHOLD:
432 *i1 = tcp->tcp_ka_abort_thres;
433 return (sizeof (int));
434 case TCP_CORK:
435 *i1 = tcp->tcp_cork;
436 return (sizeof (int));
437 case TCP_RTO_INITIAL:
438 *i1 = tcp->tcp_rto_initial;
439 return (sizeof (uint32_t));
440 case TCP_RTO_MIN:
441 *i1 = tcp->tcp_rto_min;
442 return (sizeof (uint32_t));
443 case TCP_RTO_MAX:
444 *i1 = tcp->tcp_rto_max;
445 return (sizeof (uint32_t));
446 case TCP_LINGER2:
447 *i1 = tcp->tcp_fin_wait_2_flush_interval / SECONDS;
448 return (sizeof (int));
449 }
450 break;
451 case IPPROTO_IP:
452 if (connp->conn_family != AF_INET)
453 return (-1);
454 switch (name) {
455 case IP_OPTIONS:
456 case T_IP_OPTIONS:
457 /* Caller ensures enough space */
458 return (ip_opt_get_user(connp, ptr));
459 default:
460 break;
461 }
462 break;
463
464 case IPPROTO_IPV6:
465 /*
466 * IPPROTO_IPV6 options are only supported for sockets
467 * that are using IPv6 on the wire.
468 */
469 if (connp->conn_ipversion != IPV6_VERSION) {
470 return (-1);
471 }
472 switch (name) {
473 case IPV6_PATHMTU:
474 if (tcp->tcp_state < TCPS_ESTABLISHED)
475 return (-1);
476 break;
477 }
478 break;
479 }
480 mutex_enter(&connp->conn_lock);
481 retval = conn_opt_get(&coas, level, name, ptr);
482 mutex_exit(&connp->conn_lock);
483 return (retval);
484 }
485
486 /*
487 * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
488 * Parameters are assumed to be verified by the caller.
489 */
490 /* ARGSUSED */
491 int
492 tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
493 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
494 void *thisdg_attrs, cred_t *cr)
495 {
496 tcp_t *tcp = connp->conn_tcp;
497 int *i1 = (int *)invalp;
498 boolean_t onoff = (*i1 == 0) ? 0 : 1;
499 boolean_t checkonly;
500 int reterr;
501 tcp_stack_t *tcps = tcp->tcp_tcps;
502 conn_opt_arg_t coas;
503 uint32_t val = *((uint32_t *)invalp);
504
505 coas.coa_connp = connp;
506 coas.coa_ixa = connp->conn_ixa;
507 coas.coa_ipp = &connp->conn_xmit_ipp;
508 coas.coa_ancillary = B_FALSE;
509 coas.coa_changed = 0;
510
511 switch (optset_context) {
512 case SETFN_OPTCOM_CHECKONLY:
513 checkonly = B_TRUE;
514 /*
515 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
516 * inlen != 0 implies value supplied and
517 * we have to "pretend" to set it.
518 * inlen == 0 implies that there is no
519 * value part in T_CHECK request and just validation
520 * done elsewhere should be enough, we just return here.
521 */
522 if (inlen == 0) {
523 *outlenp = 0;
524 return (0);
525 }
526 break;
527 case SETFN_OPTCOM_NEGOTIATE:
528 checkonly = B_FALSE;
529 break;
530 case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */
531 case SETFN_CONN_NEGOTIATE:
532 checkonly = B_FALSE;
533 /*
534 * Negotiating local and "association-related" options
535 * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ)
536 * primitives is allowed by XTI, but we choose
537 * to not implement this style negotiation for Internet
538 * protocols (We interpret it is a must for OSI world but
539 * optional for Internet protocols) for all options.
540 * [ Will do only for the few options that enable test
541 * suites that our XTI implementation of this feature
542 * works for transports that do allow it ]
543 */
544 if (!tcp_allow_connopt_set(level, name)) {
545 *outlenp = 0;
546 return (EINVAL);
547 }
548 break;
549 default:
550 /*
551 * We should never get here
552 */
553 *outlenp = 0;
554 return (EINVAL);
555 }
556
557 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
558 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
559
560 /*
561 * For TCP, we should have no ancillary data sent down
562 * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs
563 * has to be zero.
564 */
565 ASSERT(thisdg_attrs == NULL);
566
567 /*
568 * For fixed length options, no sanity check
569 * of passed in length is done. It is assumed *_optcom_req()
570 * routines do the right thing.
571 */
572 switch (level) {
573 case SOL_SOCKET:
574 switch (name) {
575 case SO_KEEPALIVE:
576 if (checkonly) {
577 /* check only case */
578 break;
579 }
580
581 if (!onoff) {
582 if (connp->conn_keepalive) {
583 if (tcp->tcp_ka_tid != 0) {
584 (void) TCP_TIMER_CANCEL(tcp,
585 tcp->tcp_ka_tid);
586 tcp->tcp_ka_tid = 0;
587 }
588 connp->conn_keepalive = 0;
589 }
590 break;
591 }
592 if (!connp->conn_keepalive) {
593 /* Crank up the keepalive timer */
594 tcp->tcp_ka_last_intrvl = 0;
595 tcp->tcp_ka_tid = TCP_TIMER(tcp,
596 tcp_keepalive_timer, tcp->tcp_ka_interval);
597 connp->conn_keepalive = 1;
598 }
599 break;
600 case SO_SNDBUF: {
601 if (*i1 > tcps->tcps_max_buf) {
602 *outlenp = 0;
603 return (ENOBUFS);
604 }
605 if (checkonly)
606 break;
607
608 connp->conn_sndbuf = *i1;
609 if (tcps->tcps_snd_lowat_fraction != 0) {
610 connp->conn_sndlowat = connp->conn_sndbuf /
611 tcps->tcps_snd_lowat_fraction;
612 }
613 (void) tcp_maxpsz_set(tcp, B_TRUE);
614 /*
615 * If we are flow-controlled, recheck the condition.
616 * There are apps that increase SO_SNDBUF size when
617 * flow-controlled (EWOULDBLOCK), and expect the flow
618 * control condition to be lifted right away.
619 */
620 mutex_enter(&tcp->tcp_non_sq_lock);
621 if (tcp->tcp_flow_stopped &&
622 TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) {
623 tcp_clrqfull(tcp);
624 }
625 mutex_exit(&tcp->tcp_non_sq_lock);
626 *outlenp = inlen;
627 return (0);
628 }
629 case SO_RCVBUF:
630 if (*i1 > tcps->tcps_max_buf) {
631 *outlenp = 0;
632 return (ENOBUFS);
633 }
634 /* Silently ignore zero */
635 if (!checkonly && *i1 != 0) {
636 *i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss);
637 (void) tcp_rwnd_set(tcp, *i1);
638 }
639 /*
640 * XXX should we return the rwnd here
641 * and tcp_opt_get ?
642 */
643 *outlenp = inlen;
644 return (0);
645 case SO_SND_COPYAVOID:
646 if (!checkonly) {
647 if (tcp->tcp_loopback ||
648 (onoff != 1) || !tcp_zcopy_check(tcp)) {
649 *outlenp = 0;
650 return (EOPNOTSUPP);
651 }
652 tcp->tcp_snd_zcopy_aware = 1;
653 }
654 *outlenp = inlen;
655 return (0);
656 }
657 break;
658 case IPPROTO_TCP:
659 switch (name) {
660 case TCP_NODELAY:
661 if (!checkonly)
662 tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss;
663 break;
664 case TCP_NOTIFY_THRESHOLD:
665 if (!checkonly)
666 tcp->tcp_first_timer_threshold = *i1;
667 break;
668 case TCP_ABORT_THRESHOLD:
669 if (!checkonly)
670 tcp->tcp_second_timer_threshold = *i1;
671 break;
672 case TCP_CONN_NOTIFY_THRESHOLD:
673 if (!checkonly)
674 tcp->tcp_first_ctimer_threshold = *i1;
675 break;
676 case TCP_CONN_ABORT_THRESHOLD:
677 if (!checkonly)
678 tcp->tcp_second_ctimer_threshold = *i1;
679 break;
680 case TCP_RECVDSTADDR:
681 if (tcp->tcp_state > TCPS_LISTEN) {
682 *outlenp = 0;
683 return (EOPNOTSUPP);
684 }
685 /* Setting done in conn_opt_set */
686 break;
687 case TCP_INIT_CWND:
688 if (checkonly)
689 break;
690
691 /*
692 * Only allow socket with network configuration
693 * privilege to set the initial cwnd to be larger
694 * than allowed by RFC 3390.
695 */
696 if (val > MIN(4, MAX(2, 4380 / tcp->tcp_mss))) {
697 if ((reterr = secpolicy_ip_config(cr, B_TRUE))
698 != 0) {
699 *outlenp = 0;
700 return (reterr);
701 }
702 if (val > tcp_max_init_cwnd) {
703 *outlenp = 0;
704 return (EINVAL);
705 }
706 }
707
708 tcp->tcp_init_cwnd = val;
709
710 /*
711 * If the socket is connected, AND no outbound data
712 * has been sent, reset the actual cwnd values.
713 */
714 if (tcp->tcp_state == TCPS_ESTABLISHED &&
715 tcp->tcp_iss == tcp->tcp_snxt - 1) {
716 tcp->tcp_cwnd =
717 MIN(tcp->tcp_rwnd, val * tcp->tcp_mss);
718 }
719 break;
720
721 /*
722 * TCP_KEEPIDLE is in seconds but TCP_KEEPALIVE_THRESHOLD
723 * is in milliseconds. TCP_KEEPIDLE is introduced for
724 * compatibility with other Unix flavors.
725 * We can fall through TCP_KEEPALIVE_THRESHOLD logic after
726 * converting the input to milliseconds.
727 */
728 case TCP_KEEPIDLE:
729 *i1 *= 1000;
730 /* FALLTHRU */
731
732 case TCP_KEEPALIVE_THRESHOLD:
733 if (checkonly)
734 break;
735
736 if (*i1 < tcps->tcps_keepalive_interval_low ||
737 *i1 > tcps->tcps_keepalive_interval_high) {
738 *outlenp = 0;
739 return (EINVAL);
740 }
741 if (*i1 != tcp->tcp_ka_interval) {
742 tcp->tcp_ka_interval = *i1;
743 /*
744 * Check if we need to restart the
745 * keepalive timer.
746 */
747 if (tcp->tcp_ka_tid != 0) {
748 ASSERT(connp->conn_keepalive);
749 (void) TCP_TIMER_CANCEL(tcp,
750 tcp->tcp_ka_tid);
751 tcp->tcp_ka_last_intrvl = 0;
752 tcp->tcp_ka_tid = TCP_TIMER(tcp,
753 tcp_keepalive_timer,
754 tcp->tcp_ka_interval);
755 }
756 }
757 break;
758
759 /*
760 * tcp_ka_abort_thres = tcp_ka_rinterval * tcp_ka_cnt.
761 * So setting TCP_KEEPCNT or TCP_KEEPINTVL can affect all the
762 * three members - tcp_ka_abort_thres, tcp_ka_rinterval and
763 * tcp_ka_cnt.
764 */
765 case TCP_KEEPCNT:
766 if (checkonly)
767 break;
768
769 if (*i1 == 0) {
770 return (EINVAL);
771 } else if (tcp->tcp_ka_rinterval == 0) {
772 /*
773 * When TCP_KEEPCNT is specified without first
774 * specifying a TCP_KEEPINTVL, we infer an
775 * interval based on a tunable specific to our
776 * stack: the tcp_keepalive_abort_interval.
777 * (Or the TCP_KEEPALIVE_ABORT_THRESHOLD, in
778 * the unlikely event that that has been set.)
779 * Given the abort interval's default value of
780 * 480 seconds, low TCP_KEEPCNT values can
781 * result in intervals that exceed the default
782 * maximum RTO of 60 seconds. Rather than
783 * fail in these cases, we (implicitly) clamp
784 * the interval at the maximum RTO; if the
785 * TCP_KEEPCNT is shortly followed by a
786 * TCP_KEEPINTVL (as we expect), the abort
787 * threshold will be recalculated correctly --
788 * and if a TCP_KEEPINTVL is not forthcoming,
789 * keep-alive will at least operate reasonably
790 * given the underconfigured state.
791 */
792 uint32_t interval;
793
794 interval = tcp->tcp_ka_abort_thres / *i1;
795
796 if (interval < tcp->tcp_rto_min)
797 interval = tcp->tcp_rto_min;
798
799 if (interval > tcp->tcp_rto_max)
800 interval = tcp->tcp_rto_max;
801
802 tcp->tcp_ka_rinterval = interval;
803 } else {
804 if ((*i1 * tcp->tcp_ka_rinterval) <
805 tcps->tcps_keepalive_abort_interval_low ||
806 (*i1 * tcp->tcp_ka_rinterval) >
807 tcps->tcps_keepalive_abort_interval_high)
808 return (EINVAL);
809 tcp->tcp_ka_abort_thres =
810 (*i1 * tcp->tcp_ka_rinterval);
811 }
812 tcp->tcp_ka_cnt = *i1;
813 break;
814 case TCP_KEEPINTVL:
815 /*
816 * TCP_KEEPINTVL is specified in seconds, but
817 * tcp_ka_rinterval is in milliseconds.
818 */
819
820 if (checkonly)
821 break;
822
823 if ((*i1 * 1000) < tcp->tcp_rto_min ||
824 (*i1 * 1000) > tcp->tcp_rto_max)
825 return (EINVAL);
826
827 if (tcp->tcp_ka_cnt == 0) {
828 tcp->tcp_ka_cnt =
829 tcp->tcp_ka_abort_thres / (*i1 * 1000);
830 } else {
831 if ((*i1 * tcp->tcp_ka_cnt * 1000) <
832 tcps->tcps_keepalive_abort_interval_low ||
833 (*i1 * tcp->tcp_ka_cnt * 1000) >
834 tcps->tcps_keepalive_abort_interval_high)
835 return (EINVAL);
836 tcp->tcp_ka_abort_thres =
837 (*i1 * tcp->tcp_ka_cnt * 1000);
838 }
839 tcp->tcp_ka_rinterval = *i1 * 1000;
840 break;
841 case TCP_KEEPALIVE_ABORT_THRESHOLD:
842 if (!checkonly) {
843 if (*i1 <
844 tcps->tcps_keepalive_abort_interval_low ||
845 *i1 >
846 tcps->tcps_keepalive_abort_interval_high) {
847 *outlenp = 0;
848 return (EINVAL);
849 }
850 tcp->tcp_ka_abort_thres = *i1;
851 tcp->tcp_ka_cnt = 0;
852 tcp->tcp_ka_rinterval = 0;
853 }
854 break;
855 case TCP_CORK:
856 if (!checkonly) {
857 /*
858 * if tcp->tcp_cork was set and is now
859 * being unset, we have to make sure that
860 * the remaining data gets sent out. Also
861 * unset tcp->tcp_cork so that tcp_wput_data()
862 * can send data even if it is less than mss
863 */
864 if (tcp->tcp_cork && onoff == 0 &&
865 tcp->tcp_unsent > 0) {
866 tcp->tcp_cork = B_FALSE;
867 tcp_wput_data(tcp, NULL, B_FALSE);
868 }
869 tcp->tcp_cork = onoff;
870 }
871 break;
872 case TCP_RTO_INITIAL: {
873 clock_t rto;
874
875 if (checkonly || val == 0)
876 break;
877
878 /*
879 * Sanity checks
880 *
881 * The initial RTO should be bounded by the minimum
882 * and maximum RTO. And it should also be smaller
883 * than the connect attempt abort timeout. Otherwise,
884 * the connection won't be aborted in a period
885 * reasonably close to that timeout.
886 */
887 if (val < tcp->tcp_rto_min || val > tcp->tcp_rto_max ||
888 val > tcp->tcp_second_ctimer_threshold ||
889 val < tcps->tcps_rexmit_interval_initial_low ||
890 val > tcps->tcps_rexmit_interval_initial_high) {
891 *outlenp = 0;
892 return (EINVAL);
893 }
894 tcp->tcp_rto_initial = val;
895
896 /*
897 * If TCP has not sent anything, need to re-calculate
898 * tcp_rto. Otherwise, this option change does not
899 * really affect anything.
900 */
901 if (tcp->tcp_state >= TCPS_SYN_SENT)
902 break;
903
904 tcp->tcp_rtt_sa = tcp->tcp_rto_initial << 2;
905 tcp->tcp_rtt_sd = tcp->tcp_rto_initial >> 1;
906 rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
907 tcps->tcps_rexmit_interval_extra +
908 (tcp->tcp_rtt_sa >> 5) +
909 tcps->tcps_conn_grace_period;
910 TCP_SET_RTO(tcp, rto);
911 break;
912 }
913 case TCP_RTO_MIN:
914 if (checkonly || val == 0)
915 break;
916
917 if (val < tcps->tcps_rexmit_interval_min_low ||
918 val > tcps->tcps_rexmit_interval_min_high ||
919 val > tcp->tcp_rto_max) {
920 *outlenp = 0;
921 return (EINVAL);
922 }
923 tcp->tcp_rto_min = val;
924 if (tcp->tcp_rto < val)
925 tcp->tcp_rto = val;
926 break;
927 case TCP_RTO_MAX:
928 if (checkonly || val == 0)
929 break;
930
931 /*
932 * Sanity checks
933 *
934 * The maximum RTO should not be larger than the
935 * connection abort timeout. Otherwise, the
936 * connection won't be aborted in a period reasonably
937 * close to that timeout.
938 */
939 if (val < tcps->tcps_rexmit_interval_max_low ||
940 val > tcps->tcps_rexmit_interval_max_high ||
941 val < tcp->tcp_rto_min ||
942 val > tcp->tcp_second_timer_threshold) {
943 *outlenp = 0;
944 return (EINVAL);
945 }
946 tcp->tcp_rto_max = val;
947 if (tcp->tcp_rto > val)
948 tcp->tcp_rto = val;
949 break;
950 case TCP_LINGER2:
951 if (checkonly || *i1 == 0)
952 break;
953
954 /*
955 * Note that the option value's unit is second. And
956 * the value should be bigger than the private
957 * parameter tcp_fin_wait_2_flush_interval's lower
958 * bound and smaller than the current value of that
959 * parameter. It should be smaller than the current
960 * value to avoid an app setting TCP_LINGER2 to a big
961 * value, causing resource to be held up too long in
962 * FIN-WAIT-2 state.
963 */
964 if (*i1 < 0 ||
965 tcps->tcps_fin_wait_2_flush_interval_low/SECONDS >
966 *i1 ||
967 tcps->tcps_fin_wait_2_flush_interval/SECONDS <
968 *i1) {
969 *outlenp = 0;
970 return (EINVAL);
971 }
972 tcp->tcp_fin_wait_2_flush_interval = *i1 * SECONDS;
973 break;
974 default:
975 break;
976 }
977 break;
978 case IPPROTO_IP:
979 if (connp->conn_family != AF_INET) {
980 *outlenp = 0;
981 return (EINVAL);
982 }
983 switch (name) {
984 case IP_SEC_OPT:
985 /*
986 * We should not allow policy setting after
987 * we start listening for connections.
988 */
989 if (tcp->tcp_state == TCPS_LISTEN) {
990 return (EINVAL);
991 }
992 break;
993 }
994 break;
995 case IPPROTO_IPV6:
996 /*
997 * IPPROTO_IPV6 options are only supported for sockets
998 * that are using IPv6 on the wire.
999 */
1000 if (connp->conn_ipversion != IPV6_VERSION) {
1001 *outlenp = 0;
1002 return (EINVAL);
1003 }
1004
1005 switch (name) {
1006 case IPV6_RECVPKTINFO:
1007 if (!checkonly) {
1008 /* Force it to be sent up with the next msg */
1009 tcp->tcp_recvifindex = 0;
1010 }
1011 break;
1012 case IPV6_RECVTCLASS:
1013 if (!checkonly) {
1014 /* Force it to be sent up with the next msg */
1015 tcp->tcp_recvtclass = 0xffffffffU;
1016 }
1017 break;
1018 case IPV6_RECVHOPLIMIT:
1019 if (!checkonly) {
1020 /* Force it to be sent up with the next msg */
1021 tcp->tcp_recvhops = 0xffffffffU;
1022 }
1023 break;
1024 case IPV6_PKTINFO:
1025 /* This is an extra check for TCP */
1026 if (inlen == sizeof (struct in6_pktinfo)) {
1027 struct in6_pktinfo *pkti;
1028
1029 pkti = (struct in6_pktinfo *)invalp;
1030 /*
1031 * RFC 3542 states that ipi6_addr must be
1032 * the unspecified address when setting the
1033 * IPV6_PKTINFO sticky socket option on a
1034 * TCP socket.
1035 */
1036 if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr))
1037 return (EINVAL);
1038 }
1039 break;
1040 case IPV6_SEC_OPT:
1041 /*
1042 * We should not allow policy setting after
1043 * we start listening for connections.
1044 */
1045 if (tcp->tcp_state == TCPS_LISTEN) {
1046 return (EINVAL);
1047 }
1048 break;
1049 }
1050 break;
1051 }
1052 reterr = conn_opt_set(&coas, level, name, inlen, invalp,
1053 checkonly, cr);
1054 if (reterr != 0) {
1055 *outlenp = 0;
1056 return (reterr);
1057 }
1058
1059 /*
1060 * Common case of OK return with outval same as inval
1061 */
1062 if (invalp != outvalp) {
1063 /* don't trust bcopy for identical src/dst */
1064 (void) bcopy(invalp, outvalp, inlen);
1065 }
1066 *outlenp = inlen;
1067
1068 if (coas.coa_changed & COA_HEADER_CHANGED) {
1069 /* If we are connected we rebuilt the headers */
1070 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1071 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1072 reterr = tcp_build_hdrs(tcp);
1073 if (reterr != 0)
1074 return (reterr);
1075 }
1076 }
1077 if (coas.coa_changed & COA_ROUTE_CHANGED) {
1078 in6_addr_t nexthop;
1079
1080 /*
1081 * If we are connected we re-cache the information.
1082 * We ignore errors to preserve BSD behavior.
1083 * Note that we don't redo IPsec policy lookup here
1084 * since the final destination (or source) didn't change.
1085 */
1086 ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa,
1087 &connp->conn_faddr_v6, &nexthop);
1088
1089 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1090 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1091 (void) ip_attr_connect(connp, connp->conn_ixa,
1092 &connp->conn_laddr_v6, &connp->conn_faddr_v6,
1093 &nexthop, connp->conn_fport, NULL, NULL,
1094 IPDF_VERIFY_DST);
1095 }
1096 }
1097 if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
1098 connp->conn_wq->q_hiwat = connp->conn_sndbuf;
1099 }
1100 if (coas.coa_changed & COA_WROFF_CHANGED) {
1101 connp->conn_wroff = connp->conn_ht_iphc_allocated +
1102 tcps->tcps_wroff_xtra;
1103 (void) proto_set_tx_wroff(connp->conn_rq, connp,
1104 connp->conn_wroff);
1105 }
1106 if (coas.coa_changed & COA_OOBINLINE_CHANGED) {
1107 if (IPCL_IS_NONSTR(connp))
1108 proto_set_rx_oob_opt(connp, onoff);
1109 }
1110 return (0);
1111 }