1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
24 */
25
26 #include <sys/types.h>
27 #include <sys/stream.h>
28 #define _SUN_TPI_VERSION 2
29 #include <sys/tihdr.h>
30 #include <sys/socket.h>
31 #include <sys/xti_xtiopt.h>
32 #include <sys/xti_inet.h>
33 #include <sys/policy.h>
34
35 #include <inet/common.h>
36 #include <netinet/ip6.h>
37 #include <inet/ip.h>
38
39 #include <netinet/in.h>
40 #include <netinet/tcp.h>
41 #include <inet/optcom.h>
42 #include <inet/proto_set.h>
43 #include <inet/tcp_impl.h>
44
45 /*
46 * Table of all known options handled on a TCP protocol stack.
47 *
48 * Note: This table contains options processed by both TCP and IP levels
49 * and is the superset of options that can be performed on a TCP over IP
50 * stack.
51 */
52 opdes_t tcp_opt_arr[] = {
53
54 { SO_LINGER, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
55 sizeof (struct linger), 0 },
56
57 { SO_DEBUG, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
58 { SO_KEEPALIVE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
59 { SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
60 { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
61 },
62 { SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
63 { SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
64 { SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
65 { SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
66 { SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
67 { SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
68 { SO_SNDTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
69 sizeof (struct timeval), 0 },
70 { SO_RCVTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
71 sizeof (struct timeval), 0 },
72 { SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
73 },
74 { SO_SND_COPYAVOID, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
75 { SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
76 0 },
77 { SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
78 0 },
79 { SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
80 0 },
81 { SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
82 0 },
83 { SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
84
85 { SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
86
87 { SO_PROTOTYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
88
89 { TCP_NODELAY, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
90 },
91 { TCP_MAXSEG, IPPROTO_TCP, OA_R, OA_R, OP_NP, 0, sizeof (uint_t),
92 536 },
93
94 { TCP_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
95 OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
96
97 { TCP_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
98 OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
99
100 { TCP_CONN_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
101 OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
102
103 { TCP_CONN_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
104 OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
105
106 { TCP_RECVDSTADDR, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
107 0 },
108
109 { TCP_ANONPRIVBIND, IPPROTO_TCP, OA_R, OA_RW, OP_PRIVPORT, 0,
110 sizeof (int), 0 },
111
112 { TCP_EXCLBIND, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
113 },
114
115 { TCP_INIT_CWND, IPPROTO_TCP, OA_RW, OA_RW, OP_CONFIG, 0,
116 sizeof (int), 0 },
117
118 { TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
119 sizeof (int), 0 },
120
121 { TCP_KEEPIDLE, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
122
123 { TCP_KEEPCNT, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
124
125 { TCP_KEEPINTVL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
126
127 { TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
128 sizeof (int), 0 },
129
130 { TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
131
132 { TCP_RTO_INITIAL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
133
134 { TCP_RTO_MIN, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
135
136 { TCP_RTO_MAX, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
137
138 { TCP_LINGER2, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
139
140 { IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
141 (OP_VARLEN|OP_NODEFAULT),
142 IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
143 { T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
144 (OP_VARLEN|OP_NODEFAULT),
145 IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
146
147 { IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
148 { T_IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
149 { IP_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
150 sizeof (int), -1 /* not initialized */ },
151
152 { IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
153 sizeof (ipsec_req_t), -1 /* not initialized */ },
154
155 { IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
156 sizeof (int), 0 /* no ifindex */ },
157
158 { IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
159 sizeof (int), 0 },
160
161 { IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
162 sizeof (int), -1 /* not initialized */ },
163
164 { IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
165 sizeof (int), 0 /* no ifindex */ },
166
167 { IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
168
169 { IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
170 sizeof (in_addr_t), -1 /* not initialized */ },
171
172 { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
173 sizeof (int), 0 },
174
175 { IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
176 (OP_NODEFAULT|OP_VARLEN),
177 sizeof (struct in6_pktinfo), -1 /* not initialized */ },
178 { IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
179 OP_NODEFAULT,
180 sizeof (sin6_t), -1 /* not initialized */ },
181 { IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
182 (OP_VARLEN|OP_NODEFAULT), 255*8,
183 -1 /* not initialized */ },
184 { IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
185 (OP_VARLEN|OP_NODEFAULT), 255*8,
186 -1 /* not initialized */ },
187 { IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
188 (OP_VARLEN|OP_NODEFAULT), 255*8,
189 -1 /* not initialized */ },
190 { IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
191 (OP_VARLEN|OP_NODEFAULT), 255*8,
192 -1 /* not initialized */ },
193 { IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
194 OP_NODEFAULT,
195 sizeof (int), -1 /* not initialized */ },
196 { IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
197 OP_NODEFAULT,
198 sizeof (struct ip6_mtuinfo), -1 /* not initialized */ },
199 { IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
200 sizeof (int), 0 },
201 { IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
202 sizeof (int), 0 },
203 { IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
204 sizeof (int), 0 },
205
206 /* Enable receipt of ancillary data */
207 { IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
208 sizeof (int), 0 },
209 { IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
210 sizeof (int), 0 },
211 { IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
212 sizeof (int), 0 },
213 { _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
214 sizeof (int), 0 },
215 { IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
216 sizeof (int), 0 },
217 { IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
218 sizeof (int), 0 },
219 { IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
220 sizeof (int), 0 },
221 { IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
222 sizeof (int), 0 },
223
224 { IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
225 sizeof (ipsec_req_t), -1 /* not initialized */ },
226 { IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
227 sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT },
228 };
229
230 /*
231 * Table of all supported levels
232 * Note: Some levels (e.g. XTI_GENERIC) may be valid but may not have
233 * any supported options so we need this info separately.
234 *
235 * This is needed only for topmost tpi providers and is used only by
236 * XTI interfaces.
237 */
238 optlevel_t tcp_valid_levels_arr[] = {
239 XTI_GENERIC,
240 SOL_SOCKET,
241 IPPROTO_TCP,
242 IPPROTO_IP,
243 IPPROTO_IPV6
244 };
245
246
247 #define TCP_OPT_ARR_CNT A_CNT(tcp_opt_arr)
248 #define TCP_VALID_LEVELS_CNT A_CNT(tcp_valid_levels_arr)
249
250 uint_t tcp_max_optsize; /* initialized when TCP driver is loaded */
251
252 /*
253 * Initialize option database object for TCP
254 *
255 * This object represents database of options to search passed to
256 * {sock,tpi}optcom_req() interface routine to take care of option
257 * management and associated methods.
258 */
259
260 optdb_obj_t tcp_opt_obj = {
261 tcp_opt_default, /* TCP default value function pointer */
262 tcp_tpi_opt_get, /* TCP get function pointer */
263 tcp_tpi_opt_set, /* TCP set function pointer */
264 TCP_OPT_ARR_CNT, /* TCP option database count of entries */
265 tcp_opt_arr, /* TCP option database */
266 TCP_VALID_LEVELS_CNT, /* TCP valid level count of entries */
267 tcp_valid_levels_arr /* TCP valid level array */
268 };
269
270 /* Maximum TCP initial cwin (start/restart). */
271 #define TCP_MAX_INIT_CWND 16
272
273 static int tcp_max_init_cwnd = TCP_MAX_INIT_CWND;
274
275 /*
276 * Some TCP options can be "set" by requesting them in the option
277 * buffer. This is needed for XTI feature test though we do not
278 * allow it in general. We interpret that this mechanism is more
279 * applicable to OSI protocols and need not be allowed in general.
280 * This routine filters out options for which it is not allowed (most)
281 * and lets through those (few) for which it is. [ The XTI interface
282 * test suite specifics will imply that any XTI_GENERIC level XTI_* if
283 * ever implemented will have to be allowed here ].
284 */
285 static boolean_t
286 tcp_allow_connopt_set(int level, int name)
287 {
288
289 switch (level) {
290 case IPPROTO_TCP:
291 switch (name) {
292 case TCP_NODELAY:
293 return (B_TRUE);
294 default:
295 return (B_FALSE);
296 }
297 /*NOTREACHED*/
298 default:
299 return (B_FALSE);
300 }
301 /*NOTREACHED*/
302 }
303
304 /*
305 * This routine gets default values of certain options whose default
306 * values are maintained by protocol specific code
307 */
308 /* ARGSUSED */
309 int
310 tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
311 {
312 int32_t *i1 = (int32_t *)ptr;
313 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps;
314
315 switch (level) {
316 case IPPROTO_TCP:
317 switch (name) {
318 case TCP_NOTIFY_THRESHOLD:
319 *i1 = tcps->tcps_ip_notify_interval;
320 break;
321 case TCP_ABORT_THRESHOLD:
322 *i1 = tcps->tcps_ip_abort_interval;
323 break;
324 case TCP_CONN_NOTIFY_THRESHOLD:
325 *i1 = tcps->tcps_ip_notify_cinterval;
326 break;
327 case TCP_CONN_ABORT_THRESHOLD:
328 *i1 = tcps->tcps_ip_abort_cinterval;
329 break;
330 default:
331 return (-1);
332 }
333 break;
334 case IPPROTO_IP:
335 switch (name) {
336 case IP_TTL:
337 *i1 = tcps->tcps_ipv4_ttl;
338 break;
339 default:
340 return (-1);
341 }
342 break;
343 case IPPROTO_IPV6:
344 switch (name) {
345 case IPV6_UNICAST_HOPS:
346 *i1 = tcps->tcps_ipv6_hoplimit;
347 break;
348 default:
349 return (-1);
350 }
351 break;
352 default:
353 return (-1);
354 }
355 return (sizeof (int));
356 }
357
358 /*
359 * TCP routine to get the values of options.
360 */
361 int
362 tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
363 {
364 int *i1 = (int *)ptr;
365 tcp_t *tcp = connp->conn_tcp;
366 conn_opt_arg_t coas;
367 int retval;
368
369 coas.coa_connp = connp;
370 coas.coa_ixa = connp->conn_ixa;
371 coas.coa_ipp = &connp->conn_xmit_ipp;
372 coas.coa_ancillary = B_FALSE;
373 coas.coa_changed = 0;
374
375 switch (level) {
376 case SOL_SOCKET:
377 switch (name) {
378 case SO_SND_COPYAVOID:
379 *i1 = tcp->tcp_snd_zcopy_on ?
380 SO_SND_COPYAVOID : 0;
381 return (sizeof (int));
382 case SO_ACCEPTCONN:
383 *i1 = (tcp->tcp_state == TCPS_LISTEN);
384 return (sizeof (int));
385 }
386 break;
387 case IPPROTO_TCP:
388 switch (name) {
389 case TCP_NODELAY:
390 *i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0;
391 return (sizeof (int));
392 case TCP_MAXSEG:
393 *i1 = tcp->tcp_mss;
394 return (sizeof (int));
395 case TCP_NOTIFY_THRESHOLD:
396 *i1 = (int)tcp->tcp_first_timer_threshold;
397 return (sizeof (int));
398 case TCP_ABORT_THRESHOLD:
399 *i1 = tcp->tcp_second_timer_threshold;
400 return (sizeof (int));
401 case TCP_CONN_NOTIFY_THRESHOLD:
402 *i1 = tcp->tcp_first_ctimer_threshold;
403 return (sizeof (int));
404 case TCP_CONN_ABORT_THRESHOLD:
405 *i1 = tcp->tcp_second_ctimer_threshold;
406 return (sizeof (int));
407 case TCP_INIT_CWND:
408 *i1 = tcp->tcp_init_cwnd;
409 return (sizeof (int));
410 case TCP_KEEPALIVE_THRESHOLD:
411 *i1 = tcp->tcp_ka_interval;
412 return (sizeof (int));
413
414 /*
415 * TCP_KEEPIDLE expects value in seconds, but
416 * tcp_ka_interval is in milliseconds.
417 */
418 case TCP_KEEPIDLE:
419 *i1 = tcp->tcp_ka_interval / 1000;
420 return (sizeof (int));
421 case TCP_KEEPCNT:
422 *i1 = tcp->tcp_ka_cnt;
423 return (sizeof (int));
424
425 /*
426 * TCP_KEEPINTVL expects value in seconds, but
427 * tcp_ka_rinterval is in milliseconds.
428 */
429 case TCP_KEEPINTVL:
430 *i1 = tcp->tcp_ka_rinterval / 1000;
431 return (sizeof (int));
432 case TCP_KEEPALIVE_ABORT_THRESHOLD:
433 *i1 = tcp->tcp_ka_abort_thres;
434 return (sizeof (int));
435 case TCP_CORK:
436 *i1 = tcp->tcp_cork;
437 return (sizeof (int));
438 case TCP_RTO_INITIAL:
439 *i1 = tcp->tcp_rto_initial;
440 return (sizeof (uint32_t));
441 case TCP_RTO_MIN:
442 *i1 = tcp->tcp_rto_min;
443 return (sizeof (uint32_t));
444 case TCP_RTO_MAX:
445 *i1 = tcp->tcp_rto_max;
446 return (sizeof (uint32_t));
447 case TCP_LINGER2:
448 *i1 = tcp->tcp_fin_wait_2_flush_interval / SECONDS;
449 return (sizeof (int));
450 }
451 break;
452 case IPPROTO_IP:
453 if (connp->conn_family != AF_INET)
454 return (-1);
455 switch (name) {
456 case IP_OPTIONS:
457 case T_IP_OPTIONS:
458 /* Caller ensures enough space */
459 return (ip_opt_get_user(connp, ptr));
460 default:
461 break;
462 }
463 break;
464
465 case IPPROTO_IPV6:
466 /*
467 * IPPROTO_IPV6 options are only supported for sockets
468 * that are using IPv6 on the wire.
469 */
470 if (connp->conn_ipversion != IPV6_VERSION) {
471 return (-1);
472 }
473 switch (name) {
474 case IPV6_PATHMTU:
475 if (tcp->tcp_state < TCPS_ESTABLISHED)
476 return (-1);
477 break;
478 }
479 break;
480 }
481 mutex_enter(&connp->conn_lock);
482 retval = conn_opt_get(&coas, level, name, ptr);
483 mutex_exit(&connp->conn_lock);
484 return (retval);
485 }
486
487 /*
488 * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
489 * Parameters are assumed to be verified by the caller.
490 */
491 /* ARGSUSED */
492 int
493 tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
494 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
495 void *thisdg_attrs, cred_t *cr)
496 {
497 tcp_t *tcp = connp->conn_tcp;
498 int *i1 = (int *)invalp;
499 boolean_t onoff = (*i1 == 0) ? 0 : 1;
500 boolean_t checkonly;
501 int reterr;
502 tcp_stack_t *tcps = tcp->tcp_tcps;
503 conn_opt_arg_t coas;
504 uint32_t val = *((uint32_t *)invalp);
505
506 coas.coa_connp = connp;
507 coas.coa_ixa = connp->conn_ixa;
508 coas.coa_ipp = &connp->conn_xmit_ipp;
509 coas.coa_ancillary = B_FALSE;
510 coas.coa_changed = 0;
511
512 switch (optset_context) {
513 case SETFN_OPTCOM_CHECKONLY:
514 checkonly = B_TRUE;
515 /*
516 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
517 * inlen != 0 implies value supplied and
518 * we have to "pretend" to set it.
519 * inlen == 0 implies that there is no
520 * value part in T_CHECK request and just validation
521 * done elsewhere should be enough, we just return here.
522 */
523 if (inlen == 0) {
524 *outlenp = 0;
525 return (0);
526 }
527 break;
528 case SETFN_OPTCOM_NEGOTIATE:
529 checkonly = B_FALSE;
530 break;
531 case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */
532 case SETFN_CONN_NEGOTIATE:
533 checkonly = B_FALSE;
534 /*
535 * Negotiating local and "association-related" options
536 * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ)
537 * primitives is allowed by XTI, but we choose
538 * to not implement this style negotiation for Internet
539 * protocols (We interpret it is a must for OSI world but
540 * optional for Internet protocols) for all options.
541 * [ Will do only for the few options that enable test
542 * suites that our XTI implementation of this feature
543 * works for transports that do allow it ]
544 */
545 if (!tcp_allow_connopt_set(level, name)) {
546 *outlenp = 0;
547 return (EINVAL);
548 }
549 break;
550 default:
551 /*
552 * We should never get here
553 */
554 *outlenp = 0;
555 return (EINVAL);
556 }
557
558 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
559 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
560
561 /*
562 * For TCP, we should have no ancillary data sent down
563 * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs
564 * has to be zero.
565 */
566 ASSERT(thisdg_attrs == NULL);
567
568 /*
569 * For fixed length options, no sanity check
570 * of passed in length is done. It is assumed *_optcom_req()
571 * routines do the right thing.
572 */
573 switch (level) {
574 case SOL_SOCKET:
575 switch (name) {
576 case SO_KEEPALIVE:
577 if (checkonly) {
578 /* check only case */
579 break;
580 }
581
582 if (!onoff) {
583 if (connp->conn_keepalive) {
584 if (tcp->tcp_ka_tid != 0) {
585 (void) TCP_TIMER_CANCEL(tcp,
586 tcp->tcp_ka_tid);
587 tcp->tcp_ka_tid = 0;
588 }
589 connp->conn_keepalive = 0;
590 }
591 break;
592 }
593 if (!connp->conn_keepalive) {
594 /* Crank up the keepalive timer */
595 tcp->tcp_ka_last_intrvl = 0;
596 tcp->tcp_ka_tid = TCP_TIMER(tcp,
597 tcp_keepalive_timer, tcp->tcp_ka_interval);
598 connp->conn_keepalive = 1;
599 }
600 break;
601 case SO_SNDBUF: {
602 if (*i1 > tcps->tcps_max_buf) {
603 *outlenp = 0;
604 return (ENOBUFS);
605 }
606 if (checkonly)
607 break;
608
609 connp->conn_sndbuf = *i1;
610 if (tcps->tcps_snd_lowat_fraction != 0) {
611 connp->conn_sndlowat = connp->conn_sndbuf /
612 tcps->tcps_snd_lowat_fraction;
613 }
614 (void) tcp_maxpsz_set(tcp, B_TRUE);
615 /*
616 * If we are flow-controlled, recheck the condition.
617 * There are apps that increase SO_SNDBUF size when
618 * flow-controlled (EWOULDBLOCK), and expect the flow
619 * control condition to be lifted right away.
620 */
621 mutex_enter(&tcp->tcp_non_sq_lock);
622 if (tcp->tcp_flow_stopped &&
623 TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) {
624 tcp_clrqfull(tcp);
625 }
626 mutex_exit(&tcp->tcp_non_sq_lock);
627 *outlenp = inlen;
628 return (0);
629 }
630 case SO_RCVBUF:
631 if (*i1 > tcps->tcps_max_buf) {
632 *outlenp = 0;
633 return (ENOBUFS);
634 }
635 /* Silently ignore zero */
636 if (!checkonly && *i1 != 0) {
637 *i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss);
638 (void) tcp_rwnd_set(tcp, *i1);
639 }
640 /*
641 * XXX should we return the rwnd here
642 * and tcp_opt_get ?
643 */
644 *outlenp = inlen;
645 return (0);
646 case SO_SND_COPYAVOID:
647 if (!checkonly) {
648 if (tcp->tcp_loopback ||
649 (onoff != 1) || !tcp_zcopy_check(tcp)) {
650 *outlenp = 0;
651 return (EOPNOTSUPP);
652 }
653 tcp->tcp_snd_zcopy_aware = 1;
654 }
655 *outlenp = inlen;
656 return (0);
657 }
658 break;
659 case IPPROTO_TCP:
660 switch (name) {
661 case TCP_NODELAY:
662 if (!checkonly)
663 tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss;
664 break;
665 case TCP_NOTIFY_THRESHOLD:
666 if (!checkonly)
667 tcp->tcp_first_timer_threshold = *i1;
668 break;
669 case TCP_ABORT_THRESHOLD:
670 if (!checkonly)
671 tcp->tcp_second_timer_threshold = *i1;
672 break;
673 case TCP_CONN_NOTIFY_THRESHOLD:
674 if (!checkonly)
675 tcp->tcp_first_ctimer_threshold = *i1;
676 break;
677 case TCP_CONN_ABORT_THRESHOLD:
678 if (!checkonly)
679 tcp->tcp_second_ctimer_threshold = *i1;
680 break;
681 case TCP_RECVDSTADDR:
682 if (tcp->tcp_state > TCPS_LISTEN) {
683 *outlenp = 0;
684 return (EOPNOTSUPP);
685 }
686 /* Setting done in conn_opt_set */
687 break;
688 case TCP_INIT_CWND:
689 if (checkonly)
690 break;
691
692 /*
693 * Only allow socket with network configuration
694 * privilege to set the initial cwnd to be larger
695 * than allowed by RFC 3390.
696 */
697 if (val > MIN(4, MAX(2, 4380 / tcp->tcp_mss))) {
698 if ((reterr = secpolicy_ip_config(cr, B_TRUE))
699 != 0) {
700 *outlenp = 0;
701 return (reterr);
702 }
703 if (val > tcp_max_init_cwnd) {
704 *outlenp = 0;
705 return (EINVAL);
706 }
707 }
708
709 tcp->tcp_init_cwnd = val;
710
711 /*
712 * If the socket is connected, AND no outbound data
713 * has been sent, reset the actual cwnd values.
714 */
715 if (tcp->tcp_state == TCPS_ESTABLISHED &&
716 tcp->tcp_iss == tcp->tcp_snxt - 1) {
717 tcp->tcp_cwnd =
718 MIN(tcp->tcp_rwnd, val * tcp->tcp_mss);
719 }
720 break;
721
722 /*
723 * TCP_KEEPIDLE is in seconds but TCP_KEEPALIVE_THRESHOLD
724 * is in milliseconds. TCP_KEEPIDLE is introduced for
725 * compatibility with other Unix flavors.
726 * We can fall through TCP_KEEPALIVE_THRESHOLD logic after
727 * converting the input to milliseconds.
728 */
729 case TCP_KEEPIDLE:
730 *i1 *= 1000;
731 /* FALLTHRU */
732
733 case TCP_KEEPALIVE_THRESHOLD:
734 if (checkonly)
735 break;
736
737 if (*i1 < tcps->tcps_keepalive_interval_low ||
738 *i1 > tcps->tcps_keepalive_interval_high) {
739 *outlenp = 0;
740 return (EINVAL);
741 }
742 if (*i1 != tcp->tcp_ka_interval) {
743 tcp->tcp_ka_interval = *i1;
744 /*
745 * Check if we need to restart the
746 * keepalive timer.
747 */
748 if (tcp->tcp_ka_tid != 0) {
749 ASSERT(connp->conn_keepalive);
750 (void) TCP_TIMER_CANCEL(tcp,
751 tcp->tcp_ka_tid);
752 tcp->tcp_ka_last_intrvl = 0;
753 tcp->tcp_ka_tid = TCP_TIMER(tcp,
754 tcp_keepalive_timer,
755 tcp->tcp_ka_interval);
756 }
757 }
758 break;
759
760 /*
761 * tcp_ka_abort_thres = tcp_ka_rinterval * tcp_ka_cnt.
762 * So setting TCP_KEEPCNT or TCP_KEEPINTVL can affect all the
763 * three members - tcp_ka_abort_thres, tcp_ka_rinterval and
764 * tcp_ka_cnt.
765 */
766 case TCP_KEEPCNT:
767 if (checkonly)
768 break;
769
770 if (*i1 == 0) {
771 return (EINVAL);
772 } else if (tcp->tcp_ka_rinterval == 0) {
773 if ((tcp->tcp_ka_abort_thres / *i1) <
774 tcp->tcp_rto_min ||
775 (tcp->tcp_ka_abort_thres / *i1) >
776 tcp->tcp_rto_max)
777 return (EINVAL);
778
779 tcp->tcp_ka_rinterval =
780 tcp->tcp_ka_abort_thres / *i1;
781 } else {
782 if ((*i1 * tcp->tcp_ka_rinterval) <
783 tcps->tcps_keepalive_abort_interval_low ||
784 (*i1 * tcp->tcp_ka_rinterval) >
785 tcps->tcps_keepalive_abort_interval_high)
786 return (EINVAL);
787 tcp->tcp_ka_abort_thres =
788 (*i1 * tcp->tcp_ka_rinterval);
789 }
790 tcp->tcp_ka_cnt = *i1;
791 break;
792 case TCP_KEEPINTVL:
793 /*
794 * TCP_KEEPINTVL is specified in seconds, but
795 * tcp_ka_rinterval is in milliseconds.
796 */
797
798 if (checkonly)
799 break;
800
801 if ((*i1 * 1000) < tcp->tcp_rto_min ||
802 (*i1 * 1000) > tcp->tcp_rto_max)
803 return (EINVAL);
804
805 if (tcp->tcp_ka_cnt == 0) {
806 tcp->tcp_ka_cnt =
807 tcp->tcp_ka_abort_thres / (*i1 * 1000);
808 } else {
809 if ((*i1 * tcp->tcp_ka_cnt * 1000) <
810 tcps->tcps_keepalive_abort_interval_low ||
811 (*i1 * tcp->tcp_ka_cnt * 1000) >
812 tcps->tcps_keepalive_abort_interval_high)
813 return (EINVAL);
814 tcp->tcp_ka_abort_thres =
815 (*i1 * tcp->tcp_ka_cnt * 1000);
816 }
817 tcp->tcp_ka_rinterval = *i1 * 1000;
818 break;
819 case TCP_KEEPALIVE_ABORT_THRESHOLD:
820 if (!checkonly) {
821 if (*i1 <
822 tcps->tcps_keepalive_abort_interval_low ||
823 *i1 >
824 tcps->tcps_keepalive_abort_interval_high) {
825 *outlenp = 0;
826 return (EINVAL);
827 }
828 tcp->tcp_ka_abort_thres = *i1;
829 tcp->tcp_ka_cnt = 0;
830 tcp->tcp_ka_rinterval = 0;
831 }
832 break;
833 case TCP_CORK:
834 if (!checkonly) {
835 /*
836 * if tcp->tcp_cork was set and is now
837 * being unset, we have to make sure that
838 * the remaining data gets sent out. Also
839 * unset tcp->tcp_cork so that tcp_wput_data()
840 * can send data even if it is less than mss
841 */
842 if (tcp->tcp_cork && onoff == 0 &&
843 tcp->tcp_unsent > 0) {
844 tcp->tcp_cork = B_FALSE;
845 tcp_wput_data(tcp, NULL, B_FALSE);
846 }
847 tcp->tcp_cork = onoff;
848 }
849 break;
850 case TCP_RTO_INITIAL: {
851 clock_t rto;
852
853 if (checkonly || val == 0)
854 break;
855
856 /*
857 * Sanity checks
858 *
859 * The initial RTO should be bounded by the minimum
860 * and maximum RTO. And it should also be smaller
861 * than the connect attempt abort timeout. Otherwise,
862 * the connection won't be aborted in a period
863 * reasonably close to that timeout.
864 */
865 if (val < tcp->tcp_rto_min || val > tcp->tcp_rto_max ||
866 val > tcp->tcp_second_ctimer_threshold ||
867 val < tcps->tcps_rexmit_interval_initial_low ||
868 val > tcps->tcps_rexmit_interval_initial_high) {
869 *outlenp = 0;
870 return (EINVAL);
871 }
872 tcp->tcp_rto_initial = val;
873
874 /*
875 * If TCP has not sent anything, need to re-calculate
876 * tcp_rto. Otherwise, this option change does not
877 * really affect anything.
878 */
879 if (tcp->tcp_state >= TCPS_SYN_SENT)
880 break;
881
882 tcp->tcp_rtt_sa = tcp->tcp_rto_initial << 2;
883 tcp->tcp_rtt_sd = tcp->tcp_rto_initial >> 1;
884 rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
885 tcps->tcps_rexmit_interval_extra +
886 (tcp->tcp_rtt_sa >> 5) +
887 tcps->tcps_conn_grace_period;
888 TCP_SET_RTO(tcp, rto);
889 break;
890 }
891 case TCP_RTO_MIN:
892 if (checkonly || val == 0)
893 break;
894
895 if (val < tcps->tcps_rexmit_interval_min_low ||
896 val > tcps->tcps_rexmit_interval_min_high ||
897 val > tcp->tcp_rto_max) {
898 *outlenp = 0;
899 return (EINVAL);
900 }
901 tcp->tcp_rto_min = val;
902 if (tcp->tcp_rto < val)
903 tcp->tcp_rto = val;
904 break;
905 case TCP_RTO_MAX:
906 if (checkonly || val == 0)
907 break;
908
909 /*
910 * Sanity checks
911 *
912 * The maximum RTO should not be larger than the
913 * connection abort timeout. Otherwise, the
914 * connection won't be aborted in a period reasonably
915 * close to that timeout.
916 */
917 if (val < tcps->tcps_rexmit_interval_max_low ||
918 val > tcps->tcps_rexmit_interval_max_high ||
919 val < tcp->tcp_rto_min ||
920 val > tcp->tcp_second_timer_threshold) {
921 *outlenp = 0;
922 return (EINVAL);
923 }
924 tcp->tcp_rto_max = val;
925 if (tcp->tcp_rto > val)
926 tcp->tcp_rto = val;
927 break;
928 case TCP_LINGER2:
929 if (checkonly || *i1 == 0)
930 break;
931
932 /*
933 * Note that the option value's unit is second. And
934 * the value should be bigger than the private
935 * parameter tcp_fin_wait_2_flush_interval's lower
936 * bound and smaller than the current value of that
937 * parameter. It should be smaller than the current
938 * value to avoid an app setting TCP_LINGER2 to a big
939 * value, causing resource to be held up too long in
940 * FIN-WAIT-2 state.
941 */
942 if (*i1 < 0 ||
943 tcps->tcps_fin_wait_2_flush_interval_low/SECONDS >
944 *i1 ||
945 tcps->tcps_fin_wait_2_flush_interval/SECONDS <
946 *i1) {
947 *outlenp = 0;
948 return (EINVAL);
949 }
950 tcp->tcp_fin_wait_2_flush_interval = *i1 * SECONDS;
951 break;
952 default:
953 break;
954 }
955 break;
956 case IPPROTO_IP:
957 if (connp->conn_family != AF_INET) {
958 *outlenp = 0;
959 return (EINVAL);
960 }
961 switch (name) {
962 case IP_SEC_OPT:
963 /*
964 * We should not allow policy setting after
965 * we start listening for connections.
966 */
967 if (tcp->tcp_state == TCPS_LISTEN) {
968 return (EINVAL);
969 }
970 break;
971 }
972 break;
973 case IPPROTO_IPV6:
974 /*
975 * IPPROTO_IPV6 options are only supported for sockets
976 * that are using IPv6 on the wire.
977 */
978 if (connp->conn_ipversion != IPV6_VERSION) {
979 *outlenp = 0;
980 return (EINVAL);
981 }
982
983 switch (name) {
984 case IPV6_RECVPKTINFO:
985 if (!checkonly) {
986 /* Force it to be sent up with the next msg */
987 tcp->tcp_recvifindex = 0;
988 }
989 break;
990 case IPV6_RECVTCLASS:
991 if (!checkonly) {
992 /* Force it to be sent up with the next msg */
993 tcp->tcp_recvtclass = 0xffffffffU;
994 }
995 break;
996 case IPV6_RECVHOPLIMIT:
997 if (!checkonly) {
998 /* Force it to be sent up with the next msg */
999 tcp->tcp_recvhops = 0xffffffffU;
1000 }
1001 break;
1002 case IPV6_PKTINFO:
1003 /* This is an extra check for TCP */
1004 if (inlen == sizeof (struct in6_pktinfo)) {
1005 struct in6_pktinfo *pkti;
1006
1007 pkti = (struct in6_pktinfo *)invalp;
1008 /*
1009 * RFC 3542 states that ipi6_addr must be
1010 * the unspecified address when setting the
1011 * IPV6_PKTINFO sticky socket option on a
1012 * TCP socket.
1013 */
1014 if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr))
1015 return (EINVAL);
1016 }
1017 break;
1018 case IPV6_SEC_OPT:
1019 /*
1020 * We should not allow policy setting after
1021 * we start listening for connections.
1022 */
1023 if (tcp->tcp_state == TCPS_LISTEN) {
1024 return (EINVAL);
1025 }
1026 break;
1027 }
1028 break;
1029 }
1030 reterr = conn_opt_set(&coas, level, name, inlen, invalp,
1031 checkonly, cr);
1032 if (reterr != 0) {
1033 *outlenp = 0;
1034 return (reterr);
1035 }
1036
1037 /*
1038 * Common case of OK return with outval same as inval
1039 */
1040 if (invalp != outvalp) {
1041 /* don't trust bcopy for identical src/dst */
1042 (void) bcopy(invalp, outvalp, inlen);
1043 }
1044 *outlenp = inlen;
1045
1046 if (coas.coa_changed & COA_HEADER_CHANGED) {
1047 /* If we are connected we rebuilt the headers */
1048 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1049 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1050 reterr = tcp_build_hdrs(tcp);
1051 if (reterr != 0)
1052 return (reterr);
1053 }
1054 }
1055 if (coas.coa_changed & COA_ROUTE_CHANGED) {
1056 in6_addr_t nexthop;
1057
1058 /*
1059 * If we are connected we re-cache the information.
1060 * We ignore errors to preserve BSD behavior.
1061 * Note that we don't redo IPsec policy lookup here
1062 * since the final destination (or source) didn't change.
1063 */
1064 ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa,
1065 &connp->conn_faddr_v6, &nexthop);
1066
1067 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1068 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1069 (void) ip_attr_connect(connp, connp->conn_ixa,
1070 &connp->conn_laddr_v6, &connp->conn_faddr_v6,
1071 &nexthop, connp->conn_fport, NULL, NULL,
1072 IPDF_VERIFY_DST);
1073 }
1074 }
1075 if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
1076 connp->conn_wq->q_hiwat = connp->conn_sndbuf;
1077 }
1078 if (coas.coa_changed & COA_WROFF_CHANGED) {
1079 connp->conn_wroff = connp->conn_ht_iphc_allocated +
1080 tcps->tcps_wroff_xtra;
1081 (void) proto_set_tx_wroff(connp->conn_rq, connp,
1082 connp->conn_wroff);
1083 }
1084 if (coas.coa_changed & COA_OOBINLINE_CHANGED) {
1085 if (IPCL_IS_NONSTR(connp))
1086 proto_set_rx_oob_opt(connp, onoff);
1087 }
1088 return (0);
1089 }