1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/stream.h> 28 #define _SUN_TPI_VERSION 2 29 #include <sys/tihdr.h> 30 #include <sys/socket.h> 31 #include <sys/xti_xtiopt.h> 32 #include <sys/xti_inet.h> 33 #include <sys/policy.h> 34 35 #include <inet/common.h> 36 #include <netinet/ip6.h> 37 #include <inet/ip.h> 38 39 #include <netinet/in.h> 40 #include <netinet/tcp.h> 41 #include <inet/optcom.h> 42 #include <inet/proto_set.h> 43 #include <inet/tcp_impl.h> 44 45 static int tcp_opt_default(queue_t *, int, int, uchar_t *); 46 47 /* 48 * Table of all known options handled on a TCP protocol stack. 49 * 50 * Note: This table contains options processed by both TCP and IP levels 51 * and is the superset of options that can be performed on a TCP over IP 52 * stack. 53 */ 54 opdes_t tcp_opt_arr[] = { 55 56 { SO_LINGER, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, 57 sizeof (struct linger), 0 }, 58 59 { SO_DEBUG, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 60 { SO_KEEPALIVE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 61 { SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 62 { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 63 }, 64 { SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 65 { SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 66 { SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 67 { SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, 68 { SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 69 { SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 70 { SO_SNDTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, 71 sizeof (struct timeval), 0 }, 72 { SO_RCVTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, 73 sizeof (struct timeval), 0 }, 74 { SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 75 }, 76 { SO_SND_COPYAVOID, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 77 { SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 78 0 }, 79 { SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 80 0 }, 81 { SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 82 0 }, 83 { SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int), 84 0 }, 85 { SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 86 87 { SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, 88 89 { SO_PROTOTYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, 90 91 { TCP_NODELAY, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 92 }, 93 { TCP_MAXSEG, IPPROTO_TCP, OA_R, OA_R, OP_NP, 0, sizeof (uint_t), 94 536 }, 95 96 { TCP_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 97 OP_DEF_FN, sizeof (int), -1 /* not initialized */ }, 98 99 { TCP_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 100 OP_DEF_FN, sizeof (int), -1 /* not initialized */ }, 101 102 { TCP_CONN_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 103 OP_DEF_FN, sizeof (int), -1 /* not initialized */ }, 104 105 { TCP_CONN_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 106 OP_DEF_FN, sizeof (int), -1 /* not initialized */ }, 107 108 { TCP_RECVDSTADDR, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 109 0 }, 110 111 { TCP_ANONPRIVBIND, IPPROTO_TCP, OA_R, OA_RW, OP_PRIVPORT, 0, 112 sizeof (int), 0 }, 113 114 { TCP_EXCLBIND, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 115 }, 116 117 { TCP_INIT_CWND, IPPROTO_TCP, OA_RW, OA_RW, OP_CONFIG, 0, 118 sizeof (int), 0 }, 119 120 { TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, 121 sizeof (int), 0 }, 122 123 { TCP_KEEPIDLE, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 124 125 { TCP_KEEPCNT, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 126 127 { TCP_KEEPINTVL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 128 129 { TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, 130 sizeof (int), 0 }, 131 132 { TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 133 134 { TCP_RTO_INITIAL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 }, 135 136 { TCP_RTO_MIN, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 }, 137 138 { TCP_RTO_MAX, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 }, 139 140 { TCP_LINGER2, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 141 142 { IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 143 (OP_VARLEN|OP_NODEFAULT), 144 IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ }, 145 { T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 146 (OP_VARLEN|OP_NODEFAULT), 147 IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ }, 148 149 { IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 150 { T_IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 151 { IP_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN, 152 sizeof (int), -1 /* not initialized */ }, 153 154 { IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT, 155 sizeof (ipsec_req_t), -1 /* not initialized */ }, 156 157 { IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, 158 sizeof (int), 0 /* no ifindex */ }, 159 160 { IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0, 161 sizeof (int), 0 }, 162 163 { IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN, 164 sizeof (int), -1 /* not initialized */ }, 165 166 { IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 167 sizeof (int), 0 /* no ifindex */ }, 168 169 { IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 170 171 { IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0, 172 sizeof (in_addr_t), -1 /* not initialized */ }, 173 174 { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0, 175 sizeof (int), 0 }, 176 177 { IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 178 (OP_NODEFAULT|OP_VARLEN), 179 sizeof (struct in6_pktinfo), -1 /* not initialized */ }, 180 { IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 181 OP_NODEFAULT, 182 sizeof (sin6_t), -1 /* not initialized */ }, 183 { IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 184 (OP_VARLEN|OP_NODEFAULT), 255*8, 185 -1 /* not initialized */ }, 186 { IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 187 (OP_VARLEN|OP_NODEFAULT), 255*8, 188 -1 /* not initialized */ }, 189 { IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 190 (OP_VARLEN|OP_NODEFAULT), 255*8, 191 -1 /* not initialized */ }, 192 { IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 193 (OP_VARLEN|OP_NODEFAULT), 255*8, 194 -1 /* not initialized */ }, 195 { IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 196 OP_NODEFAULT, 197 sizeof (int), -1 /* not initialized */ }, 198 { IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 199 OP_NODEFAULT, 200 sizeof (struct ip6_mtuinfo), -1 /* not initialized */ }, 201 { IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 202 sizeof (int), 0 }, 203 { IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 204 sizeof (int), 0 }, 205 { IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 206 sizeof (int), 0 }, 207 208 /* Enable receipt of ancillary data */ 209 { IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 210 sizeof (int), 0 }, 211 { IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 212 sizeof (int), 0 }, 213 { IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 214 sizeof (int), 0 }, 215 { _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 216 sizeof (int), 0 }, 217 { IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 218 sizeof (int), 0 }, 219 { IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 220 sizeof (int), 0 }, 221 { IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 222 sizeof (int), 0 }, 223 { IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 224 sizeof (int), 0 }, 225 226 { IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT, 227 sizeof (ipsec_req_t), -1 /* not initialized */ }, 228 { IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 229 sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT }, 230 }; 231 232 /* 233 * Table of all supported levels 234 * Note: Some levels (e.g. XTI_GENERIC) may be valid but may not have 235 * any supported options so we need this info separately. 236 * 237 * This is needed only for topmost tpi providers and is used only by 238 * XTI interfaces. 239 */ 240 optlevel_t tcp_valid_levels_arr[] = { 241 XTI_GENERIC, 242 SOL_SOCKET, 243 IPPROTO_TCP, 244 IPPROTO_IP, 245 IPPROTO_IPV6 246 }; 247 248 249 #define TCP_OPT_ARR_CNT A_CNT(tcp_opt_arr) 250 #define TCP_VALID_LEVELS_CNT A_CNT(tcp_valid_levels_arr) 251 252 uint_t tcp_max_optsize; /* initialized when TCP driver is loaded */ 253 254 /* 255 * Initialize option database object for TCP 256 * 257 * This object represents database of options to search passed to 258 * {sock,tpi}optcom_req() interface routine to take care of option 259 * management and associated methods. 260 */ 261 262 optdb_obj_t tcp_opt_obj = { 263 tcp_opt_default, /* TCP default value function pointer */ 264 tcp_tpi_opt_get, /* TCP get function pointer */ 265 tcp_tpi_opt_set, /* TCP set function pointer */ 266 TCP_OPT_ARR_CNT, /* TCP option database count of entries */ 267 tcp_opt_arr, /* TCP option database */ 268 TCP_VALID_LEVELS_CNT, /* TCP valid level count of entries */ 269 tcp_valid_levels_arr /* TCP valid level array */ 270 }; 271 272 static int tcp_max_init_cwnd = TCP_MAX_INIT_CWND; 273 274 /* 275 * Some TCP options can be "set" by requesting them in the option 276 * buffer. This is needed for XTI feature test though we do not 277 * allow it in general. We interpret that this mechanism is more 278 * applicable to OSI protocols and need not be allowed in general. 279 * This routine filters out options for which it is not allowed (most) 280 * and lets through those (few) for which it is. [ The XTI interface 281 * test suite specifics will imply that any XTI_GENERIC level XTI_* if 282 * ever implemented will have to be allowed here ]. 283 */ 284 static boolean_t 285 tcp_allow_connopt_set(int level, int name) 286 { 287 288 switch (level) { 289 case IPPROTO_TCP: 290 switch (name) { 291 case TCP_NODELAY: 292 return (B_TRUE); 293 default: 294 return (B_FALSE); 295 } 296 /*NOTREACHED*/ 297 default: 298 return (B_FALSE); 299 } 300 /*NOTREACHED*/ 301 } 302 303 /* 304 * This routine gets default values of certain options whose default 305 * values are maintained by protocol specific code 306 */ 307 /* ARGSUSED */ 308 static int 309 tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr) 310 { 311 int32_t *i1 = (int32_t *)ptr; 312 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; 313 314 switch (level) { 315 case IPPROTO_TCP: 316 switch (name) { 317 case TCP_NOTIFY_THRESHOLD: 318 *i1 = tcps->tcps_ip_notify_interval; 319 break; 320 case TCP_ABORT_THRESHOLD: 321 *i1 = tcps->tcps_ip_abort_interval; 322 break; 323 case TCP_CONN_NOTIFY_THRESHOLD: 324 *i1 = tcps->tcps_ip_notify_cinterval; 325 break; 326 case TCP_CONN_ABORT_THRESHOLD: 327 *i1 = tcps->tcps_ip_abort_cinterval; 328 break; 329 default: 330 return (-1); 331 } 332 break; 333 case IPPROTO_IP: 334 switch (name) { 335 case IP_TTL: 336 *i1 = tcps->tcps_ipv4_ttl; 337 break; 338 default: 339 return (-1); 340 } 341 break; 342 case IPPROTO_IPV6: 343 switch (name) { 344 case IPV6_UNICAST_HOPS: 345 *i1 = tcps->tcps_ipv6_hoplimit; 346 break; 347 default: 348 return (-1); 349 } 350 break; 351 default: 352 return (-1); 353 } 354 return (sizeof (int)); 355 } 356 357 /* 358 * TCP routine to get the values of options. 359 */ 360 int 361 tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) 362 { 363 int *i1 = (int *)ptr; 364 tcp_t *tcp = connp->conn_tcp; 365 conn_opt_arg_t coas; 366 int retval; 367 368 coas.coa_connp = connp; 369 coas.coa_ixa = connp->conn_ixa; 370 coas.coa_ipp = &connp->conn_xmit_ipp; 371 coas.coa_ancillary = B_FALSE; 372 coas.coa_changed = 0; 373 374 switch (level) { 375 case SOL_SOCKET: 376 switch (name) { 377 case SO_SND_COPYAVOID: 378 *i1 = tcp->tcp_snd_zcopy_on ? 379 SO_SND_COPYAVOID : 0; 380 return (sizeof (int)); 381 case SO_ACCEPTCONN: 382 *i1 = (tcp->tcp_state == TCPS_LISTEN); 383 return (sizeof (int)); 384 } 385 break; 386 case IPPROTO_TCP: 387 switch (name) { 388 case TCP_NODELAY: 389 *i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0; 390 return (sizeof (int)); 391 case TCP_MAXSEG: 392 *i1 = tcp->tcp_mss; 393 return (sizeof (int)); 394 case TCP_NOTIFY_THRESHOLD: 395 *i1 = (int)tcp->tcp_first_timer_threshold; 396 return (sizeof (int)); 397 case TCP_ABORT_THRESHOLD: 398 *i1 = tcp->tcp_second_timer_threshold; 399 return (sizeof (int)); 400 case TCP_CONN_NOTIFY_THRESHOLD: 401 *i1 = tcp->tcp_first_ctimer_threshold; 402 return (sizeof (int)); 403 case TCP_CONN_ABORT_THRESHOLD: 404 *i1 = tcp->tcp_second_ctimer_threshold; 405 return (sizeof (int)); 406 case TCP_INIT_CWND: 407 *i1 = tcp->tcp_init_cwnd; 408 return (sizeof (int)); 409 case TCP_KEEPALIVE_THRESHOLD: 410 *i1 = tcp->tcp_ka_interval; 411 return (sizeof (int)); 412 413 /* 414 * TCP_KEEPIDLE expects value in seconds, but 415 * tcp_ka_interval is in milliseconds. 416 */ 417 case TCP_KEEPIDLE: 418 *i1 = tcp->tcp_ka_interval / 1000; 419 return (sizeof (int)); 420 case TCP_KEEPCNT: 421 *i1 = tcp->tcp_ka_cnt; 422 return (sizeof (int)); 423 424 /* 425 * TCP_KEEPINTVL expects value in seconds, but 426 * tcp_ka_rinterval is in milliseconds. 427 */ 428 case TCP_KEEPINTVL: 429 *i1 = tcp->tcp_ka_rinterval / 1000; 430 return (sizeof (int)); 431 case TCP_KEEPALIVE_ABORT_THRESHOLD: 432 *i1 = tcp->tcp_ka_abort_thres; 433 return (sizeof (int)); 434 case TCP_CORK: 435 *i1 = tcp->tcp_cork; 436 return (sizeof (int)); 437 case TCP_RTO_INITIAL: 438 *i1 = tcp->tcp_rto_initial; 439 return (sizeof (uint32_t)); 440 case TCP_RTO_MIN: 441 *i1 = tcp->tcp_rto_min; 442 return (sizeof (uint32_t)); 443 case TCP_RTO_MAX: 444 *i1 = tcp->tcp_rto_max; 445 return (sizeof (uint32_t)); 446 case TCP_LINGER2: 447 *i1 = tcp->tcp_fin_wait_2_flush_interval / SECONDS; 448 return (sizeof (int)); 449 } 450 break; 451 case IPPROTO_IP: 452 if (connp->conn_family != AF_INET) 453 return (-1); 454 switch (name) { 455 case IP_OPTIONS: 456 case T_IP_OPTIONS: 457 /* Caller ensures enough space */ 458 return (ip_opt_get_user(connp, ptr)); 459 default: 460 break; 461 } 462 break; 463 464 case IPPROTO_IPV6: 465 /* 466 * IPPROTO_IPV6 options are only supported for sockets 467 * that are using IPv6 on the wire. 468 */ 469 if (connp->conn_ipversion != IPV6_VERSION) { 470 return (-1); 471 } 472 switch (name) { 473 case IPV6_PATHMTU: 474 if (tcp->tcp_state < TCPS_ESTABLISHED) 475 return (-1); 476 break; 477 } 478 break; 479 } 480 mutex_enter(&connp->conn_lock); 481 retval = conn_opt_get(&coas, level, name, ptr); 482 mutex_exit(&connp->conn_lock); 483 return (retval); 484 } 485 486 /* 487 * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements. 488 * Parameters are assumed to be verified by the caller. 489 */ 490 /* ARGSUSED */ 491 int 492 tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, 493 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 494 void *thisdg_attrs, cred_t *cr) 495 { 496 tcp_t *tcp = connp->conn_tcp; 497 int *i1 = (int *)invalp; 498 boolean_t onoff = (*i1 == 0) ? 0 : 1; 499 boolean_t checkonly; 500 int reterr; 501 tcp_stack_t *tcps = tcp->tcp_tcps; 502 conn_opt_arg_t coas; 503 uint32_t val = *((uint32_t *)invalp); 504 505 coas.coa_connp = connp; 506 coas.coa_ixa = connp->conn_ixa; 507 coas.coa_ipp = &connp->conn_xmit_ipp; 508 coas.coa_ancillary = B_FALSE; 509 coas.coa_changed = 0; 510 511 switch (optset_context) { 512 case SETFN_OPTCOM_CHECKONLY: 513 checkonly = B_TRUE; 514 /* 515 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ 516 * inlen != 0 implies value supplied and 517 * we have to "pretend" to set it. 518 * inlen == 0 implies that there is no 519 * value part in T_CHECK request and just validation 520 * done elsewhere should be enough, we just return here. 521 */ 522 if (inlen == 0) { 523 *outlenp = 0; 524 return (0); 525 } 526 break; 527 case SETFN_OPTCOM_NEGOTIATE: 528 checkonly = B_FALSE; 529 break; 530 case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */ 531 case SETFN_CONN_NEGOTIATE: 532 checkonly = B_FALSE; 533 /* 534 * Negotiating local and "association-related" options 535 * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ) 536 * primitives is allowed by XTI, but we choose 537 * to not implement this style negotiation for Internet 538 * protocols (We interpret it is a must for OSI world but 539 * optional for Internet protocols) for all options. 540 * [ Will do only for the few options that enable test 541 * suites that our XTI implementation of this feature 542 * works for transports that do allow it ] 543 */ 544 if (!tcp_allow_connopt_set(level, name)) { 545 *outlenp = 0; 546 return (EINVAL); 547 } 548 break; 549 default: 550 /* 551 * We should never get here 552 */ 553 *outlenp = 0; 554 return (EINVAL); 555 } 556 557 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || 558 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); 559 560 /* 561 * For TCP, we should have no ancillary data sent down 562 * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs 563 * has to be zero. 564 */ 565 ASSERT(thisdg_attrs == NULL); 566 567 /* 568 * For fixed length options, no sanity check 569 * of passed in length is done. It is assumed *_optcom_req() 570 * routines do the right thing. 571 */ 572 switch (level) { 573 case SOL_SOCKET: 574 switch (name) { 575 case SO_KEEPALIVE: 576 if (checkonly) { 577 /* check only case */ 578 break; 579 } 580 581 if (!onoff) { 582 if (connp->conn_keepalive) { 583 if (tcp->tcp_ka_tid != 0) { 584 (void) TCP_TIMER_CANCEL(tcp, 585 tcp->tcp_ka_tid); 586 tcp->tcp_ka_tid = 0; 587 } 588 connp->conn_keepalive = 0; 589 } 590 break; 591 } 592 if (!connp->conn_keepalive) { 593 /* Crank up the keepalive timer */ 594 tcp->tcp_ka_last_intrvl = 0; 595 tcp->tcp_ka_tid = TCP_TIMER(tcp, 596 tcp_keepalive_timer, tcp->tcp_ka_interval); 597 connp->conn_keepalive = 1; 598 } 599 break; 600 case SO_SNDBUF: { 601 if (*i1 > tcps->tcps_max_buf) { 602 *outlenp = 0; 603 return (ENOBUFS); 604 } 605 if (checkonly) 606 break; 607 608 connp->conn_sndbuf = *i1; 609 if (tcps->tcps_snd_lowat_fraction != 0) { 610 connp->conn_sndlowat = connp->conn_sndbuf / 611 tcps->tcps_snd_lowat_fraction; 612 } 613 (void) tcp_maxpsz_set(tcp, B_TRUE); 614 /* 615 * If we are flow-controlled, recheck the condition. 616 * There are apps that increase SO_SNDBUF size when 617 * flow-controlled (EWOULDBLOCK), and expect the flow 618 * control condition to be lifted right away. 619 */ 620 mutex_enter(&tcp->tcp_non_sq_lock); 621 if (tcp->tcp_flow_stopped && 622 TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) { 623 tcp_clrqfull(tcp); 624 } 625 mutex_exit(&tcp->tcp_non_sq_lock); 626 *outlenp = inlen; 627 return (0); 628 } 629 case SO_RCVBUF: 630 if (*i1 > tcps->tcps_max_buf) { 631 *outlenp = 0; 632 return (ENOBUFS); 633 } 634 /* Silently ignore zero */ 635 if (!checkonly && *i1 != 0) { 636 *i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss); 637 (void) tcp_rwnd_set(tcp, *i1); 638 } 639 /* 640 * XXX should we return the rwnd here 641 * and tcp_opt_get ? 642 */ 643 *outlenp = inlen; 644 return (0); 645 case SO_SND_COPYAVOID: 646 if (!checkonly) { 647 if (tcp->tcp_loopback || 648 (onoff != 1) || !tcp_zcopy_check(tcp)) { 649 *outlenp = 0; 650 return (EOPNOTSUPP); 651 } 652 tcp->tcp_snd_zcopy_aware = 1; 653 } 654 *outlenp = inlen; 655 return (0); 656 } 657 break; 658 case IPPROTO_TCP: 659 switch (name) { 660 case TCP_NODELAY: 661 if (!checkonly) 662 tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss; 663 break; 664 case TCP_NOTIFY_THRESHOLD: 665 if (!checkonly) 666 tcp->tcp_first_timer_threshold = *i1; 667 break; 668 case TCP_ABORT_THRESHOLD: 669 if (!checkonly) 670 tcp->tcp_second_timer_threshold = *i1; 671 break; 672 case TCP_CONN_NOTIFY_THRESHOLD: 673 if (!checkonly) 674 tcp->tcp_first_ctimer_threshold = *i1; 675 break; 676 case TCP_CONN_ABORT_THRESHOLD: 677 if (!checkonly) 678 tcp->tcp_second_ctimer_threshold = *i1; 679 break; 680 case TCP_RECVDSTADDR: 681 if (tcp->tcp_state > TCPS_LISTEN) { 682 *outlenp = 0; 683 return (EOPNOTSUPP); 684 } 685 /* Setting done in conn_opt_set */ 686 break; 687 case TCP_INIT_CWND: 688 if (checkonly) 689 break; 690 691 /* 692 * Only allow socket with network configuration 693 * privilege to set the initial cwnd to be larger 694 * than allowed by RFC 3390. 695 */ 696 if (val > MIN(4, MAX(2, 4380 / tcp->tcp_mss))) { 697 if ((reterr = secpolicy_ip_config(cr, B_TRUE)) 698 != 0) { 699 *outlenp = 0; 700 return (reterr); 701 } 702 if (val > tcp_max_init_cwnd) { 703 *outlenp = 0; 704 return (EINVAL); 705 } 706 } 707 708 tcp->tcp_init_cwnd = val; 709 710 /* 711 * If the socket is connected, AND no outbound data 712 * has been sent, reset the actual cwnd values. 713 */ 714 if (tcp->tcp_state == TCPS_ESTABLISHED && 715 tcp->tcp_iss == tcp->tcp_snxt - 1) { 716 tcp->tcp_cwnd = 717 MIN(tcp->tcp_rwnd, val * tcp->tcp_mss); 718 } 719 break; 720 721 /* 722 * TCP_KEEPIDLE is in seconds but TCP_KEEPALIVE_THRESHOLD 723 * is in milliseconds. TCP_KEEPIDLE is introduced for 724 * compatibility with other Unix flavors. 725 * We can fall through TCP_KEEPALIVE_THRESHOLD logic after 726 * converting the input to milliseconds. 727 */ 728 case TCP_KEEPIDLE: 729 *i1 *= 1000; 730 /* FALLTHRU */ 731 732 case TCP_KEEPALIVE_THRESHOLD: 733 if (checkonly) 734 break; 735 736 if (*i1 < tcps->tcps_keepalive_interval_low || 737 *i1 > tcps->tcps_keepalive_interval_high) { 738 *outlenp = 0; 739 return (EINVAL); 740 } 741 if (*i1 != tcp->tcp_ka_interval) { 742 tcp->tcp_ka_interval = *i1; 743 /* 744 * Check if we need to restart the 745 * keepalive timer. 746 */ 747 if (tcp->tcp_ka_tid != 0) { 748 ASSERT(connp->conn_keepalive); 749 (void) TCP_TIMER_CANCEL(tcp, 750 tcp->tcp_ka_tid); 751 tcp->tcp_ka_last_intrvl = 0; 752 tcp->tcp_ka_tid = TCP_TIMER(tcp, 753 tcp_keepalive_timer, 754 tcp->tcp_ka_interval); 755 } 756 } 757 break; 758 759 /* 760 * tcp_ka_abort_thres = tcp_ka_rinterval * tcp_ka_cnt. 761 * So setting TCP_KEEPCNT or TCP_KEEPINTVL can affect all the 762 * three members - tcp_ka_abort_thres, tcp_ka_rinterval and 763 * tcp_ka_cnt. 764 */ 765 case TCP_KEEPCNT: 766 if (checkonly) 767 break; 768 769 if (*i1 == 0) { 770 return (EINVAL); 771 } else if (tcp->tcp_ka_rinterval == 0) { 772 /* 773 * When TCP_KEEPCNT is specified without first 774 * specifying a TCP_KEEPINTVL, we infer an 775 * interval based on a tunable specific to our 776 * stack: the tcp_keepalive_abort_interval. 777 * (Or the TCP_KEEPALIVE_ABORT_THRESHOLD, in 778 * the unlikely event that that has been set.) 779 * Given the abort interval's default value of 780 * 480 seconds, low TCP_KEEPCNT values can 781 * result in intervals that exceed the default 782 * maximum RTO of 60 seconds. Rather than 783 * fail in these cases, we (implicitly) clamp 784 * the interval at the maximum RTO; if the 785 * TCP_KEEPCNT is shortly followed by a 786 * TCP_KEEPINTVL (as we expect), the abort 787 * threshold will be recalculated correctly -- 788 * and if a TCP_KEEPINTVL is not forthcoming, 789 * keep-alive will at least operate reasonably 790 * given the underconfigured state. 791 */ 792 uint32_t interval; 793 794 interval = tcp->tcp_ka_abort_thres / *i1; 795 796 if (interval < tcp->tcp_rto_min) 797 interval = tcp->tcp_rto_min; 798 799 if (interval > tcp->tcp_rto_max) 800 interval = tcp->tcp_rto_max; 801 802 tcp->tcp_ka_rinterval = interval; 803 } else { 804 if ((*i1 * tcp->tcp_ka_rinterval) < 805 tcps->tcps_keepalive_abort_interval_low || 806 (*i1 * tcp->tcp_ka_rinterval) > 807 tcps->tcps_keepalive_abort_interval_high) 808 return (EINVAL); 809 tcp->tcp_ka_abort_thres = 810 (*i1 * tcp->tcp_ka_rinterval); 811 } 812 tcp->tcp_ka_cnt = *i1; 813 break; 814 case TCP_KEEPINTVL: 815 /* 816 * TCP_KEEPINTVL is specified in seconds, but 817 * tcp_ka_rinterval is in milliseconds. 818 */ 819 820 if (checkonly) 821 break; 822 823 if ((*i1 * 1000) < tcp->tcp_rto_min || 824 (*i1 * 1000) > tcp->tcp_rto_max) 825 return (EINVAL); 826 827 if (tcp->tcp_ka_cnt == 0) { 828 tcp->tcp_ka_cnt = 829 tcp->tcp_ka_abort_thres / (*i1 * 1000); 830 } else { 831 if ((*i1 * tcp->tcp_ka_cnt * 1000) < 832 tcps->tcps_keepalive_abort_interval_low || 833 (*i1 * tcp->tcp_ka_cnt * 1000) > 834 tcps->tcps_keepalive_abort_interval_high) 835 return (EINVAL); 836 tcp->tcp_ka_abort_thres = 837 (*i1 * tcp->tcp_ka_cnt * 1000); 838 } 839 tcp->tcp_ka_rinterval = *i1 * 1000; 840 break; 841 case TCP_KEEPALIVE_ABORT_THRESHOLD: 842 if (!checkonly) { 843 if (*i1 < 844 tcps->tcps_keepalive_abort_interval_low || 845 *i1 > 846 tcps->tcps_keepalive_abort_interval_high) { 847 *outlenp = 0; 848 return (EINVAL); 849 } 850 tcp->tcp_ka_abort_thres = *i1; 851 tcp->tcp_ka_cnt = 0; 852 tcp->tcp_ka_rinterval = 0; 853 } 854 break; 855 case TCP_CORK: 856 if (!checkonly) { 857 /* 858 * if tcp->tcp_cork was set and is now 859 * being unset, we have to make sure that 860 * the remaining data gets sent out. Also 861 * unset tcp->tcp_cork so that tcp_wput_data() 862 * can send data even if it is less than mss 863 */ 864 if (tcp->tcp_cork && onoff == 0 && 865 tcp->tcp_unsent > 0) { 866 tcp->tcp_cork = B_FALSE; 867 tcp_wput_data(tcp, NULL, B_FALSE); 868 } 869 tcp->tcp_cork = onoff; 870 } 871 break; 872 case TCP_RTO_INITIAL: { 873 clock_t rto; 874 875 if (checkonly || val == 0) 876 break; 877 878 /* 879 * Sanity checks 880 * 881 * The initial RTO should be bounded by the minimum 882 * and maximum RTO. And it should also be smaller 883 * than the connect attempt abort timeout. Otherwise, 884 * the connection won't be aborted in a period 885 * reasonably close to that timeout. 886 */ 887 if (val < tcp->tcp_rto_min || val > tcp->tcp_rto_max || 888 val > tcp->tcp_second_ctimer_threshold || 889 val < tcps->tcps_rexmit_interval_initial_low || 890 val > tcps->tcps_rexmit_interval_initial_high) { 891 *outlenp = 0; 892 return (EINVAL); 893 } 894 tcp->tcp_rto_initial = val; 895 896 /* 897 * If TCP has not sent anything, need to re-calculate 898 * tcp_rto. Otherwise, this option change does not 899 * really affect anything. 900 */ 901 if (tcp->tcp_state >= TCPS_SYN_SENT) 902 break; 903 904 tcp->tcp_rtt_sa = tcp->tcp_rto_initial << 2; 905 tcp->tcp_rtt_sd = tcp->tcp_rto_initial >> 1; 906 rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + 907 tcps->tcps_rexmit_interval_extra + 908 (tcp->tcp_rtt_sa >> 5) + 909 tcps->tcps_conn_grace_period; 910 TCP_SET_RTO(tcp, rto); 911 break; 912 } 913 case TCP_RTO_MIN: 914 if (checkonly || val == 0) 915 break; 916 917 if (val < tcps->tcps_rexmit_interval_min_low || 918 val > tcps->tcps_rexmit_interval_min_high || 919 val > tcp->tcp_rto_max) { 920 *outlenp = 0; 921 return (EINVAL); 922 } 923 tcp->tcp_rto_min = val; 924 if (tcp->tcp_rto < val) 925 tcp->tcp_rto = val; 926 break; 927 case TCP_RTO_MAX: 928 if (checkonly || val == 0) 929 break; 930 931 /* 932 * Sanity checks 933 * 934 * The maximum RTO should not be larger than the 935 * connection abort timeout. Otherwise, the 936 * connection won't be aborted in a period reasonably 937 * close to that timeout. 938 */ 939 if (val < tcps->tcps_rexmit_interval_max_low || 940 val > tcps->tcps_rexmit_interval_max_high || 941 val < tcp->tcp_rto_min || 942 val > tcp->tcp_second_timer_threshold) { 943 *outlenp = 0; 944 return (EINVAL); 945 } 946 tcp->tcp_rto_max = val; 947 if (tcp->tcp_rto > val) 948 tcp->tcp_rto = val; 949 break; 950 case TCP_LINGER2: 951 if (checkonly || *i1 == 0) 952 break; 953 954 /* 955 * Note that the option value's unit is second. And 956 * the value should be bigger than the private 957 * parameter tcp_fin_wait_2_flush_interval's lower 958 * bound and smaller than the current value of that 959 * parameter. It should be smaller than the current 960 * value to avoid an app setting TCP_LINGER2 to a big 961 * value, causing resource to be held up too long in 962 * FIN-WAIT-2 state. 963 */ 964 if (*i1 < 0 || 965 tcps->tcps_fin_wait_2_flush_interval_low/SECONDS > 966 *i1 || 967 tcps->tcps_fin_wait_2_flush_interval/SECONDS < 968 *i1) { 969 *outlenp = 0; 970 return (EINVAL); 971 } 972 tcp->tcp_fin_wait_2_flush_interval = *i1 * SECONDS; 973 break; 974 default: 975 break; 976 } 977 break; 978 case IPPROTO_IP: 979 if (connp->conn_family != AF_INET) { 980 *outlenp = 0; 981 return (EINVAL); 982 } 983 switch (name) { 984 case IP_SEC_OPT: 985 /* 986 * We should not allow policy setting after 987 * we start listening for connections. 988 */ 989 if (tcp->tcp_state == TCPS_LISTEN) { 990 return (EINVAL); 991 } 992 break; 993 } 994 break; 995 case IPPROTO_IPV6: 996 /* 997 * IPPROTO_IPV6 options are only supported for sockets 998 * that are using IPv6 on the wire. 999 */ 1000 if (connp->conn_ipversion != IPV6_VERSION) { 1001 *outlenp = 0; 1002 return (EINVAL); 1003 } 1004 1005 switch (name) { 1006 case IPV6_RECVPKTINFO: 1007 if (!checkonly) { 1008 /* Force it to be sent up with the next msg */ 1009 tcp->tcp_recvifindex = 0; 1010 } 1011 break; 1012 case IPV6_RECVTCLASS: 1013 if (!checkonly) { 1014 /* Force it to be sent up with the next msg */ 1015 tcp->tcp_recvtclass = 0xffffffffU; 1016 } 1017 break; 1018 case IPV6_RECVHOPLIMIT: 1019 if (!checkonly) { 1020 /* Force it to be sent up with the next msg */ 1021 tcp->tcp_recvhops = 0xffffffffU; 1022 } 1023 break; 1024 case IPV6_PKTINFO: 1025 /* This is an extra check for TCP */ 1026 if (inlen == sizeof (struct in6_pktinfo)) { 1027 struct in6_pktinfo *pkti; 1028 1029 pkti = (struct in6_pktinfo *)invalp; 1030 /* 1031 * RFC 3542 states that ipi6_addr must be 1032 * the unspecified address when setting the 1033 * IPV6_PKTINFO sticky socket option on a 1034 * TCP socket. 1035 */ 1036 if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr)) 1037 return (EINVAL); 1038 } 1039 break; 1040 case IPV6_SEC_OPT: 1041 /* 1042 * We should not allow policy setting after 1043 * we start listening for connections. 1044 */ 1045 if (tcp->tcp_state == TCPS_LISTEN) { 1046 return (EINVAL); 1047 } 1048 break; 1049 } 1050 break; 1051 } 1052 reterr = conn_opt_set(&coas, level, name, inlen, invalp, 1053 checkonly, cr); 1054 if (reterr != 0) { 1055 *outlenp = 0; 1056 return (reterr); 1057 } 1058 1059 /* 1060 * Common case of OK return with outval same as inval 1061 */ 1062 if (invalp != outvalp) { 1063 /* don't trust bcopy for identical src/dst */ 1064 (void) bcopy(invalp, outvalp, inlen); 1065 } 1066 *outlenp = inlen; 1067 1068 if (coas.coa_changed & COA_HEADER_CHANGED) { 1069 /* If we are connected we rebuilt the headers */ 1070 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && 1071 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 1072 reterr = tcp_build_hdrs(tcp); 1073 if (reterr != 0) 1074 return (reterr); 1075 } 1076 } 1077 if (coas.coa_changed & COA_ROUTE_CHANGED) { 1078 in6_addr_t nexthop; 1079 1080 /* 1081 * If we are connected we re-cache the information. 1082 * We ignore errors to preserve BSD behavior. 1083 * Note that we don't redo IPsec policy lookup here 1084 * since the final destination (or source) didn't change. 1085 */ 1086 ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa, 1087 &connp->conn_faddr_v6, &nexthop); 1088 1089 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && 1090 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 1091 (void) ip_attr_connect(connp, connp->conn_ixa, 1092 &connp->conn_laddr_v6, &connp->conn_faddr_v6, 1093 &nexthop, connp->conn_fport, NULL, NULL, 1094 IPDF_VERIFY_DST); 1095 } 1096 } 1097 if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) { 1098 connp->conn_wq->q_hiwat = connp->conn_sndbuf; 1099 } 1100 if (coas.coa_changed & COA_WROFF_CHANGED) { 1101 connp->conn_wroff = connp->conn_ht_iphc_allocated + 1102 tcps->tcps_wroff_xtra; 1103 (void) proto_set_tx_wroff(connp->conn_rq, connp, 1104 connp->conn_wroff); 1105 } 1106 if (coas.coa_changed & COA_OOBINLINE_CHANGED) { 1107 if (IPCL_IS_NONSTR(connp)) 1108 proto_set_rx_oob_opt(connp, onoff); 1109 } 1110 return (0); 1111 }