1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/stream.h> 28 #define _SUN_TPI_VERSION 2 29 #include <sys/tihdr.h> 30 #include <sys/socket.h> 31 #include <sys/xti_xtiopt.h> 32 #include <sys/xti_inet.h> 33 #include <sys/policy.h> 34 35 #include <inet/common.h> 36 #include <netinet/ip6.h> 37 #include <inet/ip.h> 38 39 #include <netinet/in.h> 40 #include <netinet/tcp.h> 41 #include <inet/optcom.h> 42 #include <inet/proto_set.h> 43 #include <inet/tcp_impl.h> 44 45 /* 46 * Table of all known options handled on a TCP protocol stack. 47 * 48 * Note: This table contains options processed by both TCP and IP levels 49 * and is the superset of options that can be performed on a TCP over IP 50 * stack. 51 */ 52 opdes_t tcp_opt_arr[] = { 53 54 { SO_LINGER, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, 55 sizeof (struct linger), 0 }, 56 57 { SO_DEBUG, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 58 { SO_KEEPALIVE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 59 { SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 60 { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 61 }, 62 { SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 63 { SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 64 { SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 65 { SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, 66 { SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 67 { SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 68 { SO_SNDTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, 69 sizeof (struct timeval), 0 }, 70 { SO_RCVTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, 71 sizeof (struct timeval), 0 }, 72 { SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 73 }, 74 { SO_SND_COPYAVOID, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 75 { SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 76 0 }, 77 { SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 78 0 }, 79 { SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 80 0 }, 81 { SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int), 82 0 }, 83 { SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 84 85 { SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, 86 87 { SO_PROTOTYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, 88 89 { TCP_NODELAY, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 90 }, 91 { TCP_MAXSEG, IPPROTO_TCP, OA_R, OA_R, OP_NP, 0, sizeof (uint_t), 92 536 }, 93 94 { TCP_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 95 OP_DEF_FN, sizeof (int), -1 /* not initialized */ }, 96 97 { TCP_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 98 OP_DEF_FN, sizeof (int), -1 /* not initialized */ }, 99 100 { TCP_CONN_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 101 OP_DEF_FN, sizeof (int), -1 /* not initialized */ }, 102 103 { TCP_CONN_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 104 OP_DEF_FN, sizeof (int), -1 /* not initialized */ }, 105 106 { TCP_RECVDSTADDR, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 107 0 }, 108 109 { TCP_ANONPRIVBIND, IPPROTO_TCP, OA_R, OA_RW, OP_PRIVPORT, 0, 110 sizeof (int), 0 }, 111 112 { TCP_EXCLBIND, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 113 }, 114 115 { TCP_INIT_CWND, IPPROTO_TCP, OA_RW, OA_RW, OP_CONFIG, 0, 116 sizeof (int), 0 }, 117 118 { TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, 119 sizeof (int), 0 }, 120 121 { TCP_KEEPIDLE, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 122 123 { TCP_KEEPCNT, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 124 125 { TCP_KEEPINTVL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 126 127 { TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, 128 sizeof (int), 0 }, 129 130 { TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 131 132 { TCP_RTO_INITIAL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 }, 133 134 { TCP_RTO_MIN, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 }, 135 136 { TCP_RTO_MAX, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 }, 137 138 { TCP_LINGER2, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 139 140 { IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 141 (OP_VARLEN|OP_NODEFAULT), 142 IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ }, 143 { T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 144 (OP_VARLEN|OP_NODEFAULT), 145 IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ }, 146 147 { IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 148 { T_IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 149 { IP_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN, 150 sizeof (int), -1 /* not initialized */ }, 151 152 { IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT, 153 sizeof (ipsec_req_t), -1 /* not initialized */ }, 154 155 { IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, 156 sizeof (int), 0 /* no ifindex */ }, 157 158 { IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0, 159 sizeof (int), 0 }, 160 161 { IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN, 162 sizeof (int), -1 /* not initialized */ }, 163 164 { IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 165 sizeof (int), 0 /* no ifindex */ }, 166 167 { IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 168 169 { IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0, 170 sizeof (in_addr_t), -1 /* not initialized */ }, 171 172 { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0, 173 sizeof (int), 0 }, 174 175 { IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 176 (OP_NODEFAULT|OP_VARLEN), 177 sizeof (struct in6_pktinfo), -1 /* not initialized */ }, 178 { IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 179 OP_NODEFAULT, 180 sizeof (sin6_t), -1 /* not initialized */ }, 181 { IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 182 (OP_VARLEN|OP_NODEFAULT), 255*8, 183 -1 /* not initialized */ }, 184 { IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 185 (OP_VARLEN|OP_NODEFAULT), 255*8, 186 -1 /* not initialized */ }, 187 { IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 188 (OP_VARLEN|OP_NODEFAULT), 255*8, 189 -1 /* not initialized */ }, 190 { IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 191 (OP_VARLEN|OP_NODEFAULT), 255*8, 192 -1 /* not initialized */ }, 193 { IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 194 OP_NODEFAULT, 195 sizeof (int), -1 /* not initialized */ }, 196 { IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 197 OP_NODEFAULT, 198 sizeof (struct ip6_mtuinfo), -1 /* not initialized */ }, 199 { IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 200 sizeof (int), 0 }, 201 { IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 202 sizeof (int), 0 }, 203 { IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 204 sizeof (int), 0 }, 205 206 /* Enable receipt of ancillary data */ 207 { IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 208 sizeof (int), 0 }, 209 { IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 210 sizeof (int), 0 }, 211 { IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 212 sizeof (int), 0 }, 213 { _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 214 sizeof (int), 0 }, 215 { IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 216 sizeof (int), 0 }, 217 { IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 218 sizeof (int), 0 }, 219 { IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 220 sizeof (int), 0 }, 221 { IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 222 sizeof (int), 0 }, 223 224 { IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT, 225 sizeof (ipsec_req_t), -1 /* not initialized */ }, 226 { IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 227 sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT }, 228 }; 229 230 /* 231 * Table of all supported levels 232 * Note: Some levels (e.g. XTI_GENERIC) may be valid but may not have 233 * any supported options so we need this info separately. 234 * 235 * This is needed only for topmost tpi providers and is used only by 236 * XTI interfaces. 237 */ 238 optlevel_t tcp_valid_levels_arr[] = { 239 XTI_GENERIC, 240 SOL_SOCKET, 241 IPPROTO_TCP, 242 IPPROTO_IP, 243 IPPROTO_IPV6 244 }; 245 246 247 #define TCP_OPT_ARR_CNT A_CNT(tcp_opt_arr) 248 #define TCP_VALID_LEVELS_CNT A_CNT(tcp_valid_levels_arr) 249 250 uint_t tcp_max_optsize; /* initialized when TCP driver is loaded */ 251 252 /* 253 * Initialize option database object for TCP 254 * 255 * This object represents database of options to search passed to 256 * {sock,tpi}optcom_req() interface routine to take care of option 257 * management and associated methods. 258 */ 259 260 optdb_obj_t tcp_opt_obj = { 261 tcp_opt_default, /* TCP default value function pointer */ 262 tcp_tpi_opt_get, /* TCP get function pointer */ 263 tcp_tpi_opt_set, /* TCP set function pointer */ 264 TCP_OPT_ARR_CNT, /* TCP option database count of entries */ 265 tcp_opt_arr, /* TCP option database */ 266 TCP_VALID_LEVELS_CNT, /* TCP valid level count of entries */ 267 tcp_valid_levels_arr /* TCP valid level array */ 268 }; 269 270 /* Maximum TCP initial cwin (start/restart). */ 271 #define TCP_MAX_INIT_CWND 16 272 273 static int tcp_max_init_cwnd = TCP_MAX_INIT_CWND; 274 275 /* 276 * Some TCP options can be "set" by requesting them in the option 277 * buffer. This is needed for XTI feature test though we do not 278 * allow it in general. We interpret that this mechanism is more 279 * applicable to OSI protocols and need not be allowed in general. 280 * This routine filters out options for which it is not allowed (most) 281 * and lets through those (few) for which it is. [ The XTI interface 282 * test suite specifics will imply that any XTI_GENERIC level XTI_* if 283 * ever implemented will have to be allowed here ]. 284 */ 285 static boolean_t 286 tcp_allow_connopt_set(int level, int name) 287 { 288 289 switch (level) { 290 case IPPROTO_TCP: 291 switch (name) { 292 case TCP_NODELAY: 293 return (B_TRUE); 294 default: 295 return (B_FALSE); 296 } 297 /*NOTREACHED*/ 298 default: 299 return (B_FALSE); 300 } 301 /*NOTREACHED*/ 302 } 303 304 /* 305 * This routine gets default values of certain options whose default 306 * values are maintained by protocol specific code 307 */ 308 /* ARGSUSED */ 309 int 310 tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr) 311 { 312 int32_t *i1 = (int32_t *)ptr; 313 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; 314 315 switch (level) { 316 case IPPROTO_TCP: 317 switch (name) { 318 case TCP_NOTIFY_THRESHOLD: 319 *i1 = tcps->tcps_ip_notify_interval; 320 break; 321 case TCP_ABORT_THRESHOLD: 322 *i1 = tcps->tcps_ip_abort_interval; 323 break; 324 case TCP_CONN_NOTIFY_THRESHOLD: 325 *i1 = tcps->tcps_ip_notify_cinterval; 326 break; 327 case TCP_CONN_ABORT_THRESHOLD: 328 *i1 = tcps->tcps_ip_abort_cinterval; 329 break; 330 default: 331 return (-1); 332 } 333 break; 334 case IPPROTO_IP: 335 switch (name) { 336 case IP_TTL: 337 *i1 = tcps->tcps_ipv4_ttl; 338 break; 339 default: 340 return (-1); 341 } 342 break; 343 case IPPROTO_IPV6: 344 switch (name) { 345 case IPV6_UNICAST_HOPS: 346 *i1 = tcps->tcps_ipv6_hoplimit; 347 break; 348 default: 349 return (-1); 350 } 351 break; 352 default: 353 return (-1); 354 } 355 return (sizeof (int)); 356 } 357 358 /* 359 * TCP routine to get the values of options. 360 */ 361 int 362 tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) 363 { 364 int *i1 = (int *)ptr; 365 tcp_t *tcp = connp->conn_tcp; 366 conn_opt_arg_t coas; 367 int retval; 368 369 coas.coa_connp = connp; 370 coas.coa_ixa = connp->conn_ixa; 371 coas.coa_ipp = &connp->conn_xmit_ipp; 372 coas.coa_ancillary = B_FALSE; 373 coas.coa_changed = 0; 374 375 switch (level) { 376 case SOL_SOCKET: 377 switch (name) { 378 case SO_SND_COPYAVOID: 379 *i1 = tcp->tcp_snd_zcopy_on ? 380 SO_SND_COPYAVOID : 0; 381 return (sizeof (int)); 382 case SO_ACCEPTCONN: 383 *i1 = (tcp->tcp_state == TCPS_LISTEN); 384 return (sizeof (int)); 385 } 386 break; 387 case IPPROTO_TCP: 388 switch (name) { 389 case TCP_NODELAY: 390 *i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0; 391 return (sizeof (int)); 392 case TCP_MAXSEG: 393 *i1 = tcp->tcp_mss; 394 return (sizeof (int)); 395 case TCP_NOTIFY_THRESHOLD: 396 *i1 = (int)tcp->tcp_first_timer_threshold; 397 return (sizeof (int)); 398 case TCP_ABORT_THRESHOLD: 399 *i1 = tcp->tcp_second_timer_threshold; 400 return (sizeof (int)); 401 case TCP_CONN_NOTIFY_THRESHOLD: 402 *i1 = tcp->tcp_first_ctimer_threshold; 403 return (sizeof (int)); 404 case TCP_CONN_ABORT_THRESHOLD: 405 *i1 = tcp->tcp_second_ctimer_threshold; 406 return (sizeof (int)); 407 case TCP_INIT_CWND: 408 *i1 = tcp->tcp_init_cwnd; 409 return (sizeof (int)); 410 case TCP_KEEPALIVE_THRESHOLD: 411 *i1 = tcp->tcp_ka_interval; 412 return (sizeof (int)); 413 414 /* 415 * TCP_KEEPIDLE expects value in seconds, but 416 * tcp_ka_interval is in milliseconds. 417 */ 418 case TCP_KEEPIDLE: 419 *i1 = tcp->tcp_ka_interval / 1000; 420 return (sizeof (int)); 421 case TCP_KEEPCNT: 422 *i1 = tcp->tcp_ka_cnt; 423 return (sizeof (int)); 424 425 /* 426 * TCP_KEEPINTVL expects value in seconds, but 427 * tcp_ka_rinterval is in milliseconds. 428 */ 429 case TCP_KEEPINTVL: 430 *i1 = tcp->tcp_ka_rinterval / 1000; 431 return (sizeof (int)); 432 case TCP_KEEPALIVE_ABORT_THRESHOLD: 433 *i1 = tcp->tcp_ka_abort_thres; 434 return (sizeof (int)); 435 case TCP_CORK: 436 *i1 = tcp->tcp_cork; 437 return (sizeof (int)); 438 case TCP_RTO_INITIAL: 439 *i1 = tcp->tcp_rto_initial; 440 return (sizeof (uint32_t)); 441 case TCP_RTO_MIN: 442 *i1 = tcp->tcp_rto_min; 443 return (sizeof (uint32_t)); 444 case TCP_RTO_MAX: 445 *i1 = tcp->tcp_rto_max; 446 return (sizeof (uint32_t)); 447 case TCP_LINGER2: 448 *i1 = tcp->tcp_fin_wait_2_flush_interval / SECONDS; 449 return (sizeof (int)); 450 } 451 break; 452 case IPPROTO_IP: 453 if (connp->conn_family != AF_INET) 454 return (-1); 455 switch (name) { 456 case IP_OPTIONS: 457 case T_IP_OPTIONS: 458 /* Caller ensures enough space */ 459 return (ip_opt_get_user(connp, ptr)); 460 default: 461 break; 462 } 463 break; 464 465 case IPPROTO_IPV6: 466 /* 467 * IPPROTO_IPV6 options are only supported for sockets 468 * that are using IPv6 on the wire. 469 */ 470 if (connp->conn_ipversion != IPV6_VERSION) { 471 return (-1); 472 } 473 switch (name) { 474 case IPV6_PATHMTU: 475 if (tcp->tcp_state < TCPS_ESTABLISHED) 476 return (-1); 477 break; 478 } 479 break; 480 } 481 mutex_enter(&connp->conn_lock); 482 retval = conn_opt_get(&coas, level, name, ptr); 483 mutex_exit(&connp->conn_lock); 484 return (retval); 485 } 486 487 /* 488 * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements. 489 * Parameters are assumed to be verified by the caller. 490 */ 491 /* ARGSUSED */ 492 int 493 tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, 494 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 495 void *thisdg_attrs, cred_t *cr) 496 { 497 tcp_t *tcp = connp->conn_tcp; 498 int *i1 = (int *)invalp; 499 boolean_t onoff = (*i1 == 0) ? 0 : 1; 500 boolean_t checkonly; 501 int reterr; 502 tcp_stack_t *tcps = tcp->tcp_tcps; 503 conn_opt_arg_t coas; 504 uint32_t val = *((uint32_t *)invalp); 505 506 coas.coa_connp = connp; 507 coas.coa_ixa = connp->conn_ixa; 508 coas.coa_ipp = &connp->conn_xmit_ipp; 509 coas.coa_ancillary = B_FALSE; 510 coas.coa_changed = 0; 511 512 switch (optset_context) { 513 case SETFN_OPTCOM_CHECKONLY: 514 checkonly = B_TRUE; 515 /* 516 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ 517 * inlen != 0 implies value supplied and 518 * we have to "pretend" to set it. 519 * inlen == 0 implies that there is no 520 * value part in T_CHECK request and just validation 521 * done elsewhere should be enough, we just return here. 522 */ 523 if (inlen == 0) { 524 *outlenp = 0; 525 return (0); 526 } 527 break; 528 case SETFN_OPTCOM_NEGOTIATE: 529 checkonly = B_FALSE; 530 break; 531 case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */ 532 case SETFN_CONN_NEGOTIATE: 533 checkonly = B_FALSE; 534 /* 535 * Negotiating local and "association-related" options 536 * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ) 537 * primitives is allowed by XTI, but we choose 538 * to not implement this style negotiation for Internet 539 * protocols (We interpret it is a must for OSI world but 540 * optional for Internet protocols) for all options. 541 * [ Will do only for the few options that enable test 542 * suites that our XTI implementation of this feature 543 * works for transports that do allow it ] 544 */ 545 if (!tcp_allow_connopt_set(level, name)) { 546 *outlenp = 0; 547 return (EINVAL); 548 } 549 break; 550 default: 551 /* 552 * We should never get here 553 */ 554 *outlenp = 0; 555 return (EINVAL); 556 } 557 558 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || 559 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); 560 561 /* 562 * For TCP, we should have no ancillary data sent down 563 * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs 564 * has to be zero. 565 */ 566 ASSERT(thisdg_attrs == NULL); 567 568 /* 569 * For fixed length options, no sanity check 570 * of passed in length is done. It is assumed *_optcom_req() 571 * routines do the right thing. 572 */ 573 switch (level) { 574 case SOL_SOCKET: 575 switch (name) { 576 case SO_KEEPALIVE: 577 if (checkonly) { 578 /* check only case */ 579 break; 580 } 581 582 if (!onoff) { 583 if (connp->conn_keepalive) { 584 if (tcp->tcp_ka_tid != 0) { 585 (void) TCP_TIMER_CANCEL(tcp, 586 tcp->tcp_ka_tid); 587 tcp->tcp_ka_tid = 0; 588 } 589 connp->conn_keepalive = 0; 590 } 591 break; 592 } 593 if (!connp->conn_keepalive) { 594 /* Crank up the keepalive timer */ 595 tcp->tcp_ka_last_intrvl = 0; 596 tcp->tcp_ka_tid = TCP_TIMER(tcp, 597 tcp_keepalive_timer, tcp->tcp_ka_interval); 598 connp->conn_keepalive = 1; 599 } 600 break; 601 case SO_SNDBUF: { 602 if (*i1 > tcps->tcps_max_buf) { 603 *outlenp = 0; 604 return (ENOBUFS); 605 } 606 if (checkonly) 607 break; 608 609 connp->conn_sndbuf = *i1; 610 if (tcps->tcps_snd_lowat_fraction != 0) { 611 connp->conn_sndlowat = connp->conn_sndbuf / 612 tcps->tcps_snd_lowat_fraction; 613 } 614 (void) tcp_maxpsz_set(tcp, B_TRUE); 615 /* 616 * If we are flow-controlled, recheck the condition. 617 * There are apps that increase SO_SNDBUF size when 618 * flow-controlled (EWOULDBLOCK), and expect the flow 619 * control condition to be lifted right away. 620 */ 621 mutex_enter(&tcp->tcp_non_sq_lock); 622 if (tcp->tcp_flow_stopped && 623 TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) { 624 tcp_clrqfull(tcp); 625 } 626 mutex_exit(&tcp->tcp_non_sq_lock); 627 *outlenp = inlen; 628 return (0); 629 } 630 case SO_RCVBUF: 631 if (*i1 > tcps->tcps_max_buf) { 632 *outlenp = 0; 633 return (ENOBUFS); 634 } 635 /* Silently ignore zero */ 636 if (!checkonly && *i1 != 0) { 637 *i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss); 638 (void) tcp_rwnd_set(tcp, *i1); 639 } 640 /* 641 * XXX should we return the rwnd here 642 * and tcp_opt_get ? 643 */ 644 *outlenp = inlen; 645 return (0); 646 case SO_SND_COPYAVOID: 647 if (!checkonly) { 648 if (tcp->tcp_loopback || 649 (onoff != 1) || !tcp_zcopy_check(tcp)) { 650 *outlenp = 0; 651 return (EOPNOTSUPP); 652 } 653 tcp->tcp_snd_zcopy_aware = 1; 654 } 655 *outlenp = inlen; 656 return (0); 657 } 658 break; 659 case IPPROTO_TCP: 660 switch (name) { 661 case TCP_NODELAY: 662 if (!checkonly) 663 tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss; 664 break; 665 case TCP_NOTIFY_THRESHOLD: 666 if (!checkonly) 667 tcp->tcp_first_timer_threshold = *i1; 668 break; 669 case TCP_ABORT_THRESHOLD: 670 if (!checkonly) 671 tcp->tcp_second_timer_threshold = *i1; 672 break; 673 case TCP_CONN_NOTIFY_THRESHOLD: 674 if (!checkonly) 675 tcp->tcp_first_ctimer_threshold = *i1; 676 break; 677 case TCP_CONN_ABORT_THRESHOLD: 678 if (!checkonly) 679 tcp->tcp_second_ctimer_threshold = *i1; 680 break; 681 case TCP_RECVDSTADDR: 682 if (tcp->tcp_state > TCPS_LISTEN) { 683 *outlenp = 0; 684 return (EOPNOTSUPP); 685 } 686 /* Setting done in conn_opt_set */ 687 break; 688 case TCP_INIT_CWND: 689 if (checkonly) 690 break; 691 692 /* 693 * Only allow socket with network configuration 694 * privilege to set the initial cwnd to be larger 695 * than allowed by RFC 3390. 696 */ 697 if (val > MIN(4, MAX(2, 4380 / tcp->tcp_mss))) { 698 if ((reterr = secpolicy_ip_config(cr, B_TRUE)) 699 != 0) { 700 *outlenp = 0; 701 return (reterr); 702 } 703 if (val > tcp_max_init_cwnd) { 704 *outlenp = 0; 705 return (EINVAL); 706 } 707 } 708 709 tcp->tcp_init_cwnd = val; 710 711 /* 712 * If the socket is connected, AND no outbound data 713 * has been sent, reset the actual cwnd values. 714 */ 715 if (tcp->tcp_state == TCPS_ESTABLISHED && 716 tcp->tcp_iss == tcp->tcp_snxt - 1) { 717 tcp->tcp_cwnd = 718 MIN(tcp->tcp_rwnd, val * tcp->tcp_mss); 719 } 720 break; 721 722 /* 723 * TCP_KEEPIDLE is in seconds but TCP_KEEPALIVE_THRESHOLD 724 * is in milliseconds. TCP_KEEPIDLE is introduced for 725 * compatibility with other Unix flavors. 726 * We can fall through TCP_KEEPALIVE_THRESHOLD logic after 727 * converting the input to milliseconds. 728 */ 729 case TCP_KEEPIDLE: 730 *i1 *= 1000; 731 /* FALLTHRU */ 732 733 case TCP_KEEPALIVE_THRESHOLD: 734 if (checkonly) 735 break; 736 737 if (*i1 < tcps->tcps_keepalive_interval_low || 738 *i1 > tcps->tcps_keepalive_interval_high) { 739 *outlenp = 0; 740 return (EINVAL); 741 } 742 if (*i1 != tcp->tcp_ka_interval) { 743 tcp->tcp_ka_interval = *i1; 744 /* 745 * Check if we need to restart the 746 * keepalive timer. 747 */ 748 if (tcp->tcp_ka_tid != 0) { 749 ASSERT(connp->conn_keepalive); 750 (void) TCP_TIMER_CANCEL(tcp, 751 tcp->tcp_ka_tid); 752 tcp->tcp_ka_last_intrvl = 0; 753 tcp->tcp_ka_tid = TCP_TIMER(tcp, 754 tcp_keepalive_timer, 755 tcp->tcp_ka_interval); 756 } 757 } 758 break; 759 760 /* 761 * tcp_ka_abort_thres = tcp_ka_rinterval * tcp_ka_cnt. 762 * So setting TCP_KEEPCNT or TCP_KEEPINTVL can affect all the 763 * three members - tcp_ka_abort_thres, tcp_ka_rinterval and 764 * tcp_ka_cnt. 765 */ 766 case TCP_KEEPCNT: 767 if (checkonly) 768 break; 769 770 if (*i1 == 0) { 771 return (EINVAL); 772 } else if (tcp->tcp_ka_rinterval == 0) { 773 if ((tcp->tcp_ka_abort_thres / *i1) < 774 tcp->tcp_rto_min || 775 (tcp->tcp_ka_abort_thres / *i1) > 776 tcp->tcp_rto_max) 777 return (EINVAL); 778 779 tcp->tcp_ka_rinterval = 780 tcp->tcp_ka_abort_thres / *i1; 781 } else { 782 if ((*i1 * tcp->tcp_ka_rinterval) < 783 tcps->tcps_keepalive_abort_interval_low || 784 (*i1 * tcp->tcp_ka_rinterval) > 785 tcps->tcps_keepalive_abort_interval_high) 786 return (EINVAL); 787 tcp->tcp_ka_abort_thres = 788 (*i1 * tcp->tcp_ka_rinterval); 789 } 790 tcp->tcp_ka_cnt = *i1; 791 break; 792 case TCP_KEEPINTVL: 793 /* 794 * TCP_KEEPINTVL is specified in seconds, but 795 * tcp_ka_rinterval is in milliseconds. 796 */ 797 798 if (checkonly) 799 break; 800 801 if ((*i1 * 1000) < tcp->tcp_rto_min || 802 (*i1 * 1000) > tcp->tcp_rto_max) 803 return (EINVAL); 804 805 if (tcp->tcp_ka_cnt == 0) { 806 tcp->tcp_ka_cnt = 807 tcp->tcp_ka_abort_thres / (*i1 * 1000); 808 } else { 809 if ((*i1 * tcp->tcp_ka_cnt * 1000) < 810 tcps->tcps_keepalive_abort_interval_low || 811 (*i1 * tcp->tcp_ka_cnt * 1000) > 812 tcps->tcps_keepalive_abort_interval_high) 813 return (EINVAL); 814 tcp->tcp_ka_abort_thres = 815 (*i1 * tcp->tcp_ka_cnt * 1000); 816 } 817 tcp->tcp_ka_rinterval = *i1 * 1000; 818 break; 819 case TCP_KEEPALIVE_ABORT_THRESHOLD: 820 if (!checkonly) { 821 if (*i1 < 822 tcps->tcps_keepalive_abort_interval_low || 823 *i1 > 824 tcps->tcps_keepalive_abort_interval_high) { 825 *outlenp = 0; 826 return (EINVAL); 827 } 828 tcp->tcp_ka_abort_thres = *i1; 829 tcp->tcp_ka_cnt = 0; 830 tcp->tcp_ka_rinterval = 0; 831 } 832 break; 833 case TCP_CORK: 834 if (!checkonly) { 835 /* 836 * if tcp->tcp_cork was set and is now 837 * being unset, we have to make sure that 838 * the remaining data gets sent out. Also 839 * unset tcp->tcp_cork so that tcp_wput_data() 840 * can send data even if it is less than mss 841 */ 842 if (tcp->tcp_cork && onoff == 0 && 843 tcp->tcp_unsent > 0) { 844 tcp->tcp_cork = B_FALSE; 845 tcp_wput_data(tcp, NULL, B_FALSE); 846 } 847 tcp->tcp_cork = onoff; 848 } 849 break; 850 case TCP_RTO_INITIAL: { 851 clock_t rto; 852 853 if (checkonly || val == 0) 854 break; 855 856 /* 857 * Sanity checks 858 * 859 * The initial RTO should be bounded by the minimum 860 * and maximum RTO. And it should also be smaller 861 * than the connect attempt abort timeout. Otherwise, 862 * the connection won't be aborted in a period 863 * reasonably close to that timeout. 864 */ 865 if (val < tcp->tcp_rto_min || val > tcp->tcp_rto_max || 866 val > tcp->tcp_second_ctimer_threshold || 867 val < tcps->tcps_rexmit_interval_initial_low || 868 val > tcps->tcps_rexmit_interval_initial_high) { 869 *outlenp = 0; 870 return (EINVAL); 871 } 872 tcp->tcp_rto_initial = val; 873 874 /* 875 * If TCP has not sent anything, need to re-calculate 876 * tcp_rto. Otherwise, this option change does not 877 * really affect anything. 878 */ 879 if (tcp->tcp_state >= TCPS_SYN_SENT) 880 break; 881 882 tcp->tcp_rtt_sa = tcp->tcp_rto_initial << 2; 883 tcp->tcp_rtt_sd = tcp->tcp_rto_initial >> 1; 884 rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + 885 tcps->tcps_rexmit_interval_extra + 886 (tcp->tcp_rtt_sa >> 5) + 887 tcps->tcps_conn_grace_period; 888 TCP_SET_RTO(tcp, rto); 889 break; 890 } 891 case TCP_RTO_MIN: 892 if (checkonly || val == 0) 893 break; 894 895 if (val < tcps->tcps_rexmit_interval_min_low || 896 val > tcps->tcps_rexmit_interval_min_high || 897 val > tcp->tcp_rto_max) { 898 *outlenp = 0; 899 return (EINVAL); 900 } 901 tcp->tcp_rto_min = val; 902 if (tcp->tcp_rto < val) 903 tcp->tcp_rto = val; 904 break; 905 case TCP_RTO_MAX: 906 if (checkonly || val == 0) 907 break; 908 909 /* 910 * Sanity checks 911 * 912 * The maximum RTO should not be larger than the 913 * connection abort timeout. Otherwise, the 914 * connection won't be aborted in a period reasonably 915 * close to that timeout. 916 */ 917 if (val < tcps->tcps_rexmit_interval_max_low || 918 val > tcps->tcps_rexmit_interval_max_high || 919 val < tcp->tcp_rto_min || 920 val > tcp->tcp_second_timer_threshold) { 921 *outlenp = 0; 922 return (EINVAL); 923 } 924 tcp->tcp_rto_max = val; 925 if (tcp->tcp_rto > val) 926 tcp->tcp_rto = val; 927 break; 928 case TCP_LINGER2: 929 if (checkonly || *i1 == 0) 930 break; 931 932 /* 933 * Note that the option value's unit is second. And 934 * the value should be bigger than the private 935 * parameter tcp_fin_wait_2_flush_interval's lower 936 * bound and smaller than the current value of that 937 * parameter. It should be smaller than the current 938 * value to avoid an app setting TCP_LINGER2 to a big 939 * value, causing resource to be held up too long in 940 * FIN-WAIT-2 state. 941 */ 942 if (*i1 < 0 || 943 tcps->tcps_fin_wait_2_flush_interval_low/SECONDS > 944 *i1 || 945 tcps->tcps_fin_wait_2_flush_interval/SECONDS < 946 *i1) { 947 *outlenp = 0; 948 return (EINVAL); 949 } 950 tcp->tcp_fin_wait_2_flush_interval = *i1 * SECONDS; 951 break; 952 default: 953 break; 954 } 955 break; 956 case IPPROTO_IP: 957 if (connp->conn_family != AF_INET) { 958 *outlenp = 0; 959 return (EINVAL); 960 } 961 switch (name) { 962 case IP_SEC_OPT: 963 /* 964 * We should not allow policy setting after 965 * we start listening for connections. 966 */ 967 if (tcp->tcp_state == TCPS_LISTEN) { 968 return (EINVAL); 969 } 970 break; 971 } 972 break; 973 case IPPROTO_IPV6: 974 /* 975 * IPPROTO_IPV6 options are only supported for sockets 976 * that are using IPv6 on the wire. 977 */ 978 if (connp->conn_ipversion != IPV6_VERSION) { 979 *outlenp = 0; 980 return (EINVAL); 981 } 982 983 switch (name) { 984 case IPV6_RECVPKTINFO: 985 if (!checkonly) { 986 /* Force it to be sent up with the next msg */ 987 tcp->tcp_recvifindex = 0; 988 } 989 break; 990 case IPV6_RECVTCLASS: 991 if (!checkonly) { 992 /* Force it to be sent up with the next msg */ 993 tcp->tcp_recvtclass = 0xffffffffU; 994 } 995 break; 996 case IPV6_RECVHOPLIMIT: 997 if (!checkonly) { 998 /* Force it to be sent up with the next msg */ 999 tcp->tcp_recvhops = 0xffffffffU; 1000 } 1001 break; 1002 case IPV6_PKTINFO: 1003 /* This is an extra check for TCP */ 1004 if (inlen == sizeof (struct in6_pktinfo)) { 1005 struct in6_pktinfo *pkti; 1006 1007 pkti = (struct in6_pktinfo *)invalp; 1008 /* 1009 * RFC 3542 states that ipi6_addr must be 1010 * the unspecified address when setting the 1011 * IPV6_PKTINFO sticky socket option on a 1012 * TCP socket. 1013 */ 1014 if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr)) 1015 return (EINVAL); 1016 } 1017 break; 1018 case IPV6_SEC_OPT: 1019 /* 1020 * We should not allow policy setting after 1021 * we start listening for connections. 1022 */ 1023 if (tcp->tcp_state == TCPS_LISTEN) { 1024 return (EINVAL); 1025 } 1026 break; 1027 } 1028 break; 1029 } 1030 reterr = conn_opt_set(&coas, level, name, inlen, invalp, 1031 checkonly, cr); 1032 if (reterr != 0) { 1033 *outlenp = 0; 1034 return (reterr); 1035 } 1036 1037 /* 1038 * Common case of OK return with outval same as inval 1039 */ 1040 if (invalp != outvalp) { 1041 /* don't trust bcopy for identical src/dst */ 1042 (void) bcopy(invalp, outvalp, inlen); 1043 } 1044 *outlenp = inlen; 1045 1046 if (coas.coa_changed & COA_HEADER_CHANGED) { 1047 /* If we are connected we rebuilt the headers */ 1048 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && 1049 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 1050 reterr = tcp_build_hdrs(tcp); 1051 if (reterr != 0) 1052 return (reterr); 1053 } 1054 } 1055 if (coas.coa_changed & COA_ROUTE_CHANGED) { 1056 in6_addr_t nexthop; 1057 1058 /* 1059 * If we are connected we re-cache the information. 1060 * We ignore errors to preserve BSD behavior. 1061 * Note that we don't redo IPsec policy lookup here 1062 * since the final destination (or source) didn't change. 1063 */ 1064 ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa, 1065 &connp->conn_faddr_v6, &nexthop); 1066 1067 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && 1068 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 1069 (void) ip_attr_connect(connp, connp->conn_ixa, 1070 &connp->conn_laddr_v6, &connp->conn_faddr_v6, 1071 &nexthop, connp->conn_fport, NULL, NULL, 1072 IPDF_VERIFY_DST); 1073 } 1074 } 1075 if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) { 1076 connp->conn_wq->q_hiwat = connp->conn_sndbuf; 1077 } 1078 if (coas.coa_changed & COA_WROFF_CHANGED) { 1079 connp->conn_wroff = connp->conn_ht_iphc_allocated + 1080 tcps->tcps_wroff_xtra; 1081 (void) proto_set_tx_wroff(connp->conn_rq, connp, 1082 connp->conn_wroff); 1083 } 1084 if (coas.coa_changed & COA_OOBINLINE_CHANGED) { 1085 if (IPCL_IS_NONSTR(connp)) 1086 proto_set_rx_oob_opt(connp, onoff); 1087 } 1088 return (0); 1089 }