1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. 24 * Copyright 2016 Joyent, Inc. 25 * Copyright (c) 2016 by Delphix. All rights reserved. 26 */ 27 28 #include <sys/types.h> 29 #include <sys/stream.h> 30 #define _SUN_TPI_VERSION 2 31 #include <sys/tihdr.h> 32 #include <sys/socket.h> 33 #include <sys/xti_xtiopt.h> 34 #include <sys/xti_inet.h> 35 #include <sys/policy.h> 36 37 #include <inet/common.h> 38 #include <netinet/ip6.h> 39 #include <inet/ip.h> 40 41 #include <netinet/in.h> 42 #include <netinet/tcp.h> 43 #include <inet/optcom.h> 44 #include <inet/proto_set.h> 45 #include <inet/tcp_impl.h> 46 47 static int tcp_opt_default(queue_t *, int, int, uchar_t *); 48 49 /* 50 * Table of all known options handled on a TCP protocol stack. 51 * 52 * Note: This table contains options processed by both TCP and IP levels 53 * and is the superset of options that can be performed on a TCP over IP 54 * stack. 55 */ 56 opdes_t tcp_opt_arr[] = { 57 58 { SO_LINGER, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, 59 sizeof (struct linger), 0 }, 60 61 { SO_DEBUG, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 62 { SO_KEEPALIVE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 63 { SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 64 { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 65 }, 66 { SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 67 { SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 68 { SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 69 { SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, 70 { SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 71 { SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 72 { SO_SNDTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, 73 sizeof (struct timeval), 0 }, 74 { SO_RCVTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, 75 sizeof (struct timeval), 0 }, 76 { SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 77 }, 78 { SO_SND_COPYAVOID, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 79 { SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 80 0 }, 81 { SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 82 0 }, 83 { SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 84 0 }, 85 { SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int), 86 0 }, 87 { SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 88 89 { SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, 90 91 { SO_PROTOTYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, 92 93 { TCP_NODELAY, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 94 }, 95 { TCP_MAXSEG, IPPROTO_TCP, OA_R, OA_R, OP_NP, 0, sizeof (uint_t), 96 536 }, 97 98 { TCP_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 99 OP_DEF_FN, sizeof (int), -1 /* not initialized */ }, 100 101 { TCP_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 102 OP_DEF_FN, sizeof (int), -1 /* not initialized */ }, 103 104 { TCP_CONN_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 105 OP_DEF_FN, sizeof (int), -1 /* not initialized */ }, 106 107 { TCP_CONN_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 108 OP_DEF_FN, sizeof (int), -1 /* not initialized */ }, 109 110 { TCP_RECVDSTADDR, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 111 0 }, 112 113 { TCP_ANONPRIVBIND, IPPROTO_TCP, OA_R, OA_RW, OP_PRIVPORT, 0, 114 sizeof (int), 0 }, 115 116 { TCP_EXCLBIND, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 117 }, 118 119 { TCP_INIT_CWND, IPPROTO_TCP, OA_RW, OA_RW, OP_CONFIG, 0, 120 sizeof (int), 0 }, 121 122 { TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, 123 sizeof (int), 0 }, 124 125 { TCP_KEEPIDLE, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 126 127 { TCP_KEEPCNT, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 128 129 { TCP_KEEPINTVL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 130 131 { TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, 132 sizeof (int), 0 }, 133 134 { TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 135 136 { TCP_RTO_INITIAL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 }, 137 138 { TCP_RTO_MIN, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 }, 139 140 { TCP_RTO_MAX, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 }, 141 142 { TCP_LINGER2, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 143 144 { IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 145 (OP_VARLEN|OP_NODEFAULT), 146 IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ }, 147 { T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 148 (OP_VARLEN|OP_NODEFAULT), 149 IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ }, 150 151 { IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 152 { T_IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 153 { IP_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN, 154 sizeof (int), -1 /* not initialized */ }, 155 156 { IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT, 157 sizeof (ipsec_req_t), -1 /* not initialized */ }, 158 159 { IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, 160 sizeof (int), 0 /* no ifindex */ }, 161 162 { IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0, 163 sizeof (int), 0 }, 164 165 { IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN, 166 sizeof (int), -1 /* not initialized */ }, 167 168 { IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 169 sizeof (int), 0 /* no ifindex */ }, 170 171 { IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 172 173 { IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0, 174 sizeof (in_addr_t), -1 /* not initialized */ }, 175 176 { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0, 177 sizeof (int), 0 }, 178 179 { IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 180 (OP_NODEFAULT|OP_VARLEN), 181 sizeof (struct in6_pktinfo), -1 /* not initialized */ }, 182 { IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 183 OP_NODEFAULT, 184 sizeof (sin6_t), -1 /* not initialized */ }, 185 { IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 186 (OP_VARLEN|OP_NODEFAULT), 255*8, 187 -1 /* not initialized */ }, 188 { IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 189 (OP_VARLEN|OP_NODEFAULT), 255*8, 190 -1 /* not initialized */ }, 191 { IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 192 (OP_VARLEN|OP_NODEFAULT), 255*8, 193 -1 /* not initialized */ }, 194 { IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 195 (OP_VARLEN|OP_NODEFAULT), 255*8, 196 -1 /* not initialized */ }, 197 { IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 198 OP_NODEFAULT, 199 sizeof (int), -1 /* not initialized */ }, 200 { IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 201 OP_NODEFAULT, 202 sizeof (struct ip6_mtuinfo), -1 /* not initialized */ }, 203 { IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 204 sizeof (int), 0 }, 205 { IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 206 sizeof (int), 0 }, 207 { IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 208 sizeof (int), 0 }, 209 210 /* Enable receipt of ancillary data */ 211 { IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 212 sizeof (int), 0 }, 213 { IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 214 sizeof (int), 0 }, 215 { IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 216 sizeof (int), 0 }, 217 { _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 218 sizeof (int), 0 }, 219 { IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 220 sizeof (int), 0 }, 221 { IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 222 sizeof (int), 0 }, 223 { IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 224 sizeof (int), 0 }, 225 { IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 226 sizeof (int), 0 }, 227 228 { IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT, 229 sizeof (ipsec_req_t), -1 /* not initialized */ }, 230 { IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 231 sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT }, 232 }; 233 234 /* 235 * Table of all supported levels 236 * Note: Some levels (e.g. XTI_GENERIC) may be valid but may not have 237 * any supported options so we need this info separately. 238 * 239 * This is needed only for topmost tpi providers and is used only by 240 * XTI interfaces. 241 */ 242 optlevel_t tcp_valid_levels_arr[] = { 243 XTI_GENERIC, 244 SOL_SOCKET, 245 IPPROTO_TCP, 246 IPPROTO_IP, 247 IPPROTO_IPV6 248 }; 249 250 251 #define TCP_OPT_ARR_CNT A_CNT(tcp_opt_arr) 252 #define TCP_VALID_LEVELS_CNT A_CNT(tcp_valid_levels_arr) 253 254 uint_t tcp_max_optsize; /* initialized when TCP driver is loaded */ 255 256 /* 257 * Initialize option database object for TCP 258 * 259 * This object represents database of options to search passed to 260 * {sock,tpi}optcom_req() interface routine to take care of option 261 * management and associated methods. 262 */ 263 264 optdb_obj_t tcp_opt_obj = { 265 tcp_opt_default, /* TCP default value function pointer */ 266 tcp_tpi_opt_get, /* TCP get function pointer */ 267 tcp_tpi_opt_set, /* TCP set function pointer */ 268 TCP_OPT_ARR_CNT, /* TCP option database count of entries */ 269 tcp_opt_arr, /* TCP option database */ 270 TCP_VALID_LEVELS_CNT, /* TCP valid level count of entries */ 271 tcp_valid_levels_arr /* TCP valid level array */ 272 }; 273 274 static int tcp_max_init_cwnd = TCP_MAX_INIT_CWND; 275 276 /* 277 * Some TCP options can be "set" by requesting them in the option 278 * buffer. This is needed for XTI feature test though we do not 279 * allow it in general. We interpret that this mechanism is more 280 * applicable to OSI protocols and need not be allowed in general. 281 * This routine filters out options for which it is not allowed (most) 282 * and lets through those (few) for which it is. [ The XTI interface 283 * test suite specifics will imply that any XTI_GENERIC level XTI_* if 284 * ever implemented will have to be allowed here ]. 285 */ 286 static boolean_t 287 tcp_allow_connopt_set(int level, int name) 288 { 289 290 switch (level) { 291 case IPPROTO_TCP: 292 switch (name) { 293 case TCP_NODELAY: 294 return (B_TRUE); 295 default: 296 return (B_FALSE); 297 } 298 /*NOTREACHED*/ 299 default: 300 return (B_FALSE); 301 } 302 /*NOTREACHED*/ 303 } 304 305 /* 306 * This routine gets default values of certain options whose default 307 * values are maintained by protocol specific code 308 */ 309 /* ARGSUSED */ 310 static int 311 tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr) 312 { 313 int32_t *i1 = (int32_t *)ptr; 314 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; 315 316 switch (level) { 317 case IPPROTO_TCP: 318 switch (name) { 319 case TCP_NOTIFY_THRESHOLD: 320 *i1 = tcps->tcps_ip_notify_interval; 321 break; 322 case TCP_ABORT_THRESHOLD: 323 *i1 = tcps->tcps_ip_abort_interval; 324 break; 325 case TCP_CONN_NOTIFY_THRESHOLD: 326 *i1 = tcps->tcps_ip_notify_cinterval; 327 break; 328 case TCP_CONN_ABORT_THRESHOLD: 329 *i1 = tcps->tcps_ip_abort_cinterval; 330 break; 331 default: 332 return (-1); 333 } 334 break; 335 case IPPROTO_IP: 336 switch (name) { 337 case IP_TTL: 338 *i1 = tcps->tcps_ipv4_ttl; 339 break; 340 default: 341 return (-1); 342 } 343 break; 344 case IPPROTO_IPV6: 345 switch (name) { 346 case IPV6_UNICAST_HOPS: 347 *i1 = tcps->tcps_ipv6_hoplimit; 348 break; 349 default: 350 return (-1); 351 } 352 break; 353 default: 354 return (-1); 355 } 356 return (sizeof (int)); 357 } 358 359 /* 360 * TCP routine to get the values of options. 361 */ 362 int 363 tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) 364 { 365 int *i1 = (int *)ptr; 366 tcp_t *tcp = connp->conn_tcp; 367 conn_opt_arg_t coas; 368 int retval; 369 370 coas.coa_connp = connp; 371 coas.coa_ixa = connp->conn_ixa; 372 coas.coa_ipp = &connp->conn_xmit_ipp; 373 coas.coa_ancillary = B_FALSE; 374 coas.coa_changed = 0; 375 376 switch (level) { 377 case SOL_SOCKET: 378 switch (name) { 379 case SO_SND_COPYAVOID: 380 *i1 = tcp->tcp_snd_zcopy_on ? 381 SO_SND_COPYAVOID : 0; 382 return (sizeof (int)); 383 case SO_ACCEPTCONN: 384 *i1 = (tcp->tcp_state == TCPS_LISTEN); 385 return (sizeof (int)); 386 } 387 break; 388 case IPPROTO_TCP: 389 switch (name) { 390 case TCP_NODELAY: 391 *i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0; 392 return (sizeof (int)); 393 case TCP_MAXSEG: 394 *i1 = tcp->tcp_mss; 395 return (sizeof (int)); 396 case TCP_NOTIFY_THRESHOLD: 397 *i1 = (int)tcp->tcp_first_timer_threshold; 398 return (sizeof (int)); 399 case TCP_ABORT_THRESHOLD: 400 *i1 = tcp->tcp_second_timer_threshold; 401 return (sizeof (int)); 402 case TCP_CONN_NOTIFY_THRESHOLD: 403 *i1 = tcp->tcp_first_ctimer_threshold; 404 return (sizeof (int)); 405 case TCP_CONN_ABORT_THRESHOLD: 406 *i1 = tcp->tcp_second_ctimer_threshold; 407 return (sizeof (int)); 408 case TCP_INIT_CWND: 409 *i1 = tcp->tcp_init_cwnd; 410 return (sizeof (int)); 411 case TCP_KEEPALIVE_THRESHOLD: 412 *i1 = tcp->tcp_ka_interval; 413 return (sizeof (int)); 414 415 /* 416 * TCP_KEEPIDLE expects value in seconds, but 417 * tcp_ka_interval is in milliseconds. 418 */ 419 case TCP_KEEPIDLE: 420 *i1 = tcp->tcp_ka_interval / 1000; 421 return (sizeof (int)); 422 case TCP_KEEPCNT: 423 *i1 = tcp->tcp_ka_cnt; 424 return (sizeof (int)); 425 426 /* 427 * TCP_KEEPINTVL expects value in seconds, but 428 * tcp_ka_rinterval is in milliseconds. 429 */ 430 case TCP_KEEPINTVL: 431 *i1 = tcp->tcp_ka_rinterval / 1000; 432 return (sizeof (int)); 433 case TCP_KEEPALIVE_ABORT_THRESHOLD: 434 *i1 = tcp->tcp_ka_abort_thres; 435 return (sizeof (int)); 436 case TCP_CORK: 437 *i1 = tcp->tcp_cork; 438 return (sizeof (int)); 439 case TCP_RTO_INITIAL: 440 *i1 = tcp->tcp_rto_initial; 441 return (sizeof (uint32_t)); 442 case TCP_RTO_MIN: 443 *i1 = tcp->tcp_rto_min; 444 return (sizeof (uint32_t)); 445 case TCP_RTO_MAX: 446 *i1 = tcp->tcp_rto_max; 447 return (sizeof (uint32_t)); 448 case TCP_LINGER2: 449 *i1 = tcp->tcp_fin_wait_2_flush_interval / SECONDS; 450 return (sizeof (int)); 451 } 452 break; 453 case IPPROTO_IP: 454 if (connp->conn_family != AF_INET) 455 return (-1); 456 switch (name) { 457 case IP_OPTIONS: 458 case T_IP_OPTIONS: 459 /* Caller ensures enough space */ 460 return (ip_opt_get_user(connp, ptr)); 461 default: 462 break; 463 } 464 break; 465 466 case IPPROTO_IPV6: 467 /* 468 * IPPROTO_IPV6 options are only supported for sockets 469 * that are using IPv6 on the wire. 470 */ 471 if (connp->conn_ipversion != IPV6_VERSION) { 472 return (-1); 473 } 474 switch (name) { 475 case IPV6_PATHMTU: 476 if (tcp->tcp_state < TCPS_ESTABLISHED) 477 return (-1); 478 break; 479 } 480 break; 481 } 482 mutex_enter(&connp->conn_lock); 483 retval = conn_opt_get(&coas, level, name, ptr); 484 mutex_exit(&connp->conn_lock); 485 return (retval); 486 } 487 488 /* 489 * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements. 490 * Parameters are assumed to be verified by the caller. 491 */ 492 /* ARGSUSED */ 493 int 494 tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, 495 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 496 void *thisdg_attrs, cred_t *cr) 497 { 498 tcp_t *tcp = connp->conn_tcp; 499 int *i1 = (int *)invalp; 500 boolean_t onoff = (*i1 == 0) ? 0 : 1; 501 boolean_t checkonly; 502 int reterr; 503 tcp_stack_t *tcps = tcp->tcp_tcps; 504 conn_opt_arg_t coas; 505 uint32_t val = *((uint32_t *)invalp); 506 507 coas.coa_connp = connp; 508 coas.coa_ixa = connp->conn_ixa; 509 coas.coa_ipp = &connp->conn_xmit_ipp; 510 coas.coa_ancillary = B_FALSE; 511 coas.coa_changed = 0; 512 513 switch (optset_context) { 514 case SETFN_OPTCOM_CHECKONLY: 515 checkonly = B_TRUE; 516 /* 517 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ 518 * inlen != 0 implies value supplied and 519 * we have to "pretend" to set it. 520 * inlen == 0 implies that there is no 521 * value part in T_CHECK request and just validation 522 * done elsewhere should be enough, we just return here. 523 */ 524 if (inlen == 0) { 525 *outlenp = 0; 526 return (0); 527 } 528 break; 529 case SETFN_OPTCOM_NEGOTIATE: 530 checkonly = B_FALSE; 531 break; 532 case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */ 533 case SETFN_CONN_NEGOTIATE: 534 checkonly = B_FALSE; 535 /* 536 * Negotiating local and "association-related" options 537 * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ) 538 * primitives is allowed by XTI, but we choose 539 * to not implement this style negotiation for Internet 540 * protocols (We interpret it is a must for OSI world but 541 * optional for Internet protocols) for all options. 542 * [ Will do only for the few options that enable test 543 * suites that our XTI implementation of this feature 544 * works for transports that do allow it ] 545 */ 546 if (!tcp_allow_connopt_set(level, name)) { 547 *outlenp = 0; 548 return (EINVAL); 549 } 550 break; 551 default: 552 /* 553 * We should never get here 554 */ 555 *outlenp = 0; 556 return (EINVAL); 557 } 558 559 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || 560 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); 561 562 /* 563 * For TCP, we should have no ancillary data sent down 564 * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs 565 * has to be zero. 566 */ 567 ASSERT(thisdg_attrs == NULL); 568 569 /* 570 * For fixed length options, no sanity check 571 * of passed in length is done. It is assumed *_optcom_req() 572 * routines do the right thing. 573 */ 574 switch (level) { 575 case SOL_SOCKET: 576 switch (name) { 577 case SO_KEEPALIVE: 578 if (checkonly) { 579 /* check only case */ 580 break; 581 } 582 583 if (!onoff) { 584 if (connp->conn_keepalive) { 585 if (tcp->tcp_ka_tid != 0) { 586 (void) TCP_TIMER_CANCEL(tcp, 587 tcp->tcp_ka_tid); 588 tcp->tcp_ka_tid = 0; 589 } 590 connp->conn_keepalive = 0; 591 } 592 break; 593 } 594 if (!connp->conn_keepalive) { 595 /* Crank up the keepalive timer */ 596 tcp->tcp_ka_last_intrvl = 0; 597 tcp->tcp_ka_tid = TCP_TIMER(tcp, 598 tcp_keepalive_timer, tcp->tcp_ka_interval); 599 connp->conn_keepalive = 1; 600 } 601 break; 602 case SO_SNDBUF: { 603 if (*i1 > tcps->tcps_max_buf) { 604 *outlenp = 0; 605 return (ENOBUFS); 606 } 607 if (checkonly) 608 break; 609 610 connp->conn_sndbuf = *i1; 611 if (tcps->tcps_snd_lowat_fraction != 0) { 612 connp->conn_sndlowat = connp->conn_sndbuf / 613 tcps->tcps_snd_lowat_fraction; 614 } 615 (void) tcp_maxpsz_set(tcp, B_TRUE); 616 /* 617 * If we are flow-controlled, recheck the condition. 618 * There are apps that increase SO_SNDBUF size when 619 * flow-controlled (EWOULDBLOCK), and expect the flow 620 * control condition to be lifted right away. 621 */ 622 mutex_enter(&tcp->tcp_non_sq_lock); 623 if (tcp->tcp_flow_stopped && 624 TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) { 625 tcp_clrqfull(tcp); 626 } 627 mutex_exit(&tcp->tcp_non_sq_lock); 628 *outlenp = inlen; 629 return (0); 630 } 631 case SO_RCVBUF: 632 if (*i1 > tcps->tcps_max_buf) { 633 *outlenp = 0; 634 return (ENOBUFS); 635 } 636 /* Silently ignore zero */ 637 if (!checkonly && *i1 != 0) { 638 *i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss); 639 (void) tcp_rwnd_set(tcp, *i1); 640 } 641 /* 642 * XXX should we return the rwnd here 643 * and tcp_opt_get ? 644 */ 645 *outlenp = inlen; 646 return (0); 647 case SO_SND_COPYAVOID: 648 if (!checkonly) { 649 if (tcp->tcp_loopback || 650 (onoff != 1) || !tcp_zcopy_check(tcp)) { 651 *outlenp = 0; 652 return (EOPNOTSUPP); 653 } 654 tcp->tcp_snd_zcopy_aware = 1; 655 } 656 *outlenp = inlen; 657 return (0); 658 } 659 break; 660 case IPPROTO_TCP: 661 switch (name) { 662 case TCP_NODELAY: 663 if (!checkonly) 664 tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss; 665 break; 666 case TCP_NOTIFY_THRESHOLD: 667 if (!checkonly) 668 tcp->tcp_first_timer_threshold = *i1; 669 break; 670 case TCP_ABORT_THRESHOLD: 671 if (!checkonly) 672 tcp->tcp_second_timer_threshold = *i1; 673 break; 674 case TCP_CONN_NOTIFY_THRESHOLD: 675 if (!checkonly) 676 tcp->tcp_first_ctimer_threshold = *i1; 677 break; 678 case TCP_CONN_ABORT_THRESHOLD: 679 if (!checkonly) 680 tcp->tcp_second_ctimer_threshold = *i1; 681 break; 682 case TCP_RECVDSTADDR: 683 if (tcp->tcp_state > TCPS_LISTEN) { 684 *outlenp = 0; 685 return (EOPNOTSUPP); 686 } 687 /* Setting done in conn_opt_set */ 688 break; 689 case TCP_INIT_CWND: 690 if (checkonly) 691 break; 692 693 /* 694 * Only allow socket with network configuration 695 * privilege to set the initial cwnd to be larger 696 * than allowed by RFC 3390. 697 */ 698 if (val > MIN(4, MAX(2, 4380 / tcp->tcp_mss))) { 699 if ((reterr = secpolicy_ip_config(cr, B_TRUE)) 700 != 0) { 701 *outlenp = 0; 702 return (reterr); 703 } 704 if (val > tcp_max_init_cwnd) { 705 *outlenp = 0; 706 return (EINVAL); 707 } 708 } 709 710 tcp->tcp_init_cwnd = val; 711 712 /* 713 * If the socket is connected, AND no outbound data 714 * has been sent, reset the actual cwnd values. 715 */ 716 if (tcp->tcp_state == TCPS_ESTABLISHED && 717 tcp->tcp_iss == tcp->tcp_snxt - 1) { 718 tcp->tcp_cwnd = 719 MIN(tcp->tcp_rwnd, val * tcp->tcp_mss); 720 } 721 break; 722 723 /* 724 * TCP_KEEPIDLE is in seconds but TCP_KEEPALIVE_THRESHOLD 725 * is in milliseconds. TCP_KEEPIDLE is introduced for 726 * compatibility with other Unix flavors. 727 * We can fall through TCP_KEEPALIVE_THRESHOLD logic after 728 * converting the input to milliseconds. 729 */ 730 case TCP_KEEPIDLE: 731 *i1 *= 1000; 732 /* FALLTHRU */ 733 734 case TCP_KEEPALIVE_THRESHOLD: 735 if (checkonly) 736 break; 737 738 if (*i1 < tcps->tcps_keepalive_interval_low || 739 *i1 > tcps->tcps_keepalive_interval_high) { 740 *outlenp = 0; 741 return (EINVAL); 742 } 743 if (*i1 != tcp->tcp_ka_interval) { 744 tcp->tcp_ka_interval = *i1; 745 /* 746 * Check if we need to restart the 747 * keepalive timer. 748 */ 749 if (tcp->tcp_ka_tid != 0) { 750 ASSERT(connp->conn_keepalive); 751 (void) TCP_TIMER_CANCEL(tcp, 752 tcp->tcp_ka_tid); 753 tcp->tcp_ka_last_intrvl = 0; 754 tcp->tcp_ka_tid = TCP_TIMER(tcp, 755 tcp_keepalive_timer, 756 tcp->tcp_ka_interval); 757 } 758 } 759 break; 760 761 /* 762 * tcp_ka_abort_thres = tcp_ka_rinterval * tcp_ka_cnt. 763 * So setting TCP_KEEPCNT or TCP_KEEPINTVL can affect all the 764 * three members - tcp_ka_abort_thres, tcp_ka_rinterval and 765 * tcp_ka_cnt. 766 */ 767 case TCP_KEEPCNT: 768 if (checkonly) 769 break; 770 771 if (*i1 == 0) { 772 return (EINVAL); 773 } else if (tcp->tcp_ka_rinterval == 0) { 774 /* 775 * When TCP_KEEPCNT is specified without first 776 * specifying a TCP_KEEPINTVL, we infer an 777 * interval based on a tunable specific to our 778 * stack: the tcp_keepalive_abort_interval. 779 * (Or the TCP_KEEPALIVE_ABORT_THRESHOLD, in 780 * the unlikely event that that has been set.) 781 * Given the abort interval's default value of 782 * 480 seconds, low TCP_KEEPCNT values can 783 * result in intervals that exceed the default 784 * maximum RTO of 60 seconds. Rather than 785 * fail in these cases, we (implicitly) clamp 786 * the interval at the maximum RTO; if the 787 * TCP_KEEPCNT is shortly followed by a 788 * TCP_KEEPINTVL (as we expect), the abort 789 * threshold will be recalculated correctly -- 790 * and if a TCP_KEEPINTVL is not forthcoming, 791 * keep-alive will at least operate reasonably 792 * given the underconfigured state. 793 */ 794 uint32_t interval; 795 796 interval = tcp->tcp_ka_abort_thres / *i1; 797 798 if (interval < tcp->tcp_rto_min) 799 interval = tcp->tcp_rto_min; 800 801 if (interval > tcp->tcp_rto_max) 802 interval = tcp->tcp_rto_max; 803 804 tcp->tcp_ka_rinterval = interval; 805 } else { 806 if ((*i1 * tcp->tcp_ka_rinterval) < 807 tcps->tcps_keepalive_abort_interval_low || 808 (*i1 * tcp->tcp_ka_rinterval) > 809 tcps->tcps_keepalive_abort_interval_high) 810 return (EINVAL); 811 tcp->tcp_ka_abort_thres = 812 (*i1 * tcp->tcp_ka_rinterval); 813 } 814 tcp->tcp_ka_cnt = *i1; 815 break; 816 case TCP_KEEPINTVL: 817 /* 818 * TCP_KEEPINTVL is specified in seconds, but 819 * tcp_ka_rinterval is in milliseconds. 820 */ 821 822 if (checkonly) 823 break; 824 825 if ((*i1 * 1000) < tcp->tcp_rto_min || 826 (*i1 * 1000) > tcp->tcp_rto_max) 827 return (EINVAL); 828 829 if (tcp->tcp_ka_cnt == 0) { 830 tcp->tcp_ka_cnt = 831 tcp->tcp_ka_abort_thres / (*i1 * 1000); 832 } else { 833 if ((*i1 * tcp->tcp_ka_cnt * 1000) < 834 tcps->tcps_keepalive_abort_interval_low || 835 (*i1 * tcp->tcp_ka_cnt * 1000) > 836 tcps->tcps_keepalive_abort_interval_high) 837 return (EINVAL); 838 tcp->tcp_ka_abort_thres = 839 (*i1 * tcp->tcp_ka_cnt * 1000); 840 } 841 tcp->tcp_ka_rinterval = *i1 * 1000; 842 break; 843 case TCP_KEEPALIVE_ABORT_THRESHOLD: 844 if (!checkonly) { 845 if (*i1 < 846 tcps->tcps_keepalive_abort_interval_low || 847 *i1 > 848 tcps->tcps_keepalive_abort_interval_high) { 849 *outlenp = 0; 850 return (EINVAL); 851 } 852 tcp->tcp_ka_abort_thres = *i1; 853 tcp->tcp_ka_cnt = 0; 854 tcp->tcp_ka_rinterval = 0; 855 } 856 break; 857 case TCP_CORK: 858 if (!checkonly) { 859 /* 860 * if tcp->tcp_cork was set and is now 861 * being unset, we have to make sure that 862 * the remaining data gets sent out. Also 863 * unset tcp->tcp_cork so that tcp_wput_data() 864 * can send data even if it is less than mss 865 */ 866 if (tcp->tcp_cork && onoff == 0 && 867 tcp->tcp_unsent > 0) { 868 tcp->tcp_cork = B_FALSE; 869 tcp_wput_data(tcp, NULL, B_FALSE); 870 } 871 tcp->tcp_cork = onoff; 872 } 873 break; 874 case TCP_RTO_INITIAL: 875 if (checkonly || val == 0) 876 break; 877 878 /* 879 * Sanity checks 880 * 881 * The initial RTO should be bounded by the minimum 882 * and maximum RTO. And it should also be smaller 883 * than the connect attempt abort timeout. Otherwise, 884 * the connection won't be aborted in a period 885 * reasonably close to that timeout. 886 */ 887 if (val < tcp->tcp_rto_min || val > tcp->tcp_rto_max || 888 val > tcp->tcp_second_ctimer_threshold || 889 val < tcps->tcps_rexmit_interval_initial_low || 890 val > tcps->tcps_rexmit_interval_initial_high) { 891 *outlenp = 0; 892 return (EINVAL); 893 } 894 tcp->tcp_rto_initial = val; 895 896 /* 897 * If TCP has not sent anything, need to re-calculate 898 * tcp_rto. Otherwise, this option change does not 899 * really affect anything. 900 */ 901 if (tcp->tcp_state >= TCPS_SYN_SENT) 902 break; 903 904 tcp->tcp_rtt_sa = MSEC2NSEC(tcp->tcp_rto_initial) << 2; 905 tcp->tcp_rtt_sd = MSEC2NSEC(tcp->tcp_rto_initial) >> 1; 906 tcp->tcp_rto = tcp_calculate_rto(tcp, tcps, 907 tcps->tcps_conn_grace_period); 908 break; 909 case TCP_RTO_MIN: 910 if (checkonly || val == 0) 911 break; 912 913 if (val < tcps->tcps_rexmit_interval_min_low || 914 val > tcps->tcps_rexmit_interval_min_high || 915 val > tcp->tcp_rto_max) { 916 *outlenp = 0; 917 return (EINVAL); 918 } 919 tcp->tcp_rto_min = val; 920 if (tcp->tcp_rto < val) 921 tcp->tcp_rto = val; 922 break; 923 case TCP_RTO_MAX: 924 if (checkonly || val == 0) 925 break; 926 927 /* 928 * Sanity checks 929 * 930 * The maximum RTO should not be larger than the 931 * connection abort timeout. Otherwise, the 932 * connection won't be aborted in a period reasonably 933 * close to that timeout. 934 */ 935 if (val < tcps->tcps_rexmit_interval_max_low || 936 val > tcps->tcps_rexmit_interval_max_high || 937 val < tcp->tcp_rto_min || 938 val > tcp->tcp_second_timer_threshold) { 939 *outlenp = 0; 940 return (EINVAL); 941 } 942 tcp->tcp_rto_max = val; 943 if (tcp->tcp_rto > val) 944 tcp->tcp_rto = val; 945 break; 946 case TCP_LINGER2: 947 if (checkonly || *i1 == 0) 948 break; 949 950 /* 951 * Note that the option value's unit is second. And 952 * the value should be bigger than the private 953 * parameter tcp_fin_wait_2_flush_interval's lower 954 * bound and smaller than the current value of that 955 * parameter. It should be smaller than the current 956 * value to avoid an app setting TCP_LINGER2 to a big 957 * value, causing resource to be held up too long in 958 * FIN-WAIT-2 state. 959 */ 960 if (*i1 < 0 || 961 tcps->tcps_fin_wait_2_flush_interval_low/SECONDS > 962 *i1 || 963 tcps->tcps_fin_wait_2_flush_interval/SECONDS < 964 *i1) { 965 *outlenp = 0; 966 return (EINVAL); 967 } 968 tcp->tcp_fin_wait_2_flush_interval = *i1 * SECONDS; 969 break; 970 default: 971 break; 972 } 973 break; 974 case IPPROTO_IP: 975 if (connp->conn_family != AF_INET) { 976 *outlenp = 0; 977 return (EINVAL); 978 } 979 switch (name) { 980 case IP_SEC_OPT: 981 /* 982 * We should not allow policy setting after 983 * we start listening for connections. 984 */ 985 if (tcp->tcp_state == TCPS_LISTEN) { 986 return (EINVAL); 987 } 988 break; 989 } 990 break; 991 case IPPROTO_IPV6: 992 /* 993 * IPPROTO_IPV6 options are only supported for sockets 994 * that are using IPv6 on the wire. 995 */ 996 if (connp->conn_ipversion != IPV6_VERSION) { 997 *outlenp = 0; 998 return (EINVAL); 999 } 1000 1001 switch (name) { 1002 case IPV6_RECVPKTINFO: 1003 if (!checkonly) { 1004 /* Force it to be sent up with the next msg */ 1005 tcp->tcp_recvifindex = 0; 1006 } 1007 break; 1008 case IPV6_RECVTCLASS: 1009 if (!checkonly) { 1010 /* Force it to be sent up with the next msg */ 1011 tcp->tcp_recvtclass = 0xffffffffU; 1012 } 1013 break; 1014 case IPV6_RECVHOPLIMIT: 1015 if (!checkonly) { 1016 /* Force it to be sent up with the next msg */ 1017 tcp->tcp_recvhops = 0xffffffffU; 1018 } 1019 break; 1020 case IPV6_PKTINFO: 1021 /* This is an extra check for TCP */ 1022 if (inlen == sizeof (struct in6_pktinfo)) { 1023 struct in6_pktinfo *pkti; 1024 1025 pkti = (struct in6_pktinfo *)invalp; 1026 /* 1027 * RFC 3542 states that ipi6_addr must be 1028 * the unspecified address when setting the 1029 * IPV6_PKTINFO sticky socket option on a 1030 * TCP socket. 1031 */ 1032 if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr)) 1033 return (EINVAL); 1034 } 1035 break; 1036 case IPV6_SEC_OPT: 1037 /* 1038 * We should not allow policy setting after 1039 * we start listening for connections. 1040 */ 1041 if (tcp->tcp_state == TCPS_LISTEN) { 1042 return (EINVAL); 1043 } 1044 break; 1045 } 1046 break; 1047 } 1048 reterr = conn_opt_set(&coas, level, name, inlen, invalp, 1049 checkonly, cr); 1050 if (reterr != 0) { 1051 *outlenp = 0; 1052 return (reterr); 1053 } 1054 1055 /* 1056 * Common case of OK return with outval same as inval 1057 */ 1058 if (invalp != outvalp) { 1059 /* don't trust bcopy for identical src/dst */ 1060 (void) bcopy(invalp, outvalp, inlen); 1061 } 1062 *outlenp = inlen; 1063 1064 if (coas.coa_changed & COA_HEADER_CHANGED) { 1065 /* If we are connected we rebuilt the headers */ 1066 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && 1067 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 1068 reterr = tcp_build_hdrs(tcp); 1069 if (reterr != 0) 1070 return (reterr); 1071 } 1072 } 1073 if (coas.coa_changed & COA_ROUTE_CHANGED) { 1074 in6_addr_t nexthop; 1075 1076 /* 1077 * If we are connected we re-cache the information. 1078 * We ignore errors to preserve BSD behavior. 1079 * Note that we don't redo IPsec policy lookup here 1080 * since the final destination (or source) didn't change. 1081 */ 1082 ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa, 1083 &connp->conn_faddr_v6, &nexthop); 1084 1085 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && 1086 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 1087 (void) ip_attr_connect(connp, connp->conn_ixa, 1088 &connp->conn_laddr_v6, &connp->conn_faddr_v6, 1089 &nexthop, connp->conn_fport, NULL, NULL, 1090 IPDF_VERIFY_DST); 1091 } 1092 } 1093 if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) { 1094 connp->conn_wq->q_hiwat = connp->conn_sndbuf; 1095 } 1096 if (coas.coa_changed & COA_WROFF_CHANGED) { 1097 connp->conn_wroff = connp->conn_ht_iphc_allocated + 1098 tcps->tcps_wroff_xtra; 1099 (void) proto_set_tx_wroff(connp->conn_rq, connp, 1100 connp->conn_wroff); 1101 } 1102 if (coas.coa_changed & COA_OOBINLINE_CHANGED) { 1103 if (IPCL_IS_NONSTR(connp)) 1104 proto_set_rx_oob_opt(connp, onoff); 1105 } 1106 return (0); 1107 }