1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  24  * Copyright 2019 Joyent, Inc.
  25  * Copyright (c) 2016 by Delphix. All rights reserved.
  26  * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
  27  */
  28 
  29 #include <sys/types.h>
  30 #include <sys/stream.h>
  31 #define _SUN_TPI_VERSION 2
  32 #include <sys/tihdr.h>
  33 #include <sys/socket.h>
  34 #include <sys/xti_xtiopt.h>
  35 #include <sys/xti_inet.h>
  36 #include <sys/policy.h>
  37 
  38 #include <inet/cc.h>
  39 #include <inet/common.h>
  40 #include <netinet/ip6.h>
  41 #include <inet/ip.h>
  42 
  43 #include <netinet/in.h>
  44 #include <netinet/tcp.h>
  45 #include <inet/optcom.h>
  46 #include <inet/proto_set.h>
  47 #include <inet/tcp_impl.h>
  48 
  49 static int      tcp_opt_default(queue_t *, int, int, uchar_t *);
  50 
  51 /*
  52  * Table of all known options handled on a TCP protocol stack.
  53  *
  54  * Note: This table contains options processed by both TCP and IP levels
  55  *       and is the superset of options that can be performed on a TCP over IP
  56  *       stack.
  57  */
  58 opdes_t tcp_opt_arr[] = {
  59 
  60 { SO_LINGER,    SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
  61         sizeof (struct linger), 0 },
  62 
  63 { SO_DEBUG,     SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  64 { SO_KEEPALIVE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  65 { SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  66 { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
  67         },
  68 { SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  69 { SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  70 { SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  71 { SO_TYPE,      SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
  72 { SO_SNDBUF,    SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  73 { SO_RCVBUF,    SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  74 { SO_SNDTIMEO,  SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
  75         sizeof (struct timeval), 0 },
  76 { SO_RCVTIMEO,  SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
  77         sizeof (struct timeval), 0 },
  78 { SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
  79         },
  80 { SO_SND_COPYAVOID, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  81 { SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
  82         0 },
  83 { SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
  84         0 },
  85 { SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
  86         0 },
  87 { SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
  88         0 },
  89 { SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  90 
  91 { SO_DOMAIN,    SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
  92 
  93 { SO_PROTOTYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
  94 
  95 { TCP_NODELAY,  IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
  96         },
  97 { TCP_MAXSEG,   IPPROTO_TCP, OA_R, OA_R, OP_NP, 0, sizeof (uint_t),
  98         536 },
  99 
 100 { TCP_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
 101         OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
 102 
 103 { TCP_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
 104         OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
 105 
 106 { TCP_CONN_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
 107         OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
 108 
 109 { TCP_CONN_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
 110         OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
 111 
 112 { TCP_RECVDSTADDR, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
 113         0 },
 114 
 115 { TCP_ANONPRIVBIND, IPPROTO_TCP, OA_R, OA_RW, OP_PRIVPORT, 0,
 116         sizeof (int), 0 },
 117 
 118 { TCP_EXCLBIND, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
 119         },
 120 
 121 { TCP_INIT_CWND, IPPROTO_TCP, OA_RW, OA_RW, OP_CONFIG, 0,
 122         sizeof (int), 0 },
 123 
 124 { TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
 125         sizeof (int), 0 },
 126 
 127 { TCP_KEEPIDLE, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 128 
 129 { TCP_KEEPCNT, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 130 
 131 { TCP_KEEPINTVL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 132 
 133 { TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
 134         sizeof (int), 0 },
 135 
 136 { TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 137 
 138 { TCP_RTO_INITIAL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
 139 
 140 { TCP_RTO_MIN, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
 141 
 142 { TCP_RTO_MAX, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
 143 
 144 { TCP_LINGER2, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 145 
 146 { TCP_CONGESTION, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
 147         OP_VARLEN, CC_ALGO_NAME_MAX, 0 },
 148 
 149 { IP_OPTIONS,   IPPROTO_IP, OA_RW, OA_RW, OP_NP,
 150         (OP_VARLEN|OP_NODEFAULT),
 151         IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
 152 { T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
 153         (OP_VARLEN|OP_NODEFAULT),
 154         IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
 155 
 156 { IP_TOS,       IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 157 { T_IP_TOS,     IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 158 { IP_TTL,       IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
 159         sizeof (int), -1 /* not initialized */ },
 160 { IP_RECVTOS,   IPPROTO_IP,  OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 161 
 162 { IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
 163         sizeof (ipsec_req_t), -1 /* not initialized */ },
 164 
 165 { IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
 166         sizeof (int),   0 /* no ifindex */ },
 167 
 168 { IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
 169         sizeof (int), 0 },
 170 
 171 { IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
 172         sizeof (int), -1 /* not initialized */ },
 173 
 174 { IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 175         sizeof (int),   0 /* no ifindex */ },
 176 
 177 { IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 178 
 179 { IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
 180         sizeof (in_addr_t),     -1 /* not initialized  */ },
 181 
 182 { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
 183         sizeof (int), 0 },
 184 
 185 { IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 186         (OP_NODEFAULT|OP_VARLEN),
 187         sizeof (struct in6_pktinfo), -1 /* not initialized */ },
 188 { IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 189         OP_NODEFAULT,
 190         sizeof (sin6_t), -1 /* not initialized */ },
 191 { IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 192         (OP_VARLEN|OP_NODEFAULT), 255*8,
 193         -1 /* not initialized */ },
 194 { IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 195         (OP_VARLEN|OP_NODEFAULT), 255*8,
 196         -1 /* not initialized */ },
 197 { IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 198         (OP_VARLEN|OP_NODEFAULT), 255*8,
 199         -1 /* not initialized */ },
 200 { IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 201         (OP_VARLEN|OP_NODEFAULT), 255*8,
 202         -1 /* not initialized */ },
 203 { IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 204         OP_NODEFAULT,
 205         sizeof (int), -1 /* not initialized */ },
 206 { IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 207         OP_NODEFAULT,
 208         sizeof (struct ip6_mtuinfo), -1 /* not initialized */ },
 209 { IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 210         sizeof (int), 0 },
 211 { IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 212         sizeof (int), 0 },
 213 { IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 214         sizeof (int), 0 },
 215 
 216 /* Enable receipt of ancillary data */
 217 { IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 218         sizeof (int), 0 },
 219 { IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 220         sizeof (int), 0 },
 221 { IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 222         sizeof (int), 0 },
 223 { _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 224         sizeof (int), 0 },
 225 { IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 226         sizeof (int), 0 },
 227 { IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 228         sizeof (int), 0 },
 229 { IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 230         sizeof (int), 0 },
 231 { IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 232         sizeof (int), 0 },
 233 
 234 { IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
 235         sizeof (ipsec_req_t), -1 /* not initialized */ },
 236 { IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 237         sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT },
 238 };
 239 
 240 /*
 241  * Table of all supported levels
 242  * Note: Some levels (e.g. XTI_GENERIC) may be valid but may not have
 243  * any supported options so we need this info separately.
 244  *
 245  * This is needed only for topmost tpi providers and is used only by
 246  * XTI interfaces.
 247  */
 248 optlevel_t      tcp_valid_levels_arr[] = {
 249         XTI_GENERIC,
 250         SOL_SOCKET,
 251         IPPROTO_TCP,
 252         IPPROTO_IP,
 253         IPPROTO_IPV6
 254 };
 255 
 256 
 257 #define TCP_OPT_ARR_CNT         A_CNT(tcp_opt_arr)
 258 #define TCP_VALID_LEVELS_CNT    A_CNT(tcp_valid_levels_arr)
 259 
 260 uint_t tcp_max_optsize; /* initialized when TCP driver is loaded */
 261 
 262 /*
 263  * Initialize option database object for TCP
 264  *
 265  * This object represents database of options to search passed to
 266  * {sock,tpi}optcom_req() interface routine to take care of option
 267  * management and associated methods.
 268  */
 269 
 270 optdb_obj_t tcp_opt_obj = {
 271         tcp_opt_default,        /* TCP default value function pointer */
 272         tcp_tpi_opt_get,        /* TCP get function pointer */
 273         tcp_tpi_opt_set,        /* TCP set function pointer */
 274         TCP_OPT_ARR_CNT,        /* TCP option database count of entries */
 275         tcp_opt_arr,            /* TCP option database */
 276         TCP_VALID_LEVELS_CNT,   /* TCP valid level count of entries */
 277         tcp_valid_levels_arr    /* TCP valid level array */
 278 };
 279 
 280 static int tcp_max_init_cwnd = TCP_MAX_INIT_CWND;
 281 
 282 /*
 283  * Some TCP options can be "set" by requesting them in the option
 284  * buffer. This is needed for XTI feature test though we do not
 285  * allow it in general. We interpret that this mechanism is more
 286  * applicable to OSI protocols and need not be allowed in general.
 287  * This routine filters out options for which it is not allowed (most)
 288  * and lets through those (few) for which it is. [ The XTI interface
 289  * test suite specifics will imply that any XTI_GENERIC level XTI_* if
 290  * ever implemented will have to be allowed here ].
 291  */
 292 static boolean_t
 293 tcp_allow_connopt_set(int level, int name)
 294 {
 295 
 296         switch (level) {
 297         case IPPROTO_TCP:
 298                 switch (name) {
 299                 case TCP_NODELAY:
 300                         return (B_TRUE);
 301                 default:
 302                         return (B_FALSE);
 303                 }
 304                 /*NOTREACHED*/
 305         default:
 306                 return (B_FALSE);
 307         }
 308         /*NOTREACHED*/
 309 }
 310 
 311 /*
 312  * This routine gets default values of certain options whose default
 313  * values are maintained by protocol specific code
 314  */
 315 /* ARGSUSED */
 316 static int
 317 tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
 318 {
 319         int32_t *i1 = (int32_t *)ptr;
 320         tcp_stack_t     *tcps = Q_TO_TCP(q)->tcp_tcps;
 321 
 322         switch (level) {
 323         case IPPROTO_TCP:
 324                 switch (name) {
 325                 case TCP_NOTIFY_THRESHOLD:
 326                         *i1 = tcps->tcps_ip_notify_interval;
 327                         break;
 328                 case TCP_ABORT_THRESHOLD:
 329                         *i1 = tcps->tcps_ip_abort_interval;
 330                         break;
 331                 case TCP_CONN_NOTIFY_THRESHOLD:
 332                         *i1 = tcps->tcps_ip_notify_cinterval;
 333                         break;
 334                 case TCP_CONN_ABORT_THRESHOLD:
 335                         *i1 = tcps->tcps_ip_abort_cinterval;
 336                         break;
 337                 default:
 338                         return (-1);
 339                 }
 340                 break;
 341         case IPPROTO_IP:
 342                 switch (name) {
 343                 case IP_TTL:
 344                         *i1 = tcps->tcps_ipv4_ttl;
 345                         break;
 346                 default:
 347                         return (-1);
 348                 }
 349                 break;
 350         case IPPROTO_IPV6:
 351                 switch (name) {
 352                 case IPV6_UNICAST_HOPS:
 353                         *i1 = tcps->tcps_ipv6_hoplimit;
 354                         break;
 355                 default:
 356                         return (-1);
 357                 }
 358                 break;
 359         default:
 360                 return (-1);
 361         }
 362         return (sizeof (int));
 363 }
 364 
 365 /*
 366  * TCP routine to get the values of options.
 367  */
 368 int
 369 tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
 370 {
 371         int             *i1 = (int *)ptr;
 372         tcp_t           *tcp = connp->conn_tcp;
 373         conn_opt_arg_t  coas;
 374         int             retval;
 375 
 376         coas.coa_connp = connp;
 377         coas.coa_ixa = connp->conn_ixa;
 378         coas.coa_ipp = &connp->conn_xmit_ipp;
 379         coas.coa_ancillary = B_FALSE;
 380         coas.coa_changed = 0;
 381 
 382         switch (level) {
 383         case SOL_SOCKET:
 384                 switch (name) {
 385                 case SO_SND_COPYAVOID:
 386                         *i1 = tcp->tcp_snd_zcopy_on ?
 387                             SO_SND_COPYAVOID : 0;
 388                         return (sizeof (int));
 389                 case SO_ACCEPTCONN:
 390                         *i1 = (tcp->tcp_state == TCPS_LISTEN);
 391                         return (sizeof (int));
 392                 }
 393                 break;
 394         case IPPROTO_TCP:
 395                 switch (name) {
 396                 case TCP_NODELAY:
 397                         *i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0;
 398                         return (sizeof (int));
 399                 case TCP_MAXSEG:
 400                         *i1 = tcp->tcp_mss;
 401                         return (sizeof (int));
 402                 case TCP_NOTIFY_THRESHOLD:
 403                         *i1 = (int)tcp->tcp_first_timer_threshold;
 404                         return (sizeof (int));
 405                 case TCP_ABORT_THRESHOLD:
 406                         *i1 = tcp->tcp_second_timer_threshold;
 407                         return (sizeof (int));
 408                 case TCP_CONN_NOTIFY_THRESHOLD:
 409                         *i1 = tcp->tcp_first_ctimer_threshold;
 410                         return (sizeof (int));
 411                 case TCP_CONN_ABORT_THRESHOLD:
 412                         *i1 = tcp->tcp_second_ctimer_threshold;
 413                         return (sizeof (int));
 414                 case TCP_INIT_CWND:
 415                         *i1 = tcp->tcp_init_cwnd;
 416                         return (sizeof (int));
 417                 case TCP_KEEPALIVE_THRESHOLD:
 418                         *i1 = tcp->tcp_ka_interval;
 419                         return (sizeof (int));
 420 
 421                 /*
 422                  * TCP_KEEPIDLE expects value in seconds, but
 423                  * tcp_ka_interval is in milliseconds.
 424                  */
 425                 case TCP_KEEPIDLE:
 426                         *i1 = tcp->tcp_ka_interval / 1000;
 427                         return (sizeof (int));
 428                 case TCP_KEEPCNT:
 429                         *i1 = tcp->tcp_ka_cnt;
 430                         return (sizeof (int));
 431 
 432                 /*
 433                  * TCP_KEEPINTVL expects value in seconds, but
 434                  * tcp_ka_rinterval is in milliseconds.
 435                  */
 436                 case TCP_KEEPINTVL:
 437                         *i1 = tcp->tcp_ka_rinterval / 1000;
 438                         return (sizeof (int));
 439                 case TCP_KEEPALIVE_ABORT_THRESHOLD:
 440                         *i1 = tcp->tcp_ka_abort_thres;
 441                         return (sizeof (int));
 442                 case TCP_CONGESTION: {
 443                         size_t len = strlcpy((char *)ptr, CC_ALGO(tcp)->name,
 444                             CC_ALGO_NAME_MAX);
 445                         if (len >= CC_ALGO_NAME_MAX)
 446                                 return (-1);
 447                         return (len + 1);
 448                 }
 449                 case TCP_CORK:
 450                         *i1 = tcp->tcp_cork;
 451                         return (sizeof (int));
 452                 case TCP_RTO_INITIAL:
 453                         *i1 = tcp->tcp_rto_initial;
 454                         return (sizeof (uint32_t));
 455                 case TCP_RTO_MIN:
 456                         *i1 = tcp->tcp_rto_min;
 457                         return (sizeof (uint32_t));
 458                 case TCP_RTO_MAX:
 459                         *i1 = tcp->tcp_rto_max;
 460                         return (sizeof (uint32_t));
 461                 case TCP_LINGER2:
 462                         *i1 = tcp->tcp_fin_wait_2_flush_interval / SECONDS;
 463                         return (sizeof (int));
 464                 }
 465                 break;
 466         case IPPROTO_IP:
 467                 if (connp->conn_family != AF_INET)
 468                         return (-1);
 469                 switch (name) {
 470                 case IP_OPTIONS:
 471                 case T_IP_OPTIONS:
 472                         /* Caller ensures enough space */
 473                         return (ip_opt_get_user(connp, ptr));
 474                 default:
 475                         break;
 476                 }
 477                 break;
 478 
 479         case IPPROTO_IPV6:
 480                 /*
 481                  * IPPROTO_IPV6 options are only supported for sockets
 482                  * that are using IPv6 on the wire.
 483                  */
 484                 if (connp->conn_ipversion != IPV6_VERSION) {
 485                         return (-1);
 486                 }
 487                 switch (name) {
 488                 case IPV6_PATHMTU:
 489                         if (tcp->tcp_state < TCPS_ESTABLISHED)
 490                                 return (-1);
 491                         break;
 492                 }
 493                 break;
 494         }
 495         mutex_enter(&connp->conn_lock);
 496         retval = conn_opt_get(&coas, level, name, ptr);
 497         mutex_exit(&connp->conn_lock);
 498         return (retval);
 499 }
 500 
 501 /*
 502  * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
 503  * Parameters are assumed to be verified by the caller.
 504  */
 505 /* ARGSUSED */
 506 int
 507 tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
 508     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
 509     void *thisdg_attrs, cred_t *cr)
 510 {
 511         tcp_t   *tcp = connp->conn_tcp;
 512         int     *i1 = (int *)invalp;
 513         boolean_t onoff = (*i1 == 0) ? 0 : 1;
 514         boolean_t checkonly;
 515         int     reterr;
 516         tcp_stack_t     *tcps = tcp->tcp_tcps;
 517         conn_opt_arg_t  coas;
 518         uint32_t        val = *((uint32_t *)invalp);
 519 
 520         coas.coa_connp = connp;
 521         coas.coa_ixa = connp->conn_ixa;
 522         coas.coa_ipp = &connp->conn_xmit_ipp;
 523         coas.coa_ancillary = B_FALSE;
 524         coas.coa_changed = 0;
 525 
 526         switch (optset_context) {
 527         case SETFN_OPTCOM_CHECKONLY:
 528                 checkonly = B_TRUE;
 529                 /*
 530                  * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
 531                  * inlen != 0 implies value supplied and
 532                  *      we have to "pretend" to set it.
 533                  * inlen == 0 implies that there is no
 534                  *      value part in T_CHECK request and just validation
 535                  * done elsewhere should be enough, we just return here.
 536                  */
 537                 if (inlen == 0) {
 538                         *outlenp = 0;
 539                         return (0);
 540                 }
 541                 break;
 542         case SETFN_OPTCOM_NEGOTIATE:
 543                 checkonly = B_FALSE;
 544                 break;
 545         case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */
 546         case SETFN_CONN_NEGOTIATE:
 547                 checkonly = B_FALSE;
 548                 /*
 549                  * Negotiating local and "association-related" options
 550                  * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ)
 551                  * primitives is allowed by XTI, but we choose
 552                  * to not implement this style negotiation for Internet
 553                  * protocols (We interpret it is a must for OSI world but
 554                  * optional for Internet protocols) for all options.
 555                  * [ Will do only for the few options that enable test
 556                  * suites that our XTI implementation of this feature
 557                  * works for transports that do allow it ]
 558                  */
 559                 if (!tcp_allow_connopt_set(level, name)) {
 560                         *outlenp = 0;
 561                         return (EINVAL);
 562                 }
 563                 break;
 564         default:
 565                 /*
 566                  * We should never get here
 567                  */
 568                 *outlenp = 0;
 569                 return (EINVAL);
 570         }
 571 
 572         ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
 573             (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
 574 
 575         /*
 576          * For TCP, we should have no ancillary data sent down
 577          * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs
 578          * has to be zero.
 579          */
 580         ASSERT(thisdg_attrs == NULL);
 581 
 582         /*
 583          * For fixed length options, no sanity check
 584          * of passed in length is done. It is assumed *_optcom_req()
 585          * routines do the right thing.
 586          */
 587         switch (level) {
 588         case SOL_SOCKET:
 589                 switch (name) {
 590                 case SO_KEEPALIVE:
 591                         if (checkonly) {
 592                                 /* check only case */
 593                                 break;
 594                         }
 595 
 596                         if (!onoff) {
 597                                 if (connp->conn_keepalive) {
 598                                         if (tcp->tcp_ka_tid != 0) {
 599                                                 (void) TCP_TIMER_CANCEL(tcp,
 600                                                     tcp->tcp_ka_tid);
 601                                                 tcp->tcp_ka_tid = 0;
 602                                         }
 603                                         connp->conn_keepalive = 0;
 604                                 }
 605                                 break;
 606                         }
 607                         if (!connp->conn_keepalive) {
 608                                 /* Crank up the keepalive timer */
 609                                 tcp->tcp_ka_last_intrvl = 0;
 610                                 tcp->tcp_ka_tid = TCP_TIMER(tcp,
 611                                     tcp_keepalive_timer, tcp->tcp_ka_interval);
 612                                 connp->conn_keepalive = 1;
 613                         }
 614                         break;
 615                 case SO_SNDBUF: {
 616                         if (*i1 > tcps->tcps_max_buf) {
 617                                 *outlenp = 0;
 618                                 return (ENOBUFS);
 619                         }
 620                         if (checkonly)
 621                                 break;
 622 
 623                         connp->conn_sndbuf = *i1;
 624                         if (tcps->tcps_snd_lowat_fraction != 0) {
 625                                 connp->conn_sndlowat = connp->conn_sndbuf /
 626                                     tcps->tcps_snd_lowat_fraction;
 627                         }
 628                         (void) tcp_maxpsz_set(tcp, B_TRUE);
 629                         /*
 630                          * If we are flow-controlled, recheck the condition.
 631                          * There are apps that increase SO_SNDBUF size when
 632                          * flow-controlled (EWOULDBLOCK), and expect the flow
 633                          * control condition to be lifted right away.
 634                          */
 635                         mutex_enter(&tcp->tcp_non_sq_lock);
 636                         if (tcp->tcp_flow_stopped &&
 637                             TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) {
 638                                 tcp_clrqfull(tcp);
 639                         }
 640                         mutex_exit(&tcp->tcp_non_sq_lock);
 641                         *outlenp = inlen;
 642                         return (0);
 643                 }
 644                 case SO_RCVBUF:
 645                         if (*i1 > tcps->tcps_max_buf) {
 646                                 *outlenp = 0;
 647                                 return (ENOBUFS);
 648                         }
 649                         /* Silently ignore zero */
 650                         if (!checkonly && *i1 != 0) {
 651                                 *i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss);
 652                                 (void) tcp_rwnd_set(tcp, *i1);
 653                         }
 654                         /*
 655                          * XXX should we return the rwnd here
 656                          * and tcp_opt_get ?
 657                          */
 658                         *outlenp = inlen;
 659                         return (0);
 660                 case SO_SND_COPYAVOID:
 661                         if (!checkonly) {
 662                                 if (tcp->tcp_loopback ||
 663                                     (onoff != 1) || !tcp_zcopy_check(tcp)) {
 664                                         *outlenp = 0;
 665                                         return (EOPNOTSUPP);
 666                                 }
 667                                 tcp->tcp_snd_zcopy_aware = 1;
 668                         }
 669                         *outlenp = inlen;
 670                         return (0);
 671                 }
 672                 break;
 673         case IPPROTO_TCP:
 674                 switch (name) {
 675                 case TCP_NODELAY:
 676                         if (!checkonly)
 677                                 tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss;
 678                         break;
 679                 case TCP_NOTIFY_THRESHOLD:
 680                         if (!checkonly)
 681                                 tcp->tcp_first_timer_threshold = *i1;
 682                         break;
 683                 case TCP_ABORT_THRESHOLD:
 684                         if (!checkonly)
 685                                 tcp->tcp_second_timer_threshold = *i1;
 686                         break;
 687                 case TCP_CONN_NOTIFY_THRESHOLD:
 688                         if (!checkonly)
 689                                 tcp->tcp_first_ctimer_threshold = *i1;
 690                         break;
 691                 case TCP_CONN_ABORT_THRESHOLD:
 692                         if (!checkonly)
 693                                 tcp->tcp_second_ctimer_threshold = *i1;
 694                         break;
 695                 case TCP_RECVDSTADDR:
 696                         if (tcp->tcp_state > TCPS_LISTEN) {
 697                                 *outlenp = 0;
 698                                 return (EOPNOTSUPP);
 699                         }
 700                         /* Setting done in conn_opt_set */
 701                         break;
 702                 case TCP_INIT_CWND:
 703                         if (checkonly)
 704                                 break;
 705 
 706                         /*
 707                          * Only allow socket with network configuration
 708                          * privilege to set the initial cwnd to be larger
 709                          * than allowed by RFC 3390.
 710                          */
 711                         if (val > MIN(4, MAX(2, 4380 / tcp->tcp_mss))) {
 712                                 if ((reterr = secpolicy_ip_config(cr, B_TRUE))
 713                                     != 0) {
 714                                         *outlenp = 0;
 715                                         return (reterr);
 716                                 }
 717                                 if (val > tcp_max_init_cwnd) {
 718                                         *outlenp = 0;
 719                                         return (EINVAL);
 720                                 }
 721                         }
 722 
 723                         tcp->tcp_init_cwnd = val;
 724 
 725                         /*
 726                          * If the socket is connected, AND no outbound data
 727                          * has been sent, reset the actual cwnd values.
 728                          */
 729                         if (tcp->tcp_state == TCPS_ESTABLISHED &&
 730                             tcp->tcp_iss == tcp->tcp_snxt - 1) {
 731                                 tcp->tcp_cwnd =
 732                                     MIN(tcp->tcp_rwnd, val * tcp->tcp_mss);
 733                         }
 734                         break;
 735 
 736                 /*
 737                  * TCP_KEEPIDLE is in seconds but TCP_KEEPALIVE_THRESHOLD
 738                  * is in milliseconds. TCP_KEEPIDLE is introduced for
 739                  * compatibility with other Unix flavors.
 740                  * We can fall through TCP_KEEPALIVE_THRESHOLD logic after
 741                  * converting the input to milliseconds.
 742                  */
 743                 case TCP_KEEPIDLE:
 744                         *i1 *= 1000;
 745                         /* FALLTHRU */
 746 
 747                 case TCP_KEEPALIVE_THRESHOLD:
 748                         if (checkonly)
 749                                 break;
 750 
 751                         if (*i1 < tcps->tcps_keepalive_interval_low ||
 752                             *i1 > tcps->tcps_keepalive_interval_high) {
 753                                 *outlenp = 0;
 754                                 return (EINVAL);
 755                         }
 756                         if (*i1 != tcp->tcp_ka_interval) {
 757                                 tcp->tcp_ka_interval = *i1;
 758                                 /*
 759                                  * Check if we need to restart the
 760                                  * keepalive timer.
 761                                  */
 762                                 if (tcp->tcp_ka_tid != 0) {
 763                                         ASSERT(connp->conn_keepalive);
 764                                         (void) TCP_TIMER_CANCEL(tcp,
 765                                             tcp->tcp_ka_tid);
 766                                         tcp->tcp_ka_last_intrvl = 0;
 767                                         tcp->tcp_ka_tid = TCP_TIMER(tcp,
 768                                             tcp_keepalive_timer,
 769                                             tcp->tcp_ka_interval);
 770                                 }
 771                         }
 772                         break;
 773 
 774                 /*
 775                  * tcp_ka_abort_thres = tcp_ka_rinterval * tcp_ka_cnt.
 776                  * So setting TCP_KEEPCNT or TCP_KEEPINTVL can affect all the
 777                  * three members - tcp_ka_abort_thres, tcp_ka_rinterval and
 778                  * tcp_ka_cnt.
 779                  */
 780                 case TCP_KEEPCNT:
 781                         if (checkonly)
 782                                 break;
 783 
 784                         if (*i1 == 0) {
 785                                 return (EINVAL);
 786                         } else if (tcp->tcp_ka_rinterval == 0) {
 787                                 /*
 788                                  * When TCP_KEEPCNT is specified without first
 789                                  * specifying a TCP_KEEPINTVL, we infer an
 790                                  * interval based on a tunable specific to our
 791                                  * stack: the tcp_keepalive_abort_interval.
 792                                  * (Or the TCP_KEEPALIVE_ABORT_THRESHOLD, in
 793                                  * the unlikely event that that has been set.)
 794                                  * Given the abort interval's default value of
 795                                  * 480 seconds, low TCP_KEEPCNT values can
 796                                  * result in intervals that exceed the default
 797                                  * maximum RTO of 60 seconds.  Rather than
 798                                  * fail in these cases, we (implicitly) clamp
 799                                  * the interval at the maximum RTO; if the
 800                                  * TCP_KEEPCNT is shortly followed by a
 801                                  * TCP_KEEPINTVL (as we expect), the abort
 802                                  * threshold will be recalculated correctly --
 803                                  * and if a TCP_KEEPINTVL is not forthcoming,
 804                                  * keep-alive will at least operate reasonably
 805                                  * given the underconfigured state.
 806                                  */
 807                                 uint32_t interval;
 808 
 809                                 interval = tcp->tcp_ka_abort_thres / *i1;
 810 
 811                                 if (interval < tcp->tcp_rto_min)
 812                                         interval = tcp->tcp_rto_min;
 813 
 814                                 if (interval > tcp->tcp_rto_max)
 815                                         interval = tcp->tcp_rto_max;
 816 
 817                                 tcp->tcp_ka_rinterval = interval;
 818                         } else {
 819                                 if ((*i1 * tcp->tcp_ka_rinterval) <
 820                                     tcps->tcps_keepalive_abort_interval_low ||
 821                                     (*i1 * tcp->tcp_ka_rinterval) >
 822                                     tcps->tcps_keepalive_abort_interval_high)
 823                                         return (EINVAL);
 824                                 tcp->tcp_ka_abort_thres =
 825                                     (*i1 * tcp->tcp_ka_rinterval);
 826                         }
 827                         tcp->tcp_ka_cnt = *i1;
 828                         break;
 829                 case TCP_KEEPINTVL:
 830                         /*
 831                          * TCP_KEEPINTVL is specified in seconds, but
 832                          * tcp_ka_rinterval is in milliseconds.
 833                          */
 834 
 835                         if (checkonly)
 836                                 break;
 837 
 838                         if ((*i1 * 1000) < tcp->tcp_rto_min ||
 839                             (*i1 * 1000) > tcp->tcp_rto_max)
 840                                 return (EINVAL);
 841 
 842                         if (tcp->tcp_ka_cnt == 0) {
 843                                 tcp->tcp_ka_cnt =
 844                                     tcp->tcp_ka_abort_thres / (*i1 * 1000);
 845                         } else {
 846                                 if ((*i1 * tcp->tcp_ka_cnt * 1000) <
 847                                     tcps->tcps_keepalive_abort_interval_low ||
 848                                     (*i1 * tcp->tcp_ka_cnt * 1000) >
 849                                     tcps->tcps_keepalive_abort_interval_high)
 850                                         return (EINVAL);
 851                                 tcp->tcp_ka_abort_thres =
 852                                     (*i1 * tcp->tcp_ka_cnt * 1000);
 853                         }
 854                         tcp->tcp_ka_rinterval = *i1 * 1000;
 855                         break;
 856                 case TCP_KEEPALIVE_ABORT_THRESHOLD:
 857                         if (!checkonly) {
 858                                 if (*i1 <
 859                                     tcps->tcps_keepalive_abort_interval_low ||
 860                                     *i1 >
 861                                     tcps->tcps_keepalive_abort_interval_high) {
 862                                         *outlenp = 0;
 863                                         return (EINVAL);
 864                                 }
 865                                 tcp->tcp_ka_abort_thres = *i1;
 866                                 tcp->tcp_ka_cnt = 0;
 867                                 tcp->tcp_ka_rinterval = 0;
 868                         }
 869                         break;
 870                 case TCP_CONGESTION: {
 871                         struct cc_algo *algo;
 872 
 873                         if (checkonly) {
 874                                 break;
 875                         }
 876 
 877                         /*
 878                          * Make sure the string is NUL-terminated. Some
 879                          * consumers pass only the number of characters
 880                          * in the string, and don't include the NUL
 881                          * terminator, so we set it for them.
 882                          */
 883                         if (inlen < CC_ALGO_NAME_MAX) {
 884                                 invalp[inlen] = '\0';
 885                         }
 886                         invalp[CC_ALGO_NAME_MAX - 1] = '\0';
 887 
 888                         if ((algo = cc_load_algo((char *)invalp)) == NULL) {
 889                                 return (ENOENT);
 890                         }
 891 
 892                         if (CC_ALGO(tcp)->cb_destroy != NULL) {
 893                                 CC_ALGO(tcp)->cb_destroy(&tcp->tcp_ccv);
 894                         }
 895 
 896                         CC_DATA(tcp) = NULL;
 897                         CC_ALGO(tcp) = algo;
 898 
 899                         if (CC_ALGO(tcp)->cb_init != NULL) {
 900                                 VERIFY0(CC_ALGO(tcp)->cb_init(&tcp->tcp_ccv));
 901                         }
 902 
 903                         break;
 904                 }
 905                 case TCP_CORK:
 906                         if (!checkonly) {
 907                                 /*
 908                                  * if tcp->tcp_cork was set and is now
 909                                  * being unset, we have to make sure that
 910                                  * the remaining data gets sent out. Also
 911                                  * unset tcp->tcp_cork so that tcp_wput_data()
 912                                  * can send data even if it is less than mss
 913                                  */
 914                                 if (tcp->tcp_cork && onoff == 0 &&
 915                                     tcp->tcp_unsent > 0) {
 916                                         tcp->tcp_cork = B_FALSE;
 917                                         tcp_wput_data(tcp, NULL, B_FALSE);
 918                                 }
 919                                 tcp->tcp_cork = onoff;
 920                         }
 921                         break;
 922                 case TCP_RTO_INITIAL:
 923                         if (checkonly || val == 0)
 924                                 break;
 925 
 926                         /*
 927                          * Sanity checks
 928                          *
 929                          * The initial RTO should be bounded by the minimum
 930                          * and maximum RTO.  And it should also be smaller
 931                          * than the connect attempt abort timeout.  Otherwise,
 932                          * the connection won't be aborted in a period
 933                          * reasonably close to that timeout.
 934                          */
 935                         if (val < tcp->tcp_rto_min || val > tcp->tcp_rto_max ||
 936                             val > tcp->tcp_second_ctimer_threshold ||
 937                             val < tcps->tcps_rexmit_interval_initial_low ||
 938                             val > tcps->tcps_rexmit_interval_initial_high) {
 939                                 *outlenp = 0;
 940                                 return (EINVAL);
 941                         }
 942                         tcp->tcp_rto_initial = val;
 943 
 944                         /*
 945                          * If TCP has not sent anything, need to re-calculate
 946                          * tcp_rto.  Otherwise, this option change does not
 947                          * really affect anything.
 948                          */
 949                         if (tcp->tcp_state >= TCPS_SYN_SENT)
 950                                 break;
 951 
 952                         tcp->tcp_rtt_sa = MSEC2NSEC(tcp->tcp_rto_initial) << 2;
 953                         tcp->tcp_rtt_sd = MSEC2NSEC(tcp->tcp_rto_initial) >> 1;
 954                         tcp->tcp_rto = tcp_calculate_rto(tcp, tcps,
 955                             tcps->tcps_conn_grace_period);
 956                         break;
 957                 case TCP_RTO_MIN:
 958                         if (checkonly || val == 0)
 959                                 break;
 960 
 961                         if (val < tcps->tcps_rexmit_interval_min_low ||
 962                             val > tcps->tcps_rexmit_interval_min_high ||
 963                             val > tcp->tcp_rto_max) {
 964                                 *outlenp = 0;
 965                                 return (EINVAL);
 966                         }
 967                         tcp->tcp_rto_min = val;
 968                         if (tcp->tcp_rto < val)
 969                                 tcp->tcp_rto = val;
 970                         break;
 971                 case TCP_RTO_MAX:
 972                         if (checkonly || val == 0)
 973                                 break;
 974 
 975                         /*
 976                          * Sanity checks
 977                          *
 978                          * The maximum RTO should not be larger than the
 979                          * connection abort timeout.  Otherwise, the
 980                          * connection won't be aborted in a period reasonably
 981                          * close to that timeout.
 982                          */
 983                         if (val < tcps->tcps_rexmit_interval_max_low ||
 984                             val > tcps->tcps_rexmit_interval_max_high ||
 985                             val < tcp->tcp_rto_min ||
 986                             val > tcp->tcp_second_timer_threshold) {
 987                                 *outlenp = 0;
 988                                 return (EINVAL);
 989                         }
 990                         tcp->tcp_rto_max = val;
 991                         if (tcp->tcp_rto > val)
 992                                 tcp->tcp_rto = val;
 993                         break;
 994                 case TCP_LINGER2:
 995                         if (checkonly || *i1 == 0)
 996                                 break;
 997 
 998                         /*
 999                          * Note that the option value's unit is second.  And
1000                          * the value should be bigger than the private
1001                          * parameter tcp_fin_wait_2_flush_interval's lower
1002                          * bound and smaller than the current value of that
1003                          * parameter.  It should be smaller than the current
1004                          * value to avoid an app setting TCP_LINGER2 to a big
1005                          * value, causing resource to be held up too long in
1006                          * FIN-WAIT-2 state.
1007                          */
1008                         if (*i1 < 0 ||
1009                             tcps->tcps_fin_wait_2_flush_interval_low/SECONDS >
1010                             *i1 ||
1011                             tcps->tcps_fin_wait_2_flush_interval/SECONDS <
1012                             *i1) {
1013                                 *outlenp = 0;
1014                                 return (EINVAL);
1015                         }
1016                         tcp->tcp_fin_wait_2_flush_interval = *i1 * SECONDS;
1017                         break;
1018                 default:
1019                         break;
1020                 }
1021                 break;
1022         case IPPROTO_IP:
1023                 if (connp->conn_family != AF_INET) {
1024                         *outlenp = 0;
1025                         return (EINVAL);
1026                 }
1027                 switch (name) {
1028                 case IP_SEC_OPT:
1029                         /*
1030                          * We should not allow policy setting after
1031                          * we start listening for connections.
1032                          */
1033                         if (tcp->tcp_state == TCPS_LISTEN) {
1034                                 return (EINVAL);
1035                         }
1036                         break;
1037                 case IP_RECVTOS:
1038                         if (!checkonly) {
1039                                 /*
1040                                  * Force it to be sent up with the next msg
1041                                  * by setting it to a value which cannot
1042                                  * appear in a packet (TOS is only 8-bits)
1043                                  */
1044                                 tcp->tcp_recvtos = 0xffffffffU;
1045                         }
1046                         break;
1047                 }
1048                 break;
1049         case IPPROTO_IPV6:
1050                 /*
1051                  * IPPROTO_IPV6 options are only supported for sockets
1052                  * that are using IPv6 on the wire.
1053                  */
1054                 if (connp->conn_ipversion != IPV6_VERSION) {
1055                         *outlenp = 0;
1056                         return (EINVAL);
1057                 }
1058 
1059                 switch (name) {
1060                 case IPV6_RECVPKTINFO:
1061                         if (!checkonly) {
1062                                 /* Force it to be sent up with the next msg */
1063                                 tcp->tcp_recvifindex = 0;
1064                         }
1065                         break;
1066                 case IPV6_RECVTCLASS:
1067                         if (!checkonly) {
1068                                 /* Force it to be sent up with the next msg */
1069                                 tcp->tcp_recvtclass = 0xffffffffU;
1070                         }
1071                         break;
1072                 case IPV6_RECVHOPLIMIT:
1073                         if (!checkonly) {
1074                                 /* Force it to be sent up with the next msg */
1075                                 tcp->tcp_recvhops = 0xffffffffU;
1076                         }
1077                         break;
1078                 case IPV6_PKTINFO:
1079                         /* This is an extra check for TCP */
1080                         if (inlen == sizeof (struct in6_pktinfo)) {
1081                                 struct in6_pktinfo *pkti;
1082 
1083                                 pkti = (struct in6_pktinfo *)invalp;
1084                                 /*
1085                                  * RFC 3542 states that ipi6_addr must be
1086                                  * the unspecified address when setting the
1087                                  * IPV6_PKTINFO sticky socket option on a
1088                                  * TCP socket.
1089                                  */
1090                                 if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr))
1091                                         return (EINVAL);
1092                         }
1093                         break;
1094                 case IPV6_SEC_OPT:
1095                         /*
1096                          * We should not allow policy setting after
1097                          * we start listening for connections.
1098                          */
1099                         if (tcp->tcp_state == TCPS_LISTEN) {
1100                                 return (EINVAL);
1101                         }
1102                         break;
1103                 }
1104                 break;
1105         }
1106         reterr = conn_opt_set(&coas, level, name, inlen, invalp,
1107             checkonly, cr);
1108         if (reterr != 0) {
1109                 *outlenp = 0;
1110                 return (reterr);
1111         }
1112 
1113         /*
1114          * Common case of OK return with outval same as inval
1115          */
1116         if (invalp != outvalp) {
1117                 /* don't trust bcopy for identical src/dst */
1118                 (void) bcopy(invalp, outvalp, inlen);
1119         }
1120         *outlenp = inlen;
1121 
1122         if (coas.coa_changed & COA_HEADER_CHANGED) {
1123                 /* If we are connected we rebuilt the headers */
1124                 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1125                     !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1126                         reterr = tcp_build_hdrs(tcp);
1127                         if (reterr != 0)
1128                                 return (reterr);
1129                 }
1130         }
1131         if (coas.coa_changed & COA_ROUTE_CHANGED) {
1132                 in6_addr_t nexthop;
1133 
1134                 /*
1135                  * If we are connected we re-cache the information.
1136                  * We ignore errors to preserve BSD behavior.
1137                  * Note that we don't redo IPsec policy lookup here
1138                  * since the final destination (or source) didn't change.
1139                  */
1140                 ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa,
1141                     &connp->conn_faddr_v6, &nexthop);
1142 
1143                 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1144                     !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1145                         (void) ip_attr_connect(connp, connp->conn_ixa,
1146                             &connp->conn_laddr_v6, &connp->conn_faddr_v6,
1147                             &nexthop, connp->conn_fport, NULL, NULL,
1148                             IPDF_VERIFY_DST);
1149                 }
1150         }
1151         if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
1152                 connp->conn_wq->q_hiwat = connp->conn_sndbuf;
1153         }
1154         if (coas.coa_changed & COA_WROFF_CHANGED) {
1155                 connp->conn_wroff = connp->conn_ht_iphc_allocated +
1156                     tcps->tcps_wroff_xtra;
1157                 (void) proto_set_tx_wroff(connp->conn_rq, connp,
1158                     connp->conn_wroff);
1159         }
1160         if (coas.coa_changed & COA_OOBINLINE_CHANGED) {
1161                 if (IPCL_IS_NONSTR(connp))
1162                         proto_set_rx_oob_opt(connp, onoff);
1163         }
1164         return (0);
1165 }