1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  24  * Copyright 2019 Joyent, Inc.
  25  * Copyright (c) 2016 by Delphix. All rights reserved.
  26  */
  27 
  28 #include <sys/types.h>
  29 #include <sys/stream.h>
  30 #define _SUN_TPI_VERSION 2
  31 #include <sys/tihdr.h>
  32 #include <sys/socket.h>
  33 #include <sys/xti_xtiopt.h>
  34 #include <sys/xti_inet.h>
  35 #include <sys/policy.h>
  36 
  37 #include <inet/cc.h>
  38 #include <inet/common.h>
  39 #include <netinet/ip6.h>
  40 #include <inet/ip.h>
  41 
  42 #include <netinet/in.h>
  43 #include <netinet/tcp.h>
  44 #include <inet/optcom.h>
  45 #include <inet/proto_set.h>
  46 #include <inet/tcp_impl.h>
  47 
  48 static int      tcp_opt_default(queue_t *, int, int, uchar_t *);
  49 
  50 /*
  51  * Table of all known options handled on a TCP protocol stack.
  52  *
  53  * Note: This table contains options processed by both TCP and IP levels
  54  *       and is the superset of options that can be performed on a TCP over IP
  55  *       stack.
  56  */
  57 opdes_t tcp_opt_arr[] = {
  58 
  59 { SO_LINGER,    SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
  60         sizeof (struct linger), 0 },
  61 
  62 { SO_DEBUG,     SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  63 { SO_KEEPALIVE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  64 { SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  65 { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
  66         },
  67 { SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  68 { SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  69 { SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  70 { SO_TYPE,      SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
  71 { SO_SNDBUF,    SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  72 { SO_RCVBUF,    SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  73 { SO_SNDTIMEO,  SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
  74         sizeof (struct timeval), 0 },
  75 { SO_RCVTIMEO,  SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
  76         sizeof (struct timeval), 0 },
  77 { SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
  78         },
  79 { SO_SND_COPYAVOID, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  80 { SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
  81         0 },
  82 { SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
  83         0 },
  84 { SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
  85         0 },
  86 { SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
  87         0 },
  88 { SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  89 
  90 { SO_DOMAIN,    SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
  91 
  92 { SO_PROTOTYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
  93 
  94 { TCP_NODELAY,  IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
  95         },
  96 { TCP_MAXSEG,   IPPROTO_TCP, OA_R, OA_R, OP_NP, 0, sizeof (uint_t),
  97         536 },
  98 
  99 { TCP_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
 100         OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
 101 
 102 { TCP_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
 103         OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
 104 
 105 { TCP_CONN_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
 106         OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
 107 
 108 { TCP_CONN_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
 109         OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
 110 
 111 { TCP_RECVDSTADDR, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
 112         0 },
 113 
 114 { TCP_ANONPRIVBIND, IPPROTO_TCP, OA_R, OA_RW, OP_PRIVPORT, 0,
 115         sizeof (int), 0 },
 116 
 117 { TCP_EXCLBIND, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
 118         },
 119 
 120 { TCP_INIT_CWND, IPPROTO_TCP, OA_RW, OA_RW, OP_CONFIG, 0,
 121         sizeof (int), 0 },
 122 
 123 { TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
 124         sizeof (int), 0 },
 125 
 126 { TCP_KEEPIDLE, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 127 
 128 { TCP_KEEPCNT, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 129 
 130 { TCP_KEEPINTVL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 131 
 132 { TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
 133         sizeof (int), 0 },
 134 
 135 { TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 136 
 137 { TCP_RTO_INITIAL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
 138 
 139 { TCP_RTO_MIN, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
 140 
 141 { TCP_RTO_MAX, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
 142 
 143 { TCP_LINGER2, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 144 
 145 { TCP_CONGESTION, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
 146         OP_VARLEN, CC_ALGO_NAME_MAX, 0 },
 147 
 148 { IP_OPTIONS,   IPPROTO_IP, OA_RW, OA_RW, OP_NP,
 149         (OP_VARLEN|OP_NODEFAULT),
 150         IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
 151 { T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
 152         (OP_VARLEN|OP_NODEFAULT),
 153         IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
 154 
 155 { IP_TOS,       IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 156 { T_IP_TOS,     IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 157 { IP_TTL,       IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
 158         sizeof (int), -1 /* not initialized */ },
 159 
 160 { IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
 161         sizeof (ipsec_req_t), -1 /* not initialized */ },
 162 
 163 { IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
 164         sizeof (int),   0 /* no ifindex */ },
 165 
 166 { IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
 167         sizeof (int), 0 },
 168 
 169 { IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
 170         sizeof (int), -1 /* not initialized */ },
 171 
 172 { IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 173         sizeof (int),   0 /* no ifindex */ },
 174 
 175 { IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 176 
 177 { IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
 178         sizeof (in_addr_t),     -1 /* not initialized  */ },
 179 
 180 { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
 181         sizeof (int), 0 },
 182 
 183 { IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 184         (OP_NODEFAULT|OP_VARLEN),
 185         sizeof (struct in6_pktinfo), -1 /* not initialized */ },
 186 { IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 187         OP_NODEFAULT,
 188         sizeof (sin6_t), -1 /* not initialized */ },
 189 { IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 190         (OP_VARLEN|OP_NODEFAULT), 255*8,
 191         -1 /* not initialized */ },
 192 { IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 193         (OP_VARLEN|OP_NODEFAULT), 255*8,
 194         -1 /* not initialized */ },
 195 { IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 196         (OP_VARLEN|OP_NODEFAULT), 255*8,
 197         -1 /* not initialized */ },
 198 { IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 199         (OP_VARLEN|OP_NODEFAULT), 255*8,
 200         -1 /* not initialized */ },
 201 { IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 202         OP_NODEFAULT,
 203         sizeof (int), -1 /* not initialized */ },
 204 { IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 205         OP_NODEFAULT,
 206         sizeof (struct ip6_mtuinfo), -1 /* not initialized */ },
 207 { IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 208         sizeof (int), 0 },
 209 { IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 210         sizeof (int), 0 },
 211 { IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 212         sizeof (int), 0 },
 213 
 214 /* Enable receipt of ancillary data */
 215 { IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 216         sizeof (int), 0 },
 217 { IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 218         sizeof (int), 0 },
 219 { IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 220         sizeof (int), 0 },
 221 { _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 222         sizeof (int), 0 },
 223 { IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 224         sizeof (int), 0 },
 225 { IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 226         sizeof (int), 0 },
 227 { IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 228         sizeof (int), 0 },
 229 { IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 230         sizeof (int), 0 },
 231 
 232 { IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
 233         sizeof (ipsec_req_t), -1 /* not initialized */ },
 234 { IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 235         sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT },
 236 };
 237 
 238 /*
 239  * Table of all supported levels
 240  * Note: Some levels (e.g. XTI_GENERIC) may be valid but may not have
 241  * any supported options so we need this info separately.
 242  *
 243  * This is needed only for topmost tpi providers and is used only by
 244  * XTI interfaces.
 245  */
 246 optlevel_t      tcp_valid_levels_arr[] = {
 247         XTI_GENERIC,
 248         SOL_SOCKET,
 249         IPPROTO_TCP,
 250         IPPROTO_IP,
 251         IPPROTO_IPV6
 252 };
 253 
 254 
 255 #define TCP_OPT_ARR_CNT         A_CNT(tcp_opt_arr)
 256 #define TCP_VALID_LEVELS_CNT    A_CNT(tcp_valid_levels_arr)
 257 
 258 uint_t tcp_max_optsize; /* initialized when TCP driver is loaded */
 259 
 260 /*
 261  * Initialize option database object for TCP
 262  *
 263  * This object represents database of options to search passed to
 264  * {sock,tpi}optcom_req() interface routine to take care of option
 265  * management and associated methods.
 266  */
 267 
 268 optdb_obj_t tcp_opt_obj = {
 269         tcp_opt_default,        /* TCP default value function pointer */
 270         tcp_tpi_opt_get,        /* TCP get function pointer */
 271         tcp_tpi_opt_set,        /* TCP set function pointer */
 272         TCP_OPT_ARR_CNT,        /* TCP option database count of entries */
 273         tcp_opt_arr,            /* TCP option database */
 274         TCP_VALID_LEVELS_CNT,   /* TCP valid level count of entries */
 275         tcp_valid_levels_arr    /* TCP valid level array */
 276 };
 277 
 278 static int tcp_max_init_cwnd = TCP_MAX_INIT_CWND;
 279 
 280 /*
 281  * Some TCP options can be "set" by requesting them in the option
 282  * buffer. This is needed for XTI feature test though we do not
 283  * allow it in general. We interpret that this mechanism is more
 284  * applicable to OSI protocols and need not be allowed in general.
 285  * This routine filters out options for which it is not allowed (most)
 286  * and lets through those (few) for which it is. [ The XTI interface
 287  * test suite specifics will imply that any XTI_GENERIC level XTI_* if
 288  * ever implemented will have to be allowed here ].
 289  */
 290 static boolean_t
 291 tcp_allow_connopt_set(int level, int name)
 292 {
 293 
 294         switch (level) {
 295         case IPPROTO_TCP:
 296                 switch (name) {
 297                 case TCP_NODELAY:
 298                         return (B_TRUE);
 299                 default:
 300                         return (B_FALSE);
 301                 }
 302                 /*NOTREACHED*/
 303         default:
 304                 return (B_FALSE);
 305         }
 306         /*NOTREACHED*/
 307 }
 308 
 309 /*
 310  * This routine gets default values of certain options whose default
 311  * values are maintained by protocol specific code
 312  */
 313 /* ARGSUSED */
 314 static int
 315 tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
 316 {
 317         int32_t *i1 = (int32_t *)ptr;
 318         tcp_stack_t     *tcps = Q_TO_TCP(q)->tcp_tcps;
 319 
 320         switch (level) {
 321         case IPPROTO_TCP:
 322                 switch (name) {
 323                 case TCP_NOTIFY_THRESHOLD:
 324                         *i1 = tcps->tcps_ip_notify_interval;
 325                         break;
 326                 case TCP_ABORT_THRESHOLD:
 327                         *i1 = tcps->tcps_ip_abort_interval;
 328                         break;
 329                 case TCP_CONN_NOTIFY_THRESHOLD:
 330                         *i1 = tcps->tcps_ip_notify_cinterval;
 331                         break;
 332                 case TCP_CONN_ABORT_THRESHOLD:
 333                         *i1 = tcps->tcps_ip_abort_cinterval;
 334                         break;
 335                 default:
 336                         return (-1);
 337                 }
 338                 break;
 339         case IPPROTO_IP:
 340                 switch (name) {
 341                 case IP_TTL:
 342                         *i1 = tcps->tcps_ipv4_ttl;
 343                         break;
 344                 default:
 345                         return (-1);
 346                 }
 347                 break;
 348         case IPPROTO_IPV6:
 349                 switch (name) {
 350                 case IPV6_UNICAST_HOPS:
 351                         *i1 = tcps->tcps_ipv6_hoplimit;
 352                         break;
 353                 default:
 354                         return (-1);
 355                 }
 356                 break;
 357         default:
 358                 return (-1);
 359         }
 360         return (sizeof (int));
 361 }
 362 
 363 /*
 364  * TCP routine to get the values of options.
 365  */
 366 int
 367 tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
 368 {
 369         int             *i1 = (int *)ptr;
 370         tcp_t           *tcp = connp->conn_tcp;
 371         conn_opt_arg_t  coas;
 372         int             retval;
 373 
 374         coas.coa_connp = connp;
 375         coas.coa_ixa = connp->conn_ixa;
 376         coas.coa_ipp = &connp->conn_xmit_ipp;
 377         coas.coa_ancillary = B_FALSE;
 378         coas.coa_changed = 0;
 379 
 380         switch (level) {
 381         case SOL_SOCKET:
 382                 switch (name) {
 383                 case SO_SND_COPYAVOID:
 384                         *i1 = tcp->tcp_snd_zcopy_on ?
 385                             SO_SND_COPYAVOID : 0;
 386                         return (sizeof (int));
 387                 case SO_ACCEPTCONN:
 388                         *i1 = (tcp->tcp_state == TCPS_LISTEN);
 389                         return (sizeof (int));
 390                 }
 391                 break;
 392         case IPPROTO_TCP:
 393                 switch (name) {
 394                 case TCP_NODELAY:
 395                         *i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0;
 396                         return (sizeof (int));
 397                 case TCP_MAXSEG:
 398                         *i1 = tcp->tcp_mss;
 399                         return (sizeof (int));
 400                 case TCP_NOTIFY_THRESHOLD:
 401                         *i1 = (int)tcp->tcp_first_timer_threshold;
 402                         return (sizeof (int));
 403                 case TCP_ABORT_THRESHOLD:
 404                         *i1 = tcp->tcp_second_timer_threshold;
 405                         return (sizeof (int));
 406                 case TCP_CONN_NOTIFY_THRESHOLD:
 407                         *i1 = tcp->tcp_first_ctimer_threshold;
 408                         return (sizeof (int));
 409                 case TCP_CONN_ABORT_THRESHOLD:
 410                         *i1 = tcp->tcp_second_ctimer_threshold;
 411                         return (sizeof (int));
 412                 case TCP_INIT_CWND:
 413                         *i1 = tcp->tcp_init_cwnd;
 414                         return (sizeof (int));
 415                 case TCP_KEEPALIVE_THRESHOLD:
 416                         *i1 = tcp->tcp_ka_interval;
 417                         return (sizeof (int));
 418 
 419                 /*
 420                  * TCP_KEEPIDLE expects value in seconds, but
 421                  * tcp_ka_interval is in milliseconds.
 422                  */
 423                 case TCP_KEEPIDLE:
 424                         *i1 = tcp->tcp_ka_interval / 1000;
 425                         return (sizeof (int));
 426                 case TCP_KEEPCNT:
 427                         *i1 = tcp->tcp_ka_cnt;
 428                         return (sizeof (int));
 429 
 430                 /*
 431                  * TCP_KEEPINTVL expects value in seconds, but
 432                  * tcp_ka_rinterval is in milliseconds.
 433                  */
 434                 case TCP_KEEPINTVL:
 435                         *i1 = tcp->tcp_ka_rinterval / 1000;
 436                         return (sizeof (int));
 437                 case TCP_KEEPALIVE_ABORT_THRESHOLD:
 438                         *i1 = tcp->tcp_ka_abort_thres;
 439                         return (sizeof (int));
 440                 case TCP_CONGESTION: {
 441                         size_t len = strlcpy((char *)ptr, CC_ALGO(tcp)->name,
 442                             CC_ALGO_NAME_MAX);
 443                         if (len >= CC_ALGO_NAME_MAX)
 444                                 return (-1);
 445                         return (len + 1);
 446                 }
 447                 case TCP_CORK:
 448                         *i1 = tcp->tcp_cork;
 449                         return (sizeof (int));
 450                 case TCP_RTO_INITIAL:
 451                         *i1 = tcp->tcp_rto_initial;
 452                         return (sizeof (uint32_t));
 453                 case TCP_RTO_MIN:
 454                         *i1 = tcp->tcp_rto_min;
 455                         return (sizeof (uint32_t));
 456                 case TCP_RTO_MAX:
 457                         *i1 = tcp->tcp_rto_max;
 458                         return (sizeof (uint32_t));
 459                 case TCP_LINGER2:
 460                         *i1 = tcp->tcp_fin_wait_2_flush_interval / SECONDS;
 461                         return (sizeof (int));
 462                 }
 463                 break;
 464         case IPPROTO_IP:
 465                 if (connp->conn_family != AF_INET)
 466                         return (-1);
 467                 switch (name) {
 468                 case IP_OPTIONS:
 469                 case T_IP_OPTIONS:
 470                         /* Caller ensures enough space */
 471                         return (ip_opt_get_user(connp, ptr));
 472                 default:
 473                         break;
 474                 }
 475                 break;
 476 
 477         case IPPROTO_IPV6:
 478                 /*
 479                  * IPPROTO_IPV6 options are only supported for sockets
 480                  * that are using IPv6 on the wire.
 481                  */
 482                 if (connp->conn_ipversion != IPV6_VERSION) {
 483                         return (-1);
 484                 }
 485                 switch (name) {
 486                 case IPV6_PATHMTU:
 487                         if (tcp->tcp_state < TCPS_ESTABLISHED)
 488                                 return (-1);
 489                         break;
 490                 }
 491                 break;
 492         }
 493         mutex_enter(&connp->conn_lock);
 494         retval = conn_opt_get(&coas, level, name, ptr);
 495         mutex_exit(&connp->conn_lock);
 496         return (retval);
 497 }
 498 
 499 /*
 500  * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
 501  * Parameters are assumed to be verified by the caller.
 502  */
 503 /* ARGSUSED */
 504 int
 505 tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
 506     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
 507     void *thisdg_attrs, cred_t *cr)
 508 {
 509         tcp_t   *tcp = connp->conn_tcp;
 510         int     *i1 = (int *)invalp;
 511         boolean_t onoff = (*i1 == 0) ? 0 : 1;
 512         boolean_t checkonly;
 513         int     reterr;
 514         tcp_stack_t     *tcps = tcp->tcp_tcps;
 515         conn_opt_arg_t  coas;
 516         uint32_t        val = *((uint32_t *)invalp);
 517 
 518         coas.coa_connp = connp;
 519         coas.coa_ixa = connp->conn_ixa;
 520         coas.coa_ipp = &connp->conn_xmit_ipp;
 521         coas.coa_ancillary = B_FALSE;
 522         coas.coa_changed = 0;
 523 
 524         switch (optset_context) {
 525         case SETFN_OPTCOM_CHECKONLY:
 526                 checkonly = B_TRUE;
 527                 /*
 528                  * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
 529                  * inlen != 0 implies value supplied and
 530                  *      we have to "pretend" to set it.
 531                  * inlen == 0 implies that there is no
 532                  *      value part in T_CHECK request and just validation
 533                  * done elsewhere should be enough, we just return here.
 534                  */
 535                 if (inlen == 0) {
 536                         *outlenp = 0;
 537                         return (0);
 538                 }
 539                 break;
 540         case SETFN_OPTCOM_NEGOTIATE:
 541                 checkonly = B_FALSE;
 542                 break;
 543         case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */
 544         case SETFN_CONN_NEGOTIATE:
 545                 checkonly = B_FALSE;
 546                 /*
 547                  * Negotiating local and "association-related" options
 548                  * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ)
 549                  * primitives is allowed by XTI, but we choose
 550                  * to not implement this style negotiation for Internet
 551                  * protocols (We interpret it is a must for OSI world but
 552                  * optional for Internet protocols) for all options.
 553                  * [ Will do only for the few options that enable test
 554                  * suites that our XTI implementation of this feature
 555                  * works for transports that do allow it ]
 556                  */
 557                 if (!tcp_allow_connopt_set(level, name)) {
 558                         *outlenp = 0;
 559                         return (EINVAL);
 560                 }
 561                 break;
 562         default:
 563                 /*
 564                  * We should never get here
 565                  */
 566                 *outlenp = 0;
 567                 return (EINVAL);
 568         }
 569 
 570         ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
 571             (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
 572 
 573         /*
 574          * For TCP, we should have no ancillary data sent down
 575          * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs
 576          * has to be zero.
 577          */
 578         ASSERT(thisdg_attrs == NULL);
 579 
 580         /*
 581          * For fixed length options, no sanity check
 582          * of passed in length is done. It is assumed *_optcom_req()
 583          * routines do the right thing.
 584          */
 585         switch (level) {
 586         case SOL_SOCKET:
 587                 switch (name) {
 588                 case SO_KEEPALIVE:
 589                         if (checkonly) {
 590                                 /* check only case */
 591                                 break;
 592                         }
 593 
 594                         if (!onoff) {
 595                                 if (connp->conn_keepalive) {
 596                                         if (tcp->tcp_ka_tid != 0) {
 597                                                 (void) TCP_TIMER_CANCEL(tcp,
 598                                                     tcp->tcp_ka_tid);
 599                                                 tcp->tcp_ka_tid = 0;
 600                                         }
 601                                         connp->conn_keepalive = 0;
 602                                 }
 603                                 break;
 604                         }
 605                         if (!connp->conn_keepalive) {
 606                                 /* Crank up the keepalive timer */
 607                                 tcp->tcp_ka_last_intrvl = 0;
 608                                 tcp->tcp_ka_tid = TCP_TIMER(tcp,
 609                                     tcp_keepalive_timer, tcp->tcp_ka_interval);
 610                                 connp->conn_keepalive = 1;
 611                         }
 612                         break;
 613                 case SO_SNDBUF: {
 614                         if (*i1 > tcps->tcps_max_buf) {
 615                                 *outlenp = 0;
 616                                 return (ENOBUFS);
 617                         }
 618                         if (checkonly)
 619                                 break;
 620 
 621                         connp->conn_sndbuf = *i1;
 622                         if (tcps->tcps_snd_lowat_fraction != 0) {
 623                                 connp->conn_sndlowat = connp->conn_sndbuf /
 624                                     tcps->tcps_snd_lowat_fraction;
 625                         }
 626                         (void) tcp_maxpsz_set(tcp, B_TRUE);
 627                         /*
 628                          * If we are flow-controlled, recheck the condition.
 629                          * There are apps that increase SO_SNDBUF size when
 630                          * flow-controlled (EWOULDBLOCK), and expect the flow
 631                          * control condition to be lifted right away.
 632                          */
 633                         mutex_enter(&tcp->tcp_non_sq_lock);
 634                         if (tcp->tcp_flow_stopped &&
 635                             TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) {
 636                                 tcp_clrqfull(tcp);
 637                         }
 638                         mutex_exit(&tcp->tcp_non_sq_lock);
 639                         *outlenp = inlen;
 640                         return (0);
 641                 }
 642                 case SO_RCVBUF:
 643                         if (*i1 > tcps->tcps_max_buf) {
 644                                 *outlenp = 0;
 645                                 return (ENOBUFS);
 646                         }
 647                         /* Silently ignore zero */
 648                         if (!checkonly && *i1 != 0) {
 649                                 *i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss);
 650                                 (void) tcp_rwnd_set(tcp, *i1);
 651                         }
 652                         /*
 653                          * XXX should we return the rwnd here
 654                          * and tcp_opt_get ?
 655                          */
 656                         *outlenp = inlen;
 657                         return (0);
 658                 case SO_SND_COPYAVOID:
 659                         if (!checkonly) {
 660                                 if (tcp->tcp_loopback ||
 661                                     (onoff != 1) || !tcp_zcopy_check(tcp)) {
 662                                         *outlenp = 0;
 663                                         return (EOPNOTSUPP);
 664                                 }
 665                                 tcp->tcp_snd_zcopy_aware = 1;
 666                         }
 667                         *outlenp = inlen;
 668                         return (0);
 669                 }
 670                 break;
 671         case IPPROTO_TCP:
 672                 switch (name) {
 673                 case TCP_NODELAY:
 674                         if (!checkonly)
 675                                 tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss;
 676                         break;
 677                 case TCP_NOTIFY_THRESHOLD:
 678                         if (!checkonly)
 679                                 tcp->tcp_first_timer_threshold = *i1;
 680                         break;
 681                 case TCP_ABORT_THRESHOLD:
 682                         if (!checkonly)
 683                                 tcp->tcp_second_timer_threshold = *i1;
 684                         break;
 685                 case TCP_CONN_NOTIFY_THRESHOLD:
 686                         if (!checkonly)
 687                                 tcp->tcp_first_ctimer_threshold = *i1;
 688                         break;
 689                 case TCP_CONN_ABORT_THRESHOLD:
 690                         if (!checkonly)
 691                                 tcp->tcp_second_ctimer_threshold = *i1;
 692                         break;
 693                 case TCP_RECVDSTADDR:
 694                         if (tcp->tcp_state > TCPS_LISTEN) {
 695                                 *outlenp = 0;
 696                                 return (EOPNOTSUPP);
 697                         }
 698                         /* Setting done in conn_opt_set */
 699                         break;
 700                 case TCP_INIT_CWND:
 701                         if (checkonly)
 702                                 break;
 703 
 704                         /*
 705                          * Only allow socket with network configuration
 706                          * privilege to set the initial cwnd to be larger
 707                          * than allowed by RFC 3390.
 708                          */
 709                         if (val > MIN(4, MAX(2, 4380 / tcp->tcp_mss))) {
 710                                 if ((reterr = secpolicy_ip_config(cr, B_TRUE))
 711                                     != 0) {
 712                                         *outlenp = 0;
 713                                         return (reterr);
 714                                 }
 715                                 if (val > tcp_max_init_cwnd) {
 716                                         *outlenp = 0;
 717                                         return (EINVAL);
 718                                 }
 719                         }
 720 
 721                         tcp->tcp_init_cwnd = val;
 722 
 723                         /*
 724                          * If the socket is connected, AND no outbound data
 725                          * has been sent, reset the actual cwnd values.
 726                          */
 727                         if (tcp->tcp_state == TCPS_ESTABLISHED &&
 728                             tcp->tcp_iss == tcp->tcp_snxt - 1) {
 729                                 tcp->tcp_cwnd =
 730                                     MIN(tcp->tcp_rwnd, val * tcp->tcp_mss);
 731                         }
 732                         break;
 733 
 734                 /*
 735                  * TCP_KEEPIDLE is in seconds but TCP_KEEPALIVE_THRESHOLD
 736                  * is in milliseconds. TCP_KEEPIDLE is introduced for
 737                  * compatibility with other Unix flavors.
 738                  * We can fall through TCP_KEEPALIVE_THRESHOLD logic after
 739                  * converting the input to milliseconds.
 740                  */
 741                 case TCP_KEEPIDLE:
 742                         *i1 *= 1000;
 743                         /* FALLTHRU */
 744 
 745                 case TCP_KEEPALIVE_THRESHOLD:
 746                         if (checkonly)
 747                                 break;
 748 
 749                         if (*i1 < tcps->tcps_keepalive_interval_low ||
 750                             *i1 > tcps->tcps_keepalive_interval_high) {
 751                                 *outlenp = 0;
 752                                 return (EINVAL);
 753                         }
 754                         if (*i1 != tcp->tcp_ka_interval) {
 755                                 tcp->tcp_ka_interval = *i1;
 756                                 /*
 757                                  * Check if we need to restart the
 758                                  * keepalive timer.
 759                                  */
 760                                 if (tcp->tcp_ka_tid != 0) {
 761                                         ASSERT(connp->conn_keepalive);
 762                                         (void) TCP_TIMER_CANCEL(tcp,
 763                                             tcp->tcp_ka_tid);
 764                                         tcp->tcp_ka_last_intrvl = 0;
 765                                         tcp->tcp_ka_tid = TCP_TIMER(tcp,
 766                                             tcp_keepalive_timer,
 767                                             tcp->tcp_ka_interval);
 768                                 }
 769                         }
 770                         break;
 771 
 772                 /*
 773                  * tcp_ka_abort_thres = tcp_ka_rinterval * tcp_ka_cnt.
 774                  * So setting TCP_KEEPCNT or TCP_KEEPINTVL can affect all the
 775                  * three members - tcp_ka_abort_thres, tcp_ka_rinterval and
 776                  * tcp_ka_cnt.
 777                  */
 778                 case TCP_KEEPCNT:
 779                         if (checkonly)
 780                                 break;
 781 
 782                         if (*i1 == 0) {
 783                                 return (EINVAL);
 784                         } else if (tcp->tcp_ka_rinterval == 0) {
 785                                 /*
 786                                  * When TCP_KEEPCNT is specified without first
 787                                  * specifying a TCP_KEEPINTVL, we infer an
 788                                  * interval based on a tunable specific to our
 789                                  * stack: the tcp_keepalive_abort_interval.
 790                                  * (Or the TCP_KEEPALIVE_ABORT_THRESHOLD, in
 791                                  * the unlikely event that that has been set.)
 792                                  * Given the abort interval's default value of
 793                                  * 480 seconds, low TCP_KEEPCNT values can
 794                                  * result in intervals that exceed the default
 795                                  * maximum RTO of 60 seconds.  Rather than
 796                                  * fail in these cases, we (implicitly) clamp
 797                                  * the interval at the maximum RTO; if the
 798                                  * TCP_KEEPCNT is shortly followed by a
 799                                  * TCP_KEEPINTVL (as we expect), the abort
 800                                  * threshold will be recalculated correctly --
 801                                  * and if a TCP_KEEPINTVL is not forthcoming,
 802                                  * keep-alive will at least operate reasonably
 803                                  * given the underconfigured state.
 804                                  */
 805                                 uint32_t interval;
 806 
 807                                 interval = tcp->tcp_ka_abort_thres / *i1;
 808 
 809                                 if (interval < tcp->tcp_rto_min)
 810                                         interval = tcp->tcp_rto_min;
 811 
 812                                 if (interval > tcp->tcp_rto_max)
 813                                         interval = tcp->tcp_rto_max;
 814 
 815                                 tcp->tcp_ka_rinterval = interval;
 816                         } else {
 817                                 if ((*i1 * tcp->tcp_ka_rinterval) <
 818                                     tcps->tcps_keepalive_abort_interval_low ||
 819                                     (*i1 * tcp->tcp_ka_rinterval) >
 820                                     tcps->tcps_keepalive_abort_interval_high)
 821                                         return (EINVAL);
 822                                 tcp->tcp_ka_abort_thres =
 823                                     (*i1 * tcp->tcp_ka_rinterval);
 824                         }
 825                         tcp->tcp_ka_cnt = *i1;
 826                         break;
 827                 case TCP_KEEPINTVL:
 828                         /*
 829                          * TCP_KEEPINTVL is specified in seconds, but
 830                          * tcp_ka_rinterval is in milliseconds.
 831                          */
 832 
 833                         if (checkonly)
 834                                 break;
 835 
 836                         if ((*i1 * 1000) < tcp->tcp_rto_min ||
 837                             (*i1 * 1000) > tcp->tcp_rto_max)
 838                                 return (EINVAL);
 839 
 840                         if (tcp->tcp_ka_cnt == 0) {
 841                                 tcp->tcp_ka_cnt =
 842                                     tcp->tcp_ka_abort_thres / (*i1 * 1000);
 843                         } else {
 844                                 if ((*i1 * tcp->tcp_ka_cnt * 1000) <
 845                                     tcps->tcps_keepalive_abort_interval_low ||
 846                                     (*i1 * tcp->tcp_ka_cnt * 1000) >
 847                                     tcps->tcps_keepalive_abort_interval_high)
 848                                         return (EINVAL);
 849                                 tcp->tcp_ka_abort_thres =
 850                                     (*i1 * tcp->tcp_ka_cnt * 1000);
 851                         }
 852                         tcp->tcp_ka_rinterval = *i1 * 1000;
 853                         break;
 854                 case TCP_KEEPALIVE_ABORT_THRESHOLD:
 855                         if (!checkonly) {
 856                                 if (*i1 <
 857                                     tcps->tcps_keepalive_abort_interval_low ||
 858                                     *i1 >
 859                                     tcps->tcps_keepalive_abort_interval_high) {
 860                                         *outlenp = 0;
 861                                         return (EINVAL);
 862                                 }
 863                                 tcp->tcp_ka_abort_thres = *i1;
 864                                 tcp->tcp_ka_cnt = 0;
 865                                 tcp->tcp_ka_rinterval = 0;
 866                         }
 867                         break;
 868                 case TCP_CONGESTION: {
 869                         struct cc_algo *algo;
 870 
 871                         if (checkonly) {
 872                                 break;
 873                         }
 874 
 875                         /*
 876                          * Make sure the string is NUL-terminated. Some
 877                          * consumers pass only the number of characters
 878                          * in the string, and don't include the NUL
 879                          * terminator, so we set it for them.
 880                          */
 881                         if (inlen < CC_ALGO_NAME_MAX) {
 882                                 invalp[inlen] = '\0';
 883                         }
 884                         invalp[CC_ALGO_NAME_MAX - 1] = '\0';
 885 
 886                         if ((algo = cc_load_algo((char *)invalp)) == NULL) {
 887                                 return (ENOENT);
 888                         }
 889 
 890                         if (CC_ALGO(tcp)->cb_destroy != NULL) {
 891                                 CC_ALGO(tcp)->cb_destroy(&tcp->tcp_ccv);
 892                         }
 893 
 894                         CC_DATA(tcp) = NULL;
 895                         CC_ALGO(tcp) = algo;
 896 
 897                         if (CC_ALGO(tcp)->cb_init != NULL) {
 898                                 VERIFY0(CC_ALGO(tcp)->cb_init(&tcp->tcp_ccv));
 899                         }
 900 
 901                         break;
 902                 }
 903                 case TCP_CORK:
 904                         if (!checkonly) {
 905                                 /*
 906                                  * if tcp->tcp_cork was set and is now
 907                                  * being unset, we have to make sure that
 908                                  * the remaining data gets sent out. Also
 909                                  * unset tcp->tcp_cork so that tcp_wput_data()
 910                                  * can send data even if it is less than mss
 911                                  */
 912                                 if (tcp->tcp_cork && onoff == 0 &&
 913                                     tcp->tcp_unsent > 0) {
 914                                         tcp->tcp_cork = B_FALSE;
 915                                         tcp_wput_data(tcp, NULL, B_FALSE);
 916                                 }
 917                                 tcp->tcp_cork = onoff;
 918                         }
 919                         break;
 920                 case TCP_RTO_INITIAL:
 921                         if (checkonly || val == 0)
 922                                 break;
 923 
 924                         /*
 925                          * Sanity checks
 926                          *
 927                          * The initial RTO should be bounded by the minimum
 928                          * and maximum RTO.  And it should also be smaller
 929                          * than the connect attempt abort timeout.  Otherwise,
 930                          * the connection won't be aborted in a period
 931                          * reasonably close to that timeout.
 932                          */
 933                         if (val < tcp->tcp_rto_min || val > tcp->tcp_rto_max ||
 934                             val > tcp->tcp_second_ctimer_threshold ||
 935                             val < tcps->tcps_rexmit_interval_initial_low ||
 936                             val > tcps->tcps_rexmit_interval_initial_high) {
 937                                 *outlenp = 0;
 938                                 return (EINVAL);
 939                         }
 940                         tcp->tcp_rto_initial = val;
 941 
 942                         /*
 943                          * If TCP has not sent anything, need to re-calculate
 944                          * tcp_rto.  Otherwise, this option change does not
 945                          * really affect anything.
 946                          */
 947                         if (tcp->tcp_state >= TCPS_SYN_SENT)
 948                                 break;
 949 
 950                         tcp->tcp_rtt_sa = MSEC2NSEC(tcp->tcp_rto_initial) << 2;
 951                         tcp->tcp_rtt_sd = MSEC2NSEC(tcp->tcp_rto_initial) >> 1;
 952                         tcp->tcp_rto = tcp_calculate_rto(tcp, tcps,
 953                             tcps->tcps_conn_grace_period);
 954                         break;
 955                 case TCP_RTO_MIN:
 956                         if (checkonly || val == 0)
 957                                 break;
 958 
 959                         if (val < tcps->tcps_rexmit_interval_min_low ||
 960                             val > tcps->tcps_rexmit_interval_min_high ||
 961                             val > tcp->tcp_rto_max) {
 962                                 *outlenp = 0;
 963                                 return (EINVAL);
 964                         }
 965                         tcp->tcp_rto_min = val;
 966                         if (tcp->tcp_rto < val)
 967                                 tcp->tcp_rto = val;
 968                         break;
 969                 case TCP_RTO_MAX:
 970                         if (checkonly || val == 0)
 971                                 break;
 972 
 973                         /*
 974                          * Sanity checks
 975                          *
 976                          * The maximum RTO should not be larger than the
 977                          * connection abort timeout.  Otherwise, the
 978                          * connection won't be aborted in a period reasonably
 979                          * close to that timeout.
 980                          */
 981                         if (val < tcps->tcps_rexmit_interval_max_low ||
 982                             val > tcps->tcps_rexmit_interval_max_high ||
 983                             val < tcp->tcp_rto_min ||
 984                             val > tcp->tcp_second_timer_threshold) {
 985                                 *outlenp = 0;
 986                                 return (EINVAL);
 987                         }
 988                         tcp->tcp_rto_max = val;
 989                         if (tcp->tcp_rto > val)
 990                                 tcp->tcp_rto = val;
 991                         break;
 992                 case TCP_LINGER2:
 993                         if (checkonly || *i1 == 0)
 994                                 break;
 995 
 996                         /*
 997                          * Note that the option value's unit is second.  And
 998                          * the value should be bigger than the private
 999                          * parameter tcp_fin_wait_2_flush_interval's lower
1000                          * bound and smaller than the current value of that
1001                          * parameter.  It should be smaller than the current
1002                          * value to avoid an app setting TCP_LINGER2 to a big
1003                          * value, causing resource to be held up too long in
1004                          * FIN-WAIT-2 state.
1005                          */
1006                         if (*i1 < 0 ||
1007                             tcps->tcps_fin_wait_2_flush_interval_low/SECONDS >
1008                             *i1 ||
1009                             tcps->tcps_fin_wait_2_flush_interval/SECONDS <
1010                             *i1) {
1011                                 *outlenp = 0;
1012                                 return (EINVAL);
1013                         }
1014                         tcp->tcp_fin_wait_2_flush_interval = *i1 * SECONDS;
1015                         break;
1016                 default:
1017                         break;
1018                 }
1019                 break;
1020         case IPPROTO_IP:
1021                 if (connp->conn_family != AF_INET) {
1022                         *outlenp = 0;
1023                         return (EINVAL);
1024                 }
1025                 switch (name) {
1026                 case IP_SEC_OPT:
1027                         /*
1028                          * We should not allow policy setting after
1029                          * we start listening for connections.
1030                          */
1031                         if (tcp->tcp_state == TCPS_LISTEN) {
1032                                 return (EINVAL);
1033                         }
1034                         break;
1035                 }
1036                 break;
1037         case IPPROTO_IPV6:
1038                 /*
1039                  * IPPROTO_IPV6 options are only supported for sockets
1040                  * that are using IPv6 on the wire.
1041                  */
1042                 if (connp->conn_ipversion != IPV6_VERSION) {
1043                         *outlenp = 0;
1044                         return (EINVAL);
1045                 }
1046 
1047                 switch (name) {
1048                 case IPV6_RECVPKTINFO:
1049                         if (!checkonly) {
1050                                 /* Force it to be sent up with the next msg */
1051                                 tcp->tcp_recvifindex = 0;
1052                         }
1053                         break;
1054                 case IPV6_RECVTCLASS:
1055                         if (!checkonly) {
1056                                 /* Force it to be sent up with the next msg */
1057                                 tcp->tcp_recvtclass = 0xffffffffU;
1058                         }
1059                         break;
1060                 case IPV6_RECVHOPLIMIT:
1061                         if (!checkonly) {
1062                                 /* Force it to be sent up with the next msg */
1063                                 tcp->tcp_recvhops = 0xffffffffU;
1064                         }
1065                         break;
1066                 case IPV6_PKTINFO:
1067                         /* This is an extra check for TCP */
1068                         if (inlen == sizeof (struct in6_pktinfo)) {
1069                                 struct in6_pktinfo *pkti;
1070 
1071                                 pkti = (struct in6_pktinfo *)invalp;
1072                                 /*
1073                                  * RFC 3542 states that ipi6_addr must be
1074                                  * the unspecified address when setting the
1075                                  * IPV6_PKTINFO sticky socket option on a
1076                                  * TCP socket.
1077                                  */
1078                                 if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr))
1079                                         return (EINVAL);
1080                         }
1081                         break;
1082                 case IPV6_SEC_OPT:
1083                         /*
1084                          * We should not allow policy setting after
1085                          * we start listening for connections.
1086                          */
1087                         if (tcp->tcp_state == TCPS_LISTEN) {
1088                                 return (EINVAL);
1089                         }
1090                         break;
1091                 }
1092                 break;
1093         }
1094         reterr = conn_opt_set(&coas, level, name, inlen, invalp,
1095             checkonly, cr);
1096         if (reterr != 0) {
1097                 *outlenp = 0;
1098                 return (reterr);
1099         }
1100 
1101         /*
1102          * Common case of OK return with outval same as inval
1103          */
1104         if (invalp != outvalp) {
1105                 /* don't trust bcopy for identical src/dst */
1106                 (void) bcopy(invalp, outvalp, inlen);
1107         }
1108         *outlenp = inlen;
1109 
1110         if (coas.coa_changed & COA_HEADER_CHANGED) {
1111                 /* If we are connected we rebuilt the headers */
1112                 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1113                     !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1114                         reterr = tcp_build_hdrs(tcp);
1115                         if (reterr != 0)
1116                                 return (reterr);
1117                 }
1118         }
1119         if (coas.coa_changed & COA_ROUTE_CHANGED) {
1120                 in6_addr_t nexthop;
1121 
1122                 /*
1123                  * If we are connected we re-cache the information.
1124                  * We ignore errors to preserve BSD behavior.
1125                  * Note that we don't redo IPsec policy lookup here
1126                  * since the final destination (or source) didn't change.
1127                  */
1128                 ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa,
1129                     &connp->conn_faddr_v6, &nexthop);
1130 
1131                 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1132                     !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1133                         (void) ip_attr_connect(connp, connp->conn_ixa,
1134                             &connp->conn_laddr_v6, &connp->conn_faddr_v6,
1135                             &nexthop, connp->conn_fport, NULL, NULL,
1136                             IPDF_VERIFY_DST);
1137                 }
1138         }
1139         if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
1140                 connp->conn_wq->q_hiwat = connp->conn_sndbuf;
1141         }
1142         if (coas.coa_changed & COA_WROFF_CHANGED) {
1143                 connp->conn_wroff = connp->conn_ht_iphc_allocated +
1144                     tcps->tcps_wroff_xtra;
1145                 (void) proto_set_tx_wroff(connp->conn_rq, connp,
1146                     connp->conn_wroff);
1147         }
1148         if (coas.coa_changed & COA_OOBINLINE_CHANGED) {
1149                 if (IPCL_IS_NONSTR(connp))
1150                         proto_set_rx_oob_opt(connp, onoff);
1151         }
1152         return (0);
1153 }