1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2013 by Delphix. All rights reserved.
  24  */
  25 /* Copyright (c) 1990 Mentat Inc. */
  26 
  27 #include <sys/types.h>
  28 #include <sys/stream.h>
  29 #include <sys/stropts.h>
  30 #include <sys/strlog.h>
  31 #include <sys/strsun.h>
  32 #define _SUN_TPI_VERSION 2
  33 #include <sys/tihdr.h>
  34 #include <sys/timod.h>
  35 #include <sys/ddi.h>
  36 #include <sys/sunddi.h>
  37 #include <sys/strsubr.h>
  38 #include <sys/suntpi.h>
  39 #include <sys/xti_inet.h>
  40 #include <sys/cmn_err.h>
  41 #include <sys/kmem.h>
  42 #include <sys/cred.h>
  43 #include <sys/policy.h>
  44 #include <sys/priv.h>
  45 #include <sys/ucred.h>
  46 #include <sys/zone.h>
  47 
  48 #include <sys/sockio.h>
  49 #include <sys/socket.h>
  50 #include <sys/socketvar.h>
  51 #include <sys/vtrace.h>
  52 #include <sys/sdt.h>
  53 #include <sys/debug.h>
  54 #include <sys/isa_defs.h>
  55 #include <sys/random.h>
  56 #include <netinet/in.h>
  57 #include <netinet/ip6.h>
  58 #include <netinet/icmp6.h>
  59 #include <netinet/udp.h>
  60 
  61 #include <inet/common.h>
  62 #include <inet/ip.h>
  63 #include <inet/ip_impl.h>
  64 #include <inet/ipsec_impl.h>
  65 #include <inet/ip6.h>
  66 #include <inet/ip_ire.h>
  67 #include <inet/ip_if.h>
  68 #include <inet/ip_multi.h>
  69 #include <inet/ip_ndp.h>
  70 #include <inet/proto_set.h>
  71 #include <inet/mib2.h>
  72 #include <inet/nd.h>
  73 #include <inet/optcom.h>
  74 #include <inet/snmpcom.h>
  75 #include <inet/kstatcom.h>
  76 #include <inet/ipclassifier.h>
  77 
  78 #include <sys/tsol/label.h>
  79 #include <sys/tsol/tnet.h>
  80 
  81 #include <inet/rawip_impl.h>
  82 
  83 #include <sys/disp.h>
  84 
  85 /*
  86  * Synchronization notes:
  87  *
  88  * RAWIP is MT and uses the usual kernel synchronization primitives. We use
  89  * conn_lock to protect the icmp_t.
  90  *
  91  * Plumbing notes:
  92  * ICMP is always a device driver. For compatibility with mibopen() code
  93  * it is possible to I_PUSH "icmp", but that results in pushing a passthrough
  94  * dummy module.
  95  */
  96 static void     icmp_addr_req(queue_t *q, mblk_t *mp);
  97 static void     icmp_tpi_bind(queue_t *q, mblk_t *mp);
  98 static void     icmp_bind_proto(icmp_t *icmp);
  99 static int      icmp_build_hdr_template(conn_t *, const in6_addr_t *,
 100     const in6_addr_t *, uint32_t);
 101 static void     icmp_capability_req(queue_t *q, mblk_t *mp);
 102 static int      icmp_close(queue_t *q, int flags);
 103 static void     icmp_close_free(conn_t *);
 104 static void     icmp_tpi_connect(queue_t *q, mblk_t *mp);
 105 static void     icmp_tpi_disconnect(queue_t *q, mblk_t *mp);
 106 static void     icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error,
 107     int sys_error);
 108 static void     icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
 109     t_scalar_t tlierr, int sys_error);
 110 static void     icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2,
 111     ip_recv_attr_t *);
 112 static void     icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp,
 113     ip_recv_attr_t *);
 114 static void     icmp_info_req(queue_t *q, mblk_t *mp);
 115 static void     icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *);
 116 static conn_t   *icmp_open(int family, cred_t *credp, int *err, int flags);
 117 static int      icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag,
 118                     cred_t *credp);
 119 static int      icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag,
 120                     cred_t *credp);
 121 static boolean_t icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name);
 122 int             icmp_opt_set(conn_t *connp, uint_t optset_context,
 123                     int level, int name, uint_t inlen,
 124                     uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
 125                     void *thisdg_attrs, cred_t *cr);
 126 int             icmp_opt_get(conn_t *connp, int level, int name,
 127                     uchar_t *ptr);
 128 static int      icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin,
 129                     sin6_t *sin6, cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa);
 130 static mblk_t   *icmp_prepend_hdr(conn_t *, ip_xmit_attr_t *, const ip_pkt_t *,
 131     const in6_addr_t *, const in6_addr_t *, uint32_t, mblk_t *, int *);
 132 static mblk_t   *icmp_prepend_header_template(conn_t *, ip_xmit_attr_t *,
 133     mblk_t *, const in6_addr_t *, uint32_t, int *);
 134 static int      icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
 135                     uchar_t *ptr, int len);
 136 static void     icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err);
 137 static void     icmp_tpi_unbind(queue_t *q, mblk_t *mp);
 138 static void     icmp_wput(queue_t *q, mblk_t *mp);
 139 static void     icmp_wput_fallback(queue_t *q, mblk_t *mp);
 140 static void     icmp_wput_other(queue_t *q, mblk_t *mp);
 141 static void     icmp_wput_iocdata(queue_t *q, mblk_t *mp);
 142 static void     icmp_wput_restricted(queue_t *q, mblk_t *mp);
 143 static void     icmp_ulp_recv(conn_t *, mblk_t *, uint_t);
 144 
 145 static void     *rawip_stack_init(netstackid_t stackid, netstack_t *ns);
 146 static void     rawip_stack_fini(netstackid_t stackid, void *arg);
 147 
 148 static void     *rawip_kstat_init(netstackid_t stackid);
 149 static void     rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp);
 150 static int      rawip_kstat_update(kstat_t *kp, int rw);
 151 static void     rawip_stack_shutdown(netstackid_t stackid, void *arg);
 152 
 153 /* Common routines for TPI and socket module */
 154 static conn_t   *rawip_do_open(int, cred_t *, int *, int);
 155 static void     rawip_do_close(conn_t *);
 156 static int      rawip_do_bind(conn_t *, struct sockaddr *, socklen_t);
 157 static int      rawip_do_unbind(conn_t *);
 158 static int      rawip_do_connect(conn_t *, const struct sockaddr *, socklen_t,
 159     cred_t *, pid_t);
 160 
 161 int             rawip_getsockname(sock_lower_handle_t, struct sockaddr *,
 162                     socklen_t *, cred_t *);
 163 int             rawip_getpeername(sock_lower_handle_t, struct sockaddr *,
 164                     socklen_t *, cred_t *);
 165 
 166 static struct module_info icmp_mod_info =  {
 167         5707, "icmp", 1, INFPSZ, 512, 128
 168 };
 169 
 170 /*
 171  * Entry points for ICMP as a device.
 172  * We have separate open functions for the /dev/icmp and /dev/icmp6 devices.
 173  */
 174 static struct qinit icmprinitv4 = {
 175         NULL, NULL, icmp_openv4, icmp_close, NULL, &icmp_mod_info
 176 };
 177 
 178 static struct qinit icmprinitv6 = {
 179         NULL, NULL, icmp_openv6, icmp_close, NULL, &icmp_mod_info
 180 };
 181 
 182 static struct qinit icmpwinit = {
 183         (pfi_t)icmp_wput, (pfi_t)ip_wsrv, NULL, NULL, NULL, &icmp_mod_info
 184 };
 185 
 186 /* ICMP entry point during fallback */
 187 static struct qinit icmp_fallback_sock_winit = {
 188         (pfi_t)icmp_wput_fallback, NULL, NULL, NULL, NULL, &icmp_mod_info
 189 };
 190 
 191 /* For AF_INET aka /dev/icmp */
 192 struct streamtab icmpinfov4 = {
 193         &icmprinitv4, &icmpwinit
 194 };
 195 
 196 /* For AF_INET6 aka /dev/icmp6 */
 197 struct streamtab icmpinfov6 = {
 198         &icmprinitv6, &icmpwinit
 199 };
 200 
 201 /* Default structure copied into T_INFO_ACK messages */
 202 static struct T_info_ack icmp_g_t_info_ack = {
 203         T_INFO_ACK,
 204         IP_MAXPACKET,    /* TSDU_size.  icmp allows maximum size messages. */
 205         T_INVALID,      /* ETSDU_size.  icmp does not support expedited data. */
 206         T_INVALID,      /* CDATA_size. icmp does not support connect data. */
 207         T_INVALID,      /* DDATA_size. icmp does not support disconnect data. */
 208         0,              /* ADDR_size - filled in later. */
 209         0,              /* OPT_size - not initialized here */
 210         IP_MAXPACKET,   /* TIDU_size.  icmp allows maximum size messages. */
 211         T_CLTS,         /* SERV_type.  icmp supports connection-less. */
 212         TS_UNBND,       /* CURRENT_state.  This is set from icmp_state. */
 213         (XPG4_1|SENDZERO) /* PROVIDER_flag */
 214 };
 215 
 216 static int
 217 icmp_set_buf_prop(netstack_t *stack, cred_t *cr, mod_prop_info_t *pinfo,
 218     const char *ifname, const void *pval, uint_t flags)
 219 {
 220         return (mod_set_buf_prop(stack->netstack_icmp->is_propinfo_tbl,
 221             stack, cr, pinfo, ifname, pval, flags));
 222 }
 223 
 224 static int
 225 icmp_get_buf_prop(netstack_t *stack, mod_prop_info_t *pinfo, const char *ifname,
 226     void *val, uint_t psize, uint_t flags)
 227 {
 228         return (mod_get_buf_prop(stack->netstack_icmp->is_propinfo_tbl, stack,
 229             pinfo, ifname, val, psize, flags));
 230 }
 231 
 232 /*
 233  * All of these are alterable, within the min/max values given, at run time.
 234  *
 235  * Note: All those tunables which do not start with "icmp_" are Committed and
 236  * therefore are public. See PSARC 2010/080.
 237  */
 238 static mod_prop_info_t icmp_propinfo_tbl[] = {
 239         /* tunable - 0 */
 240         { "_wroff_extra", MOD_PROTO_RAWIP,
 241             mod_set_uint32, mod_get_uint32,
 242             {0, 128, 32}, {32} },
 243 
 244         { "_ipv4_ttl", MOD_PROTO_RAWIP,
 245             mod_set_uint32, mod_get_uint32,
 246             {1, 255, 255}, {255} },
 247 
 248         { "_ipv6_hoplimit", MOD_PROTO_RAWIP,
 249             mod_set_uint32, mod_get_uint32,
 250             {0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS},
 251             {IPV6_DEFAULT_HOPS} },
 252 
 253         { "_bsd_compat", MOD_PROTO_RAWIP,
 254             mod_set_boolean, mod_get_boolean,
 255             {B_TRUE}, {B_TRUE} },
 256 
 257         { "send_buf", MOD_PROTO_RAWIP,
 258             icmp_set_buf_prop, icmp_get_buf_prop,
 259             {4096, 65536, 8192}, {8192} },
 260 
 261         { "_xmit_lowat", MOD_PROTO_RAWIP,
 262             mod_set_uint32, mod_get_uint32,
 263             {0, 65536, 1024}, {1024} },
 264 
 265         { "recv_buf", MOD_PROTO_RAWIP,
 266             icmp_set_buf_prop, icmp_get_buf_prop,
 267             {4096, 65536, 8192}, {8192} },
 268 
 269         { "max_buf", MOD_PROTO_RAWIP,
 270             mod_set_uint32, mod_get_uint32,
 271             {65536, ULP_MAX_BUF, 256*1024}, {256*1024} },
 272 
 273         { "_pmtu_discovery", MOD_PROTO_RAWIP,
 274             mod_set_boolean, mod_get_boolean,
 275             {B_FALSE}, {B_FALSE} },
 276 
 277         { "_sendto_ignerr", MOD_PROTO_RAWIP,
 278             mod_set_boolean, mod_get_boolean,
 279             {B_FALSE}, {B_FALSE} },
 280 
 281         { "?", MOD_PROTO_RAWIP, NULL, mod_get_allprop, {0}, {0} },
 282 
 283         { NULL, 0, NULL, NULL, {0}, {0} }
 284 };
 285 
 286 #define is_wroff_extra                  is_propinfo_tbl[0].prop_cur_uval
 287 #define is_ipv4_ttl                     is_propinfo_tbl[1].prop_cur_uval
 288 #define is_ipv6_hoplimit                is_propinfo_tbl[2].prop_cur_uval
 289 #define is_bsd_compat                   is_propinfo_tbl[3].prop_cur_bval
 290 #define is_xmit_hiwat                   is_propinfo_tbl[4].prop_cur_uval
 291 #define is_xmit_lowat                   is_propinfo_tbl[5].prop_cur_uval
 292 #define is_recv_hiwat                   is_propinfo_tbl[6].prop_cur_uval
 293 #define is_max_buf                      is_propinfo_tbl[7].prop_cur_uval
 294 #define is_pmtu_discovery               is_propinfo_tbl[8].prop_cur_bval
 295 #define is_sendto_ignerr                is_propinfo_tbl[9].prop_cur_bval
 296 
 297 typedef union T_primitives *t_primp_t;
 298 
 299 /*
 300  * This routine is called to handle each O_T_BIND_REQ/T_BIND_REQ message
 301  * passed to icmp_wput.
 302  * It calls IP to verify the local IP address, and calls IP to insert
 303  * the conn_t in the fanout table.
 304  * If everything is ok it then sends the T_BIND_ACK back up.
 305  */
 306 static void
 307 icmp_tpi_bind(queue_t *q, mblk_t *mp)
 308 {
 309         int     error;
 310         struct sockaddr *sa;
 311         struct T_bind_req *tbr;
 312         socklen_t       len;
 313         sin_t   *sin;
 314         sin6_t  *sin6;
 315         icmp_t          *icmp;
 316         conn_t  *connp = Q_TO_CONN(q);
 317         mblk_t *mp1;
 318         cred_t *cr;
 319 
 320         /*
 321          * All Solaris components should pass a db_credp
 322          * for this TPI message, hence we ASSERT.
 323          * But in case there is some other M_PROTO that looks
 324          * like a TPI message sent by some other kernel
 325          * component, we check and return an error.
 326          */
 327         cr = msg_getcred(mp, NULL);
 328         ASSERT(cr != NULL);
 329         if (cr == NULL) {
 330                 icmp_err_ack(q, mp, TSYSERR, EINVAL);
 331                 return;
 332         }
 333 
 334         icmp = connp->conn_icmp;
 335         if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
 336                 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
 337                     "icmp_bind: bad req, len %u",
 338                     (uint_t)(mp->b_wptr - mp->b_rptr));
 339                 icmp_err_ack(q, mp, TPROTO, 0);
 340                 return;
 341         }
 342 
 343         if (icmp->icmp_state != TS_UNBND) {
 344                 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
 345                     "icmp_bind: bad state, %u", icmp->icmp_state);
 346                 icmp_err_ack(q, mp, TOUTSTATE, 0);
 347                 return;
 348         }
 349 
 350         /*
 351          * Reallocate the message to make sure we have enough room for an
 352          * address.
 353          */
 354         mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1);
 355         if (mp1 == NULL) {
 356                 icmp_err_ack(q, mp, TSYSERR, ENOMEM);
 357                 return;
 358         }
 359         mp = mp1;
 360 
 361         /* Reset the message type in preparation for shipping it back. */
 362         DB_TYPE(mp) = M_PCPROTO;
 363         tbr = (struct T_bind_req *)mp->b_rptr;
 364         len = tbr->ADDR_length;
 365         switch (len) {
 366         case 0: /* request for a generic port */
 367                 tbr->ADDR_offset = sizeof (struct T_bind_req);
 368                 if (connp->conn_family == AF_INET) {
 369                         tbr->ADDR_length = sizeof (sin_t);
 370                         sin = (sin_t *)&tbr[1];
 371                         *sin = sin_null;
 372                         sin->sin_family = AF_INET;
 373                         mp->b_wptr = (uchar_t *)&sin[1];
 374                         sa = (struct sockaddr *)sin;
 375                         len = sizeof (sin_t);
 376                 } else {
 377                         ASSERT(connp->conn_family == AF_INET6);
 378                         tbr->ADDR_length = sizeof (sin6_t);
 379                         sin6 = (sin6_t *)&tbr[1];
 380                         *sin6 = sin6_null;
 381                         sin6->sin6_family = AF_INET6;
 382                         mp->b_wptr = (uchar_t *)&sin6[1];
 383                         sa = (struct sockaddr *)sin6;
 384                         len = sizeof (sin6_t);
 385                 }
 386                 break;
 387 
 388         case sizeof (sin_t):    /* Complete IPv4 address */
 389                 sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset,
 390                     sizeof (sin_t));
 391                 break;
 392 
 393         case sizeof (sin6_t):   /* Complete IPv6 address */
 394                 sa = (struct sockaddr *)mi_offset_param(mp,
 395                     tbr->ADDR_offset, sizeof (sin6_t));
 396                 break;
 397 
 398         default:
 399                 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
 400                     "icmp_bind: bad ADDR_length %u", tbr->ADDR_length);
 401                 icmp_err_ack(q, mp, TBADADDR, 0);
 402                 return;
 403         }
 404 
 405         error = rawip_do_bind(connp, sa, len);
 406         if (error != 0) {
 407                 if (error > 0) {
 408                         icmp_err_ack(q, mp, TSYSERR, error);
 409                 } else {
 410                         icmp_err_ack(q, mp, -error, 0);
 411                 }
 412         } else {
 413                 tbr->PRIM_type = T_BIND_ACK;
 414                 qreply(q, mp);
 415         }
 416 }
 417 
 418 static int
 419 rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len)
 420 {
 421         sin_t           *sin;
 422         sin6_t          *sin6;
 423         icmp_t          *icmp = connp->conn_icmp;
 424         int             error = 0;
 425         ip_laddr_t      laddr_type = IPVL_UNICAST_UP;   /* INADDR_ANY */
 426         in_port_t       lport;          /* Network byte order */
 427         ipaddr_t        v4src;          /* Set if AF_INET */
 428         in6_addr_t      v6src;
 429         uint_t          scopeid = 0;
 430         zoneid_t        zoneid = IPCL_ZONEID(connp);
 431         ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
 432 
 433         if (sa == NULL || !OK_32PTR((char *)sa)) {
 434                 return (EINVAL);
 435         }
 436 
 437         switch (len) {
 438         case sizeof (sin_t):    /* Complete IPv4 address */
 439                 sin = (sin_t *)sa;
 440                 if (sin->sin_family != AF_INET ||
 441                     connp->conn_family != AF_INET) {
 442                         /* TSYSERR, EAFNOSUPPORT */
 443                         return (EAFNOSUPPORT);
 444                 }
 445                 v4src = sin->sin_addr.s_addr;
 446                 IN6_IPADDR_TO_V4MAPPED(v4src, &v6src);
 447                 if (v4src != INADDR_ANY) {
 448                         laddr_type = ip_laddr_verify_v4(v4src, zoneid, ipst,
 449                             B_TRUE);
 450                 }
 451                 lport = sin->sin_port;
 452                 break;
 453         case sizeof (sin6_t): /* Complete IPv6 address */
 454                 sin6 = (sin6_t *)sa;
 455                 if (sin6->sin6_family != AF_INET6 ||
 456                     connp->conn_family != AF_INET6) {
 457                         /* TSYSERR, EAFNOSUPPORT */
 458                         return (EAFNOSUPPORT);
 459                 }
 460                 /* No support for mapped addresses on raw sockets */
 461                 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 462                         /* TSYSERR, EADDRNOTAVAIL */
 463                         return (EADDRNOTAVAIL);
 464                 }
 465                 v6src = sin6->sin6_addr;
 466                 if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
 467                         if (IN6_IS_ADDR_LINKSCOPE(&v6src))
 468                                 scopeid = sin6->sin6_scope_id;
 469                         laddr_type = ip_laddr_verify_v6(&v6src, zoneid, ipst,
 470                             B_TRUE, scopeid);
 471                 }
 472                 lport = sin6->sin6_port;
 473                 break;
 474 
 475         default:
 476                 /* TBADADDR */
 477                 return (EADDRNOTAVAIL);
 478         }
 479 
 480         /* Is the local address a valid unicast, multicast, or broadcast? */
 481         if (laddr_type == IPVL_BAD)
 482                 return (EADDRNOTAVAIL);
 483 
 484         /*
 485          * The state must be TS_UNBND.
 486          */
 487         mutex_enter(&connp->conn_lock);
 488         if (icmp->icmp_state != TS_UNBND) {
 489                 mutex_exit(&connp->conn_lock);
 490                 return (-TOUTSTATE);
 491         }
 492 
 493         /*
 494          * Copy the source address into our icmp structure.  This address
 495          * may still be zero; if so, ip will fill in the correct address
 496          * each time an outbound packet is passed to it.
 497          * If we are binding to a broadcast or multicast address then
 498          * we just set the conn_bound_addr since we don't want to use
 499          * that as the source address when sending.
 500          */
 501         connp->conn_bound_addr_v6 = v6src;
 502         connp->conn_laddr_v6 = v6src;
 503         if (scopeid != 0) {
 504                 connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET;
 505                 connp->conn_ixa->ixa_scopeid = scopeid;
 506                 connp->conn_incoming_ifindex = scopeid;
 507         } else {
 508                 connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
 509                 connp->conn_incoming_ifindex = connp->conn_bound_if;
 510         }
 511 
 512         switch (laddr_type) {
 513         case IPVL_UNICAST_UP:
 514         case IPVL_UNICAST_DOWN:
 515                 connp->conn_saddr_v6 = v6src;
 516                 connp->conn_mcbc_bind = B_FALSE;
 517                 break;
 518         case IPVL_MCAST:
 519         case IPVL_BCAST:
 520                 /* ip_set_destination will pick a source address later */
 521                 connp->conn_saddr_v6 = ipv6_all_zeros;
 522                 connp->conn_mcbc_bind = B_TRUE;
 523                 break;
 524         }
 525 
 526         /* Any errors after this point should use late_error */
 527 
 528         /*
 529          * Use sin_port/sin6_port since applications like psh use SOCK_RAW
 530          * with IPPROTO_TCP.
 531          */
 532         connp->conn_lport = lport;
 533         connp->conn_fport = 0;
 534 
 535         if (connp->conn_family == AF_INET) {
 536                 ASSERT(connp->conn_ipversion == IPV4_VERSION);
 537         } else {
 538                 ASSERT(connp->conn_ipversion == IPV6_VERSION);
 539         }
 540 
 541         icmp->icmp_state = TS_IDLE;
 542 
 543         /*
 544          * We create an initial header template here to make a subsequent
 545          * sendto have a starting point. Since conn_last_dst is zero the
 546          * first sendto will always follow the 'dst changed' code path.
 547          * Note that we defer massaging options and the related checksum
 548          * adjustment until we have a destination address.
 549          */
 550         error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
 551             &connp->conn_faddr_v6, connp->conn_flowinfo);
 552         if (error != 0) {
 553                 mutex_exit(&connp->conn_lock);
 554                 goto late_error;
 555         }
 556         /* Just in case */
 557         connp->conn_faddr_v6 = ipv6_all_zeros;
 558         connp->conn_v6lastdst = ipv6_all_zeros;
 559         mutex_exit(&connp->conn_lock);
 560 
 561         error = ip_laddr_fanout_insert(connp);
 562         if (error != 0)
 563                 goto late_error;
 564 
 565         /* Bind succeeded */
 566         return (0);
 567 
 568 late_error:
 569         mutex_enter(&connp->conn_lock);
 570         connp->conn_saddr_v6 = ipv6_all_zeros;
 571         connp->conn_bound_addr_v6 = ipv6_all_zeros;
 572         connp->conn_laddr_v6 = ipv6_all_zeros;
 573         if (scopeid != 0) {
 574                 connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
 575                 connp->conn_incoming_ifindex = connp->conn_bound_if;
 576         }
 577         icmp->icmp_state = TS_UNBND;
 578         connp->conn_v6lastdst = ipv6_all_zeros;
 579         connp->conn_lport = 0;
 580 
 581         /* Restore the header that was built above - different source address */
 582         (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
 583             &connp->conn_faddr_v6, connp->conn_flowinfo);
 584         mutex_exit(&connp->conn_lock);
 585         return (error);
 586 }
 587 
 588 /*
 589  * Tell IP to just bind to the protocol.
 590  */
 591 static void
 592 icmp_bind_proto(icmp_t *icmp)
 593 {
 594         conn_t  *connp = icmp->icmp_connp;
 595 
 596         mutex_enter(&connp->conn_lock);
 597         connp->conn_saddr_v6 = ipv6_all_zeros;
 598         connp->conn_laddr_v6 = ipv6_all_zeros;
 599         connp->conn_faddr_v6 = ipv6_all_zeros;
 600         connp->conn_v6lastdst = ipv6_all_zeros;
 601         mutex_exit(&connp->conn_lock);
 602 
 603         (void) ip_laddr_fanout_insert(connp);
 604 }
 605 
 606 /*
 607  * This routine handles each T_CONN_REQ message passed to icmp.  It
 608  * associates a default destination address with the stream.
 609  *
 610  * After various error checks are completed, icmp_connect() lays
 611  * the target address and port into the composite header template.
 612  * Then we ask IP for information, including a source address if we didn't
 613  * already have one. Finally we send up the T_OK_ACK reply message.
 614  */
 615 static void
 616 icmp_tpi_connect(queue_t *q, mblk_t *mp)
 617 {
 618         conn_t  *connp = Q_TO_CONN(q);
 619         struct T_conn_req       *tcr;
 620         struct sockaddr *sa;
 621         socklen_t len;
 622         int error;
 623         cred_t *cr;
 624         pid_t pid;
 625         /*
 626          * All Solaris components should pass a db_credp
 627          * for this TPI message, hence we ASSERT.
 628          * But in case there is some other M_PROTO that looks
 629          * like a TPI message sent by some other kernel
 630          * component, we check and return an error.
 631          */
 632         cr = msg_getcred(mp, &pid);
 633         ASSERT(cr != NULL);
 634         if (cr == NULL) {
 635                 icmp_err_ack(q, mp, TSYSERR, EINVAL);
 636                 return;
 637         }
 638 
 639         tcr = (struct T_conn_req *)mp->b_rptr;
 640         /* Sanity checks */
 641         if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_req)) {
 642                 icmp_err_ack(q, mp, TPROTO, 0);
 643                 return;
 644         }
 645 
 646         if (tcr->OPT_length != 0) {
 647                 icmp_err_ack(q, mp, TBADOPT, 0);
 648                 return;
 649         }
 650 
 651         len = tcr->DEST_length;
 652 
 653         switch (len) {
 654         default:
 655                 icmp_err_ack(q, mp, TBADADDR, 0);
 656                 return;
 657         case sizeof (sin_t):
 658                 sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
 659                     sizeof (sin_t));
 660                 break;
 661         case sizeof (sin6_t):
 662                 sa = (struct sockaddr *)mi_offset_param(mp,
 663                     tcr->DEST_offset, sizeof (sin6_t));
 664                 break;
 665         }
 666 
 667         error = proto_verify_ip_addr(connp->conn_family, sa, len);
 668         if (error != 0) {
 669                 icmp_err_ack(q, mp, TSYSERR, error);
 670                 return;
 671         }
 672 
 673         error = rawip_do_connect(connp, sa, len, cr, pid);
 674         if (error != 0) {
 675                 if (error < 0) {
 676                         icmp_err_ack(q, mp, -error, 0);
 677                 } else {
 678                         icmp_err_ack(q, mp, 0, error);
 679                 }
 680         } else {
 681                 mblk_t *mp1;
 682 
 683                 /*
 684                  * We have to send a connection confirmation to
 685                  * keep TLI happy.
 686                  */
 687                 if (connp->conn_family == AF_INET) {
 688                         mp1 = mi_tpi_conn_con(NULL, (char *)sa,
 689                             sizeof (sin_t), NULL, 0);
 690                 } else {
 691                         ASSERT(connp->conn_family == AF_INET6);
 692                         mp1 = mi_tpi_conn_con(NULL, (char *)sa,
 693                             sizeof (sin6_t), NULL, 0);
 694                 }
 695                 if (mp1 == NULL) {
 696                         icmp_err_ack(q, mp, TSYSERR, ENOMEM);
 697                         return;
 698                 }
 699 
 700                 /*
 701                  * Send ok_ack for T_CONN_REQ
 702                  */
 703                 mp = mi_tpi_ok_ack_alloc(mp);
 704                 if (mp == NULL) {
 705                         /* Unable to reuse the T_CONN_REQ for the ack. */
 706                         icmp_err_ack_prim(q, mp1, T_CONN_REQ, TSYSERR, ENOMEM);
 707                         return;
 708                 }
 709                 putnext(connp->conn_rq, mp);
 710                 putnext(connp->conn_rq, mp1);
 711         }
 712 }
 713 
 714 static int
 715 rawip_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
 716     cred_t *cr, pid_t pid)
 717 {
 718         icmp_t          *icmp;
 719         sin_t           *sin;
 720         sin6_t          *sin6;
 721         int             error;
 722         uint16_t        dstport;
 723         ipaddr_t        v4dst;
 724         in6_addr_t      v6dst;
 725         uint32_t        flowinfo;
 726         ip_xmit_attr_t  *ixa;
 727         ip_xmit_attr_t  *oldixa;
 728         uint_t          scopeid = 0;
 729         uint_t          srcid = 0;
 730         in6_addr_t      v6src = connp->conn_saddr_v6;
 731 
 732         icmp = connp->conn_icmp;
 733 
 734         if (sa == NULL || !OK_32PTR((char *)sa)) {
 735                 return (EINVAL);
 736         }
 737 
 738         ASSERT(sa != NULL && len != 0);
 739 
 740         /*
 741          * Determine packet type based on type of address passed in
 742          * the request should contain an IPv4 or IPv6 address.
 743          * Make sure that address family matches the type of
 744          * family of the address passed down.
 745          */
 746         switch (len) {
 747         case sizeof (sin_t):
 748                 sin = (sin_t *)sa;
 749 
 750                 v4dst = sin->sin_addr.s_addr;
 751                 dstport = sin->sin_port;
 752                 IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
 753                 ASSERT(connp->conn_ipversion == IPV4_VERSION);
 754                 break;
 755 
 756         case sizeof (sin6_t):
 757                 sin6 = (sin6_t *)sa;
 758 
 759                 /* No support for mapped addresses on raw sockets */
 760                 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 761                         return (EADDRNOTAVAIL);
 762                 }
 763                 v6dst = sin6->sin6_addr;
 764                 dstport = sin6->sin6_port;
 765                 ASSERT(connp->conn_ipversion == IPV6_VERSION);
 766                 flowinfo = sin6->sin6_flowinfo;
 767                 if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr))
 768                         scopeid = sin6->sin6_scope_id;
 769                 srcid = sin6->__sin6_src_id;
 770                 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
 771                         ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
 772                             connp->conn_netstack);
 773                 }
 774                 break;
 775         }
 776 
 777         /*
 778          * If there is a different thread using conn_ixa then we get a new
 779          * copy and cut the old one loose from conn_ixa. Otherwise we use
 780          * conn_ixa and prevent any other thread from using/changing it.
 781          * Once connect() is done other threads can use conn_ixa since the
 782          * refcnt will be back at one.
 783          * We defer updating conn_ixa until later to handle any concurrent
 784          * conn_ixa_cleanup thread.
 785          */
 786         ixa = conn_get_ixa(connp, B_FALSE);
 787         if (ixa == NULL)
 788                 return (ENOMEM);
 789 
 790         mutex_enter(&connp->conn_lock);
 791         /*
 792          * This icmp_t must have bound already before doing a connect.
 793          * Reject if a connect is in progress (we drop conn_lock during
 794          * rawip_do_connect).
 795          */
 796         if (icmp->icmp_state == TS_UNBND || icmp->icmp_state == TS_WCON_CREQ) {
 797                 mutex_exit(&connp->conn_lock);
 798                 ixa_refrele(ixa);
 799                 return (-TOUTSTATE);
 800         }
 801 
 802         if (icmp->icmp_state == TS_DATA_XFER) {
 803                 /* Already connected - clear out state */
 804                 if (connp->conn_mcbc_bind)
 805                         connp->conn_saddr_v6 = ipv6_all_zeros;
 806                 else
 807                         connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
 808                 connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
 809                 connp->conn_faddr_v6 = ipv6_all_zeros;
 810                 icmp->icmp_state = TS_IDLE;
 811         }
 812 
 813         /*
 814          * Use sin_port/sin6_port since applications like psh use SOCK_RAW
 815          * with IPPROTO_TCP.
 816          */
 817         connp->conn_fport = dstport;
 818         if (connp->conn_ipversion == IPV4_VERSION) {
 819                 /*
 820                  * Interpret a zero destination to mean loopback.
 821                  * Update the T_CONN_REQ (sin/sin6) since it is used to
 822                  * generate the T_CONN_CON.
 823                  */
 824                 if (v4dst == INADDR_ANY) {
 825                         v4dst = htonl(INADDR_LOOPBACK);
 826                         IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
 827                         ASSERT(connp->conn_family == AF_INET);
 828                         sin->sin_addr.s_addr = v4dst;
 829                 }
 830                 connp->conn_faddr_v6 = v6dst;
 831                 connp->conn_flowinfo = 0;
 832         } else {
 833                 ASSERT(connp->conn_ipversion == IPV6_VERSION);
 834                 /*
 835                  * Interpret a zero destination to mean loopback.
 836                  * Update the T_CONN_REQ (sin/sin6) since it is used to
 837                  * generate the T_CONN_CON.
 838                  */
 839                 if (IN6_IS_ADDR_UNSPECIFIED(&v6dst)) {
 840                         v6dst = ipv6_loopback;
 841                         sin6->sin6_addr = v6dst;
 842                 }
 843                 connp->conn_faddr_v6 = v6dst;
 844                 connp->conn_flowinfo = flowinfo;
 845         }
 846 
 847         /*
 848          * We update our cred/cpid based on the caller of connect
 849          */
 850         if (connp->conn_cred != cr) {
 851                 crhold(cr);
 852                 crfree(connp->conn_cred);
 853                 connp->conn_cred = cr;
 854         }
 855         connp->conn_cpid = pid;
 856         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
 857         ixa->ixa_cred = cr;
 858         ixa->ixa_cpid = pid;
 859         if (is_system_labeled()) {
 860                 /* We need to restart with a label based on the cred */
 861                 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
 862         }
 863 
 864         if (scopeid != 0) {
 865                 ixa->ixa_flags |= IXAF_SCOPEID_SET;
 866                 ixa->ixa_scopeid = scopeid;
 867                 connp->conn_incoming_ifindex = scopeid;
 868         } else {
 869                 ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
 870                 connp->conn_incoming_ifindex = connp->conn_bound_if;
 871         }
 872 
 873         /*
 874          * conn_connect will drop conn_lock and reacquire it.
 875          * To prevent a send* from messing with this icmp_t while the lock
 876          * is dropped we set icmp_state and clear conn_v6lastdst.
 877          * That will make all send* fail with EISCONN.
 878          */
 879         connp->conn_v6lastdst = ipv6_all_zeros;
 880         icmp->icmp_state = TS_WCON_CREQ;
 881 
 882         error = conn_connect(connp, NULL, IPDF_ALLOW_MCBC);
 883         mutex_exit(&connp->conn_lock);
 884         if (error != 0)
 885                 goto connect_failed;
 886 
 887         /*
 888          * The addresses have been verified. Time to insert in
 889          * the correct fanout list.
 890          */
 891         error = ipcl_conn_insert(connp);
 892         if (error != 0)
 893                 goto connect_failed;
 894 
 895         mutex_enter(&connp->conn_lock);
 896         error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
 897             &connp->conn_faddr_v6, connp->conn_flowinfo);
 898         if (error != 0) {
 899                 mutex_exit(&connp->conn_lock);
 900                 goto connect_failed;
 901         }
 902 
 903         icmp->icmp_state = TS_DATA_XFER;
 904         /* Record this as the "last" send even though we haven't sent any */
 905         connp->conn_v6lastdst = connp->conn_faddr_v6;
 906         connp->conn_lastipversion = connp->conn_ipversion;
 907         connp->conn_lastdstport = connp->conn_fport;
 908         connp->conn_lastflowinfo = connp->conn_flowinfo;
 909         connp->conn_lastscopeid = scopeid;
 910         connp->conn_lastsrcid = srcid;
 911         /* Also remember a source to use together with lastdst */
 912         connp->conn_v6lastsrc = v6src;
 913 
 914         oldixa = conn_replace_ixa(connp, ixa);
 915         mutex_exit(&connp->conn_lock);
 916         ixa_refrele(oldixa);
 917 
 918         ixa_refrele(ixa);
 919         return (0);
 920 
 921 connect_failed:
 922         if (ixa != NULL)
 923                 ixa_refrele(ixa);
 924         mutex_enter(&connp->conn_lock);
 925         icmp->icmp_state = TS_IDLE;
 926         /* In case the source address was set above */
 927         if (connp->conn_mcbc_bind)
 928                 connp->conn_saddr_v6 = ipv6_all_zeros;
 929         else
 930                 connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
 931         connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
 932         connp->conn_faddr_v6 = ipv6_all_zeros;
 933         connp->conn_v6lastdst = ipv6_all_zeros;
 934         connp->conn_flowinfo = 0;
 935 
 936         (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
 937             &connp->conn_faddr_v6, connp->conn_flowinfo);
 938         mutex_exit(&connp->conn_lock);
 939         return (error);
 940 }
 941 
 942 static void
 943 rawip_do_close(conn_t *connp)
 944 {
 945         ASSERT(connp != NULL && IPCL_IS_RAWIP(connp));
 946 
 947         ip_quiesce_conn(connp);
 948 
 949         if (!IPCL_IS_NONSTR(connp)) {
 950                 qprocsoff(connp->conn_rq);
 951         }
 952 
 953         icmp_close_free(connp);
 954 
 955         /*
 956          * Now we are truly single threaded on this stream, and can
 957          * delete the things hanging off the connp, and finally the connp.
 958          * We removed this connp from the fanout list, it cannot be
 959          * accessed thru the fanouts, and we already waited for the
 960          * conn_ref to drop to 0. We are already in close, so
 961          * there cannot be any other thread from the top. qprocsoff
 962          * has completed, and service has completed or won't run in
 963          * future.
 964          */
 965         ASSERT(connp->conn_ref == 1);
 966 
 967         if (!IPCL_IS_NONSTR(connp)) {
 968                 inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
 969         } else {
 970                 ip_free_helper_stream(connp);
 971         }
 972 
 973         connp->conn_ref--;
 974         ipcl_conn_destroy(connp);
 975 }
 976 
 977 static int
 978 icmp_close(queue_t *q, int flags)
 979 {
 980         conn_t  *connp;
 981 
 982         if (flags & SO_FALLBACK) {
 983                 /*
 984                  * stream is being closed while in fallback
 985                  * simply free the resources that were allocated
 986                  */
 987                 inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr));
 988                 qprocsoff(q);
 989                 goto done;
 990         }
 991 
 992         connp = Q_TO_CONN(q);
 993         (void) rawip_do_close(connp);
 994 done:
 995         q->q_ptr = WR(q)->q_ptr = NULL;
 996         return (0);
 997 }
 998 
 999 static void
1000 icmp_close_free(conn_t *connp)
1001 {
1002         icmp_t *icmp = connp->conn_icmp;
1003 
1004         if (icmp->icmp_filter != NULL) {
1005                 kmem_free(icmp->icmp_filter, sizeof (icmp6_filter_t));
1006                 icmp->icmp_filter = NULL;
1007         }
1008 
1009         /*
1010          * Clear any fields which the kmem_cache constructor clears.
1011          * Only icmp_connp needs to be preserved.
1012          * TBD: We should make this more efficient to avoid clearing
1013          * everything.
1014          */
1015         ASSERT(icmp->icmp_connp == connp);
1016         bzero(icmp, sizeof (icmp_t));
1017         icmp->icmp_connp = connp;
1018 }
1019 
1020 /*
1021  * This routine handles each T_DISCON_REQ message passed to icmp
1022  * as an indicating that ICMP is no longer connected. This results
1023  * in telling IP to restore the binding to just the local address.
1024  */
1025 static int
1026 icmp_do_disconnect(conn_t *connp)
1027 {
1028         icmp_t  *icmp = connp->conn_icmp;
1029         int     error;
1030 
1031         mutex_enter(&connp->conn_lock);
1032         if (icmp->icmp_state != TS_DATA_XFER) {
1033                 mutex_exit(&connp->conn_lock);
1034                 return (-TOUTSTATE);
1035         }
1036         if (connp->conn_mcbc_bind)
1037                 connp->conn_saddr_v6 = ipv6_all_zeros;
1038         else
1039                 connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
1040         connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
1041         connp->conn_faddr_v6 = ipv6_all_zeros;
1042         icmp->icmp_state = TS_IDLE;
1043 
1044         connp->conn_v6lastdst = ipv6_all_zeros;
1045         error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
1046             &connp->conn_faddr_v6, connp->conn_flowinfo);
1047         mutex_exit(&connp->conn_lock);
1048         if (error != 0)
1049                 return (error);
1050 
1051         /*
1052          * Tell IP to remove the full binding and revert
1053          * to the local address binding.
1054          */
1055         return (ip_laddr_fanout_insert(connp));
1056 }
1057 
1058 static void
1059 icmp_tpi_disconnect(queue_t *q, mblk_t *mp)
1060 {
1061         conn_t  *connp = Q_TO_CONN(q);
1062         int     error;
1063 
1064         /*
1065          * Allocate the largest primitive we need to send back
1066          * T_error_ack is > than T_ok_ack
1067          */
1068         mp = reallocb(mp, sizeof (struct T_error_ack), 1);
1069         if (mp == NULL) {
1070                 /* Unable to reuse the T_DISCON_REQ for the ack. */
1071                 icmp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, ENOMEM);
1072                 return;
1073         }
1074 
1075         error = icmp_do_disconnect(connp);
1076 
1077         if (error != 0) {
1078                 if (error > 0) {
1079                         icmp_err_ack(q, mp, 0, error);
1080                 } else {
1081                         icmp_err_ack(q, mp, -error, 0);
1082                 }
1083         } else {
1084                 mp = mi_tpi_ok_ack_alloc(mp);
1085                 ASSERT(mp != NULL);
1086                 qreply(q, mp);
1087         }
1088 }
1089 
1090 static int
1091 icmp_disconnect(conn_t *connp)
1092 {
1093         int     error;
1094 
1095         connp->conn_dgram_errind = B_FALSE;
1096 
1097         error = icmp_do_disconnect(connp);
1098 
1099         if (error < 0)
1100                 error = proto_tlitosyserr(-error);
1101         return (error);
1102 }
1103 
1104 /* This routine creates a T_ERROR_ACK message and passes it upstream. */
1105 static void
1106 icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
1107 {
1108         if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
1109                 qreply(q, mp);
1110 }
1111 
1112 /* Shorthand to generate and send TPI error acks to our client */
1113 static void
1114 icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
1115     t_scalar_t t_error, int sys_error)
1116 {
1117         struct T_error_ack      *teackp;
1118 
1119         if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack),
1120             M_PCPROTO, T_ERROR_ACK)) != NULL) {
1121                 teackp = (struct T_error_ack *)mp->b_rptr;
1122                 teackp->ERROR_prim = primitive;
1123                 teackp->TLI_error = t_error;
1124                 teackp->UNIX_error = sys_error;
1125                 qreply(q, mp);
1126         }
1127 }
1128 
1129 /*
1130  * icmp_icmp_input is called as conn_recvicmp to process ICMP messages.
1131  * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors.
1132  * Assumes that IP has pulled up everything up to and including the ICMP header.
1133  */
1134 /* ARGSUSED2 */
1135 static void
1136 icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
1137 {
1138         conn_t          *connp = (conn_t *)arg1;
1139         icmp_t          *icmp = connp->conn_icmp;
1140         icmph_t         *icmph;
1141         ipha_t          *ipha;
1142         int             iph_hdr_length;
1143         sin_t           sin;
1144         mblk_t          *mp1;
1145         int             error = 0;
1146 
1147         ipha = (ipha_t *)mp->b_rptr;
1148 
1149         ASSERT(OK_32PTR(mp->b_rptr));
1150 
1151         if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
1152                 ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
1153                 icmp_icmp_error_ipv6(connp, mp, ira);
1154                 return;
1155         }
1156         ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
1157 
1158         /* Skip past the outer IP and ICMP headers */
1159         ASSERT(IPH_HDR_LENGTH(ipha) == ira->ira_ip_hdr_length);
1160         iph_hdr_length = ira->ira_ip_hdr_length;
1161         icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
1162         ipha = (ipha_t *)&icmph[1]; /* Inner IP header */
1163 
1164         iph_hdr_length = IPH_HDR_LENGTH(ipha);
1165 
1166         switch (icmph->icmph_type) {
1167         case ICMP_DEST_UNREACHABLE:
1168                 switch (icmph->icmph_code) {
1169                 case ICMP_FRAGMENTATION_NEEDED: {
1170                         ipha_t          *ipha;
1171                         ip_xmit_attr_t  *ixa;
1172                         /*
1173                          * IP has already adjusted the path MTU.
1174                          * But we need to adjust DF for IPv4.
1175                          */
1176                         if (connp->conn_ipversion != IPV4_VERSION)
1177                                 break;
1178 
1179                         ixa = conn_get_ixa(connp, B_FALSE);
1180                         if (ixa == NULL || ixa->ixa_ire == NULL) {
1181                                 /*
1182                                  * Some other thread holds conn_ixa. We will
1183                                  * redo this on the next ICMP too big.
1184                                  */
1185                                 if (ixa != NULL)
1186                                         ixa_refrele(ixa);
1187                                 break;
1188                         }
1189                         (void) ip_get_pmtu(ixa);
1190 
1191                         mutex_enter(&connp->conn_lock);
1192                         ipha = (ipha_t *)connp->conn_ht_iphc;
1193                         if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) {
1194                                 ipha->ipha_fragment_offset_and_flags |=
1195                                     IPH_DF_HTONS;
1196                         } else {
1197                                 ipha->ipha_fragment_offset_and_flags &=
1198                                     ~IPH_DF_HTONS;
1199                         }
1200                         mutex_exit(&connp->conn_lock);
1201                         ixa_refrele(ixa);
1202                         break;
1203                 }
1204                 case ICMP_PORT_UNREACHABLE:
1205                 case ICMP_PROTOCOL_UNREACHABLE:
1206                         error = ECONNREFUSED;
1207                         break;
1208                 default:
1209                         /* Transient errors */
1210                         break;
1211                 }
1212                 break;
1213         default:
1214                 /* Transient errors */
1215                 break;
1216         }
1217         if (error == 0) {
1218                 freemsg(mp);
1219                 return;
1220         }
1221 
1222         /*
1223          * Deliver T_UDERROR_IND when the application has asked for it.
1224          * The socket layer enables this automatically when connected.
1225          */
1226         if (!connp->conn_dgram_errind) {
1227                 freemsg(mp);
1228                 return;
1229         }
1230 
1231         sin = sin_null;
1232         sin.sin_family = AF_INET;
1233         sin.sin_addr.s_addr = ipha->ipha_dst;
1234 
1235         if (IPCL_IS_NONSTR(connp)) {
1236                 mutex_enter(&connp->conn_lock);
1237                 if (icmp->icmp_state == TS_DATA_XFER) {
1238                         if (sin.sin_addr.s_addr == connp->conn_faddr_v4) {
1239                                 mutex_exit(&connp->conn_lock);
1240                                 (*connp->conn_upcalls->su_set_error)
1241                                     (connp->conn_upper_handle, error);
1242                                 goto done;
1243                         }
1244                 } else {
1245                         icmp->icmp_delayed_error = error;
1246                         *((sin_t *)&icmp->icmp_delayed_addr) = sin;
1247                 }
1248                 mutex_exit(&connp->conn_lock);
1249         } else {
1250                 mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL, 0,
1251                     error);
1252                 if (mp1 != NULL)
1253                         putnext(connp->conn_rq, mp1);
1254         }
1255 done:
1256         freemsg(mp);
1257 }
1258 
1259 /*
1260  * icmp_icmp_error_ipv6 is called by icmp_icmp_error to process ICMP for IPv6.
1261  * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors.
1262  * Assumes that IP has pulled up all the extension headers as well as the
1263  * ICMPv6 header.
1264  */
1265 static void
1266 icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira)
1267 {
1268         icmp6_t         *icmp6;
1269         ip6_t           *ip6h, *outer_ip6h;
1270         uint16_t        iph_hdr_length;
1271         uint8_t         *nexthdrp;
1272         sin6_t          sin6;
1273         mblk_t          *mp1;
1274         int             error = 0;
1275         icmp_t          *icmp = connp->conn_icmp;
1276 
1277         outer_ip6h = (ip6_t *)mp->b_rptr;
1278 #ifdef DEBUG
1279         if (outer_ip6h->ip6_nxt != IPPROTO_ICMPV6)
1280                 iph_hdr_length = ip_hdr_length_v6(mp, outer_ip6h);
1281         else
1282                 iph_hdr_length = IPV6_HDR_LEN;
1283         ASSERT(iph_hdr_length == ira->ira_ip_hdr_length);
1284 #endif
1285         /* Skip past the outer IP and ICMP headers */
1286         iph_hdr_length = ira->ira_ip_hdr_length;
1287         icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length];
1288 
1289         ip6h = (ip6_t *)&icmp6[1];  /* Inner IP header */
1290         if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) {
1291                 freemsg(mp);
1292                 return;
1293         }
1294 
1295         switch (icmp6->icmp6_type) {
1296         case ICMP6_DST_UNREACH:
1297                 switch (icmp6->icmp6_code) {
1298                 case ICMP6_DST_UNREACH_NOPORT:
1299                         error = ECONNREFUSED;
1300                         break;
1301                 case ICMP6_DST_UNREACH_ADMIN:
1302                 case ICMP6_DST_UNREACH_NOROUTE:
1303                 case ICMP6_DST_UNREACH_BEYONDSCOPE:
1304                 case ICMP6_DST_UNREACH_ADDR:
1305                         /* Transient errors */
1306                         break;
1307                 default:
1308                         break;
1309                 }
1310                 break;
1311         case ICMP6_PACKET_TOO_BIG: {
1312                 struct T_unitdata_ind   *tudi;
1313                 struct T_opthdr         *toh;
1314                 size_t                  udi_size;
1315                 mblk_t                  *newmp;
1316                 t_scalar_t              opt_length = sizeof (struct T_opthdr) +
1317                     sizeof (struct ip6_mtuinfo);
1318                 sin6_t                  *sin6;
1319                 struct ip6_mtuinfo      *mtuinfo;
1320 
1321                 /*
1322                  * If the application has requested to receive path mtu
1323                  * information, send up an empty message containing an
1324                  * IPV6_PATHMTU ancillary data item.
1325                  */
1326                 if (!connp->conn_ipv6_recvpathmtu)
1327                         break;
1328 
1329                 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t) +
1330                     opt_length;
1331                 if ((newmp = allocb(udi_size, BPRI_MED)) == NULL) {
1332                         BUMP_MIB(&icmp->icmp_is->is_rawip_mib, rawipInErrors);
1333                         break;
1334                 }
1335 
1336                 /*
1337                  * newmp->b_cont is left to NULL on purpose.  This is an
1338                  * empty message containing only ancillary data.
1339                  */
1340                 newmp->b_datap->db_type = M_PROTO;
1341                 tudi = (struct T_unitdata_ind *)newmp->b_rptr;
1342                 newmp->b_wptr = (uchar_t *)tudi + udi_size;
1343                 tudi->PRIM_type = T_UNITDATA_IND;
1344                 tudi->SRC_length = sizeof (sin6_t);
1345                 tudi->SRC_offset = sizeof (struct T_unitdata_ind);
1346                 tudi->OPT_offset = tudi->SRC_offset + sizeof (sin6_t);
1347                 tudi->OPT_length = opt_length;
1348 
1349                 sin6 = (sin6_t *)&tudi[1];
1350                 bzero(sin6, sizeof (sin6_t));
1351                 sin6->sin6_family = AF_INET6;
1352                 sin6->sin6_addr = connp->conn_faddr_v6;
1353 
1354                 toh = (struct T_opthdr *)&sin6[1];
1355                 toh->level = IPPROTO_IPV6;
1356                 toh->name = IPV6_PATHMTU;
1357                 toh->len = opt_length;
1358                 toh->status = 0;
1359 
1360                 mtuinfo = (struct ip6_mtuinfo *)&toh[1];
1361                 bzero(mtuinfo, sizeof (struct ip6_mtuinfo));
1362                 mtuinfo->ip6m_addr.sin6_family = AF_INET6;
1363                 mtuinfo->ip6m_addr.sin6_addr = ip6h->ip6_dst;
1364                 mtuinfo->ip6m_mtu = icmp6->icmp6_mtu;
1365                 /*
1366                  * We've consumed everything we need from the original
1367                  * message.  Free it, then send our empty message.
1368                  */
1369                 freemsg(mp);
1370                 icmp_ulp_recv(connp, newmp, msgdsize(newmp));
1371                 return;
1372         }
1373         case ICMP6_TIME_EXCEEDED:
1374                 /* Transient errors */
1375                 break;
1376         case ICMP6_PARAM_PROB:
1377                 /* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */
1378                 if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER &&
1379                     (uchar_t *)ip6h + icmp6->icmp6_pptr ==
1380                     (uchar_t *)nexthdrp) {
1381                         error = ECONNREFUSED;
1382                         break;
1383                 }
1384                 break;
1385         }
1386         if (error == 0) {
1387                 freemsg(mp);
1388                 return;
1389         }
1390 
1391         /*
1392          * Deliver T_UDERROR_IND when the application has asked for it.
1393          * The socket layer enables this automatically when connected.
1394          */
1395         if (!connp->conn_dgram_errind) {
1396                 freemsg(mp);
1397                 return;
1398         }
1399 
1400         sin6 = sin6_null;
1401         sin6.sin6_family = AF_INET6;
1402         sin6.sin6_addr = ip6h->ip6_dst;
1403         sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
1404         if (IPCL_IS_NONSTR(connp)) {
1405                 mutex_enter(&connp->conn_lock);
1406                 if (icmp->icmp_state == TS_DATA_XFER) {
1407                         if (IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr,
1408                             &connp->conn_faddr_v6)) {
1409                                 mutex_exit(&connp->conn_lock);
1410                                 (*connp->conn_upcalls->su_set_error)
1411                                     (connp->conn_upper_handle, error);
1412                                 goto done;
1413                         }
1414                 } else {
1415                         icmp->icmp_delayed_error = error;
1416                         *((sin6_t *)&icmp->icmp_delayed_addr) = sin6;
1417                 }
1418                 mutex_exit(&connp->conn_lock);
1419         } else {
1420                 mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t),
1421                     NULL, 0, error);
1422                 if (mp1 != NULL)
1423                         putnext(connp->conn_rq, mp1);
1424         }
1425 done:
1426         freemsg(mp);
1427 }
1428 
1429 /*
1430  * This routine responds to T_ADDR_REQ messages.  It is called by icmp_wput.
1431  * The local address is filled in if endpoint is bound. The remote address
1432  * is filled in if remote address has been precified ("connected endpoint")
1433  * (The concept of connected CLTS sockets is alien to published TPI
1434  *  but we support it anyway).
1435  */
1436 static void
1437 icmp_addr_req(queue_t *q, mblk_t *mp)
1438 {
1439         struct sockaddr *sa;
1440         mblk_t  *ackmp;
1441         struct T_addr_ack *taa;
1442         icmp_t  *icmp = Q_TO_ICMP(q);
1443         conn_t  *connp = icmp->icmp_connp;
1444         uint_t  addrlen;
1445 
1446         /* Make it large enough for worst case */
1447         ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
1448             2 * sizeof (sin6_t), 1);
1449         if (ackmp == NULL) {
1450                 icmp_err_ack(q, mp, TSYSERR, ENOMEM);
1451                 return;
1452         }
1453         taa = (struct T_addr_ack *)ackmp->b_rptr;
1454 
1455         bzero(taa, sizeof (struct T_addr_ack));
1456         ackmp->b_wptr = (uchar_t *)&taa[1];
1457 
1458         taa->PRIM_type = T_ADDR_ACK;
1459         ackmp->b_datap->db_type = M_PCPROTO;
1460 
1461         if (connp->conn_family == AF_INET)
1462                 addrlen = sizeof (sin_t);
1463         else
1464                 addrlen = sizeof (sin6_t);
1465 
1466         mutex_enter(&connp->conn_lock);
1467         /*
1468          * Note: Following code assumes 32 bit alignment of basic
1469          * data structures like sin_t and struct T_addr_ack.
1470          */
1471         if (icmp->icmp_state != TS_UNBND) {
1472                 /*
1473                  * Fill in local address first
1474                  */
1475                 taa->LOCADDR_offset = sizeof (*taa);
1476                 taa->LOCADDR_length = addrlen;
1477                 sa = (struct sockaddr *)&taa[1];
1478                 (void) conn_getsockname(connp, sa, &addrlen);
1479                 ackmp->b_wptr += addrlen;
1480         }
1481         if (icmp->icmp_state == TS_DATA_XFER) {
1482                 /*
1483                  * connected, fill remote address too
1484                  */
1485                 taa->REMADDR_length = addrlen;
1486                 /* assumed 32-bit alignment */
1487                 taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length;
1488                 sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset);
1489                 (void) conn_getpeername(connp, sa, &addrlen);
1490                 ackmp->b_wptr += addrlen;
1491         }
1492         mutex_exit(&connp->conn_lock);
1493         ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim);
1494         qreply(q, ackmp);
1495 }
1496 
1497 static void
1498 icmp_copy_info(struct T_info_ack *tap, icmp_t *icmp)
1499 {
1500         conn_t          *connp = icmp->icmp_connp;
1501 
1502         *tap = icmp_g_t_info_ack;
1503 
1504         if (connp->conn_family == AF_INET6)
1505                 tap->ADDR_size = sizeof (sin6_t);
1506         else
1507                 tap->ADDR_size = sizeof (sin_t);
1508         tap->CURRENT_state = icmp->icmp_state;
1509         tap->OPT_size = icmp_max_optsize;
1510 }
1511 
1512 static void
1513 icmp_do_capability_ack(icmp_t *icmp, struct T_capability_ack *tcap,
1514     t_uscalar_t cap_bits1)
1515 {
1516         tcap->CAP_bits1 = 0;
1517 
1518         if (cap_bits1 & TC1_INFO) {
1519                 icmp_copy_info(&tcap->INFO_ack, icmp);
1520                 tcap->CAP_bits1 |= TC1_INFO;
1521         }
1522 }
1523 
1524 /*
1525  * This routine responds to T_CAPABILITY_REQ messages.  It is called by
1526  * icmp_wput.  Much of the T_CAPABILITY_ACK information is copied from
1527  * icmp_g_t_info_ack.  The current state of the stream is copied from
1528  * icmp_state.
1529  */
1530 static void
1531 icmp_capability_req(queue_t *q, mblk_t *mp)
1532 {
1533         icmp_t                  *icmp = Q_TO_ICMP(q);
1534         t_uscalar_t             cap_bits1;
1535         struct T_capability_ack *tcap;
1536 
1537         cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
1538 
1539         mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
1540             mp->b_datap->db_type, T_CAPABILITY_ACK);
1541         if (!mp)
1542                 return;
1543 
1544         tcap = (struct T_capability_ack *)mp->b_rptr;
1545 
1546         icmp_do_capability_ack(icmp, tcap, cap_bits1);
1547 
1548         qreply(q, mp);
1549 }
1550 
1551 /*
1552  * This routine responds to T_INFO_REQ messages.  It is called by icmp_wput.
1553  * Most of the T_INFO_ACK information is copied from icmp_g_t_info_ack.
1554  * The current state of the stream is copied from icmp_state.
1555  */
1556 static void
1557 icmp_info_req(queue_t *q, mblk_t *mp)
1558 {
1559         icmp_t  *icmp = Q_TO_ICMP(q);
1560 
1561         /* Create a T_INFO_ACK message. */
1562         mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO,
1563             T_INFO_ACK);
1564         if (!mp)
1565                 return;
1566         icmp_copy_info((struct T_info_ack *)mp->b_rptr, icmp);
1567         qreply(q, mp);
1568 }
1569 
1570 static int
1571 icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
1572     int family)
1573 {
1574         conn_t *connp;
1575         dev_t   conn_dev;
1576         int     error;
1577 
1578         /* If the stream is already open, return immediately. */
1579         if (q->q_ptr != NULL)
1580                 return (0);
1581 
1582         if (sflag == MODOPEN)
1583                 return (EINVAL);
1584 
1585         /*
1586          * Since ICMP is not used so heavily, allocating from the small
1587          * arena should be sufficient.
1588          */
1589         if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) {
1590                 return (EBUSY);
1591         }
1592 
1593         if (flag & SO_FALLBACK) {
1594                 /*
1595                  * Non streams socket needs a stream to fallback to
1596                  */
1597                 RD(q)->q_ptr = (void *)conn_dev;
1598                 WR(q)->q_qinfo = &icmp_fallback_sock_winit;
1599                 WR(q)->q_ptr = (void *)ip_minor_arena_sa;
1600                 qprocson(q);
1601                 return (0);
1602         }
1603 
1604         connp = rawip_do_open(family, credp, &error, KM_SLEEP);
1605         if (connp == NULL) {
1606                 ASSERT(error != 0);
1607                 inet_minor_free(ip_minor_arena_sa, conn_dev);
1608                 return (error);
1609         }
1610 
1611         *devp = makedevice(getemajor(*devp), (minor_t)conn_dev);
1612         connp->conn_dev = conn_dev;
1613         connp->conn_minor_arena = ip_minor_arena_sa;
1614 
1615         /*
1616          * Initialize the icmp_t structure for this stream.
1617          */
1618         q->q_ptr = connp;
1619         WR(q)->q_ptr = connp;
1620         connp->conn_rq = q;
1621         connp->conn_wq = WR(q);
1622 
1623         WR(q)->q_hiwat = connp->conn_sndbuf;
1624         WR(q)->q_lowat = connp->conn_sndlowat;
1625 
1626         qprocson(q);
1627 
1628         /* Set the Stream head write offset. */
1629         (void) proto_set_tx_wroff(q, connp, connp->conn_wroff);
1630         (void) proto_set_rx_hiwat(connp->conn_rq, connp, connp->conn_rcvbuf);
1631 
1632         mutex_enter(&connp->conn_lock);
1633         connp->conn_state_flags &= ~CONN_INCIPIENT;
1634         mutex_exit(&connp->conn_lock);
1635 
1636         icmp_bind_proto(connp->conn_icmp);
1637 
1638         return (0);
1639 }
1640 
1641 /* For /dev/icmp aka AF_INET open */
1642 static int
1643 icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
1644 {
1645         return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET));
1646 }
1647 
1648 /* For /dev/icmp6 aka AF_INET6 open */
1649 static int
1650 icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
1651 {
1652         return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET6));
1653 }
1654 
1655 /*
1656  * This is the open routine for icmp.  It allocates a icmp_t structure for
1657  * the stream and, on the first open of the module, creates an ND table.
1658  */
1659 static conn_t *
1660 rawip_do_open(int family, cred_t *credp, int *err, int flags)
1661 {
1662         icmp_t  *icmp;
1663         conn_t *connp;
1664         zoneid_t zoneid;
1665         netstack_t *ns;
1666         icmp_stack_t *is;
1667         int len;
1668         boolean_t isv6 = B_FALSE;
1669 
1670         *err = secpolicy_net_icmpaccess(credp);
1671         if (*err != 0)
1672                 return (NULL);
1673 
1674         if (family == AF_INET6)
1675                 isv6 = B_TRUE;
1676 
1677         ns = netstack_find_by_cred(credp);
1678         ASSERT(ns != NULL);
1679         is = ns->netstack_icmp;
1680         ASSERT(is != NULL);
1681 
1682         /*
1683          * For exclusive stacks we set the zoneid to zero
1684          * to make ICMP operate as if in the global zone.
1685          */
1686         if (ns->netstack_stackid != GLOBAL_NETSTACKID)
1687                 zoneid = GLOBAL_ZONEID;
1688         else
1689                 zoneid = crgetzoneid(credp);
1690 
1691         ASSERT(flags == KM_SLEEP || flags == KM_NOSLEEP);
1692 
1693         connp = ipcl_conn_create(IPCL_RAWIPCONN, flags, ns);
1694         icmp = connp->conn_icmp;
1695 
1696         /*
1697          * ipcl_conn_create did a netstack_hold. Undo the hold that was
1698          * done by netstack_find_by_cred()
1699          */
1700         netstack_rele(ns);
1701 
1702         /*
1703          * Since this conn_t/icmp_t is not yet visible to anybody else we don't
1704          * need to lock anything.
1705          */
1706         ASSERT(connp->conn_proto == IPPROTO_ICMP);
1707         ASSERT(connp->conn_icmp == icmp);
1708         ASSERT(icmp->icmp_connp == connp);
1709 
1710         /* Set the initial state of the stream and the privilege status. */
1711         icmp->icmp_state = TS_UNBND;
1712         connp->conn_ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
1713         if (isv6) {
1714                 connp->conn_family = AF_INET6;
1715                 connp->conn_ipversion = IPV6_VERSION;
1716                 connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4;
1717                 connp->conn_proto = IPPROTO_ICMPV6;
1718                 /* May be changed by a SO_PROTOTYPE socket option. */
1719                 connp->conn_proto = IPPROTO_ICMPV6;
1720                 connp->conn_ixa->ixa_protocol = connp->conn_proto;
1721                 connp->conn_ixa->ixa_raw_cksum_offset = 2;
1722                 connp->conn_default_ttl = is->is_ipv6_hoplimit;
1723                 len = sizeof (ip6_t);
1724         } else {
1725                 connp->conn_family = AF_INET;
1726                 connp->conn_ipversion = IPV4_VERSION;
1727                 connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4;
1728                 /* May be changed by a SO_PROTOTYPE socket option. */
1729                 connp->conn_proto = IPPROTO_ICMP;
1730                 connp->conn_ixa->ixa_protocol = connp->conn_proto;
1731                 connp->conn_default_ttl = is->is_ipv4_ttl;
1732                 len = sizeof (ipha_t);
1733         }
1734         connp->conn_xmit_ipp.ipp_unicast_hops = connp->conn_default_ttl;
1735 
1736         connp->conn_ixa->ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1737 
1738         /*
1739          * For the socket of protocol IPPROTO_RAW or when IP_HDRINCL is set,
1740          * the checksum is provided in the pre-built packet. We clear
1741          * IXAF_SET_ULP_CKSUM to tell IP that the application has sent a
1742          * complete IP header and not to compute the transport checksum.
1743          */
1744         connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM;
1745         /* conn_allzones can not be set this early, hence no IPCL_ZONEID */
1746         connp->conn_ixa->ixa_zoneid = zoneid;
1747 
1748         connp->conn_zoneid = zoneid;
1749 
1750         /*
1751          * If the caller has the process-wide flag set, then default to MAC
1752          * exempt mode.  This allows read-down to unlabeled hosts.
1753          */
1754         if (getpflags(NET_MAC_AWARE, credp) != 0)
1755                 connp->conn_mac_mode = CONN_MAC_AWARE;
1756 
1757         connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID);
1758 
1759         icmp->icmp_is = is;
1760 
1761         connp->conn_rcvbuf = is->is_recv_hiwat;
1762         connp->conn_sndbuf = is->is_xmit_hiwat;
1763         connp->conn_sndlowat = is->is_xmit_lowat;
1764         connp->conn_rcvlowat = icmp_mod_info.mi_lowat;
1765 
1766         connp->conn_wroff = len + is->is_wroff_extra;
1767         connp->conn_so_type = SOCK_RAW;
1768 
1769         connp->conn_recv = icmp_input;
1770         connp->conn_recvicmp = icmp_icmp_input;
1771         crhold(credp);
1772         connp->conn_cred = credp;
1773         connp->conn_cpid = curproc->p_pid;
1774         connp->conn_open_time = ddi_get_lbolt64();
1775         /* Cache things in ixa without an extra refhold */
1776         ASSERT(!(connp->conn_ixa->ixa_free_flags & IXA_FREE_CRED));
1777         connp->conn_ixa->ixa_cred = connp->conn_cred;
1778         connp->conn_ixa->ixa_cpid = connp->conn_cpid;
1779         if (is_system_labeled())
1780                 connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred);
1781 
1782         connp->conn_flow_cntrld = B_FALSE;
1783 
1784         if (is->is_pmtu_discovery)
1785                 connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
1786 
1787         return (connp);
1788 }
1789 
1790 /*
1791  * Which ICMP options OK to set through T_UNITDATA_REQ...
1792  */
1793 /* ARGSUSED */
1794 static boolean_t
1795 icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name)
1796 {
1797         return (B_TRUE);
1798 }
1799 
1800 /*
1801  * This routine gets default values of certain options whose default
1802  * values are maintained by protcol specific code
1803  */
1804 int
1805 icmp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
1806 {
1807         icmp_t *icmp = Q_TO_ICMP(q);
1808         icmp_stack_t *is = icmp->icmp_is;
1809         int *i1 = (int *)ptr;
1810 
1811         switch (level) {
1812         case IPPROTO_IP:
1813                 switch (name) {
1814                 case IP_MULTICAST_TTL:
1815                         *ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL;
1816                         return (sizeof (uchar_t));
1817                 case IP_MULTICAST_LOOP:
1818                         *ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP;
1819                         return (sizeof (uchar_t));
1820                 }
1821                 break;
1822         case IPPROTO_IPV6:
1823                 switch (name) {
1824                 case IPV6_MULTICAST_HOPS:
1825                         *i1 = IP_DEFAULT_MULTICAST_TTL;
1826                         return (sizeof (int));
1827                 case IPV6_MULTICAST_LOOP:
1828                         *i1 = IP_DEFAULT_MULTICAST_LOOP;
1829                         return (sizeof (int));
1830                 case IPV6_UNICAST_HOPS:
1831                         *i1 = is->is_ipv6_hoplimit;
1832                         return (sizeof (int));
1833                 }
1834                 break;
1835         case IPPROTO_ICMPV6:
1836                 switch (name) {
1837                 case ICMP6_FILTER:
1838                         /* Make it look like "pass all" */
1839                         ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr);
1840                         return (sizeof (icmp6_filter_t));
1841                 }
1842                 break;
1843         }
1844         return (-1);
1845 }
1846 
1847 /*
1848  * This routine retrieves the current status of socket options.
1849  * It returns the size of the option retrieved, or -1.
1850  */
1851 int
1852 icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
1853 {
1854         icmp_t          *icmp = connp->conn_icmp;
1855         int             *i1 = (int *)ptr;
1856         conn_opt_arg_t  coas;
1857         int             retval;
1858 
1859         coas.coa_connp = connp;
1860         coas.coa_ixa = connp->conn_ixa;
1861         coas.coa_ipp = &connp->conn_xmit_ipp;
1862         coas.coa_ancillary = B_FALSE;
1863         coas.coa_changed = 0;
1864 
1865         /*
1866          * We assume that the optcom framework has checked for the set
1867          * of levels and names that are supported, hence we don't worry
1868          * about rejecting based on that.
1869          * First check for ICMP specific handling, then pass to common routine.
1870          */
1871         switch (level) {
1872         case IPPROTO_IP:
1873                 /*
1874                  * Only allow IPv4 option processing on IPv4 sockets.
1875                  */
1876                 if (connp->conn_family != AF_INET)
1877                         return (-1);
1878 
1879                 switch (name) {
1880                 case IP_OPTIONS:
1881                 case T_IP_OPTIONS:
1882                         /* Options are passed up with each packet */
1883                         return (0);
1884                 case IP_HDRINCL:
1885                         mutex_enter(&connp->conn_lock);
1886                         *i1 = (int)icmp->icmp_hdrincl;
1887                         mutex_exit(&connp->conn_lock);
1888                         return (sizeof (int));
1889                 }
1890                 break;
1891 
1892         case IPPROTO_IPV6:
1893                 /*
1894                  * Only allow IPv6 option processing on native IPv6 sockets.
1895                  */
1896                 if (connp->conn_family != AF_INET6)
1897                         return (-1);
1898 
1899                 switch (name) {
1900                 case IPV6_CHECKSUM:
1901                         /*
1902                          * Return offset or -1 if no checksum offset.
1903                          * Does not apply to IPPROTO_ICMPV6
1904                          */
1905                         if (connp->conn_proto == IPPROTO_ICMPV6)
1906                                 return (-1);
1907 
1908                         mutex_enter(&connp->conn_lock);
1909                         if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM)
1910                                 *i1 = connp->conn_ixa->ixa_raw_cksum_offset;
1911                         else
1912                                 *i1 = -1;
1913                         mutex_exit(&connp->conn_lock);
1914                         return (sizeof (int));
1915                 }
1916                 break;
1917 
1918         case IPPROTO_ICMPV6:
1919                 /*
1920                  * Only allow IPv6 option processing on native IPv6 sockets.
1921                  */
1922                 if (connp->conn_family != AF_INET6)
1923                         return (-1);
1924 
1925                 if (connp->conn_proto != IPPROTO_ICMPV6)
1926                         return (-1);
1927 
1928                 switch (name) {
1929                 case ICMP6_FILTER:
1930                         mutex_enter(&connp->conn_lock);
1931                         if (icmp->icmp_filter == NULL) {
1932                                 /* Make it look like "pass all" */
1933                                 ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr);
1934                         } else {
1935                                 (void) bcopy(icmp->icmp_filter, ptr,
1936                                     sizeof (icmp6_filter_t));
1937                         }
1938                         mutex_exit(&connp->conn_lock);
1939                         return (sizeof (icmp6_filter_t));
1940                 }
1941         }
1942         mutex_enter(&connp->conn_lock);
1943         retval = conn_opt_get(&coas, level, name, ptr);
1944         mutex_exit(&connp->conn_lock);
1945         return (retval);
1946 }
1947 
1948 /*
1949  * This routine retrieves the current status of socket options.
1950  * It returns the size of the option retrieved, or -1.
1951  */
1952 int
1953 icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
1954 {
1955         conn_t          *connp = Q_TO_CONN(q);
1956         int             err;
1957 
1958         err = icmp_opt_get(connp, level, name, ptr);
1959         return (err);
1960 }
1961 
1962 /*
1963  * This routine sets socket options.
1964  */
1965 int
1966 icmp_do_opt_set(conn_opt_arg_t *coa, int level, int name,
1967     uint_t inlen, uchar_t *invalp, cred_t *cr, boolean_t checkonly)
1968 {
1969         conn_t          *connp = coa->coa_connp;
1970         ip_xmit_attr_t  *ixa = coa->coa_ixa;
1971         icmp_t          *icmp = connp->conn_icmp;
1972         icmp_stack_t    *is = icmp->icmp_is;
1973         int             *i1 = (int *)invalp;
1974         boolean_t       onoff = (*i1 == 0) ? 0 : 1;
1975         int             error;
1976 
1977         ASSERT(MUTEX_NOT_HELD(&coa->coa_connp->conn_lock));
1978 
1979         /*
1980          * For fixed length options, no sanity check
1981          * of passed in length is done. It is assumed *_optcom_req()
1982          * routines do the right thing.
1983          */
1984 
1985         switch (level) {
1986         case SOL_SOCKET:
1987                 switch (name) {
1988                 case SO_PROTOTYPE:
1989                         if ((*i1 & 0xFF) != IPPROTO_ICMP &&
1990                             (*i1 & 0xFF) != IPPROTO_ICMPV6 &&
1991                             secpolicy_net_rawaccess(cr) != 0) {
1992                                 return (EACCES);
1993                         }
1994                         if (checkonly)
1995                                 break;
1996 
1997                         mutex_enter(&connp->conn_lock);
1998                         connp->conn_proto = *i1 & 0xFF;
1999                         ixa->ixa_protocol = connp->conn_proto;
2000                         if ((connp->conn_proto == IPPROTO_RAW ||
2001                             connp->conn_proto == IPPROTO_IGMP) &&
2002                             connp->conn_family == AF_INET) {
2003                                 icmp->icmp_hdrincl = 1;
2004                                 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
2005                         } else if (connp->conn_proto == IPPROTO_UDP ||
2006                             connp->conn_proto == IPPROTO_TCP ||
2007                             connp->conn_proto == IPPROTO_SCTP) {
2008                                 /* Used by test applications like psh */
2009                                 icmp->icmp_hdrincl = 0;
2010                                 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
2011                         } else {
2012                                 icmp->icmp_hdrincl = 0;
2013                                 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
2014                         }
2015 
2016                         if (connp->conn_family == AF_INET6 &&
2017                             connp->conn_proto == IPPROTO_ICMPV6) {
2018                                 /* Set offset for icmp6_cksum */
2019                                 ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM;
2020                                 ixa->ixa_raw_cksum_offset = 2;
2021                         }
2022                         if (icmp->icmp_filter != NULL &&
2023                             connp->conn_proto != IPPROTO_ICMPV6) {
2024                                 kmem_free(icmp->icmp_filter,
2025                                     sizeof (icmp6_filter_t));
2026                                 icmp->icmp_filter = NULL;
2027                         }
2028                         mutex_exit(&connp->conn_lock);
2029 
2030                         coa->coa_changed |= COA_HEADER_CHANGED;
2031                         /*
2032                          * For SCTP, we don't use icmp_bind_proto() for
2033                          * raw socket binding.
2034                          */
2035                         if (connp->conn_proto == IPPROTO_SCTP)
2036                                 return (0);
2037 
2038                         coa->coa_changed |= COA_ICMP_BIND_NEEDED;
2039                         return (0);
2040 
2041                 case SO_SNDBUF:
2042                         if (*i1 > is->is_max_buf) {
2043                                 return (ENOBUFS);
2044                         }
2045                         break;
2046                 case SO_RCVBUF:
2047                         if (*i1 > is->is_max_buf) {
2048                                 return (ENOBUFS);
2049                         }
2050                         break;
2051                 }
2052                 break;
2053 
2054         case IPPROTO_IP:
2055                 /*
2056                  * Only allow IPv4 option processing on IPv4 sockets.
2057                  */
2058                 if (connp->conn_family != AF_INET)
2059                         return (EINVAL);
2060 
2061                 switch (name) {
2062                 case IP_HDRINCL:
2063                         if (!checkonly) {
2064                                 mutex_enter(&connp->conn_lock);
2065                                 icmp->icmp_hdrincl = onoff;
2066                                 if (onoff)
2067                                         ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
2068                                 else
2069                                         ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
2070                                 mutex_exit(&connp->conn_lock);
2071                         }
2072                         break;
2073                 }
2074                 break;
2075 
2076         case IPPROTO_IPV6:
2077                 if (connp->conn_family != AF_INET6)
2078                         return (EINVAL);
2079 
2080                 switch (name) {
2081                 case IPV6_CHECKSUM:
2082                         /*
2083                          * Integer offset into the user data of where the
2084                          * checksum is located.
2085                          * Offset of -1 disables option.
2086                          * Does not apply to IPPROTO_ICMPV6.
2087                          */
2088                         if (connp->conn_proto == IPPROTO_ICMPV6 ||
2089                             coa->coa_ancillary) {
2090                                 return (EINVAL);
2091                         }
2092                         if ((*i1 != -1) && ((*i1 < 0) || (*i1 & 0x1) != 0)) {
2093                                 /* Negative or not 16 bit aligned offset */
2094                                 return (EINVAL);
2095                         }
2096                         if (checkonly)
2097                                 break;
2098 
2099                         mutex_enter(&connp->conn_lock);
2100                         if (*i1 == -1) {
2101                                 ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM;
2102                                 ixa->ixa_raw_cksum_offset = 0;
2103                                 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
2104                         } else {
2105                                 ixa->ixa_flags |= IXAF_SET_RAW_CKSUM;
2106                                 ixa->ixa_raw_cksum_offset = *i1;
2107                                 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
2108                         }
2109                         mutex_exit(&connp->conn_lock);
2110                         break;
2111                 }
2112                 break;
2113 
2114         case IPPROTO_ICMPV6:
2115                 /*
2116                  * Only allow IPv6 option processing on IPv6 sockets.
2117                  */
2118                 if (connp->conn_family != AF_INET6)
2119                         return (EINVAL);
2120                 if (connp->conn_proto != IPPROTO_ICMPV6)
2121                         return (EINVAL);
2122 
2123                 switch (name) {
2124                 case ICMP6_FILTER:
2125                         if (checkonly)
2126                                 break;
2127 
2128                         if ((inlen != 0) &&
2129                             (inlen != sizeof (icmp6_filter_t)))
2130                                 return (EINVAL);
2131 
2132                         mutex_enter(&connp->conn_lock);
2133                         if (inlen == 0) {
2134                                 if (icmp->icmp_filter != NULL) {
2135                                         kmem_free(icmp->icmp_filter,
2136                                             sizeof (icmp6_filter_t));
2137                                         icmp->icmp_filter = NULL;
2138                                 }
2139                         } else {
2140                                 if (icmp->icmp_filter == NULL) {
2141                                         icmp->icmp_filter = kmem_alloc(
2142                                             sizeof (icmp6_filter_t),
2143                                             KM_NOSLEEP);
2144                                         if (icmp->icmp_filter == NULL) {
2145                                                 mutex_exit(&connp->conn_lock);
2146                                                 return (ENOBUFS);
2147                                         }
2148                                 }
2149                                 (void) bcopy(invalp, icmp->icmp_filter, inlen);
2150                         }
2151                         mutex_exit(&connp->conn_lock);
2152                         break;
2153                 }
2154                 break;
2155         }
2156         error = conn_opt_set(coa, level, name, inlen, invalp,
2157             checkonly, cr);
2158         return (error);
2159 }
2160 
2161 /*
2162  * This routine sets socket options.
2163  */
2164 int
2165 icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
2166     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
2167     void *thisdg_attrs, cred_t *cr)
2168 {
2169         icmp_t          *icmp = connp->conn_icmp;
2170         int             err;
2171         conn_opt_arg_t  coas, *coa;
2172         boolean_t       checkonly;
2173         icmp_stack_t    *is = icmp->icmp_is;
2174 
2175         switch (optset_context) {
2176         case SETFN_OPTCOM_CHECKONLY:
2177                 checkonly = B_TRUE;
2178                 /*
2179                  * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
2180                  * inlen != 0 implies value supplied and
2181                  *      we have to "pretend" to set it.
2182                  * inlen == 0 implies that there is no
2183                  *      value part in T_CHECK request and just validation
2184                  * done elsewhere should be enough, we just return here.
2185                  */
2186                 if (inlen == 0) {
2187                         *outlenp = 0;
2188                         return (0);
2189                 }
2190                 break;
2191         case SETFN_OPTCOM_NEGOTIATE:
2192                 checkonly = B_FALSE;
2193                 break;
2194         case SETFN_UD_NEGOTIATE:
2195         case SETFN_CONN_NEGOTIATE:
2196                 checkonly = B_FALSE;
2197                 /*
2198                  * Negotiating local and "association-related" options
2199                  * through T_UNITDATA_REQ.
2200                  *
2201                  * Following routine can filter out ones we do not
2202                  * want to be "set" this way.
2203                  */
2204                 if (!icmp_opt_allow_udr_set(level, name)) {
2205                         *outlenp = 0;
2206                         return (EINVAL);
2207                 }
2208                 break;
2209         default:
2210                 /*
2211                  * We should never get here
2212                  */
2213                 *outlenp = 0;
2214                 return (EINVAL);
2215         }
2216 
2217         ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
2218             (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
2219 
2220         if (thisdg_attrs != NULL) {
2221                 /* Options from T_UNITDATA_REQ */
2222                 coa = (conn_opt_arg_t *)thisdg_attrs;
2223                 ASSERT(coa->coa_connp == connp);
2224                 ASSERT(coa->coa_ixa != NULL);
2225                 ASSERT(coa->coa_ipp != NULL);
2226                 ASSERT(coa->coa_ancillary);
2227         } else {
2228                 coa = &coas;
2229                 coas.coa_connp = connp;
2230                 /* Get a reference on conn_ixa to prevent concurrent mods */
2231                 coas.coa_ixa = conn_get_ixa(connp, B_TRUE);
2232                 if (coas.coa_ixa == NULL) {
2233                         *outlenp = 0;
2234                         return (ENOMEM);
2235                 }
2236                 coas.coa_ipp = &connp->conn_xmit_ipp;
2237                 coas.coa_ancillary = B_FALSE;
2238                 coas.coa_changed = 0;
2239         }
2240 
2241         err = icmp_do_opt_set(coa, level, name, inlen, invalp,
2242             cr, checkonly);
2243         if (err != 0) {
2244 errout:
2245                 if (!coa->coa_ancillary)
2246                         ixa_refrele(coa->coa_ixa);
2247                 *outlenp = 0;
2248                 return (err);
2249         }
2250 
2251         /*
2252          * Common case of OK return with outval same as inval.
2253          */
2254         if (invalp != outvalp) {
2255                 /* don't trust bcopy for identical src/dst */
2256                 (void) bcopy(invalp, outvalp, inlen);
2257         }
2258         *outlenp = inlen;
2259 
2260         /*
2261          * If this was not ancillary data, then we rebuild the headers,
2262          * update the IRE/NCE, and IPsec as needed.
2263          * Since the label depends on the destination we go through
2264          * ip_set_destination first.
2265          */
2266         if (coa->coa_ancillary) {
2267                 return (0);
2268         }
2269 
2270         if (coa->coa_changed & COA_ROUTE_CHANGED) {
2271                 in6_addr_t saddr, faddr, nexthop;
2272                 in_port_t fport;
2273 
2274                 /*
2275                  * We clear lastdst to make sure we pick up the change
2276                  * next time sending.
2277                  * If we are connected we re-cache the information.
2278                  * We ignore errors to preserve BSD behavior.
2279                  * Note that we don't redo IPsec policy lookup here
2280                  * since the final destination (or source) didn't change.
2281                  */
2282                 mutex_enter(&connp->conn_lock);
2283                 connp->conn_v6lastdst = ipv6_all_zeros;
2284 
2285                 ip_attr_nexthop(coa->coa_ipp, coa->coa_ixa,
2286                     &connp->conn_faddr_v6, &nexthop);
2287                 saddr = connp->conn_saddr_v6;
2288                 faddr = connp->conn_faddr_v6;
2289                 fport = connp->conn_fport;
2290                 mutex_exit(&connp->conn_lock);
2291 
2292                 if (!IN6_IS_ADDR_UNSPECIFIED(&faddr) &&
2293                     !IN6_IS_ADDR_V4MAPPED_ANY(&faddr)) {
2294                         (void) ip_attr_connect(connp, coa->coa_ixa,
2295                             &saddr, &faddr, &nexthop, fport, NULL, NULL,
2296                             IPDF_ALLOW_MCBC | IPDF_VERIFY_DST);
2297                 }
2298         }
2299 
2300         ixa_refrele(coa->coa_ixa);
2301 
2302         if (coa->coa_changed & COA_HEADER_CHANGED) {
2303                 /*
2304                  * Rebuild the header template if we are connected.
2305                  * Otherwise clear conn_v6lastdst so we rebuild the header
2306                  * in the data path.
2307                  */
2308                 mutex_enter(&connp->conn_lock);
2309                 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
2310                     !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
2311                         err = icmp_build_hdr_template(connp,
2312                             &connp->conn_saddr_v6, &connp->conn_faddr_v6,
2313                             connp->conn_flowinfo);
2314                         if (err != 0) {
2315                                 mutex_exit(&connp->conn_lock);
2316                                 return (err);
2317                         }
2318                 } else {
2319                         connp->conn_v6lastdst = ipv6_all_zeros;
2320                 }
2321                 mutex_exit(&connp->conn_lock);
2322         }
2323         if (coa->coa_changed & COA_RCVBUF_CHANGED) {
2324                 (void) proto_set_rx_hiwat(connp->conn_rq, connp,
2325                     connp->conn_rcvbuf);
2326         }
2327         if ((coa->coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
2328                 connp->conn_wq->q_hiwat = connp->conn_sndbuf;
2329         }
2330         if (coa->coa_changed & COA_WROFF_CHANGED) {
2331                 /* Increase wroff if needed */
2332                 uint_t wroff;
2333 
2334                 mutex_enter(&connp->conn_lock);
2335                 wroff = connp->conn_ht_iphc_allocated + is->is_wroff_extra;
2336                 if (wroff > connp->conn_wroff) {
2337                         connp->conn_wroff = wroff;
2338                         mutex_exit(&connp->conn_lock);
2339                         (void) proto_set_tx_wroff(connp->conn_rq, connp, wroff);
2340                 } else {
2341                         mutex_exit(&connp->conn_lock);
2342                 }
2343         }
2344         if (coa->coa_changed & COA_ICMP_BIND_NEEDED) {
2345                 icmp_bind_proto(icmp);
2346         }
2347         return (err);
2348 }
2349 
2350 /* This routine sets socket options. */
2351 int
2352 icmp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name,
2353     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
2354     void *thisdg_attrs, cred_t *cr)
2355 {
2356         conn_t  *connp = Q_TO_CONN(q);
2357         int error;
2358 
2359         error = icmp_opt_set(connp, optset_context, level, name, inlen, invalp,
2360             outlenp, outvalp, thisdg_attrs, cr);
2361         return (error);
2362 }
2363 
2364 /*
2365  * Setup IP headers.
2366  *
2367  * Note that IP_HDRINCL has ipha_protocol that is different than conn_proto,
2368  * but icmp_output_hdrincl restores ipha_protocol once we return.
2369  */
2370 mblk_t *
2371 icmp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp,
2372     const in6_addr_t *v6src, const in6_addr_t *v6dst, uint32_t flowinfo,
2373     mblk_t *data_mp, int *errorp)
2374 {
2375         mblk_t          *mp;
2376         icmp_stack_t    *is = connp->conn_netstack->netstack_icmp;
2377         uint_t          data_len;
2378         uint32_t        cksum;
2379 
2380         data_len = msgdsize(data_mp);
2381         mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, connp->conn_proto,
2382             flowinfo, 0, data_mp, data_len, is->is_wroff_extra, &cksum, errorp);
2383         if (mp == NULL) {
2384                 ASSERT(*errorp != 0);
2385                 return (NULL);
2386         }
2387 
2388         ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length;
2389 
2390         /*
2391          * If there was a routing option/header then conn_prepend_hdr
2392          * has massaged it and placed the pseudo-header checksum difference
2393          * in the cksum argument.
2394          *
2395          * Prepare for ICMPv6 checksum done in IP.
2396          *
2397          * We make it easy for IP to include our pseudo header
2398          * by putting our length (and any routing header adjustment)
2399          * in the ICMPv6 checksum field.
2400          * The IP source, destination, and length have already been set by
2401          * conn_prepend_hdr.
2402          */
2403         cksum += data_len;
2404         cksum = (cksum >> 16) + (cksum & 0xFFFF);
2405         ASSERT(cksum < 0x10000);
2406 
2407         if (ixa->ixa_flags & IXAF_IS_IPV4) {
2408                 ipha_t  *ipha = (ipha_t *)mp->b_rptr;
2409 
2410                 ASSERT(ntohs(ipha->ipha_length) == ixa->ixa_pktlen);
2411         } else {
2412                 ip6_t   *ip6h = (ip6_t *)mp->b_rptr;
2413                 uint_t  cksum_offset = 0;
2414 
2415                 ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == ixa->ixa_pktlen);
2416 
2417                 if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) {
2418                         if (connp->conn_proto == IPPROTO_ICMPV6) {
2419                                 cksum_offset = ixa->ixa_ip_hdr_length +
2420                                     offsetof(icmp6_t, icmp6_cksum);
2421                         } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
2422                                 cksum_offset = ixa->ixa_ip_hdr_length +
2423                                     ixa->ixa_raw_cksum_offset;
2424                         }
2425                 }
2426                 if (cksum_offset != 0) {
2427                         uint16_t *ptr;
2428 
2429                         /* Make sure the checksum fits in the first mblk */
2430                         if (cksum_offset + sizeof (short) > MBLKL(mp)) {
2431                                 mblk_t *mp1;
2432 
2433                                 mp1 = msgpullup(mp,
2434                                     cksum_offset + sizeof (short));
2435                                 freemsg(mp);
2436                                 if (mp1 == NULL) {
2437                                         *errorp = ENOMEM;
2438                                         return (NULL);
2439                                 }
2440                                 mp = mp1;
2441                                 ip6h = (ip6_t *)mp->b_rptr;
2442                         }
2443                         ptr = (uint16_t *)(mp->b_rptr + cksum_offset);
2444                         *ptr = htons(cksum);
2445                 }
2446         }
2447 
2448         /* Note that we don't try to update wroff due to ancillary data */
2449         return (mp);
2450 }
2451 
2452 static int
2453 icmp_build_hdr_template(conn_t *connp, const in6_addr_t *v6src,
2454     const in6_addr_t *v6dst, uint32_t flowinfo)
2455 {
2456         int             error;
2457 
2458         ASSERT(MUTEX_HELD(&connp->conn_lock));
2459         /*
2460          * We clear lastdst to make sure we don't use the lastdst path
2461          * next time sending since we might not have set v6dst yet.
2462          */
2463         connp->conn_v6lastdst = ipv6_all_zeros;
2464 
2465         error = conn_build_hdr_template(connp, 0, 0, v6src, v6dst, flowinfo);
2466         if (error != 0)
2467                 return (error);
2468 
2469         /*
2470          * Any routing header/option has been massaged. The checksum difference
2471          * is stored in conn_sum.
2472          */
2473         return (0);
2474 }
2475 
2476 static mblk_t *
2477 icmp_queue_fallback(icmp_t *icmp, mblk_t *mp)
2478 {
2479         ASSERT(MUTEX_HELD(&icmp->icmp_recv_lock));
2480         if (IPCL_IS_NONSTR(icmp->icmp_connp)) {
2481                 /*
2482                  * fallback has started but messages have not been moved yet
2483                  */
2484                 if (icmp->icmp_fallback_queue_head == NULL) {
2485                         ASSERT(icmp->icmp_fallback_queue_tail == NULL);
2486                         icmp->icmp_fallback_queue_head = mp;
2487                         icmp->icmp_fallback_queue_tail = mp;
2488                 } else {
2489                         ASSERT(icmp->icmp_fallback_queue_tail != NULL);
2490                         icmp->icmp_fallback_queue_tail->b_next = mp;
2491                         icmp->icmp_fallback_queue_tail = mp;
2492                 }
2493                 return (NULL);
2494         } else {
2495                 /*
2496                  * Fallback completed, let the caller putnext() the mblk.
2497                  */
2498                 return (mp);
2499         }
2500 }
2501 
2502 /*
2503  * Deliver data to ULP. In case we have a socket, and it's falling back to
2504  * TPI, then we'll queue the mp for later processing.
2505  */
2506 static void
2507 icmp_ulp_recv(conn_t *connp, mblk_t *mp, uint_t len)
2508 {
2509         if (IPCL_IS_NONSTR(connp)) {
2510                 icmp_t *icmp = connp->conn_icmp;
2511                 int error;
2512 
2513                 ASSERT(len == msgdsize(mp));
2514                 if ((*connp->conn_upcalls->su_recv)
2515                     (connp->conn_upper_handle, mp, len, 0, &error, NULL) < 0) {
2516                         mutex_enter(&icmp->icmp_recv_lock);
2517                         if (error == ENOSPC) {
2518                                 /*
2519                                  * let's confirm while holding the lock
2520                                  */
2521                                 if ((*connp->conn_upcalls->su_recv)
2522                                     (connp->conn_upper_handle, NULL, 0, 0,
2523                                     &error, NULL) < 0) {
2524                                         ASSERT(error == ENOSPC);
2525                                         if (error == ENOSPC) {
2526                                                 connp->conn_flow_cntrld =
2527                                                     B_TRUE;
2528                                         }
2529                                 }
2530                                 mutex_exit(&icmp->icmp_recv_lock);
2531                         } else {
2532                                 ASSERT(error == EOPNOTSUPP);
2533                                 mp = icmp_queue_fallback(icmp, mp);
2534                                 mutex_exit(&icmp->icmp_recv_lock);
2535                                 if (mp != NULL)
2536                                         putnext(connp->conn_rq, mp);
2537                         }
2538                 }
2539                 ASSERT(MUTEX_NOT_HELD(&icmp->icmp_recv_lock));
2540         } else {
2541                 putnext(connp->conn_rq, mp);
2542         }
2543 }
2544 
2545 /*
2546  * This is the inbound data path.
2547  * IP has already pulled up the IP headers and verified alignment
2548  * etc.
2549  */
2550 /* ARGSUSED2 */
2551 static void
2552 icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
2553 {
2554         conn_t                  *connp = (conn_t *)arg1;
2555         struct T_unitdata_ind   *tudi;
2556         uchar_t                 *rptr;          /* Pointer to IP header */
2557         int                     ip_hdr_length;
2558         int                     udi_size;       /* Size of T_unitdata_ind */
2559         int                     pkt_len;
2560         icmp_t                  *icmp;
2561         ip_pkt_t                ipps;
2562         ip6_t                   *ip6h;
2563         mblk_t                  *mp1;
2564         crb_t                   recv_ancillary;
2565         icmp_stack_t            *is;
2566         sin_t                   *sin;
2567         sin6_t                  *sin6;
2568         ipha_t                  *ipha;
2569 
2570         ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2571 
2572         icmp = connp->conn_icmp;
2573         is = icmp->icmp_is;
2574         rptr = mp->b_rptr;
2575 
2576         ASSERT(DB_TYPE(mp) == M_DATA);
2577         ASSERT(OK_32PTR(rptr));
2578         ASSERT(ira->ira_pktlen == msgdsize(mp));
2579         pkt_len = ira->ira_pktlen;
2580 
2581         /*
2582          * Get a snapshot of these and allow other threads to change
2583          * them after that. We need the same recv_ancillary when determining
2584          * the size as when adding the ancillary data items.
2585          */
2586         mutex_enter(&connp->conn_lock);
2587         recv_ancillary = connp->conn_recv_ancillary;
2588         mutex_exit(&connp->conn_lock);
2589 
2590         ip_hdr_length = ira->ira_ip_hdr_length;
2591         ASSERT(MBLKL(mp) >= ip_hdr_length);  /* IP did a pullup */
2592 
2593         /* Initialize regardless of IP version */
2594         ipps.ipp_fields = 0;
2595 
2596         if (ira->ira_flags & IRAF_IS_IPV4) {
2597                 ASSERT(IPH_HDR_VERSION(rptr) == IPV4_VERSION);
2598                 ASSERT(MBLKL(mp) >= sizeof (ipha_t));
2599                 ASSERT(ira->ira_ip_hdr_length == IPH_HDR_LENGTH(rptr));
2600 
2601                 ipha = (ipha_t *)mp->b_rptr;
2602                 if (recv_ancillary.crb_all != 0)
2603                         (void) ip_find_hdr_v4(ipha, &ipps, B_FALSE);
2604 
2605                 /*
2606                  * BSD for some reason adjusts ipha_length to exclude the
2607                  * IP header length. We do the same.
2608                  */
2609                 if (is->is_bsd_compat) {
2610                         ushort_t len;
2611 
2612                         len = ntohs(ipha->ipha_length);
2613                         if (mp->b_datap->db_ref > 1) {
2614                                 /*
2615                                  * Allocate a new IP header so that we can
2616                                  * modify ipha_length.
2617                                  */
2618                                 mblk_t  *mp1;
2619 
2620                                 mp1 = allocb(ip_hdr_length, BPRI_MED);
2621                                 if (mp1 == NULL) {
2622                                         freemsg(mp);
2623                                         BUMP_MIB(&is->is_rawip_mib,
2624                                             rawipInErrors);
2625                                         return;
2626                                 }
2627                                 bcopy(rptr, mp1->b_rptr, ip_hdr_length);
2628                                 mp->b_rptr = rptr + ip_hdr_length;
2629                                 rptr = mp1->b_rptr;
2630                                 ipha = (ipha_t *)rptr;
2631                                 mp1->b_cont = mp;
2632                                 mp1->b_wptr = rptr + ip_hdr_length;
2633                                 mp = mp1;
2634                         }
2635                         len -= ip_hdr_length;
2636                         ipha->ipha_length = htons(len);
2637                 }
2638 
2639                 /*
2640                  * For RAW sockets we not pass ICMP/IPv4 packets to AF_INET6
2641                  * sockets. This is ensured by icmp_bind and the IP fanout code.
2642                  */
2643                 ASSERT(connp->conn_family == AF_INET);
2644 
2645                 /*
2646                  * This is the inbound data path.  Packets are passed upstream
2647                  * as T_UNITDATA_IND messages with full IPv4 headers still
2648                  * attached.
2649                  */
2650 
2651                 /*
2652                  * Normally only send up the source address.
2653                  * If any ancillary data items are wanted we add those.
2654                  */
2655                 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t);
2656                 if (recv_ancillary.crb_all != 0) {
2657                         udi_size += conn_recvancillary_size(connp,
2658                             recv_ancillary, ira, mp, &ipps);
2659                 }
2660 
2661                 /* Allocate a message block for the T_UNITDATA_IND structure. */
2662                 mp1 = allocb(udi_size, BPRI_MED);
2663                 if (mp1 == NULL) {
2664                         freemsg(mp);
2665                         BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
2666                         return;
2667                 }
2668                 mp1->b_cont = mp;
2669                 tudi = (struct T_unitdata_ind *)mp1->b_rptr;
2670                 mp1->b_datap->db_type = M_PROTO;
2671                 mp1->b_wptr = (uchar_t *)tudi + udi_size;
2672                 tudi->PRIM_type = T_UNITDATA_IND;
2673                 tudi->SRC_length = sizeof (sin_t);
2674                 tudi->SRC_offset = sizeof (struct T_unitdata_ind);
2675                 sin = (sin_t *)&tudi[1];
2676                 *sin = sin_null;
2677                 sin->sin_family = AF_INET;
2678                 sin->sin_addr.s_addr = ipha->ipha_src;
2679                 *(uint32_t *)&sin->sin_zero[0] = 0;
2680                 *(uint32_t *)&sin->sin_zero[4] = 0;
2681                 tudi->OPT_offset =  sizeof (struct T_unitdata_ind) +
2682                     sizeof (sin_t);
2683                 udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t));
2684                 tudi->OPT_length = udi_size;
2685 
2686                 /*
2687                  * Add options if IP_RECVIF etc is set
2688                  */
2689                 if (udi_size != 0) {
2690                         conn_recvancillary_add(connp, recv_ancillary, ira,
2691                             &ipps, (uchar_t *)&sin[1], udi_size);
2692                 }
2693                 goto deliver;
2694         }
2695 
2696         ASSERT(IPH_HDR_VERSION(rptr) == IPV6_VERSION);
2697         /*
2698          * IPv6 packets can only be received by applications
2699          * that are prepared to receive IPv6 addresses.
2700          * The IP fanout must ensure this.
2701          */
2702         ASSERT(connp->conn_family == AF_INET6);
2703 
2704         /*
2705          * Handle IPv6 packets. We don't pass up the IP headers with the
2706          * payload for IPv6.
2707          */
2708 
2709         ip6h = (ip6_t *)rptr;
2710         if (recv_ancillary.crb_all != 0) {
2711                 /*
2712                  * Call on ip_find_hdr_v6 which gets individual lenghts of
2713                  * extension headers (and pointers to them).
2714                  */
2715                 uint8_t         nexthdr;
2716 
2717                 /* We don't care about the length or nextheader. */
2718                 (void) ip_find_hdr_v6(mp, ip6h, B_TRUE, &ipps, &nexthdr);
2719 
2720                 /*
2721                  * We do not pass up hop-by-hop options or any other
2722                  * extension header as part of the packet. Applications
2723                  * that want to see them have to specify IPV6_RECV* socket
2724                  * options. And conn_recvancillary_size/add explicitly
2725                  * drops the TX option from IPV6_HOPOPTS as it does for UDP.
2726                  *
2727                  * If we had multilevel ICMP sockets, then we'd want to
2728                  * modify conn_recvancillary_size/add to
2729                  * allow the user to see the label.
2730                  */
2731         }
2732 
2733         /*
2734          * Check a filter for ICMPv6 types if needed.
2735          * Verify raw checksums if needed.
2736          */
2737         mutex_enter(&connp->conn_lock);
2738         if (icmp->icmp_filter != NULL) {
2739                 int type;
2740 
2741                 /* Assumes that IP has done the pullupmsg */
2742                 type = mp->b_rptr[ip_hdr_length];
2743 
2744                 ASSERT(mp->b_rptr + ip_hdr_length <= mp->b_wptr);
2745                 if (ICMP6_FILTER_WILLBLOCK(type, icmp->icmp_filter)) {
2746                         mutex_exit(&connp->conn_lock);
2747                         freemsg(mp);
2748                         return;
2749                 }
2750         }
2751         if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
2752                 /* Checksum */
2753                 uint16_t        *up;
2754                 uint32_t        sum;
2755                 int             remlen;
2756 
2757                 up = (uint16_t *)&ip6h->ip6_src;
2758 
2759                 remlen = msgdsize(mp) - ip_hdr_length;
2760                 sum = htons(connp->conn_proto + remlen)
2761                     + up[0] + up[1] + up[2] + up[3]
2762                     + up[4] + up[5] + up[6] + up[7]
2763                     + up[8] + up[9] + up[10] + up[11]
2764                     + up[12] + up[13] + up[14] + up[15];
2765                 sum = (sum & 0xffff) + (sum >> 16);
2766                 sum = IP_CSUM(mp, ip_hdr_length, sum);
2767                 if (sum != 0) {
2768                         /* IPv6 RAW checksum failed */
2769                         ip0dbg(("icmp_rput: RAW checksum failed %x\n", sum));
2770                         mutex_exit(&connp->conn_lock);
2771                         freemsg(mp);
2772                         BUMP_MIB(&is->is_rawip_mib, rawipInCksumErrs);
2773                         return;
2774                 }
2775         }
2776         mutex_exit(&connp->conn_lock);
2777 
2778         udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
2779 
2780         if (recv_ancillary.crb_all != 0) {
2781                 udi_size += conn_recvancillary_size(connp,
2782                     recv_ancillary, ira, mp, &ipps);
2783         }
2784 
2785         mp1 = allocb(udi_size, BPRI_MED);
2786         if (mp1 == NULL) {
2787                 freemsg(mp);
2788                 BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
2789                 return;
2790         }
2791         mp1->b_cont = mp;
2792         mp1->b_datap->db_type = M_PROTO;
2793         tudi = (struct T_unitdata_ind *)mp1->b_rptr;
2794         mp1->b_wptr = (uchar_t *)tudi + udi_size;
2795         tudi->PRIM_type = T_UNITDATA_IND;
2796         tudi->SRC_length = sizeof (sin6_t);
2797         tudi->SRC_offset = sizeof (struct T_unitdata_ind);
2798         tudi->OPT_offset = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
2799         udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin6_t));
2800         tudi->OPT_length = udi_size;
2801         sin6 = (sin6_t *)&tudi[1];
2802         *sin6 = sin6_null;
2803         sin6->sin6_port = 0;
2804         sin6->sin6_family = AF_INET6;
2805 
2806         sin6->sin6_addr = ip6h->ip6_src;
2807         /* No sin6_flowinfo per API */
2808         sin6->sin6_flowinfo = 0;
2809         /* For link-scope pass up scope id */
2810         if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src))
2811                 sin6->sin6_scope_id = ira->ira_ruifindex;
2812         else
2813                 sin6->sin6_scope_id = 0;
2814         sin6->__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst,
2815             IPCL_ZONEID(connp), is->is_netstack);
2816 
2817         if (udi_size != 0) {
2818                 conn_recvancillary_add(connp, recv_ancillary, ira,
2819                     &ipps, (uchar_t *)&sin6[1], udi_size);
2820         }
2821 
2822         /* Skip all the IPv6 headers per API */
2823         mp->b_rptr += ip_hdr_length;
2824         pkt_len -= ip_hdr_length;
2825 
2826 deliver:
2827         BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams);
2828         icmp_ulp_recv(connp, mp1, pkt_len);
2829 }
2830 
2831 /*
2832  * return SNMP stuff in buffer in mpdata. We don't hold any lock and report
2833  * information that can be changing beneath us.
2834  */
2835 mblk_t *
2836 icmp_snmp_get(queue_t *q, mblk_t *mpctl)
2837 {
2838         mblk_t                  *mpdata;
2839         struct opthdr           *optp;
2840         conn_t                  *connp = Q_TO_CONN(q);
2841         icmp_stack_t            *is = connp->conn_netstack->netstack_icmp;
2842         mblk_t                  *mp2ctl;
2843 
2844         /*
2845          * make a copy of the original message
2846          */
2847         mp2ctl = copymsg(mpctl);
2848 
2849         if (mpctl == NULL ||
2850             (mpdata = mpctl->b_cont) == NULL) {
2851                 freemsg(mpctl);
2852                 freemsg(mp2ctl);
2853                 return (0);
2854         }
2855 
2856         /* fixed length structure for IPv4 and IPv6 counters */
2857         optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
2858         optp->level = EXPER_RAWIP;
2859         optp->name = 0;
2860         (void) snmp_append_data(mpdata, (char *)&is->is_rawip_mib,
2861             sizeof (is->is_rawip_mib));
2862         optp->len = msgdsize(mpdata);
2863         qreply(q, mpctl);
2864 
2865         return (mp2ctl);
2866 }
2867 
2868 /*
2869  * Return 0 if invalid set request, 1 otherwise, including non-rawip requests.
2870  * TODO:  If this ever actually tries to set anything, it needs to be
2871  * to do the appropriate locking.
2872  */
2873 /* ARGSUSED */
2874 int
2875 icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
2876     uchar_t *ptr, int len)
2877 {
2878         switch (level) {
2879         case EXPER_RAWIP:
2880                 return (0);
2881         default:
2882                 return (1);
2883         }
2884 }
2885 
2886 /*
2887  * This routine creates a T_UDERROR_IND message and passes it upstream.
2888  * The address and options are copied from the T_UNITDATA_REQ message
2889  * passed in mp.  This message is freed.
2890  */
2891 static void
2892 icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err)
2893 {
2894         struct T_unitdata_req *tudr;
2895         mblk_t  *mp1;
2896         uchar_t *destaddr;
2897         t_scalar_t destlen;
2898         uchar_t *optaddr;
2899         t_scalar_t optlen;
2900 
2901         if ((mp->b_wptr < mp->b_rptr) ||
2902             (MBLKL(mp)) < sizeof (struct T_unitdata_req)) {
2903                 goto done;
2904         }
2905         tudr = (struct T_unitdata_req *)mp->b_rptr;
2906         destaddr = mp->b_rptr + tudr->DEST_offset;
2907         if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr ||
2908             destaddr + tudr->DEST_length < mp->b_rptr ||
2909             destaddr + tudr->DEST_length > mp->b_wptr) {
2910                 goto done;
2911         }
2912         optaddr = mp->b_rptr + tudr->OPT_offset;
2913         if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr ||
2914             optaddr + tudr->OPT_length < mp->b_rptr ||
2915             optaddr + tudr->OPT_length > mp->b_wptr) {
2916                 goto done;
2917         }
2918         destlen = tudr->DEST_length;
2919         optlen = tudr->OPT_length;
2920 
2921         mp1 = mi_tpi_uderror_ind((char *)destaddr, destlen,
2922             (char *)optaddr, optlen, err);
2923         if (mp1 != NULL)
2924                 qreply(q, mp1);
2925 
2926 done:
2927         freemsg(mp);
2928 }
2929 
2930 static int
2931 rawip_do_unbind(conn_t *connp)
2932 {
2933         icmp_t  *icmp = connp->conn_icmp;
2934 
2935         mutex_enter(&connp->conn_lock);
2936         /* If a bind has not been done, we can't unbind. */
2937         if (icmp->icmp_state == TS_UNBND) {
2938                 mutex_exit(&connp->conn_lock);
2939                 return (-TOUTSTATE);
2940         }
2941         connp->conn_saddr_v6 = ipv6_all_zeros;
2942         connp->conn_bound_addr_v6 = ipv6_all_zeros;
2943         connp->conn_laddr_v6 = ipv6_all_zeros;
2944         connp->conn_mcbc_bind = B_FALSE;
2945         connp->conn_lport = 0;
2946         connp->conn_fport = 0;
2947         /* In case we were also connected */
2948         connp->conn_faddr_v6 = ipv6_all_zeros;
2949         connp->conn_v6lastdst = ipv6_all_zeros;
2950 
2951         icmp->icmp_state = TS_UNBND;
2952 
2953         (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
2954             &connp->conn_faddr_v6, connp->conn_flowinfo);
2955         mutex_exit(&connp->conn_lock);
2956 
2957         ip_unbind(connp);
2958         return (0);
2959 }
2960 
2961 /*
2962  * This routine is called by icmp_wput to handle T_UNBIND_REQ messages.
2963  * After some error checking, the message is passed downstream to ip.
2964  */
2965 static void
2966 icmp_tpi_unbind(queue_t *q, mblk_t *mp)
2967 {
2968         conn_t  *connp = Q_TO_CONN(q);
2969         int     error;
2970 
2971         ASSERT(mp->b_cont == NULL);
2972         error = rawip_do_unbind(connp);
2973         if (error) {
2974                 if (error < 0) {
2975                         icmp_err_ack(q, mp, -error, 0);
2976                 } else {
2977                         icmp_err_ack(q, mp, 0, error);
2978                 }
2979                 return;
2980         }
2981 
2982         /*
2983          * Convert mp into a T_OK_ACK
2984          */
2985 
2986         mp = mi_tpi_ok_ack_alloc(mp);
2987 
2988         /*
2989          * should not happen in practice... T_OK_ACK is smaller than the
2990          * original message.
2991          */
2992         ASSERT(mp != NULL);
2993         ASSERT(((struct T_ok_ack *)mp->b_rptr)->PRIM_type == T_OK_ACK);
2994         qreply(q, mp);
2995 }
2996 
2997 /*
2998  * Process IPv4 packets that already include an IP header.
2999  * Used when IP_HDRINCL has been set (implicit for IPPROTO_RAW and
3000  * IPPROTO_IGMP).
3001  * In this case we ignore the address and any options in the T_UNITDATA_REQ.
3002  *
3003  * The packet is assumed to have a base (20 byte) IP header followed
3004  * by the upper-layer protocol. We include any IP_OPTIONS including a
3005  * CIPSO label but otherwise preserve the base IP header.
3006  */
3007 static int
3008 icmp_output_hdrincl(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid)
3009 {
3010         icmp_t          *icmp = connp->conn_icmp;
3011         icmp_stack_t    *is = icmp->icmp_is;
3012         ipha_t          iphas;
3013         ipha_t          *ipha;
3014         int             ip_hdr_length;
3015         int             tp_hdr_len;
3016         ip_xmit_attr_t  *ixa;
3017         ip_pkt_t        *ipp;
3018         in6_addr_t      v6src;
3019         in6_addr_t      v6dst;
3020         in6_addr_t      v6nexthop;
3021         int             error;
3022         boolean_t       do_ipsec;
3023 
3024         /*
3025          * We need an exclusive copy of conn_ixa since the included IP
3026          * header could have any destination.
3027          * That copy has no pointers hence we
3028          * need to set them up once we've parsed the ancillary data.
3029          */
3030         ixa = conn_get_ixa_exclusive(connp);
3031         if (ixa == NULL) {
3032                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3033                 freemsg(mp);
3034                 return (ENOMEM);
3035         }
3036         ASSERT(cr != NULL);
3037         /*
3038          * Caller has a reference on cr; from db_credp or because we
3039          * are running in process context.
3040          */
3041         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3042         ixa->ixa_cred = cr;
3043         ixa->ixa_cpid = pid;
3044         if (is_system_labeled()) {
3045                 /* We need to restart with a label based on the cred */
3046                 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
3047         }
3048 
3049         /* In case previous destination was multicast or multirt */
3050         ip_attr_newdst(ixa);
3051 
3052         /* Get a copy of conn_xmit_ipp since the TX label might change it */
3053         ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP);
3054         if (ipp == NULL) {
3055                 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3056                 ixa->ixa_cred = connp->conn_cred; /* Restore */
3057                 ixa->ixa_cpid = connp->conn_cpid;
3058                 ixa_refrele(ixa);
3059                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3060                 freemsg(mp);
3061                 return (ENOMEM);
3062         }
3063         mutex_enter(&connp->conn_lock);
3064         error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP);
3065         mutex_exit(&connp->conn_lock);
3066         if (error != 0) {
3067                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3068                 freemsg(mp);
3069                 goto done;
3070         }
3071 
3072         /* Sanity check length of packet */
3073         ipha = (ipha_t *)mp->b_rptr;
3074 
3075         ip_hdr_length = IP_SIMPLE_HDR_LENGTH;
3076         if ((mp->b_wptr - mp->b_rptr) < IP_SIMPLE_HDR_LENGTH) {
3077                 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
3078                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3079                         freemsg(mp);
3080                         goto done;
3081                 }
3082                 ipha = (ipha_t *)mp->b_rptr;
3083         }
3084         ipha->ipha_version_and_hdr_length =
3085             (IP_VERSION<<4) | (ip_hdr_length>>2);
3086 
3087         /*
3088          * We set IXAF_DONTFRAG if the application set DF which makes
3089          * IP not fragment.
3090          */
3091         ipha->ipha_fragment_offset_and_flags &= htons(IPH_DF);
3092         if (ipha->ipha_fragment_offset_and_flags & htons(IPH_DF))
3093                 ixa->ixa_flags |= (IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF);
3094         else
3095                 ixa->ixa_flags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF);
3096 
3097         /* Even for multicast and broadcast we honor the apps ttl */
3098         ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
3099 
3100         /*
3101          * No source verification for non-local addresses
3102          */
3103         if (ipha->ipha_src != INADDR_ANY &&
3104             ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid,
3105             is->is_netstack->netstack_ip, B_FALSE)
3106             != IPVL_UNICAST_UP) {
3107                 ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE;
3108         }
3109 
3110         if (ipha->ipha_dst == INADDR_ANY)
3111                 ipha->ipha_dst = htonl(INADDR_LOOPBACK);
3112 
3113         IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src);
3114         IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst);
3115 
3116         /* Defer IPsec if it might need to look at ICMP type/code */
3117         do_ipsec = ipha->ipha_protocol != IPPROTO_ICMP;
3118         ixa->ixa_flags |= IXAF_IS_IPV4;
3119 
3120         ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop);
3121         error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop,
3122             connp->conn_fport, &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
3123             (do_ipsec ? IPDF_IPSEC : 0));
3124         switch (error) {
3125         case 0:
3126                 break;
3127         case EADDRNOTAVAIL:
3128                 /*
3129                  * IXAF_VERIFY_SOURCE tells us to pick a better source.
3130                  * Don't have the application see that errno
3131                  */
3132                 error = ENETUNREACH;
3133                 goto failed;
3134         case ENETDOWN:
3135                 /*
3136                  * Have !ipif_addr_ready address; drop packet silently
3137                  * until we can get applications to not send until we
3138                  * are ready.
3139                  */
3140                 error = 0;
3141                 goto failed;
3142         case EHOSTUNREACH:
3143         case ENETUNREACH:
3144                 if (ixa->ixa_ire != NULL) {
3145                         /*
3146                          * Let conn_ip_output/ire_send_noroute return
3147                          * the error and send any local ICMP error.
3148                          */
3149                         error = 0;
3150                         break;
3151                 }
3152                 /* FALLTHRU */
3153         default:
3154         failed:
3155                 freemsg(mp);
3156                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3157                 goto done;
3158         }
3159         if (ipha->ipha_src == INADDR_ANY)
3160                 IN6_V4MAPPED_TO_IPADDR(&v6src, ipha->ipha_src);
3161 
3162         /*
3163          * We might be going to a different destination than last time,
3164          * thus check that TX allows the communication and compute any
3165          * needed label.
3166          *
3167          * TSOL Note: We have an exclusive ipp and ixa for this thread so we
3168          * don't have to worry about concurrent threads.
3169          */
3170         if (is_system_labeled()) {
3171                 /*
3172                  * Check whether Trusted Solaris policy allows communication
3173                  * with this host, and pretend that the destination is
3174                  * unreachable if not.
3175                  * Compute any needed label and place it in ipp_label_v4/v6.
3176                  *
3177                  * Later conn_build_hdr_template/conn_prepend_hdr takes
3178                  * ipp_label_v4/v6 to form the packet.
3179                  *
3180                  * Tsol note: We have ipp structure local to this thread so
3181                  * no locking is needed.
3182                  */
3183                 error = conn_update_label(connp, ixa, &v6dst, ipp);
3184                 if (error != 0) {
3185                         freemsg(mp);
3186                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3187                         goto done;
3188                 }
3189         }
3190 
3191         /*
3192          * Save away a copy of the IPv4 header the application passed down
3193          * and then prepend an IPv4 header complete with any IP options
3194          * including label.
3195          * We need a struct copy since icmp_prepend_hdr will reuse the available
3196          * space in the mblk.
3197          */
3198         iphas = *ipha;
3199         mp->b_rptr += IP_SIMPLE_HDR_LENGTH;
3200 
3201         mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, 0, mp, &error);
3202         if (mp == NULL) {
3203                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3204                 ASSERT(error != 0);
3205                 goto done;
3206         }
3207         if (ixa->ixa_pktlen > IP_MAXPACKET) {
3208                 error = EMSGSIZE;
3209                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3210                 freemsg(mp);
3211                 goto done;
3212         }
3213         /* Restore key parts of the header that the application passed down */
3214         ipha = (ipha_t *)mp->b_rptr;
3215         ipha->ipha_type_of_service = iphas.ipha_type_of_service;
3216         ipha->ipha_ident = iphas.ipha_ident;
3217         ipha->ipha_fragment_offset_and_flags =
3218             iphas.ipha_fragment_offset_and_flags;
3219         ipha->ipha_ttl = iphas.ipha_ttl;
3220         ipha->ipha_protocol = iphas.ipha_protocol;
3221         ipha->ipha_src = iphas.ipha_src;
3222         ipha->ipha_dst = iphas.ipha_dst;
3223 
3224         ixa->ixa_protocol = ipha->ipha_protocol;
3225 
3226         /*
3227          * Make sure that the IP header plus any transport header that is
3228          * checksumed by ip_output is in the first mblk. (ip_output assumes
3229          * that at least the checksum field is in the first mblk.)
3230          */
3231         switch (ipha->ipha_protocol) {
3232         case IPPROTO_UDP:
3233                 tp_hdr_len = 8;
3234                 break;
3235         case IPPROTO_TCP:
3236                 tp_hdr_len = 20;
3237                 break;
3238         default:
3239                 tp_hdr_len = 0;
3240                 break;
3241         }
3242         ip_hdr_length = IPH_HDR_LENGTH(ipha);
3243         if (mp->b_wptr - mp->b_rptr < ip_hdr_length + tp_hdr_len) {
3244                 if (!pullupmsg(mp, ip_hdr_length + tp_hdr_len)) {
3245                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3246                         if (mp->b_cont == NULL)
3247                                 error = EINVAL;
3248                         else
3249                                 error = ENOMEM;
3250                         freemsg(mp);
3251                         goto done;
3252                 }
3253         }
3254 
3255         if (!do_ipsec) {
3256                 /* Policy might differ for different ICMP type/code */
3257                 if (ixa->ixa_ipsec_policy != NULL) {
3258                         IPPOL_REFRELE(ixa->ixa_ipsec_policy);
3259                         ixa->ixa_ipsec_policy = NULL;
3260                         ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
3261                 }
3262                 mp = ip_output_attach_policy(mp, ipha, NULL, connp, ixa);
3263                 if (mp == NULL) {
3264                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3265                         error = EHOSTUNREACH;   /* IPsec policy failure */
3266                         goto done;
3267                 }
3268         }
3269 
3270         /* We're done.  Pass the packet to ip. */
3271         BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
3272 
3273         error = conn_ip_output(mp, ixa);
3274         /* No rawipOutErrors if an error since IP increases its error counter */
3275         switch (error) {
3276         case 0:
3277                 break;
3278         case EWOULDBLOCK:
3279                 (void) ixa_check_drain_insert(connp, ixa);
3280                 error = 0;
3281                 break;
3282         case EADDRNOTAVAIL:
3283                 /*
3284                  * IXAF_VERIFY_SOURCE tells us to pick a better source.
3285                  * Don't have the application see that errno
3286                  */
3287                 error = ENETUNREACH;
3288                 break;
3289         }
3290 done:
3291         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3292         ixa->ixa_cred = connp->conn_cred; /* Restore */
3293         ixa->ixa_cpid = connp->conn_cpid;
3294         ixa_refrele(ixa);
3295         ip_pkt_free(ipp);
3296         kmem_free(ipp, sizeof (*ipp));
3297         return (error);
3298 }
3299 
3300 static mblk_t *
3301 icmp_output_attach_policy(mblk_t *mp, conn_t *connp, ip_xmit_attr_t *ixa)
3302 {
3303         ipha_t  *ipha = NULL;
3304         ip6_t   *ip6h = NULL;
3305 
3306         if (ixa->ixa_flags & IXAF_IS_IPV4)
3307                 ipha = (ipha_t *)mp->b_rptr;
3308         else
3309                 ip6h = (ip6_t *)mp->b_rptr;
3310 
3311         if (ixa->ixa_ipsec_policy != NULL) {
3312                 IPPOL_REFRELE(ixa->ixa_ipsec_policy);
3313                 ixa->ixa_ipsec_policy = NULL;
3314                 ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
3315         }
3316         return (ip_output_attach_policy(mp, ipha, ip6h, connp, ixa));
3317 }
3318 
3319 /*
3320  * Handle T_UNITDATA_REQ with options. Both IPv4 and IPv6
3321  * Either tudr_mp or msg is set. If tudr_mp we take ancillary data from
3322  * the TPI options, otherwise we take them from msg_control.
3323  * If both sin and sin6 is set it is a connected socket and we use conn_faddr.
3324  * Always consumes mp; never consumes tudr_mp.
3325  */
3326 static int
3327 icmp_output_ancillary(conn_t *connp, sin_t *sin, sin6_t *sin6, mblk_t *mp,
3328     mblk_t *tudr_mp, struct nmsghdr *msg, cred_t *cr, pid_t pid)
3329 {
3330         icmp_t          *icmp = connp->conn_icmp;
3331         icmp_stack_t    *is = icmp->icmp_is;
3332         int             error;
3333         ip_xmit_attr_t  *ixa;
3334         ip_pkt_t        *ipp;
3335         in6_addr_t      v6src;
3336         in6_addr_t      v6dst;
3337         in6_addr_t      v6nexthop;
3338         in_port_t       dstport;
3339         uint32_t        flowinfo;
3340         uint_t          srcid;
3341         int             is_absreq_failure = 0;
3342         conn_opt_arg_t  coas, *coa;
3343 
3344         ASSERT(tudr_mp != NULL || msg != NULL);
3345 
3346         /*
3347          * Get ixa before checking state to handle a disconnect race.
3348          *
3349          * We need an exclusive copy of conn_ixa since the ancillary data
3350          * options might modify it. That copy has no pointers hence we
3351          * need to set them up once we've parsed the ancillary data.
3352          */
3353         ixa = conn_get_ixa_exclusive(connp);
3354         if (ixa == NULL) {
3355                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3356                 freemsg(mp);
3357                 return (ENOMEM);
3358         }
3359         ASSERT(cr != NULL);
3360         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3361         ixa->ixa_cred = cr;
3362         ixa->ixa_cpid = pid;
3363         if (is_system_labeled()) {
3364                 /* We need to restart with a label based on the cred */
3365                 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
3366         }
3367 
3368         /* In case previous destination was multicast or multirt */
3369         ip_attr_newdst(ixa);
3370 
3371         /* Get a copy of conn_xmit_ipp since the options might change it */
3372         ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP);
3373         if (ipp == NULL) {
3374                 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3375                 ixa->ixa_cred = connp->conn_cred; /* Restore */
3376                 ixa->ixa_cpid = connp->conn_cpid;
3377                 ixa_refrele(ixa);
3378                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3379                 freemsg(mp);
3380                 return (ENOMEM);
3381         }
3382         mutex_enter(&connp->conn_lock);
3383         error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP);
3384         mutex_exit(&connp->conn_lock);
3385         if (error != 0) {
3386                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3387                 freemsg(mp);
3388                 goto done;
3389         }
3390 
3391         /*
3392          * Parse the options and update ixa and ipp as a result.
3393          */
3394 
3395         coa = &coas;
3396         coa->coa_connp = connp;
3397         coa->coa_ixa = ixa;
3398         coa->coa_ipp = ipp;
3399         coa->coa_ancillary = B_TRUE;
3400         coa->coa_changed = 0;
3401 
3402         if (msg != NULL) {
3403                 error = process_auxiliary_options(connp, msg->msg_control,
3404                     msg->msg_controllen, coa, &icmp_opt_obj, icmp_opt_set, cr);
3405         } else {
3406                 struct T_unitdata_req *tudr;
3407 
3408                 tudr = (struct T_unitdata_req *)tudr_mp->b_rptr;
3409                 ASSERT(tudr->PRIM_type == T_UNITDATA_REQ);
3410                 error = tpi_optcom_buf(connp->conn_wq, tudr_mp,
3411                     &tudr->OPT_length, tudr->OPT_offset, cr, &icmp_opt_obj,
3412                     coa, &is_absreq_failure);
3413         }
3414         if (error != 0) {
3415                 /*
3416                  * Note: No special action needed in this
3417                  * module for "is_absreq_failure"
3418                  */
3419                 freemsg(mp);
3420                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3421                 goto done;
3422         }
3423         ASSERT(is_absreq_failure == 0);
3424 
3425         mutex_enter(&connp->conn_lock);
3426         /*
3427          * If laddr is unspecified then we look at sin6_src_id.
3428          * We will give precedence to a source address set with IPV6_PKTINFO
3429          * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't
3430          * want ip_attr_connect to select a source (since it can fail) when
3431          * IPV6_PKTINFO is specified.
3432          * If this doesn't result in a source address then we get a source
3433          * from ip_attr_connect() below.
3434          */
3435         v6src = connp->conn_saddr_v6;
3436         if (sin != NULL) {
3437                 IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst);
3438                 dstport = sin->sin_port;
3439                 flowinfo = 0;
3440                 ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
3441                 ixa->ixa_flags |= IXAF_IS_IPV4;
3442         } else if (sin6 != NULL) {
3443                 v6dst = sin6->sin6_addr;
3444                 dstport = sin6->sin6_port;
3445                 flowinfo = sin6->sin6_flowinfo;
3446                 srcid = sin6->__sin6_src_id;
3447                 if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) {
3448                         ixa->ixa_scopeid = sin6->sin6_scope_id;
3449                         ixa->ixa_flags |= IXAF_SCOPEID_SET;
3450                 } else {
3451                         ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
3452                 }
3453                 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
3454                         ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
3455                             connp->conn_netstack);
3456                 }
3457                 if (IN6_IS_ADDR_V4MAPPED(&v6dst))
3458                         ixa->ixa_flags |= IXAF_IS_IPV4;
3459                 else
3460                         ixa->ixa_flags &= ~IXAF_IS_IPV4;
3461         } else {
3462                 /* Connected case */
3463                 v6dst = connp->conn_faddr_v6;
3464                 flowinfo = connp->conn_flowinfo;
3465         }
3466         mutex_exit(&connp->conn_lock);
3467         /* Handle IP_PKTINFO/IPV6_PKTINFO setting source address. */
3468         if (ipp->ipp_fields & IPPF_ADDR) {
3469                 if (ixa->ixa_flags & IXAF_IS_IPV4) {
3470                         if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
3471                                 v6src = ipp->ipp_addr;
3472                 } else {
3473                         if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
3474                                 v6src = ipp->ipp_addr;
3475                 }
3476         }
3477         /*
3478          * Allow source not assigned to the system
3479          * only if it is not a local addresses
3480          */
3481         if (!V6_OR_V4_INADDR_ANY(v6src)) {
3482                 ip_laddr_t laddr_type;
3483 
3484                 if (ixa->ixa_flags & IXAF_IS_IPV4) {
3485                         ipaddr_t v4src;
3486 
3487                         IN6_V4MAPPED_TO_IPADDR(&v6src, v4src);
3488                         laddr_type = ip_laddr_verify_v4(v4src, ixa->ixa_zoneid,
3489                             is->is_netstack->netstack_ip, B_FALSE);
3490                 } else {
3491                         laddr_type = ip_laddr_verify_v6(&v6src, ixa->ixa_zoneid,
3492                             is->is_netstack->netstack_ip, B_FALSE, B_FALSE);
3493                 }
3494                 if (laddr_type != IPVL_UNICAST_UP)
3495                         ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE;
3496         }
3497 
3498         ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop);
3499         error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport,
3500             &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST);
3501 
3502         switch (error) {
3503         case 0:
3504                 break;
3505         case EADDRNOTAVAIL:
3506                 /*
3507                  * IXAF_VERIFY_SOURCE tells us to pick a better source.
3508                  * Don't have the application see that errno
3509                  */
3510                 error = ENETUNREACH;
3511                 goto failed;
3512         case ENETDOWN:
3513                 /*
3514                  * Have !ipif_addr_ready address; drop packet silently
3515                  * until we can get applications to not send until we
3516                  * are ready.
3517                  */
3518                 error = 0;
3519                 goto failed;
3520         case EHOSTUNREACH:
3521         case ENETUNREACH:
3522                 if (ixa->ixa_ire != NULL) {
3523                         /*
3524                          * Let conn_ip_output/ire_send_noroute return
3525                          * the error and send any local ICMP error.
3526                          */
3527                         error = 0;
3528                         break;
3529                 }
3530                 /* FALLTHRU */
3531         default:
3532         failed:
3533                 freemsg(mp);
3534                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3535                 goto done;
3536         }
3537 
3538         /*
3539          * We might be going to a different destination than last time,
3540          * thus check that TX allows the communication and compute any
3541          * needed label.
3542          *
3543          * TSOL Note: We have an exclusive ipp and ixa for this thread so we
3544          * don't have to worry about concurrent threads.
3545          */
3546         if (is_system_labeled()) {
3547                 /*
3548                  * Check whether Trusted Solaris policy allows communication
3549                  * with this host, and pretend that the destination is
3550                  * unreachable if not.
3551                  * Compute any needed label and place it in ipp_label_v4/v6.
3552                  *
3553                  * Later conn_build_hdr_template/conn_prepend_hdr takes
3554                  * ipp_label_v4/v6 to form the packet.
3555                  *
3556                  * Tsol note: We have ipp structure local to this thread so
3557                  * no locking is needed.
3558                  */
3559                 error = conn_update_label(connp, ixa, &v6dst, ipp);
3560                 if (error != 0) {
3561                         freemsg(mp);
3562                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3563                         goto done;
3564                 }
3565         }
3566         mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, flowinfo, mp,
3567             &error);
3568         if (mp == NULL) {
3569                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3570                 ASSERT(error != 0);
3571                 goto done;
3572         }
3573         if (ixa->ixa_pktlen > IP_MAXPACKET) {
3574                 error = EMSGSIZE;
3575                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3576                 freemsg(mp);
3577                 goto done;
3578         }
3579 
3580         /* Policy might differ for different ICMP type/code */
3581         mp = icmp_output_attach_policy(mp, connp, ixa);
3582         if (mp == NULL) {
3583                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3584                 error = EHOSTUNREACH;   /* IPsec policy failure */
3585                 goto done;
3586         }
3587 
3588         /* We're done.  Pass the packet to ip. */
3589         BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
3590 
3591         error = conn_ip_output(mp, ixa);
3592         if (!connp->conn_unspec_src)
3593                 ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
3594         /* No rawipOutErrors if an error since IP increases its error counter */
3595         switch (error) {
3596         case 0:
3597                 break;
3598         case EWOULDBLOCK:
3599                 (void) ixa_check_drain_insert(connp, ixa);
3600                 error = 0;
3601                 break;
3602         case EADDRNOTAVAIL:
3603                 /*
3604                  * IXAF_VERIFY_SOURCE tells us to pick a better source.
3605                  * Don't have the application see that errno
3606                  */
3607                 error = ENETUNREACH;
3608                 /* FALLTHRU */
3609         default:
3610                 mutex_enter(&connp->conn_lock);
3611                 /*
3612                  * Clear the source and v6lastdst so we call ip_attr_connect
3613                  * for the next packet and try to pick a better source.
3614                  */
3615                 if (connp->conn_mcbc_bind)
3616                         connp->conn_saddr_v6 = ipv6_all_zeros;
3617                 else
3618                         connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
3619                 connp->conn_v6lastdst = ipv6_all_zeros;
3620                 mutex_exit(&connp->conn_lock);
3621                 break;
3622         }
3623 done:
3624         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3625         ixa->ixa_cred = connp->conn_cred; /* Restore */
3626         ixa->ixa_cpid = connp->conn_cpid;
3627         ixa_refrele(ixa);
3628         ip_pkt_free(ipp);
3629         kmem_free(ipp, sizeof (*ipp));
3630         return (error);
3631 }
3632 
3633 /*
3634  * Handle sending an M_DATA for a connected socket.
3635  * Handles both IPv4 and IPv6.
3636  */
3637 int
3638 icmp_output_connected(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid)
3639 {
3640         icmp_t          *icmp = connp->conn_icmp;
3641         icmp_stack_t    *is = icmp->icmp_is;
3642         int             error;
3643         ip_xmit_attr_t  *ixa;
3644         boolean_t       do_ipsec;
3645 
3646         /*
3647          * If no other thread is using conn_ixa this just gets a reference to
3648          * conn_ixa. Otherwise we get a safe copy of conn_ixa.
3649          */
3650         ixa = conn_get_ixa(connp, B_FALSE);
3651         if (ixa == NULL) {
3652                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3653                 freemsg(mp);
3654                 return (ENOMEM);
3655         }
3656 
3657         ASSERT(cr != NULL);
3658         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3659         ixa->ixa_cred = cr;
3660         ixa->ixa_cpid = pid;
3661 
3662         /* Defer IPsec if it might need to look at ICMP type/code */
3663         switch (ixa->ixa_protocol) {
3664         case IPPROTO_ICMP:
3665         case IPPROTO_ICMPV6:
3666                 do_ipsec = B_FALSE;
3667                 break;
3668         default:
3669                 do_ipsec = B_TRUE;
3670         }
3671 
3672         mutex_enter(&connp->conn_lock);
3673         mp = icmp_prepend_header_template(connp, ixa, mp,
3674             &connp->conn_saddr_v6, connp->conn_flowinfo, &error);
3675 
3676         if (mp == NULL) {
3677                 ASSERT(error != 0);
3678                 mutex_exit(&connp->conn_lock);
3679                 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3680                 ixa->ixa_cred = connp->conn_cred; /* Restore */
3681                 ixa->ixa_cpid = connp->conn_cpid;
3682                 ixa_refrele(ixa);
3683                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3684                 freemsg(mp);
3685                 return (error);
3686         }
3687 
3688         if (!do_ipsec) {
3689                 /* Policy might differ for different ICMP type/code */
3690                 mp = icmp_output_attach_policy(mp, connp, ixa);
3691                 if (mp == NULL) {
3692                         mutex_exit(&connp->conn_lock);
3693                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3694                         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3695                         ixa->ixa_cred = connp->conn_cred; /* Restore */
3696                         ixa->ixa_cpid = connp->conn_cpid;
3697                         ixa_refrele(ixa);
3698                         return (EHOSTUNREACH);  /* IPsec policy failure */
3699                 }
3700         }
3701 
3702         /*
3703          * In case we got a safe copy of conn_ixa, or if opt_set made us a new
3704          * safe copy, then we need to fill in any pointers in it.
3705          */
3706         if (ixa->ixa_ire == NULL) {
3707                 in6_addr_t      faddr, saddr;
3708                 in6_addr_t      nexthop;
3709                 in_port_t       fport;
3710 
3711                 saddr = connp->conn_saddr_v6;
3712                 faddr = connp->conn_faddr_v6;
3713                 fport = connp->conn_fport;
3714                 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &faddr, &nexthop);
3715                 mutex_exit(&connp->conn_lock);
3716 
3717                 error = ip_attr_connect(connp, ixa, &saddr, &faddr, &nexthop,
3718                     fport, NULL, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
3719                     (do_ipsec ? IPDF_IPSEC : 0));
3720                 switch (error) {
3721                 case 0:
3722                         break;
3723                 case EADDRNOTAVAIL:
3724                         /*
3725                          * IXAF_VERIFY_SOURCE tells us to pick a better source.
3726                          * Don't have the application see that errno
3727                          */
3728                         error = ENETUNREACH;
3729                         goto failed;
3730                 case ENETDOWN:
3731                         /*
3732                          * Have !ipif_addr_ready address; drop packet silently
3733                          * until we can get applications to not send until we
3734                          * are ready.
3735                          */
3736                         error = 0;
3737                         goto failed;
3738                 case EHOSTUNREACH:
3739                 case ENETUNREACH:
3740                         if (ixa->ixa_ire != NULL) {
3741                                 /*
3742                                  * Let conn_ip_output/ire_send_noroute return
3743                                  * the error and send any local ICMP error.
3744                                  */
3745                                 error = 0;
3746                                 break;
3747                         }
3748                         /* FALLTHRU */
3749                 default:
3750                 failed:
3751                         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3752                         ixa->ixa_cred = connp->conn_cred; /* Restore */
3753                         ixa->ixa_cpid = connp->conn_cpid;
3754                         ixa_refrele(ixa);
3755                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3756                         freemsg(mp);
3757                         return (error);
3758                 }
3759         } else {
3760                 /* Done with conn_t */
3761                 mutex_exit(&connp->conn_lock);
3762         }
3763 
3764         /* We're done.  Pass the packet to ip. */
3765         BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
3766 
3767         error = conn_ip_output(mp, ixa);
3768         /* No rawipOutErrors if an error since IP increases its error counter */
3769         switch (error) {
3770         case 0:
3771                 break;
3772         case EWOULDBLOCK:
3773                 (void) ixa_check_drain_insert(connp, ixa);
3774                 error = 0;
3775                 break;
3776         case EADDRNOTAVAIL:
3777                 /*
3778                  * IXAF_VERIFY_SOURCE tells us to pick a better source.
3779                  * Don't have the application see that errno
3780                  */
3781                 error = ENETUNREACH;
3782                 break;
3783         }
3784         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3785         ixa->ixa_cred = connp->conn_cred; /* Restore */
3786         ixa->ixa_cpid = connp->conn_cpid;
3787         ixa_refrele(ixa);
3788         return (error);
3789 }
3790 
3791 /*
3792  * Handle sending an M_DATA to the last destination.
3793  * Handles both IPv4 and IPv6.
3794  *
3795  * NOTE: The caller must hold conn_lock and we drop it here.
3796  */
3797 int
3798 icmp_output_lastdst(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid,
3799     ip_xmit_attr_t *ixa)
3800 {
3801         icmp_t          *icmp = connp->conn_icmp;
3802         icmp_stack_t    *is = icmp->icmp_is;
3803         int             error;
3804         boolean_t       do_ipsec;
3805 
3806         ASSERT(MUTEX_HELD(&connp->conn_lock));
3807         ASSERT(ixa != NULL);
3808 
3809         ASSERT(cr != NULL);
3810         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3811         ixa->ixa_cred = cr;
3812         ixa->ixa_cpid = pid;
3813 
3814         /* Defer IPsec if it might need to look at ICMP type/code */
3815         switch (ixa->ixa_protocol) {
3816         case IPPROTO_ICMP:
3817         case IPPROTO_ICMPV6:
3818                 do_ipsec = B_FALSE;
3819                 break;
3820         default:
3821                 do_ipsec = B_TRUE;
3822         }
3823 
3824 
3825         mp = icmp_prepend_header_template(connp, ixa, mp,
3826             &connp->conn_v6lastsrc, connp->conn_lastflowinfo, &error);
3827 
3828         if (mp == NULL) {
3829                 ASSERT(error != 0);
3830                 mutex_exit(&connp->conn_lock);
3831                 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3832                 ixa->ixa_cred = connp->conn_cred; /* Restore */
3833                 ixa->ixa_cpid = connp->conn_cpid;
3834                 ixa_refrele(ixa);
3835                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3836                 freemsg(mp);
3837                 return (error);
3838         }
3839 
3840         if (!do_ipsec) {
3841                 /* Policy might differ for different ICMP type/code */
3842                 mp = icmp_output_attach_policy(mp, connp, ixa);
3843                 if (mp == NULL) {
3844                         mutex_exit(&connp->conn_lock);
3845                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3846                         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3847                         ixa->ixa_cred = connp->conn_cred; /* Restore */
3848                         ixa->ixa_cpid = connp->conn_cpid;
3849                         ixa_refrele(ixa);
3850                         return (EHOSTUNREACH);  /* IPsec policy failure */
3851                 }
3852         }
3853 
3854         /*
3855          * In case we got a safe copy of conn_ixa, or if opt_set made us a new
3856          * safe copy, then we need to fill in any pointers in it.
3857          */
3858         if (ixa->ixa_ire == NULL) {
3859                 in6_addr_t      lastdst, lastsrc;
3860                 in6_addr_t      nexthop;
3861                 in_port_t       lastport;
3862 
3863                 lastsrc = connp->conn_v6lastsrc;
3864                 lastdst = connp->conn_v6lastdst;
3865                 lastport = connp->conn_lastdstport;
3866                 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &lastdst, &nexthop);
3867                 mutex_exit(&connp->conn_lock);
3868 
3869                 error = ip_attr_connect(connp, ixa, &lastsrc, &lastdst,
3870                     &nexthop, lastport, NULL, NULL, IPDF_ALLOW_MCBC |
3871                     IPDF_VERIFY_DST | (do_ipsec ? IPDF_IPSEC : 0));
3872                 switch (error) {
3873                 case 0:
3874                         break;
3875                 case EADDRNOTAVAIL:
3876                         /*
3877                          * IXAF_VERIFY_SOURCE tells us to pick a better source.
3878                          * Don't have the application see that errno
3879                          */
3880                         error = ENETUNREACH;
3881                         goto failed;
3882                 case ENETDOWN:
3883                         /*
3884                          * Have !ipif_addr_ready address; drop packet silently
3885                          * until we can get applications to not send until we
3886                          * are ready.
3887                          */
3888                         error = 0;
3889                         goto failed;
3890                 case EHOSTUNREACH:
3891                 case ENETUNREACH:
3892                         if (ixa->ixa_ire != NULL) {
3893                                 /*
3894                                  * Let conn_ip_output/ire_send_noroute return
3895                                  * the error and send any local ICMP error.
3896                                  */
3897                                 error = 0;
3898                                 break;
3899                         }
3900                         /* FALLTHRU */
3901                 default:
3902                 failed:
3903                         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3904                         ixa->ixa_cred = connp->conn_cred; /* Restore */
3905                         ixa->ixa_cpid = connp->conn_cpid;
3906                         ixa_refrele(ixa);
3907                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3908                         freemsg(mp);
3909                         return (error);
3910                 }
3911         } else {
3912                 /* Done with conn_t */
3913                 mutex_exit(&connp->conn_lock);
3914         }
3915 
3916         /* We're done.  Pass the packet to ip. */
3917         BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
3918         error = conn_ip_output(mp, ixa);
3919         /* No rawipOutErrors if an error since IP increases its error counter */
3920         switch (error) {
3921         case 0:
3922                 break;
3923         case EWOULDBLOCK:
3924                 (void) ixa_check_drain_insert(connp, ixa);
3925                 error = 0;
3926                 break;
3927         case EADDRNOTAVAIL:
3928                 /*
3929                  * IXAF_VERIFY_SOURCE tells us to pick a better source.
3930                  * Don't have the application see that errno
3931                  */
3932                 error = ENETUNREACH;
3933                 /* FALLTHRU */
3934         default:
3935                 mutex_enter(&connp->conn_lock);
3936                 /*
3937                  * Clear the source and v6lastdst so we call ip_attr_connect
3938                  * for the next packet and try to pick a better source.
3939                  */
3940                 if (connp->conn_mcbc_bind)
3941                         connp->conn_saddr_v6 = ipv6_all_zeros;
3942                 else
3943                         connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
3944                 connp->conn_v6lastdst = ipv6_all_zeros;
3945                 mutex_exit(&connp->conn_lock);
3946                 break;
3947         }
3948         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3949         ixa->ixa_cred = connp->conn_cred; /* Restore */
3950         ixa->ixa_cpid = connp->conn_cpid;
3951         ixa_refrele(ixa);
3952         return (error);
3953 }
3954 
3955 
3956 /*
3957  * Prepend the header template and then fill in the source and
3958  * flowinfo. The caller needs to handle the destination address since
3959  * it's setting is different if rthdr or source route.
3960  *
3961  * Returns NULL is allocation failed or if the packet would exceed IP_MAXPACKET.
3962  * When it returns NULL it sets errorp.
3963  */
3964 static mblk_t *
3965 icmp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp,
3966     const in6_addr_t *v6src, uint32_t flowinfo, int *errorp)
3967 {
3968         icmp_t          *icmp = connp->conn_icmp;
3969         icmp_stack_t    *is = icmp->icmp_is;
3970         uint_t          pktlen;
3971         uint_t          copylen;
3972         uint8_t         *iph;
3973         uint_t          ip_hdr_length;
3974         uint32_t        cksum;
3975         ip_pkt_t        *ipp;
3976 
3977         ASSERT(MUTEX_HELD(&connp->conn_lock));
3978 
3979         /*
3980          * Copy the header template.
3981          */
3982         copylen = connp->conn_ht_iphc_len;
3983         pktlen = copylen + msgdsize(mp);
3984         if (pktlen > IP_MAXPACKET) {
3985                 freemsg(mp);
3986                 *errorp = EMSGSIZE;
3987                 return (NULL);
3988         }
3989         ixa->ixa_pktlen = pktlen;
3990 
3991         /* check/fix buffer config, setup pointers into it */
3992         iph = mp->b_rptr - copylen;
3993         if (DB_REF(mp) != 1 || iph < DB_BASE(mp) || !OK_32PTR(iph)) {
3994                 mblk_t *mp1;
3995 
3996                 mp1 = allocb(copylen + is->is_wroff_extra, BPRI_MED);
3997                 if (mp1 == NULL) {
3998                         freemsg(mp);
3999                         *errorp = ENOMEM;
4000                         return (NULL);
4001                 }
4002                 mp1->b_wptr = DB_LIM(mp1);
4003                 mp1->b_cont = mp;
4004                 mp = mp1;
4005                 iph = (mp->b_wptr - copylen);
4006         }
4007         mp->b_rptr = iph;
4008         bcopy(connp->conn_ht_iphc, iph, copylen);
4009         ip_hdr_length = (uint_t)(connp->conn_ht_ulp - connp->conn_ht_iphc);
4010 
4011         ixa->ixa_ip_hdr_length = ip_hdr_length;
4012 
4013         /*
4014          * Prepare for ICMPv6 checksum done in IP.
4015          *
4016          * icmp_build_hdr_template has already massaged any routing header
4017          * and placed the result in conn_sum.
4018          *
4019          * We make it easy for IP to include our pseudo header
4020          * by putting our length (and any routing header adjustment)
4021          * in the ICMPv6 checksum field.
4022          */
4023         cksum = pktlen - ip_hdr_length;
4024 
4025         cksum += connp->conn_sum;
4026         cksum = (cksum >> 16) + (cksum & 0xFFFF);
4027         ASSERT(cksum < 0x10000);
4028 
4029         ipp = &connp->conn_xmit_ipp;
4030         if (ixa->ixa_flags & IXAF_IS_IPV4) {
4031                 ipha_t  *ipha = (ipha_t *)iph;
4032 
4033                 ipha->ipha_length = htons((uint16_t)pktlen);
4034 
4035                 /* if IP_PKTINFO specified an addres it wins over bind() */
4036                 if ((ipp->ipp_fields & IPPF_ADDR) &&
4037                     IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) {
4038                         ASSERT(ipp->ipp_addr_v4 != INADDR_ANY);
4039                         ipha->ipha_src = ipp->ipp_addr_v4;
4040                 } else {
4041                         IN6_V4MAPPED_TO_IPADDR(v6src, ipha->ipha_src);
4042                 }
4043         } else {
4044                 ip6_t *ip6h = (ip6_t *)iph;
4045                 uint_t  cksum_offset = 0;
4046 
4047                 ip6h->ip6_plen =  htons((uint16_t)(pktlen - IPV6_HDR_LEN));
4048 
4049                 /* if IP_PKTINFO specified an addres it wins over bind() */
4050                 if ((ipp->ipp_fields & IPPF_ADDR) &&
4051                     !IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) {
4052                         ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr));
4053                         ip6h->ip6_src = ipp->ipp_addr;
4054                 } else {
4055                         ip6h->ip6_src = *v6src;
4056                 }
4057                 ip6h->ip6_vcf =
4058                     (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
4059                     (flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
4060                 if (ipp->ipp_fields & IPPF_TCLASS) {
4061                         /* Overrides the class part of flowinfo */
4062                         ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf,
4063                             ipp->ipp_tclass);
4064                 }
4065 
4066                 if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) {
4067                         if (connp->conn_proto == IPPROTO_ICMPV6) {
4068                                 cksum_offset = ixa->ixa_ip_hdr_length +
4069                                     offsetof(icmp6_t, icmp6_cksum);
4070                         } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
4071                                 cksum_offset = ixa->ixa_ip_hdr_length +
4072                                     ixa->ixa_raw_cksum_offset;
4073                         }
4074                 }
4075                 if (cksum_offset != 0) {
4076                         uint16_t *ptr;
4077 
4078                         /* Make sure the checksum fits in the first mblk */
4079                         if (cksum_offset + sizeof (short) > MBLKL(mp)) {
4080                                 mblk_t *mp1;
4081 
4082                                 mp1 = msgpullup(mp,
4083                                     cksum_offset + sizeof (short));
4084                                 freemsg(mp);
4085                                 if (mp1 == NULL) {
4086                                         *errorp = ENOMEM;
4087                                         return (NULL);
4088                                 }
4089                                 mp = mp1;
4090                                 iph = mp->b_rptr;
4091                                 ip6h = (ip6_t *)iph;
4092                         }
4093                         ptr = (uint16_t *)(mp->b_rptr + cksum_offset);
4094                         *ptr = htons(cksum);
4095                 }
4096         }
4097 
4098         return (mp);
4099 }
4100 
4101 /*
4102  * This routine handles all messages passed downstream.  It either
4103  * consumes the message or passes it downstream; it never queues a
4104  * a message.
4105  */
4106 void
4107 icmp_wput(queue_t *q, mblk_t *mp)
4108 {
4109         sin6_t          *sin6;
4110         sin_t           *sin = NULL;
4111         uint_t          srcid;
4112         conn_t          *connp = Q_TO_CONN(q);
4113         icmp_t          *icmp = connp->conn_icmp;
4114         int             error = 0;
4115         struct sockaddr *addr = NULL;
4116         socklen_t       addrlen;
4117         icmp_stack_t    *is = icmp->icmp_is;
4118         struct T_unitdata_req *tudr;
4119         mblk_t          *data_mp;
4120         cred_t          *cr;
4121         pid_t           pid;
4122 
4123         /*
4124          * We directly handle several cases here: T_UNITDATA_REQ message
4125          * coming down as M_PROTO/M_PCPROTO and M_DATA messages for connected
4126          * socket.
4127          */
4128         switch (DB_TYPE(mp)) {
4129         case M_DATA:
4130                 /* sockfs never sends down M_DATA */
4131                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4132                 freemsg(mp);
4133                 return;
4134 
4135         case M_PROTO:
4136         case M_PCPROTO:
4137                 tudr = (struct T_unitdata_req *)mp->b_rptr;
4138                 if (MBLKL(mp) < sizeof (*tudr) ||
4139                     ((t_primp_t)mp->b_rptr)->type != T_UNITDATA_REQ) {
4140                         icmp_wput_other(q, mp);
4141                         return;
4142                 }
4143                 break;
4144 
4145         default:
4146                 icmp_wput_other(q, mp);
4147                 return;
4148         }
4149 
4150         /* Handle valid T_UNITDATA_REQ here */
4151         data_mp = mp->b_cont;
4152         if (data_mp == NULL) {
4153                 error = EPROTO;
4154                 goto ud_error2;
4155         }
4156         mp->b_cont = NULL;
4157 
4158         if (!MBLKIN(mp, 0, tudr->DEST_offset + tudr->DEST_length)) {
4159                 error = EADDRNOTAVAIL;
4160                 goto ud_error2;
4161         }
4162 
4163         /*
4164          * All Solaris components should pass a db_credp
4165          * for this message, hence we ASSERT.
4166          * On production kernels we return an error to be robust against
4167          * random streams modules sitting on top of us.
4168          */
4169         cr = msg_getcred(mp, &pid);
4170         ASSERT(cr != NULL);
4171         if (cr == NULL) {
4172                 error = EINVAL;
4173                 goto ud_error2;
4174         }
4175 
4176         /*
4177          * If a port has not been bound to the stream, fail.
4178          * This is not a problem when sockfs is directly
4179          * above us, because it will ensure that the socket
4180          * is first bound before allowing data to be sent.
4181          */
4182         if (icmp->icmp_state == TS_UNBND) {
4183                 error = EPROTO;
4184                 goto ud_error2;
4185         }
4186         addr = (struct sockaddr *)&mp->b_rptr[tudr->DEST_offset];
4187         addrlen = tudr->DEST_length;
4188 
4189         switch (connp->conn_family) {
4190         case AF_INET6:
4191                 sin6 = (sin6_t *)addr;
4192                 if (!OK_32PTR((char *)sin6) || (addrlen != sizeof (sin6_t)) ||
4193                     (sin6->sin6_family != AF_INET6)) {
4194                         error = EADDRNOTAVAIL;
4195                         goto ud_error2;
4196                 }
4197 
4198                 /* No support for mapped addresses on raw sockets */
4199                 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
4200                         error = EADDRNOTAVAIL;
4201                         goto ud_error2;
4202                 }
4203                 srcid = sin6->__sin6_src_id;
4204 
4205                 /*
4206                  * If the local address is a mapped address return
4207                  * an error.
4208                  * It would be possible to send an IPv6 packet but the
4209                  * response would never make it back to the application
4210                  * since it is bound to a mapped address.
4211                  */
4212                 if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) {
4213                         error = EADDRNOTAVAIL;
4214                         goto ud_error2;
4215                 }
4216 
4217                 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
4218                         sin6->sin6_addr = ipv6_loopback;
4219 
4220                 if (tudr->OPT_length != 0) {
4221                         /*
4222                          * If we are connected then the destination needs to be
4223                          * the same as the connected one.
4224                          */
4225                         if (icmp->icmp_state == TS_DATA_XFER &&
4226                             !conn_same_as_last_v6(connp, sin6)) {
4227                                 error = EISCONN;
4228                                 goto ud_error2;
4229                         }
4230                         error = icmp_output_ancillary(connp, NULL, sin6,
4231                             data_mp, mp, NULL, cr, pid);
4232                 } else {
4233                         ip_xmit_attr_t *ixa;
4234 
4235                         /*
4236                          * We have to allocate an ip_xmit_attr_t before we grab
4237                          * conn_lock and we need to hold conn_lock once we've
4238                          * checked conn_same_as_last_v6 to handle concurrent
4239                          * send* calls on a socket.
4240                          */
4241                         ixa = conn_get_ixa(connp, B_FALSE);
4242                         if (ixa == NULL) {
4243                                 error = ENOMEM;
4244                                 goto ud_error2;
4245                         }
4246                         mutex_enter(&connp->conn_lock);
4247 
4248                         if (conn_same_as_last_v6(connp, sin6) &&
4249                             connp->conn_lastsrcid == srcid &&
4250                             ipsec_outbound_policy_current(ixa)) {
4251                                 /* icmp_output_lastdst drops conn_lock */
4252                                 error = icmp_output_lastdst(connp, data_mp, cr,
4253                                     pid, ixa);
4254                         } else {
4255                                 /* icmp_output_newdst drops conn_lock */
4256                                 error = icmp_output_newdst(connp, data_mp, NULL,
4257                                     sin6, cr, pid, ixa);
4258                         }
4259                         ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
4260                 }
4261                 if (error == 0) {
4262                         freeb(mp);
4263                         return;
4264                 }
4265                 break;
4266 
4267         case AF_INET:
4268                 sin = (sin_t *)addr;
4269                 if ((!OK_32PTR((char *)sin) || addrlen != sizeof (sin_t)) ||
4270                     (sin->sin_family != AF_INET)) {
4271                         error = EADDRNOTAVAIL;
4272                         goto ud_error2;
4273                 }
4274                 if (sin->sin_addr.s_addr == INADDR_ANY)
4275                         sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
4276 
4277                 /* Protocol 255 contains full IP headers */
4278                 /* Read without holding lock */
4279                 if (icmp->icmp_hdrincl) {
4280                         if (MBLKL(data_mp) < IP_SIMPLE_HDR_LENGTH) {
4281                                 if (!pullupmsg(data_mp, IP_SIMPLE_HDR_LENGTH)) {
4282                                         error = EINVAL;
4283                                         goto ud_error2;
4284                                 }
4285                         }
4286                         error = icmp_output_hdrincl(connp, data_mp, cr, pid);
4287                         if (error == 0) {
4288                                 freeb(mp);
4289                                 return;
4290                         }
4291                         /* data_mp consumed above */
4292                         data_mp = NULL;
4293                         goto ud_error2;
4294                 }
4295 
4296                 if (tudr->OPT_length != 0) {
4297                         /*
4298                          * If we are connected then the destination needs to be
4299                          * the same as the connected one.
4300                          */
4301                         if (icmp->icmp_state == TS_DATA_XFER &&
4302                             !conn_same_as_last_v4(connp, sin)) {
4303                                 error = EISCONN;
4304                                 goto ud_error2;
4305                         }
4306                         error = icmp_output_ancillary(connp, sin, NULL,
4307                             data_mp, mp, NULL, cr, pid);
4308                 } else {
4309                         ip_xmit_attr_t *ixa;
4310 
4311                         /*
4312                          * We have to allocate an ip_xmit_attr_t before we grab
4313                          * conn_lock and we need to hold conn_lock once we've
4314                          * checked conn_same_as_last_v4 to handle concurrent
4315                          * send* calls on a socket.
4316                          */
4317                         ixa = conn_get_ixa(connp, B_FALSE);
4318                         if (ixa == NULL) {
4319                                 error = ENOMEM;
4320                                 goto ud_error2;
4321                         }
4322                         mutex_enter(&connp->conn_lock);
4323 
4324                         if (conn_same_as_last_v4(connp, sin) &&
4325                             ipsec_outbound_policy_current(ixa)) {
4326                                 /* icmp_output_lastdst drops conn_lock */
4327                                 error = icmp_output_lastdst(connp, data_mp, cr,
4328                                     pid, ixa);
4329                         } else {
4330                                 /* icmp_output_newdst drops conn_lock */
4331                                 error = icmp_output_newdst(connp, data_mp, sin,
4332                                     NULL, cr, pid, ixa);
4333                         }
4334                         ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
4335                 }
4336                 if (error == 0) {
4337                         freeb(mp);
4338                         return;
4339                 }
4340                 break;
4341         }
4342         ASSERT(mp != NULL);
4343         /* mp is freed by the following routine */
4344         icmp_ud_err(q, mp, (t_scalar_t)error);
4345         return;
4346 
4347 ud_error2:
4348         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4349         freemsg(data_mp);
4350         ASSERT(mp != NULL);
4351         /* mp is freed by the following routine */
4352         icmp_ud_err(q, mp, (t_scalar_t)error);
4353 }
4354 
4355 /*
4356  * Handle the case of the IP address or flow label being different
4357  * for both IPv4 and IPv6.
4358  *
4359  * NOTE: The caller must hold conn_lock and we drop it here.
4360  */
4361 static int
4362 icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, sin6_t *sin6,
4363     cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa)
4364 {
4365         icmp_t          *icmp = connp->conn_icmp;
4366         icmp_stack_t    *is = icmp->icmp_is;
4367         int             error;
4368         ip_xmit_attr_t  *oldixa;
4369         boolean_t       do_ipsec;
4370         uint_t          srcid;
4371         uint32_t        flowinfo;
4372         in6_addr_t      v6src;
4373         in6_addr_t      v6dst;
4374         in6_addr_t      v6nexthop;
4375         in_port_t       dstport;
4376 
4377         ASSERT(MUTEX_HELD(&connp->conn_lock));
4378         ASSERT(ixa != NULL);
4379 
4380         /*
4381          * We hold conn_lock across all the use and modifications of
4382          * the conn_lastdst, conn_ixa, and conn_xmit_ipp to ensure that they
4383          * stay consistent.
4384          */
4385 
4386         ASSERT(cr != NULL);
4387         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
4388         ixa->ixa_cred = cr;
4389         ixa->ixa_cpid = pid;
4390         if (is_system_labeled()) {
4391                 /* We need to restart with a label based on the cred */
4392                 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
4393         }
4394         /*
4395          * If we are connected then the destination needs to be the
4396          * same as the connected one, which is not the case here since we
4397          * checked for that above.
4398          */
4399         if (icmp->icmp_state == TS_DATA_XFER) {
4400                 mutex_exit(&connp->conn_lock);
4401                 error = EISCONN;
4402                 goto ud_error;
4403         }
4404 
4405         /* In case previous destination was multicast or multirt */
4406         ip_attr_newdst(ixa);
4407 
4408         /*
4409          * If laddr is unspecified then we look at sin6_src_id.
4410          * We will give precedence to a source address set with IPV6_PKTINFO
4411          * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't
4412          * want ip_attr_connect to select a source (since it can fail) when
4413          * IPV6_PKTINFO is specified.
4414          * If this doesn't result in a source address then we get a source
4415          * from ip_attr_connect() below.
4416          */
4417         v6src = connp->conn_saddr_v6;
4418         if (sin != NULL) {
4419                 IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst);
4420                 dstport = sin->sin_port;
4421                 flowinfo = 0;
4422                 srcid = 0;
4423                 ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
4424                 if (srcid != 0 && V4_PART_OF_V6(&v6src) == INADDR_ANY) {
4425                         ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
4426                             connp->conn_netstack);
4427                 }
4428                 ixa->ixa_flags |= IXAF_IS_IPV4;
4429         } else {
4430                 v6dst = sin6->sin6_addr;
4431                 dstport = sin6->sin6_port;
4432                 flowinfo = sin6->sin6_flowinfo;
4433                 srcid = sin6->__sin6_src_id;
4434                 if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) {
4435                         ixa->ixa_scopeid = sin6->sin6_scope_id;
4436                         ixa->ixa_flags |= IXAF_SCOPEID_SET;
4437                 } else {
4438                         ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
4439                 }
4440                 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
4441                         ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
4442                             connp->conn_netstack);
4443                 }
4444                 if (IN6_IS_ADDR_V4MAPPED(&v6dst))
4445                         ixa->ixa_flags |= IXAF_IS_IPV4;
4446                 else
4447                         ixa->ixa_flags &= ~IXAF_IS_IPV4;
4448         }
4449         /* Handle IP_PKTINFO/IPV6_PKTINFO setting source address. */
4450         if (connp->conn_xmit_ipp.ipp_fields & IPPF_ADDR) {
4451                 ip_pkt_t *ipp = &connp->conn_xmit_ipp;
4452 
4453                 if (ixa->ixa_flags & IXAF_IS_IPV4) {
4454                         if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
4455                                 v6src = ipp->ipp_addr;
4456                 } else {
4457                         if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
4458                                 v6src = ipp->ipp_addr;
4459                 }
4460         }
4461 
4462         /* Defer IPsec if it might need to look at ICMP type/code */
4463         switch (ixa->ixa_protocol) {
4464         case IPPROTO_ICMP:
4465         case IPPROTO_ICMPV6:
4466                 do_ipsec = B_FALSE;
4467                 break;
4468         default:
4469                 do_ipsec = B_TRUE;
4470         }
4471 
4472         ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &v6dst, &v6nexthop);
4473         mutex_exit(&connp->conn_lock);
4474 
4475         error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport,
4476             &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
4477             (do_ipsec ? IPDF_IPSEC : 0));
4478         switch (error) {
4479         case 0:
4480                 break;
4481         case EADDRNOTAVAIL:
4482                 /*
4483                  * IXAF_VERIFY_SOURCE tells us to pick a better source.
4484                  * Don't have the application see that errno
4485                  */
4486                 error = ENETUNREACH;
4487                 goto failed;
4488         case ENETDOWN:
4489                 /*
4490                  * Have !ipif_addr_ready address; drop packet silently
4491                  * until we can get applications to not send until we
4492                  * are ready.
4493                  */
4494                 error = 0;
4495                 goto failed;
4496         case EHOSTUNREACH:
4497         case ENETUNREACH:
4498                 if (ixa->ixa_ire != NULL) {
4499                         /*
4500                          * Let conn_ip_output/ire_send_noroute return
4501                          * the error and send any local ICMP error.
4502                          */
4503                         error = 0;
4504                         break;
4505                 }
4506                 /* FALLTHRU */
4507         default:
4508         failed:
4509                 goto ud_error;
4510         }
4511 
4512         mutex_enter(&connp->conn_lock);
4513         /*
4514          * While we dropped the lock some other thread might have connected
4515          * this socket. If so we bail out with EISCONN to ensure that the
4516          * connecting thread is the one that updates conn_ixa, conn_ht_*
4517          * and conn_*last*.
4518          */
4519         if (icmp->icmp_state == TS_DATA_XFER) {
4520                 mutex_exit(&connp->conn_lock);
4521                 error = EISCONN;
4522                 goto ud_error;
4523         }
4524 
4525         /*
4526          * We need to rebuild the headers if
4527          *  - we are labeling packets (could be different for different
4528          *    destinations)
4529          *  - we have a source route (or routing header) since we need to
4530          *    massage that to get the pseudo-header checksum
4531          *  - a socket option with COA_HEADER_CHANGED has been set which
4532          *    set conn_v6lastdst to zero.
4533          *
4534          * Otherwise the prepend function will just update the src, dst,
4535          * and flow label.
4536          */
4537         if (is_system_labeled()) {
4538                 /* TX MLP requires SCM_UCRED and don't have that here */
4539                 if (connp->conn_mlp_type != mlptSingle) {
4540                         mutex_exit(&connp->conn_lock);
4541                         error = ECONNREFUSED;
4542                         goto ud_error;
4543                 }
4544                 /*
4545                  * Check whether Trusted Solaris policy allows communication
4546                  * with this host, and pretend that the destination is
4547                  * unreachable if not.
4548                  * Compute any needed label and place it in ipp_label_v4/v6.
4549                  *
4550                  * Later conn_build_hdr_template/conn_prepend_hdr takes
4551                  * ipp_label_v4/v6 to form the packet.
4552                  *
4553                  * Tsol note: Since we hold conn_lock we know no other
4554                  * thread manipulates conn_xmit_ipp.
4555                  */
4556                 error = conn_update_label(connp, ixa, &v6dst,
4557                     &connp->conn_xmit_ipp);
4558                 if (error != 0) {
4559                         mutex_exit(&connp->conn_lock);
4560                         goto ud_error;
4561                 }
4562                 /* Rebuild the header template */
4563                 error = icmp_build_hdr_template(connp, &v6src, &v6dst,
4564                     flowinfo);
4565                 if (error != 0) {
4566                         mutex_exit(&connp->conn_lock);
4567                         goto ud_error;
4568                 }
4569         } else if (connp->conn_xmit_ipp.ipp_fields &
4570             (IPPF_IPV4_OPTIONS|IPPF_RTHDR) ||
4571             IN6_IS_ADDR_UNSPECIFIED(&connp->conn_v6lastdst)) {
4572                 /* Rebuild the header template */
4573                 error = icmp_build_hdr_template(connp, &v6src, &v6dst,
4574                     flowinfo);
4575                 if (error != 0) {
4576                         mutex_exit(&connp->conn_lock);
4577                         goto ud_error;
4578                 }
4579         } else {
4580                 /* Simply update the destination address if no source route */
4581                 if (ixa->ixa_flags & IXAF_IS_IPV4) {
4582                         ipha_t  *ipha = (ipha_t *)connp->conn_ht_iphc;
4583 
4584                         IN6_V4MAPPED_TO_IPADDR(&v6dst, ipha->ipha_dst);
4585                         if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) {
4586                                 ipha->ipha_fragment_offset_and_flags |=
4587                                     IPH_DF_HTONS;
4588                         } else {
4589                                 ipha->ipha_fragment_offset_and_flags &=
4590                                     ~IPH_DF_HTONS;
4591                         }
4592                 } else {
4593                         ip6_t *ip6h = (ip6_t *)connp->conn_ht_iphc;
4594                         ip6h->ip6_dst = v6dst;
4595                 }
4596         }
4597 
4598         /*
4599          * Remember the dst etc which corresponds to the built header
4600          * template and conn_ixa.
4601          */
4602         oldixa = conn_replace_ixa(connp, ixa);
4603         connp->conn_v6lastdst = v6dst;
4604         connp->conn_lastflowinfo = flowinfo;
4605         connp->conn_lastscopeid = ixa->ixa_scopeid;
4606         connp->conn_lastsrcid = srcid;
4607         /* Also remember a source to use together with lastdst */
4608         connp->conn_v6lastsrc = v6src;
4609 
4610         data_mp = icmp_prepend_header_template(connp, ixa, data_mp, &v6src,
4611             flowinfo, &error);
4612 
4613         /* Done with conn_t */
4614         mutex_exit(&connp->conn_lock);
4615         ixa_refrele(oldixa);
4616 
4617         if (data_mp == NULL) {
4618                 ASSERT(error != 0);
4619                 goto ud_error;
4620         }
4621 
4622         if (!do_ipsec) {
4623                 /* Policy might differ for different ICMP type/code */
4624                 data_mp = icmp_output_attach_policy(data_mp, connp, ixa);
4625                 if (data_mp == NULL) {
4626                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4627                         error = EHOSTUNREACH;   /* IPsec policy failure */
4628                         goto done;
4629                 }
4630         }
4631 
4632         /* We're done.  Pass the packet to ip. */
4633         BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
4634 
4635         error = conn_ip_output(data_mp, ixa);
4636         /* No rawipOutErrors if an error since IP increases its error counter */
4637         switch (error) {
4638         case 0:
4639                 break;
4640         case EWOULDBLOCK:
4641                 (void) ixa_check_drain_insert(connp, ixa);
4642                 error = 0;
4643                 break;
4644         case EADDRNOTAVAIL:
4645                 /*
4646                  * IXAF_VERIFY_SOURCE tells us to pick a better source.
4647                  * Don't have the application see that errno
4648                  */
4649                 error = ENETUNREACH;
4650                 /* FALLTHRU */
4651         default:
4652                 mutex_enter(&connp->conn_lock);
4653                 /*
4654                  * Clear the source and v6lastdst so we call ip_attr_connect
4655                  * for the next packet and try to pick a better source.
4656                  */
4657                 if (connp->conn_mcbc_bind)
4658                         connp->conn_saddr_v6 = ipv6_all_zeros;
4659                 else
4660                         connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
4661                 connp->conn_v6lastdst = ipv6_all_zeros;
4662                 mutex_exit(&connp->conn_lock);
4663                 break;
4664         }
4665 done:
4666         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
4667         ixa->ixa_cred = connp->conn_cred; /* Restore */
4668         ixa->ixa_cpid = connp->conn_cpid;
4669         ixa_refrele(ixa);
4670         return (error);
4671 
4672 ud_error:
4673         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
4674         ixa->ixa_cred = connp->conn_cred; /* Restore */
4675         ixa->ixa_cpid = connp->conn_cpid;
4676         ixa_refrele(ixa);
4677 
4678         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4679         freemsg(data_mp);
4680         return (error);
4681 }
4682 
4683 /* ARGSUSED */
4684 static void
4685 icmp_wput_fallback(queue_t *q, mblk_t *mp)
4686 {
4687 #ifdef DEBUG
4688         cmn_err(CE_CONT, "icmp_wput_fallback: Message during fallback \n");
4689 #endif
4690         freemsg(mp);
4691 }
4692 
4693 static void
4694 icmp_wput_other(queue_t *q, mblk_t *mp)
4695 {
4696         uchar_t *rptr = mp->b_rptr;
4697         struct iocblk *iocp;
4698         conn_t  *connp = Q_TO_CONN(q);
4699         icmp_t  *icmp = connp->conn_icmp;
4700         cred_t *cr;
4701 
4702         switch (mp->b_datap->db_type) {
4703         case M_PROTO:
4704         case M_PCPROTO:
4705                 if (mp->b_wptr - rptr < sizeof (t_scalar_t)) {
4706                         /*
4707                          * If the message does not contain a PRIM_type,
4708                          * throw it away.
4709                          */
4710                         freemsg(mp);
4711                         return;
4712                 }
4713                 switch (((t_primp_t)rptr)->type) {
4714                 case T_ADDR_REQ:
4715                         icmp_addr_req(q, mp);
4716                         return;
4717                 case O_T_BIND_REQ:
4718                 case T_BIND_REQ:
4719                         icmp_tpi_bind(q, mp);
4720                         return;
4721                 case T_CONN_REQ:
4722                         icmp_tpi_connect(q, mp);
4723                         return;
4724                 case T_CAPABILITY_REQ:
4725                         icmp_capability_req(q, mp);
4726                         return;
4727                 case T_INFO_REQ:
4728                         icmp_info_req(q, mp);
4729                         return;
4730                 case T_UNITDATA_REQ:
4731                         /*
4732                          * If a T_UNITDATA_REQ gets here, the address must
4733                          * be bad.  Valid T_UNITDATA_REQs are handled
4734                          * in icmp_wput.
4735                          */
4736                         icmp_ud_err(q, mp, EADDRNOTAVAIL);
4737                         return;
4738                 case T_UNBIND_REQ:
4739                         icmp_tpi_unbind(q, mp);
4740                         return;
4741                 case T_SVR4_OPTMGMT_REQ:
4742                         /*
4743                          * All Solaris components should pass a db_credp
4744                          * for this TPI message, hence we ASSERT.
4745                          * But in case there is some other M_PROTO that looks
4746                          * like a TPI message sent by some other kernel
4747                          * component, we check and return an error.
4748                          */
4749                         cr = msg_getcred(mp, NULL);
4750                         ASSERT(cr != NULL);
4751                         if (cr == NULL) {
4752                                 icmp_err_ack(q, mp, TSYSERR, EINVAL);
4753                                 return;
4754                         }
4755 
4756                         if (!snmpcom_req(q, mp, icmp_snmp_set, ip_snmp_get,
4757                             cr)) {
4758                                 svr4_optcom_req(q, mp, cr, &icmp_opt_obj);
4759                         }
4760                         return;
4761 
4762                 case T_OPTMGMT_REQ:
4763                         /*
4764                          * All Solaris components should pass a db_credp
4765                          * for this TPI message, hence we ASSERT.
4766                          * But in case there is some other M_PROTO that looks
4767                          * like a TPI message sent by some other kernel
4768                          * component, we check and return an error.
4769                          */
4770                         cr = msg_getcred(mp, NULL);
4771                         ASSERT(cr != NULL);
4772                         if (cr == NULL) {
4773                                 icmp_err_ack(q, mp, TSYSERR, EINVAL);
4774                                 return;
4775                         }
4776                         tpi_optcom_req(q, mp, cr, &icmp_opt_obj);
4777                         return;
4778 
4779                 case T_DISCON_REQ:
4780                         icmp_tpi_disconnect(q, mp);
4781                         return;
4782 
4783                 /* The following TPI message is not supported by icmp. */
4784                 case O_T_CONN_RES:
4785                 case T_CONN_RES:
4786                         icmp_err_ack(q, mp, TNOTSUPPORT, 0);
4787                         return;
4788 
4789                 /* The following 3 TPI requests are illegal for icmp. */
4790                 case T_DATA_REQ:
4791                 case T_EXDATA_REQ:
4792                 case T_ORDREL_REQ:
4793                         icmp_err_ack(q, mp, TNOTSUPPORT, 0);
4794                         return;
4795                 default:
4796                         break;
4797                 }
4798                 break;
4799         case M_FLUSH:
4800                 if (*rptr & FLUSHW)
4801                         flushq(q, FLUSHDATA);
4802                 break;
4803         case M_IOCTL:
4804                 iocp = (struct iocblk *)mp->b_rptr;
4805                 switch (iocp->ioc_cmd) {
4806                 case TI_GETPEERNAME:
4807                         if (icmp->icmp_state != TS_DATA_XFER) {
4808                                 /*
4809                                  * If a default destination address has not
4810                                  * been associated with the stream, then we
4811                                  * don't know the peer's name.
4812                                  */
4813                                 iocp->ioc_error = ENOTCONN;
4814                                 iocp->ioc_count = 0;
4815                                 mp->b_datap->db_type = M_IOCACK;
4816                                 qreply(q, mp);
4817                                 return;
4818                         }
4819                         /* FALLTHRU */
4820                 case TI_GETMYNAME:
4821                         /*
4822                          * For TI_GETPEERNAME and TI_GETMYNAME, we first
4823                          * need to copyin the user's strbuf structure.
4824                          * Processing will continue in the M_IOCDATA case
4825                          * below.
4826                          */
4827                         mi_copyin(q, mp, NULL,
4828                             SIZEOF_STRUCT(strbuf, iocp->ioc_flag));
4829                         return;
4830                 default:
4831                         break;
4832                 }
4833                 break;
4834         case M_IOCDATA:
4835                 icmp_wput_iocdata(q, mp);
4836                 return;
4837         default:
4838                 /* Unrecognized messages are passed through without change. */
4839                 break;
4840         }
4841         ip_wput_nondata(q, mp);
4842 }
4843 
4844 /*
4845  * icmp_wput_iocdata is called by icmp_wput_other to handle all M_IOCDATA
4846  * messages.
4847  */
4848 static void
4849 icmp_wput_iocdata(queue_t *q, mblk_t *mp)
4850 {
4851         mblk_t          *mp1;
4852         STRUCT_HANDLE(strbuf, sb);
4853         uint_t          addrlen;
4854         conn_t          *connp = Q_TO_CONN(q);
4855         icmp_t          *icmp = connp->conn_icmp;
4856 
4857         /* Make sure it is one of ours. */
4858         switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
4859         case TI_GETMYNAME:
4860         case TI_GETPEERNAME:
4861                 break;
4862         default:
4863                 ip_wput_nondata(q, mp);
4864                 return;
4865         }
4866 
4867         switch (mi_copy_state(q, mp, &mp1)) {
4868         case -1:
4869                 return;
4870         case MI_COPY_CASE(MI_COPY_IN, 1):
4871                 break;
4872         case MI_COPY_CASE(MI_COPY_OUT, 1):
4873                 /*
4874                  * The address has been copied out, so now
4875                  * copyout the strbuf.
4876                  */
4877                 mi_copyout(q, mp);
4878                 return;
4879         case MI_COPY_CASE(MI_COPY_OUT, 2):
4880                 /*
4881                  * The address and strbuf have been copied out.
4882                  * We're done, so just acknowledge the original
4883                  * M_IOCTL.
4884                  */
4885                 mi_copy_done(q, mp, 0);
4886                 return;
4887         default:
4888                 /*
4889                  * Something strange has happened, so acknowledge
4890                  * the original M_IOCTL with an EPROTO error.
4891                  */
4892                 mi_copy_done(q, mp, EPROTO);
4893                 return;
4894         }
4895 
4896         /*
4897          * Now we have the strbuf structure for TI_GETMYNAME
4898          * and TI_GETPEERNAME.  Next we copyout the requested
4899          * address and then we'll copyout the strbuf.
4900          */
4901         STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag,
4902             (void *)mp1->b_rptr);
4903 
4904         if (connp->conn_family == AF_INET)
4905                 addrlen = sizeof (sin_t);
4906         else
4907                 addrlen = sizeof (sin6_t);
4908 
4909         if (STRUCT_FGET(sb, maxlen) < addrlen) {
4910                 mi_copy_done(q, mp, EINVAL);
4911                 return;
4912         }
4913         switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
4914         case TI_GETMYNAME:
4915                 break;
4916         case TI_GETPEERNAME:
4917                 if (icmp->icmp_state != TS_DATA_XFER) {
4918                         mi_copy_done(q, mp, ENOTCONN);
4919                         return;
4920                 }
4921                 break;
4922         default:
4923                 mi_copy_done(q, mp, EPROTO);
4924                 return;
4925         }
4926         mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE);
4927         if (!mp1)
4928                 return;
4929 
4930         STRUCT_FSET(sb, len, addrlen);
4931         switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
4932         case TI_GETMYNAME:
4933                 (void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr,
4934                     &addrlen);
4935                 break;
4936         case TI_GETPEERNAME:
4937                 (void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr,
4938                     &addrlen);
4939                 break;
4940         }
4941         mp1->b_wptr += addrlen;
4942         /* Copy out the address */
4943         mi_copyout(q, mp);
4944 }
4945 
4946 void
4947 icmp_ddi_g_init(void)
4948 {
4949         icmp_max_optsize = optcom_max_optsize(icmp_opt_obj.odb_opt_des_arr,
4950             icmp_opt_obj.odb_opt_arr_cnt);
4951 
4952         /*
4953          * We want to be informed each time a stack is created or
4954          * destroyed in the kernel, so we can maintain the
4955          * set of icmp_stack_t's.
4956          */
4957         netstack_register(NS_ICMP, rawip_stack_init, NULL, rawip_stack_fini);
4958 }
4959 
4960 void
4961 icmp_ddi_g_destroy(void)
4962 {
4963         netstack_unregister(NS_ICMP);
4964 }
4965 
4966 #define INET_NAME       "ip"
4967 
4968 /*
4969  * Initialize the ICMP stack instance.
4970  */
4971 static void *
4972 rawip_stack_init(netstackid_t stackid, netstack_t *ns)
4973 {
4974         icmp_stack_t    *is;
4975         int             error = 0;
4976         size_t          arrsz;
4977         major_t         major;
4978 
4979         is = (icmp_stack_t *)kmem_zalloc(sizeof (*is), KM_SLEEP);
4980         is->is_netstack = ns;
4981 
4982         arrsz = sizeof (icmp_propinfo_tbl);
4983         is->is_propinfo_tbl = (mod_prop_info_t *)kmem_alloc(arrsz, KM_SLEEP);
4984         bcopy(icmp_propinfo_tbl, is->is_propinfo_tbl, arrsz);
4985 
4986         is->is_ksp = rawip_kstat_init(stackid);
4987 
4988         major = mod_name_to_major(INET_NAME);
4989         error = ldi_ident_from_major(major, &is->is_ldi_ident);
4990         ASSERT(error == 0);
4991         return (is);
4992 }
4993 
4994 /*
4995  * Free the ICMP stack instance.
4996  */
4997 static void
4998 rawip_stack_fini(netstackid_t stackid, void *arg)
4999 {
5000         icmp_stack_t *is = (icmp_stack_t *)arg;
5001 
5002         kmem_free(is->is_propinfo_tbl, sizeof (icmp_propinfo_tbl));
5003         is->is_propinfo_tbl = NULL;
5004 
5005         rawip_kstat_fini(stackid, is->is_ksp);
5006         is->is_ksp = NULL;
5007         ldi_ident_release(is->is_ldi_ident);
5008         kmem_free(is, sizeof (*is));
5009 }
5010 
5011 static void *
5012 rawip_kstat_init(netstackid_t stackid) {
5013         kstat_t *ksp;
5014 
5015         rawip_named_kstat_t template = {
5016                 { "inDatagrams",        KSTAT_DATA_UINT32, 0 },
5017                 { "inCksumErrs",        KSTAT_DATA_UINT32, 0 },
5018                 { "inErrors",           KSTAT_DATA_UINT32, 0 },
5019                 { "outDatagrams",       KSTAT_DATA_UINT32, 0 },
5020                 { "outErrors",          KSTAT_DATA_UINT32, 0 },
5021         };
5022 
5023         ksp = kstat_create_netstack("icmp", 0, "rawip", "mib2",
5024                                         KSTAT_TYPE_NAMED,
5025                                         NUM_OF_FIELDS(rawip_named_kstat_t),
5026                                         0, stackid);
5027         if (ksp == NULL || ksp->ks_data == NULL)
5028                 return (NULL);
5029 
5030         bcopy(&template, ksp->ks_data, sizeof (template));
5031         ksp->ks_update = rawip_kstat_update;
5032         ksp->ks_private = (void *)(uintptr_t)stackid;
5033 
5034         kstat_install(ksp);
5035         return (ksp);
5036 }
5037 
5038 static void
5039 rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp)
5040 {
5041         if (ksp != NULL) {
5042                 ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private);
5043                 kstat_delete_netstack(ksp, stackid);
5044         }
5045 }
5046 
5047 static int
5048 rawip_kstat_update(kstat_t *ksp, int rw)
5049 {
5050         rawip_named_kstat_t *rawipkp;
5051         netstackid_t    stackid = (netstackid_t)(uintptr_t)ksp->ks_private;
5052         netstack_t      *ns;
5053         icmp_stack_t    *is;
5054 
5055         if ((ksp == NULL) || (ksp->ks_data == NULL))
5056                 return (EIO);
5057 
5058         if (rw == KSTAT_WRITE)
5059                 return (EACCES);
5060 
5061         rawipkp = (rawip_named_kstat_t *)ksp->ks_data;
5062 
5063         ns = netstack_find_by_stackid(stackid);
5064         if (ns == NULL)
5065                 return (-1);
5066         is = ns->netstack_icmp;
5067         if (is == NULL) {
5068                 netstack_rele(ns);
5069                 return (-1);
5070         }
5071         rawipkp->inDatagrams.value.ui32 =  is->is_rawip_mib.rawipInDatagrams;
5072         rawipkp->inCksumErrs.value.ui32 =  is->is_rawip_mib.rawipInCksumErrs;
5073         rawipkp->inErrors.value.ui32 =          is->is_rawip_mib.rawipInErrors;
5074         rawipkp->outDatagrams.value.ui32 = is->is_rawip_mib.rawipOutDatagrams;
5075         rawipkp->outErrors.value.ui32 =         is->is_rawip_mib.rawipOutErrors;
5076         netstack_rele(ns);
5077         return (0);
5078 }
5079 
5080 /* ARGSUSED */
5081 int
5082 rawip_accept(sock_lower_handle_t lproto_handle,
5083     sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
5084     cred_t *cr)
5085 {
5086         return (EOPNOTSUPP);
5087 }
5088 
5089 /* ARGSUSED */
5090 int
5091 rawip_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
5092     socklen_t len, cred_t *cr)
5093 {
5094         conn_t  *connp = (conn_t *)proto_handle;
5095         int     error;
5096 
5097         /* All Solaris components should pass a cred for this operation. */
5098         ASSERT(cr != NULL);
5099 
5100         /* Binding to a NULL address really means unbind */
5101         if (sa == NULL)
5102                 error = rawip_do_unbind(connp);
5103         else
5104                 error = rawip_do_bind(connp, sa, len);
5105 
5106         if (error < 0) {
5107                 if (error == -TOUTSTATE)
5108                         error = EINVAL;
5109                 else
5110                         error = proto_tlitosyserr(-error);
5111         }
5112         return (error);
5113 }
5114 
5115 static int
5116 rawip_implicit_bind(conn_t *connp)
5117 {
5118         sin6_t sin6addr;
5119         sin_t *sin;
5120         sin6_t *sin6;
5121         socklen_t len;
5122         int error;
5123 
5124         if (connp->conn_family == AF_INET) {
5125                 len = sizeof (struct sockaddr_in);
5126                 sin = (sin_t *)&sin6addr;
5127                 *sin = sin_null;
5128                 sin->sin_family = AF_INET;
5129                 sin->sin_addr.s_addr = INADDR_ANY;
5130         } else {
5131                 ASSERT(connp->conn_family == AF_INET6);
5132                 len = sizeof (sin6_t);
5133                 sin6 = (sin6_t *)&sin6addr;
5134                 *sin6 = sin6_null;
5135                 sin6->sin6_family = AF_INET6;
5136                 V6_SET_ZERO(sin6->sin6_addr);
5137         }
5138 
5139         error = rawip_do_bind(connp, (struct sockaddr *)&sin6addr, len);
5140 
5141         return ((error < 0) ? proto_tlitosyserr(-error) : error);
5142 }
5143 
5144 static int
5145 rawip_unbind(conn_t *connp)
5146 {
5147         int error;
5148 
5149         error = rawip_do_unbind(connp);
5150         if (error < 0) {
5151                 error = proto_tlitosyserr(-error);
5152         }
5153         return (error);
5154 }
5155 
5156 /* ARGSUSED */
5157 int
5158 rawip_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
5159 {
5160         return (EOPNOTSUPP);
5161 }
5162 
5163 int
5164 rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
5165     socklen_t len, sock_connid_t *id, cred_t *cr)
5166 {
5167         conn_t  *connp = (conn_t *)proto_handle;
5168         icmp_t *icmp = connp->conn_icmp;
5169         int     error;
5170         boolean_t did_bind = B_FALSE;
5171         pid_t   pid = curproc->p_pid;
5172 
5173         /* All Solaris components should pass a cred for this operation. */
5174         ASSERT(cr != NULL);
5175 
5176         if (sa == NULL) {
5177                 /*
5178                  * Disconnect
5179                  * Make sure we are connected
5180                  */
5181                 if (icmp->icmp_state != TS_DATA_XFER)
5182                         return (EINVAL);
5183 
5184                 error = icmp_disconnect(connp);
5185                 return (error);
5186         }
5187 
5188         error = proto_verify_ip_addr(connp->conn_family, sa, len);
5189         if (error != 0)
5190                 return (error);
5191 
5192         /* do an implicit bind if necessary */
5193         if (icmp->icmp_state == TS_UNBND) {
5194                 error = rawip_implicit_bind(connp);
5195                 /*
5196                  * We could be racing with an actual bind, in which case
5197                  * we would see EPROTO. We cross our fingers and try
5198                  * to connect.
5199                  */
5200                 if (!(error == 0 || error == EPROTO))
5201                         return (error);
5202                 did_bind = B_TRUE;
5203         }
5204 
5205         /*
5206          * set SO_DGRAM_ERRIND
5207          */
5208         connp->conn_dgram_errind = B_TRUE;
5209 
5210         error = rawip_do_connect(connp, sa, len, cr, pid);
5211         if (error != 0 && did_bind) {
5212                 int unbind_err;
5213 
5214                 unbind_err = rawip_unbind(connp);
5215                 ASSERT(unbind_err == 0);
5216         }
5217 
5218         if (error == 0) {
5219                 *id = 0;
5220                 (*connp->conn_upcalls->su_connected)(connp->conn_upper_handle,
5221                     0, NULL, -1);
5222         } else if (error < 0) {
5223                 error = proto_tlitosyserr(-error);
5224         }
5225         return (error);
5226 }
5227 
5228 /* ARGSUSED2 */
5229 int
5230 rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q,
5231     boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb,
5232     sock_quiesce_arg_t *arg)
5233 {
5234         conn_t  *connp = (conn_t *)proto_handle;
5235         icmp_t  *icmp;
5236         struct T_capability_ack tca;
5237         struct sockaddr_in6 laddr, faddr;
5238         socklen_t laddrlen, faddrlen;
5239         short opts;
5240         struct stroptions *stropt;
5241         mblk_t *mp, *stropt_mp;
5242         int error;
5243 
5244         icmp = connp->conn_icmp;
5245 
5246         stropt_mp = allocb_wait(sizeof (*stropt), BPRI_HI, STR_NOSIG, NULL);
5247 
5248         /*
5249          * setup the fallback stream that was allocated
5250          */
5251         connp->conn_dev = (dev_t)RD(q)->q_ptr;
5252         connp->conn_minor_arena = WR(q)->q_ptr;
5253 
5254         RD(q)->q_ptr = WR(q)->q_ptr = connp;
5255 
5256         WR(q)->q_qinfo = &icmpwinit;
5257 
5258         connp->conn_rq = RD(q);
5259         connp->conn_wq = WR(q);
5260 
5261         /* Notify stream head about options before sending up data */
5262         stropt_mp->b_datap->db_type = M_SETOPTS;
5263         stropt_mp->b_wptr += sizeof (*stropt);
5264         stropt = (struct stroptions *)stropt_mp->b_rptr;
5265         stropt->so_flags = SO_WROFF | SO_HIWAT;
5266         stropt->so_wroff = connp->conn_wroff;
5267         stropt->so_hiwat = connp->conn_rcvbuf;
5268         putnext(RD(q), stropt_mp);
5269 
5270         /*
5271          * free helper stream
5272          */
5273         ip_free_helper_stream(connp);
5274 
5275         /*
5276          * Collect the information needed to sync with the sonode
5277          */
5278         icmp_do_capability_ack(icmp, &tca, TC1_INFO);
5279 
5280         laddrlen = faddrlen = sizeof (sin6_t);
5281         (void) rawip_getsockname((sock_lower_handle_t)connp,
5282             (struct sockaddr *)&laddr, &laddrlen, CRED());
5283         error = rawip_getpeername((sock_lower_handle_t)connp,
5284             (struct sockaddr *)&faddr, &faddrlen, CRED());
5285         if (error != 0)
5286                 faddrlen = 0;
5287         opts = 0;
5288         if (connp->conn_dgram_errind)
5289                 opts |= SO_DGRAM_ERRIND;
5290         if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
5291                 opts |= SO_DONTROUTE;
5292 
5293         mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca,
5294             (struct sockaddr *)&laddr, laddrlen,
5295             (struct sockaddr *)&faddr, faddrlen, opts);
5296 
5297         /*
5298          * Attempts to send data up during fallback will result in it being
5299          * queued in icmp_t. Now we push up any queued packets.
5300          */
5301         mutex_enter(&icmp->icmp_recv_lock);
5302         if (mp != NULL) {
5303                 mp->b_next = icmp->icmp_fallback_queue_head;
5304                 icmp->icmp_fallback_queue_head = mp;
5305         }
5306         while (icmp->icmp_fallback_queue_head != NULL) {
5307                 mp = icmp->icmp_fallback_queue_head;
5308                 icmp->icmp_fallback_queue_head = mp->b_next;
5309                 mp->b_next = NULL;
5310                 mutex_exit(&icmp->icmp_recv_lock);
5311                 putnext(RD(q), mp);
5312                 mutex_enter(&icmp->icmp_recv_lock);
5313         }
5314         icmp->icmp_fallback_queue_tail = icmp->icmp_fallback_queue_head;
5315 
5316         /*
5317          * No longer a streams less socket
5318          */
5319         mutex_enter(&connp->conn_lock);
5320         connp->conn_flags &= ~IPCL_NONSTR;
5321         mutex_exit(&connp->conn_lock);
5322 
5323         mutex_exit(&icmp->icmp_recv_lock);
5324 
5325         ASSERT(icmp->icmp_fallback_queue_head == NULL &&
5326             icmp->icmp_fallback_queue_tail == NULL);
5327 
5328         ASSERT(connp->conn_ref >= 1);
5329 
5330         return (0);
5331 }
5332 
5333 /* ARGSUSED2 */
5334 sock_lower_handle_t
5335 rawip_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
5336     uint_t *smodep, int *errorp, int flags, cred_t *credp)
5337 {
5338         conn_t *connp;
5339 
5340         if (type != SOCK_RAW || (family != AF_INET && family != AF_INET6)) {
5341                 *errorp = EPROTONOSUPPORT;
5342                 return (NULL);
5343         }
5344 
5345         connp = rawip_do_open(family, credp, errorp, flags);
5346         if (connp != NULL) {
5347                 connp->conn_flags |= IPCL_NONSTR;
5348 
5349                 mutex_enter(&connp->conn_lock);
5350                 connp->conn_state_flags &= ~CONN_INCIPIENT;
5351                 mutex_exit(&connp->conn_lock);
5352                 *sock_downcalls = &sock_rawip_downcalls;
5353                 *smodep = SM_ATOMIC;
5354         } else {
5355                 ASSERT(*errorp != 0);
5356         }
5357 
5358         return ((sock_lower_handle_t)connp);
5359 }
5360 
5361 /* ARGSUSED3 */
5362 void
5363 rawip_activate(sock_lower_handle_t proto_handle,
5364     sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, int flags,
5365     cred_t *cr)
5366 {
5367         conn_t                  *connp = (conn_t *)proto_handle;
5368         struct sock_proto_props sopp;
5369 
5370         /* All Solaris components should pass a cred for this operation. */
5371         ASSERT(cr != NULL);
5372 
5373         connp->conn_upcalls = sock_upcalls;
5374         connp->conn_upper_handle = sock_handle;
5375 
5376         sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
5377             SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ;
5378         sopp.sopp_wroff = connp->conn_wroff;
5379         sopp.sopp_rxhiwat = connp->conn_rcvbuf;
5380         sopp.sopp_rxlowat = connp->conn_rcvlowat;
5381         sopp.sopp_maxblk = INFPSZ;
5382         sopp.sopp_maxpsz = IP_MAXPACKET;
5383         sopp.sopp_minpsz = (icmp_mod_info.mi_minpsz == 1) ? 0 :
5384             icmp_mod_info.mi_minpsz;
5385 
5386         (*connp->conn_upcalls->su_set_proto_props)
5387             (connp->conn_upper_handle, &sopp);
5388 
5389         icmp_bind_proto(connp->conn_icmp);
5390 }
5391 
5392 /* ARGSUSED3 */
5393 int
5394 rawip_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa,
5395     socklen_t *salenp, cred_t *cr)
5396 {
5397         conn_t  *connp = (conn_t *)proto_handle;
5398         icmp_t  *icmp = connp->conn_icmp;
5399         int     error;
5400 
5401         /* All Solaris components should pass a cred for this operation. */
5402         ASSERT(cr != NULL);
5403 
5404         mutex_enter(&connp->conn_lock);
5405         if (icmp->icmp_state != TS_DATA_XFER)
5406                 error = ENOTCONN;
5407         else
5408                 error = conn_getpeername(connp, sa, salenp);
5409         mutex_exit(&connp->conn_lock);
5410         return (error);
5411 }
5412 
5413 /* ARGSUSED3 */
5414 int
5415 rawip_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *sa,
5416     socklen_t *salenp, cred_t *cr)
5417 {
5418         conn_t  *connp = (conn_t *)proto_handle;
5419         int     error;
5420 
5421         /* All Solaris components should pass a cred for this operation. */
5422         ASSERT(cr != NULL);
5423 
5424         mutex_enter(&connp->conn_lock);
5425         error = conn_getsockname(connp, sa, salenp);
5426         mutex_exit(&connp->conn_lock);
5427         return (error);
5428 }
5429 
5430 int
5431 rawip_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
5432     const void *optvalp, socklen_t optlen, cred_t *cr)
5433 {
5434         conn_t  *connp = (conn_t *)proto_handle;
5435         int error;
5436 
5437         /* All Solaris components should pass a cred for this operation. */
5438         ASSERT(cr != NULL);
5439 
5440         error = proto_opt_check(level, option_name, optlen, NULL,
5441             icmp_opt_obj.odb_opt_des_arr,
5442             icmp_opt_obj.odb_opt_arr_cnt,
5443             B_TRUE, B_FALSE, cr);
5444 
5445         if (error != 0) {
5446                 /*
5447                  * option not recognized
5448                  */
5449                 if (error < 0) {
5450                         error = proto_tlitosyserr(-error);
5451                 }
5452                 return (error);
5453         }
5454 
5455         error = icmp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level,
5456             option_name, optlen, (uchar_t *)optvalp, (uint_t *)&optlen,
5457             (uchar_t *)optvalp, NULL, cr);
5458 
5459         ASSERT(error >= 0);
5460 
5461         return (error);
5462 }
5463 
5464 int
5465 rawip_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
5466     void *optvalp, socklen_t *optlen, cred_t *cr)
5467 {
5468         int             error;
5469         conn_t          *connp = (conn_t *)proto_handle;
5470         t_uscalar_t     max_optbuf_len;
5471         void            *optvalp_buf;
5472         int             len;
5473 
5474         /* All Solaris components should pass a cred for this operation. */
5475         ASSERT(cr != NULL);
5476 
5477         error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
5478             icmp_opt_obj.odb_opt_des_arr,
5479             icmp_opt_obj.odb_opt_arr_cnt,
5480             B_FALSE, B_TRUE, cr);
5481 
5482         if (error != 0) {
5483                 if (error < 0) {
5484                         error = proto_tlitosyserr(-error);
5485                 }
5486                 return (error);
5487         }
5488 
5489         optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
5490         len = icmp_opt_get(connp, level, option_name, optvalp_buf);
5491         if (len == -1) {
5492                 kmem_free(optvalp_buf, max_optbuf_len);
5493                 return (EINVAL);
5494         }
5495 
5496         /*
5497          * update optlen and copy option value
5498          */
5499         t_uscalar_t size = MIN(len, *optlen);
5500 
5501         bcopy(optvalp_buf, optvalp, size);
5502         bcopy(&size, optlen, sizeof (size));
5503 
5504         kmem_free(optvalp_buf, max_optbuf_len);
5505         return (0);
5506 }
5507 
5508 /* ARGSUSED1 */
5509 int
5510 rawip_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
5511 {
5512         conn_t  *connp = (conn_t *)proto_handle;
5513 
5514         /* All Solaris components should pass a cred for this operation. */
5515         ASSERT(cr != NULL);
5516 
5517         (void) rawip_do_close(connp);
5518         return (0);
5519 }
5520 
5521 /* ARGSUSED2 */
5522 int
5523 rawip_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
5524 {
5525         conn_t  *connp = (conn_t *)proto_handle;
5526 
5527         /* All Solaris components should pass a cred for this operation. */
5528         ASSERT(cr != NULL);
5529 
5530         /* shut down the send side */
5531         if (how != SHUT_RD)
5532                 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
5533                     SOCK_OPCTL_SHUT_SEND, 0);
5534         /* shut down the recv side */
5535         if (how != SHUT_WR)
5536                 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
5537                     SOCK_OPCTL_SHUT_RECV, 0);
5538         return (0);
5539 }
5540 
5541 void
5542 rawip_clr_flowctrl(sock_lower_handle_t proto_handle)
5543 {
5544         conn_t  *connp = (conn_t *)proto_handle;
5545         icmp_t  *icmp = connp->conn_icmp;
5546 
5547         mutex_enter(&icmp->icmp_recv_lock);
5548         connp->conn_flow_cntrld = B_FALSE;
5549         mutex_exit(&icmp->icmp_recv_lock);
5550 }
5551 
5552 int
5553 rawip_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
5554     int mode, int32_t *rvalp, cred_t *cr)
5555 {
5556         conn_t          *connp = (conn_t *)proto_handle;
5557         int             error;
5558 
5559         /* All Solaris components should pass a cred for this operation. */
5560         ASSERT(cr != NULL);
5561 
5562         /*
5563          * If we don't have a helper stream then create one.
5564          * ip_create_helper_stream takes care of locking the conn_t,
5565          * so this check for NULL is just a performance optimization.
5566          */
5567         if (connp->conn_helper_info == NULL) {
5568                 icmp_stack_t *is = connp->conn_icmp->icmp_is;
5569 
5570                 ASSERT(is->is_ldi_ident != NULL);
5571 
5572                 /*
5573                  * Create a helper stream for non-STREAMS socket.
5574                  */
5575                 error = ip_create_helper_stream(connp, is->is_ldi_ident);
5576                 if (error != 0) {
5577                         ip0dbg(("rawip_ioctl: create of IP helper stream "
5578                             "failed %d\n", error));
5579                         return (error);
5580                 }
5581         }
5582 
5583         switch (cmd) {
5584         case _SIOCSOCKFALLBACK:
5585         case TI_GETPEERNAME:
5586         case TI_GETMYNAME:
5587 #ifdef DEBUG
5588                 cmn_err(CE_CONT, "icmp_ioctl cmd 0x%x on non streams"
5589                     " socket", cmd);
5590 #endif
5591                 error = EINVAL;
5592                 break;
5593         default:
5594                 /*
5595                  * Pass on to IP using helper stream
5596                  */
5597                 error = ldi_ioctl(connp->conn_helper_info->iphs_handle,
5598                     cmd, arg, mode, cr, rvalp);
5599                 break;
5600         }
5601         return (error);
5602 }
5603 
5604 int
5605 rawip_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
5606     cred_t *cr)
5607 {
5608         sin6_t          *sin6;
5609         sin_t           *sin = NULL;
5610         uint_t          srcid;
5611         conn_t          *connp = (conn_t *)proto_handle;
5612         icmp_t          *icmp = connp->conn_icmp;
5613         int             error = 0;
5614         icmp_stack_t    *is = icmp->icmp_is;
5615         pid_t           pid = curproc->p_pid;
5616         ip_xmit_attr_t  *ixa;
5617 
5618         ASSERT(DB_TYPE(mp) == M_DATA);
5619 
5620         /* All Solaris components should pass a cred for this operation. */
5621         ASSERT(cr != NULL);
5622 
5623         /* do an implicit bind if necessary */
5624         if (icmp->icmp_state == TS_UNBND) {
5625                 error = rawip_implicit_bind(connp);
5626                 /*
5627                  * We could be racing with an actual bind, in which case
5628                  * we would see EPROTO. We cross our fingers and try
5629                  * to connect.
5630                  */
5631                 if (!(error == 0 || error == EPROTO)) {
5632                         freemsg(mp);
5633                         return (error);
5634                 }
5635         }
5636 
5637         /* Protocol 255 contains full IP headers */
5638         /* Read without holding lock */
5639         if (icmp->icmp_hdrincl) {
5640                 ASSERT(connp->conn_ipversion == IPV4_VERSION);
5641                 if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH) {
5642                         if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
5643                                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5644                                 freemsg(mp);
5645                                 return (EINVAL);
5646                         }
5647                 }
5648                 error = icmp_output_hdrincl(connp, mp, cr, pid);
5649                 if (is->is_sendto_ignerr)
5650                         return (0);
5651                 else
5652                         return (error);
5653         }
5654 
5655         /* Connected? */
5656         if (msg->msg_name == NULL) {
5657                 if (icmp->icmp_state != TS_DATA_XFER) {
5658                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5659                         return (EDESTADDRREQ);
5660                 }
5661                 if (msg->msg_controllen != 0) {
5662                         error = icmp_output_ancillary(connp, NULL, NULL, mp,
5663                             NULL, msg, cr, pid);
5664                 } else {
5665                         error = icmp_output_connected(connp, mp, cr, pid);
5666                 }
5667                 if (is->is_sendto_ignerr)
5668                         return (0);
5669                 else
5670                         return (error);
5671         }
5672         if (icmp->icmp_state == TS_DATA_XFER) {
5673                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5674                 return (EISCONN);
5675         }
5676         error = proto_verify_ip_addr(connp->conn_family,
5677             (struct sockaddr *)msg->msg_name, msg->msg_namelen);
5678         if (error != 0) {
5679                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5680                 return (error);
5681         }
5682         switch (connp->conn_family) {
5683         case AF_INET6:
5684                 sin6 = (sin6_t *)msg->msg_name;
5685 
5686                 /* No support for mapped addresses on raw sockets */
5687                 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
5688                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5689                         return (EADDRNOTAVAIL);
5690                 }
5691                 srcid = sin6->__sin6_src_id;
5692 
5693                 /*
5694                  * If the local address is a mapped address return
5695                  * an error.
5696                  * It would be possible to send an IPv6 packet but the
5697                  * response would never make it back to the application
5698                  * since it is bound to a mapped address.
5699                  */
5700                 if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) {
5701                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5702                         return (EADDRNOTAVAIL);
5703                 }
5704 
5705                 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
5706                         sin6->sin6_addr = ipv6_loopback;
5707 
5708                 /*
5709                  * We have to allocate an ip_xmit_attr_t before we grab
5710                  * conn_lock and we need to hold conn_lock once we've check
5711                  * conn_same_as_last_v6 to handle concurrent send* calls on a
5712                  * socket.
5713                  */
5714                 if (msg->msg_controllen == 0) {
5715                         ixa = conn_get_ixa(connp, B_FALSE);
5716                         if (ixa == NULL) {
5717                                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5718                                 return (ENOMEM);
5719                         }
5720                 } else {
5721                         ixa = NULL;
5722                 }
5723                 mutex_enter(&connp->conn_lock);
5724                 if (icmp->icmp_delayed_error != 0) {
5725                         sin6_t  *sin2 = (sin6_t *)&icmp->icmp_delayed_addr;
5726 
5727                         error = icmp->icmp_delayed_error;
5728                         icmp->icmp_delayed_error = 0;
5729 
5730                         /* Compare IP address and family */
5731 
5732                         if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr,
5733                             &sin2->sin6_addr) &&
5734                             sin6->sin6_family == sin2->sin6_family) {
5735                                 mutex_exit(&connp->conn_lock);
5736                                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5737                                 if (ixa != NULL)
5738                                         ixa_refrele(ixa);
5739                                 return (error);
5740                         }
5741                 }
5742                 if (msg->msg_controllen != 0) {
5743                         mutex_exit(&connp->conn_lock);
5744                         ASSERT(ixa == NULL);
5745                         error = icmp_output_ancillary(connp, NULL, sin6, mp,
5746                             NULL, msg, cr, pid);
5747                 } else if (conn_same_as_last_v6(connp, sin6) &&
5748                     connp->conn_lastsrcid == srcid &&
5749                     ipsec_outbound_policy_current(ixa)) {
5750                         /* icmp_output_lastdst drops conn_lock */
5751                         error = icmp_output_lastdst(connp, mp, cr, pid, ixa);
5752                 } else {
5753                         /* icmp_output_newdst drops conn_lock */
5754                         error = icmp_output_newdst(connp, mp, NULL, sin6, cr,
5755                             pid, ixa);
5756                 }
5757                 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
5758                 if (is->is_sendto_ignerr)
5759                         return (0);
5760                 else
5761                         return (error);
5762         case AF_INET:
5763                 sin = (sin_t *)msg->msg_name;
5764 
5765                 if (sin->sin_addr.s_addr == INADDR_ANY)
5766                         sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
5767 
5768                 /*
5769                  * We have to allocate an ip_xmit_attr_t before we grab
5770                  * conn_lock and we need to hold conn_lock once we've check
5771                  * conn_same_as_last_v6 to handle concurrent send* on a socket.
5772                  */
5773                 if (msg->msg_controllen == 0) {
5774                         ixa = conn_get_ixa(connp, B_FALSE);
5775                         if (ixa == NULL) {
5776                                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5777                                 return (ENOMEM);
5778                         }
5779                 } else {
5780                         ixa = NULL;
5781                 }
5782                 mutex_enter(&connp->conn_lock);
5783                 if (icmp->icmp_delayed_error != 0) {
5784                         sin_t  *sin2 = (sin_t *)&icmp->icmp_delayed_addr;
5785 
5786                         error = icmp->icmp_delayed_error;
5787                         icmp->icmp_delayed_error = 0;
5788 
5789                         /* Compare IP address */
5790 
5791                         if (sin->sin_addr.s_addr == sin2->sin_addr.s_addr) {
5792                                 mutex_exit(&connp->conn_lock);
5793                                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5794                                 if (ixa != NULL)
5795                                         ixa_refrele(ixa);
5796                                 return (error);
5797                         }
5798                 }
5799 
5800                 if (msg->msg_controllen != 0) {
5801                         mutex_exit(&connp->conn_lock);
5802                         ASSERT(ixa == NULL);
5803                         error = icmp_output_ancillary(connp, sin, NULL, mp,
5804                             NULL, msg, cr, pid);
5805                 } else if (conn_same_as_last_v4(connp, sin) &&
5806                     ipsec_outbound_policy_current(ixa)) {
5807                         /* icmp_output_lastdst drops conn_lock */
5808                         error = icmp_output_lastdst(connp, mp, cr, pid, ixa);
5809                 } else {
5810                         /* icmp_output_newdst drops conn_lock */
5811                         error = icmp_output_newdst(connp, mp, sin, NULL, cr,
5812                             pid, ixa);
5813                 }
5814                 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
5815                 if (is->is_sendto_ignerr)
5816                         return (0);
5817                 else
5818                         return (error);
5819         default:
5820                 return (EINVAL);
5821         }
5822 }
5823 
5824 sock_downcalls_t sock_rawip_downcalls = {
5825         rawip_activate,
5826         rawip_accept,
5827         rawip_bind,
5828         rawip_listen,
5829         rawip_connect,
5830         rawip_getpeername,
5831         rawip_getsockname,
5832         rawip_getsockopt,
5833         rawip_setsockopt,
5834         rawip_send,
5835         NULL,
5836         NULL,
5837         NULL,
5838         rawip_shutdown,
5839         rawip_clr_flowctrl,
5840         rawip_ioctl,
5841         rawip_close
5842 };