1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 /* Copyright (c) 1990 Mentat Inc. */
  25 
  26 #include <sys/types.h>
  27 #include <sys/stream.h>
  28 #include <sys/stropts.h>
  29 #include <sys/strlog.h>
  30 #include <sys/strsun.h>
  31 #define _SUN_TPI_VERSION 2
  32 #include <sys/tihdr.h>
  33 #include <sys/timod.h>
  34 #include <sys/ddi.h>
  35 #include <sys/sunddi.h>
  36 #include <sys/strsubr.h>
  37 #include <sys/suntpi.h>
  38 #include <sys/xti_inet.h>
  39 #include <sys/cmn_err.h>
  40 #include <sys/kmem.h>
  41 #include <sys/cred.h>
  42 #include <sys/policy.h>
  43 #include <sys/priv.h>
  44 #include <sys/ucred.h>
  45 #include <sys/zone.h>
  46 
  47 #include <sys/sockio.h>
  48 #include <sys/socket.h>
  49 #include <sys/socketvar.h>
  50 #include <sys/vtrace.h>
  51 #include <sys/sdt.h>
  52 #include <sys/debug.h>
  53 #include <sys/isa_defs.h>
  54 #include <sys/random.h>
  55 #include <netinet/in.h>
  56 #include <netinet/ip6.h>
  57 #include <netinet/icmp6.h>
  58 #include <netinet/udp.h>
  59 
  60 #include <inet/common.h>
  61 #include <inet/ip.h>
  62 #include <inet/ip_impl.h>
  63 #include <inet/ipsec_impl.h>
  64 #include <inet/ip6.h>
  65 #include <inet/ip_ire.h>
  66 #include <inet/ip_if.h>
  67 #include <inet/ip_multi.h>
  68 #include <inet/ip_ndp.h>
  69 #include <inet/proto_set.h>
  70 #include <inet/mib2.h>
  71 #include <inet/nd.h>
  72 #include <inet/optcom.h>
  73 #include <inet/snmpcom.h>
  74 #include <inet/kstatcom.h>
  75 #include <inet/ipclassifier.h>
  76 
  77 #include <sys/tsol/label.h>
  78 #include <sys/tsol/tnet.h>
  79 
  80 #include <inet/rawip_impl.h>
  81 
  82 #include <sys/disp.h>
  83 
  84 /*
  85  * Synchronization notes:
  86  *
  87  * RAWIP is MT and uses the usual kernel synchronization primitives. We use
  88  * conn_lock to protect the icmp_t.
  89  *
  90  * Plumbing notes:
  91  * ICMP is always a device driver. For compatibility with mibopen() code
  92  * it is possible to I_PUSH "icmp", but that results in pushing a passthrough
  93  * dummy module.
  94  */
  95 
  96 static void     icmp_addr_req(queue_t *q, mblk_t *mp);
  97 static void     icmp_tpi_bind(queue_t *q, mblk_t *mp);
  98 static void     icmp_bind_proto(icmp_t *icmp);
  99 static int      icmp_build_hdr_template(conn_t *, const in6_addr_t *,
 100     const in6_addr_t *, uint32_t);
 101 static void     icmp_capability_req(queue_t *q, mblk_t *mp);
 102 static int      icmp_close(queue_t *q, int flags);
 103 static void     icmp_close_free(conn_t *);
 104 static void     icmp_tpi_connect(queue_t *q, mblk_t *mp);
 105 static void     icmp_tpi_disconnect(queue_t *q, mblk_t *mp);
 106 static void     icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error,
 107     int sys_error);
 108 static void     icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
 109     t_scalar_t tlierr, int sys_error);
 110 static void     icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2,
 111     ip_recv_attr_t *);
 112 static void     icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp,
 113     ip_recv_attr_t *);
 114 static void     icmp_info_req(queue_t *q, mblk_t *mp);
 115 static void     icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *);
 116 static conn_t   *icmp_open(int family, cred_t *credp, int *err, int flags);
 117 static int      icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag,
 118                     cred_t *credp);
 119 static int      icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag,
 120                     cred_t *credp);
 121 static boolean_t icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name);
 122 int             icmp_opt_set(conn_t *connp, uint_t optset_context,
 123                     int level, int name, uint_t inlen,
 124                     uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
 125                     void *thisdg_attrs, cred_t *cr);
 126 int             icmp_opt_get(conn_t *connp, int level, int name,
 127                     uchar_t *ptr);
 128 static int      icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin,
 129                     sin6_t *sin6, cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa);
 130 static mblk_t   *icmp_prepend_hdr(conn_t *, ip_xmit_attr_t *, const ip_pkt_t *,
 131     const in6_addr_t *, const in6_addr_t *, uint32_t, mblk_t *, int *);
 132 static mblk_t   *icmp_prepend_header_template(conn_t *, ip_xmit_attr_t *,
 133     mblk_t *, const in6_addr_t *, uint32_t, int *);
 134 static int      icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
 135                     uchar_t *ptr, int len);
 136 static void     icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err);
 137 static void     icmp_tpi_unbind(queue_t *q, mblk_t *mp);
 138 static void     icmp_wput(queue_t *q, mblk_t *mp);
 139 static void     icmp_wput_fallback(queue_t *q, mblk_t *mp);
 140 static void     icmp_wput_other(queue_t *q, mblk_t *mp);
 141 static void     icmp_wput_iocdata(queue_t *q, mblk_t *mp);
 142 static void     icmp_wput_restricted(queue_t *q, mblk_t *mp);
 143 static void     icmp_ulp_recv(conn_t *, mblk_t *, uint_t);
 144 
 145 static void     *rawip_stack_init(netstackid_t stackid, netstack_t *ns);
 146 static void     rawip_stack_fini(netstackid_t stackid, void *arg);
 147 
 148 static void     *rawip_kstat_init(netstackid_t stackid);
 149 static void     rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp);
 150 static int      rawip_kstat_update(kstat_t *kp, int rw);
 151 static void     rawip_stack_shutdown(netstackid_t stackid, void *arg);
 152 
 153 /* Common routines for TPI and socket module */
 154 static conn_t   *rawip_do_open(int, cred_t *, int *, int);
 155 static void     rawip_do_close(conn_t *);
 156 static int      rawip_do_bind(conn_t *, struct sockaddr *, socklen_t);
 157 static int      rawip_do_unbind(conn_t *);
 158 static int      rawip_do_connect(conn_t *, const struct sockaddr *, socklen_t,
 159     cred_t *, pid_t);
 160 
 161 int             rawip_getsockname(sock_lower_handle_t, struct sockaddr *,
 162                     socklen_t *, cred_t *);
 163 int             rawip_getpeername(sock_lower_handle_t, struct sockaddr *,
 164                     socklen_t *, cred_t *);
 165 
 166 static struct module_info icmp_mod_info =  {
 167         5707, "icmp", 1, INFPSZ, 512, 128
 168 };
 169 
 170 /*
 171  * Entry points for ICMP as a device.
 172  * We have separate open functions for the /dev/icmp and /dev/icmp6 devices.
 173  */
 174 static struct qinit icmprinitv4 = {
 175         NULL, NULL, icmp_openv4, icmp_close, NULL, &icmp_mod_info
 176 };
 177 
 178 static struct qinit icmprinitv6 = {
 179         NULL, NULL, icmp_openv6, icmp_close, NULL, &icmp_mod_info
 180 };
 181 
 182 static struct qinit icmpwinit = {
 183         (pfi_t)icmp_wput, (pfi_t)ip_wsrv, NULL, NULL, NULL, &icmp_mod_info
 184 };
 185 
 186 /* ICMP entry point during fallback */
 187 static struct qinit icmp_fallback_sock_winit = {
 188         (pfi_t)icmp_wput_fallback, NULL, NULL, NULL, NULL, &icmp_mod_info
 189 };
 190 
 191 /* For AF_INET aka /dev/icmp */
 192 struct streamtab icmpinfov4 = {
 193         &icmprinitv4, &icmpwinit
 194 };
 195 
 196 /* For AF_INET6 aka /dev/icmp6 */
 197 struct streamtab icmpinfov6 = {
 198         &icmprinitv6, &icmpwinit
 199 };
 200 
 201 /* Default structure copied into T_INFO_ACK messages */
 202 static struct T_info_ack icmp_g_t_info_ack = {
 203         T_INFO_ACK,
 204         IP_MAXPACKET,    /* TSDU_size.  icmp allows maximum size messages. */
 205         T_INVALID,      /* ETSDU_size.  icmp does not support expedited data. */
 206         T_INVALID,      /* CDATA_size. icmp does not support connect data. */
 207         T_INVALID,      /* DDATA_size. icmp does not support disconnect data. */
 208         0,              /* ADDR_size - filled in later. */
 209         0,              /* OPT_size - not initialized here */
 210         IP_MAXPACKET,   /* TIDU_size.  icmp allows maximum size messages. */
 211         T_CLTS,         /* SERV_type.  icmp supports connection-less. */
 212         TS_UNBND,       /* CURRENT_state.  This is set from icmp_state. */
 213         (XPG4_1|SENDZERO) /* PROVIDER_flag */
 214 };
 215 
 216 /*
 217  * All of these are alterable, within the min/max values given, at run time.
 218  *
 219  * Note: All those tunables which do not start with "icmp_" are Committed and
 220  * therefore are public. See PSARC 2010/080.
 221  */
 222 static mod_prop_info_t icmp_propinfo_tbl[] = {
 223         /* tunable - 0 */
 224         { "_wroff_extra", MOD_PROTO_RAWIP,
 225             mod_set_uint32, mod_get_uint32,
 226             {0, 128, 32}, {32} },
 227 
 228         { "_ipv4_ttl", MOD_PROTO_RAWIP,
 229             mod_set_uint32, mod_get_uint32,
 230             {1, 255, 255}, {255} },
 231 
 232         { "_ipv6_hoplimit", MOD_PROTO_RAWIP,
 233             mod_set_uint32, mod_get_uint32,
 234             {0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS},
 235             {IPV6_DEFAULT_HOPS} },
 236 
 237         { "_bsd_compat", MOD_PROTO_RAWIP,
 238             mod_set_boolean, mod_get_boolean,
 239             {B_TRUE}, {B_TRUE} },
 240 
 241         { "send_maxbuf", MOD_PROTO_RAWIP,
 242             mod_set_uint32, mod_get_uint32,
 243             {4096, 65536, 8192}, {8192} },
 244 
 245         { "_xmit_lowat", MOD_PROTO_RAWIP,
 246             mod_set_uint32, mod_get_uint32,
 247             {0, 65536, 1024}, {1024} },
 248 
 249         { "recv_maxbuf", MOD_PROTO_RAWIP,
 250             mod_set_uint32, mod_get_uint32,
 251             {4096, 65536, 8192}, {8192} },
 252 
 253         { "_max_buf", MOD_PROTO_RAWIP,
 254             mod_set_uint32, mod_get_uint32,
 255             {65536, 1024*1024*1024, 256*1024}, {256 * 1024} },
 256 
 257         { "_pmtu_discovery", MOD_PROTO_RAWIP,
 258             mod_set_boolean, mod_get_boolean,
 259             {B_FALSE}, {B_FALSE} },
 260 
 261         { "_sendto_ignerr", MOD_PROTO_RAWIP,
 262             mod_set_boolean, mod_get_boolean,
 263             {B_FALSE}, {B_FALSE} },
 264 
 265         { "?", MOD_PROTO_RAWIP, NULL, mod_get_allprop, {0}, {0} },
 266 
 267         { NULL, 0, NULL, NULL, {0}, {0} }
 268 };
 269 
 270 #define is_wroff_extra                  is_propinfo_tbl[0].prop_cur_uval
 271 #define is_ipv4_ttl                     is_propinfo_tbl[1].prop_cur_uval
 272 #define is_ipv6_hoplimit                is_propinfo_tbl[2].prop_cur_uval
 273 #define is_bsd_compat                   is_propinfo_tbl[3].prop_cur_bval
 274 #define is_xmit_hiwat                   is_propinfo_tbl[4].prop_cur_uval
 275 #define is_xmit_lowat                   is_propinfo_tbl[5].prop_cur_uval
 276 #define is_recv_hiwat                   is_propinfo_tbl[6].prop_cur_uval
 277 #define is_max_buf                      is_propinfo_tbl[7].prop_cur_uval
 278 #define is_pmtu_discovery               is_propinfo_tbl[8].prop_cur_bval
 279 #define is_sendto_ignerr                is_propinfo_tbl[9].prop_cur_bval
 280 
 281 typedef union T_primitives *t_primp_t;
 282 
 283 /*
 284  * This routine is called to handle each O_T_BIND_REQ/T_BIND_REQ message
 285  * passed to icmp_wput.
 286  * It calls IP to verify the local IP address, and calls IP to insert
 287  * the conn_t in the fanout table.
 288  * If everything is ok it then sends the T_BIND_ACK back up.
 289  */
 290 static void
 291 icmp_tpi_bind(queue_t *q, mblk_t *mp)
 292 {
 293         int     error;
 294         struct sockaddr *sa;
 295         struct T_bind_req *tbr;
 296         socklen_t       len;
 297         sin_t   *sin;
 298         sin6_t  *sin6;
 299         icmp_t          *icmp;
 300         conn_t  *connp = Q_TO_CONN(q);
 301         mblk_t *mp1;
 302         cred_t *cr;
 303 
 304         /*
 305          * All Solaris components should pass a db_credp
 306          * for this TPI message, hence we ASSERT.
 307          * But in case there is some other M_PROTO that looks
 308          * like a TPI message sent by some other kernel
 309          * component, we check and return an error.
 310          */
 311         cr = msg_getcred(mp, NULL);
 312         ASSERT(cr != NULL);
 313         if (cr == NULL) {
 314                 icmp_err_ack(q, mp, TSYSERR, EINVAL);
 315                 return;
 316         }
 317 
 318         icmp = connp->conn_icmp;
 319         if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
 320                 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
 321                     "icmp_bind: bad req, len %u",
 322                     (uint_t)(mp->b_wptr - mp->b_rptr));
 323                 icmp_err_ack(q, mp, TPROTO, 0);
 324                 return;
 325         }
 326 
 327         if (icmp->icmp_state != TS_UNBND) {
 328                 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
 329                     "icmp_bind: bad state, %u", icmp->icmp_state);
 330                 icmp_err_ack(q, mp, TOUTSTATE, 0);
 331                 return;
 332         }
 333 
 334         /*
 335          * Reallocate the message to make sure we have enough room for an
 336          * address.
 337          */
 338         mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1);
 339         if (mp1 == NULL) {
 340                 icmp_err_ack(q, mp, TSYSERR, ENOMEM);
 341                 return;
 342         }
 343         mp = mp1;
 344 
 345         /* Reset the message type in preparation for shipping it back. */
 346         DB_TYPE(mp) = M_PCPROTO;
 347         tbr = (struct T_bind_req *)mp->b_rptr;
 348         len = tbr->ADDR_length;
 349         switch (len) {
 350         case 0: /* request for a generic port */
 351                 tbr->ADDR_offset = sizeof (struct T_bind_req);
 352                 if (connp->conn_family == AF_INET) {
 353                         tbr->ADDR_length = sizeof (sin_t);
 354                         sin = (sin_t *)&tbr[1];
 355                         *sin = sin_null;
 356                         sin->sin_family = AF_INET;
 357                         mp->b_wptr = (uchar_t *)&sin[1];
 358                         sa = (struct sockaddr *)sin;
 359                         len = sizeof (sin_t);
 360                 } else {
 361                         ASSERT(connp->conn_family == AF_INET6);
 362                         tbr->ADDR_length = sizeof (sin6_t);
 363                         sin6 = (sin6_t *)&tbr[1];
 364                         *sin6 = sin6_null;
 365                         sin6->sin6_family = AF_INET6;
 366                         mp->b_wptr = (uchar_t *)&sin6[1];
 367                         sa = (struct sockaddr *)sin6;
 368                         len = sizeof (sin6_t);
 369                 }
 370                 break;
 371 
 372         case sizeof (sin_t):    /* Complete IPv4 address */
 373                 sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset,
 374                     sizeof (sin_t));
 375                 break;
 376 
 377         case sizeof (sin6_t):   /* Complete IPv6 address */
 378                 sa = (struct sockaddr *)mi_offset_param(mp,
 379                     tbr->ADDR_offset, sizeof (sin6_t));
 380                 break;
 381 
 382         default:
 383                 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
 384                     "icmp_bind: bad ADDR_length %u", tbr->ADDR_length);
 385                 icmp_err_ack(q, mp, TBADADDR, 0);
 386                 return;
 387         }
 388 
 389         error = rawip_do_bind(connp, sa, len);
 390         if (error != 0) {
 391                 if (error > 0) {
 392                         icmp_err_ack(q, mp, TSYSERR, error);
 393                 } else {
 394                         icmp_err_ack(q, mp, -error, 0);
 395                 }
 396         } else {
 397                 tbr->PRIM_type = T_BIND_ACK;
 398                 qreply(q, mp);
 399         }
 400 }
 401 
 402 static int
 403 rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len)
 404 {
 405         sin_t           *sin;
 406         sin6_t          *sin6;
 407         icmp_t          *icmp = connp->conn_icmp;
 408         int             error = 0;
 409         ip_laddr_t      laddr_type = IPVL_UNICAST_UP;   /* INADDR_ANY */
 410         in_port_t       lport;          /* Network byte order */
 411         ipaddr_t        v4src;          /* Set if AF_INET */
 412         in6_addr_t      v6src;
 413         uint_t          scopeid = 0;
 414         zoneid_t        zoneid = IPCL_ZONEID(connp);
 415         ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
 416 
 417         if (sa == NULL || !OK_32PTR((char *)sa)) {
 418                 return (EINVAL);
 419         }
 420 
 421         switch (len) {
 422         case sizeof (sin_t):    /* Complete IPv4 address */
 423                 sin = (sin_t *)sa;
 424                 if (sin->sin_family != AF_INET ||
 425                     connp->conn_family != AF_INET) {
 426                         /* TSYSERR, EAFNOSUPPORT */
 427                         return (EAFNOSUPPORT);
 428                 }
 429                 v4src = sin->sin_addr.s_addr;
 430                 IN6_IPADDR_TO_V4MAPPED(v4src, &v6src);
 431                 if (v4src != INADDR_ANY) {
 432                         laddr_type = ip_laddr_verify_v4(v4src, zoneid, ipst,
 433                             B_TRUE);
 434                 }
 435                 lport = sin->sin_port;
 436                 break;
 437         case sizeof (sin6_t): /* Complete IPv6 address */
 438                 sin6 = (sin6_t *)sa;
 439                 if (sin6->sin6_family != AF_INET6 ||
 440                     connp->conn_family != AF_INET6) {
 441                         /* TSYSERR, EAFNOSUPPORT */
 442                         return (EAFNOSUPPORT);
 443                 }
 444                 /* No support for mapped addresses on raw sockets */
 445                 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 446                         /* TSYSERR, EADDRNOTAVAIL */
 447                         return (EADDRNOTAVAIL);
 448                 }
 449                 v6src = sin6->sin6_addr;
 450                 if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
 451                         if (IN6_IS_ADDR_LINKSCOPE(&v6src))
 452                                 scopeid = sin6->sin6_scope_id;
 453                         laddr_type = ip_laddr_verify_v6(&v6src, zoneid, ipst,
 454                             B_TRUE, scopeid);
 455                 }
 456                 lport = sin6->sin6_port;
 457                 break;
 458 
 459         default:
 460                 /* TBADADDR */
 461                 return (EADDRNOTAVAIL);
 462         }
 463 
 464         /* Is the local address a valid unicast, multicast, or broadcast? */
 465         if (laddr_type == IPVL_BAD)
 466                 return (EADDRNOTAVAIL);
 467 
 468         /*
 469          * The state must be TS_UNBND.
 470          */
 471         mutex_enter(&connp->conn_lock);
 472         if (icmp->icmp_state != TS_UNBND) {
 473                 mutex_exit(&connp->conn_lock);
 474                 return (-TOUTSTATE);
 475         }
 476 
 477         /*
 478          * Copy the source address into our icmp structure.  This address
 479          * may still be zero; if so, ip will fill in the correct address
 480          * each time an outbound packet is passed to it.
 481          * If we are binding to a broadcast or multicast address then
 482          * we just set the conn_bound_addr since we don't want to use
 483          * that as the source address when sending.
 484          */
 485         connp->conn_bound_addr_v6 = v6src;
 486         connp->conn_laddr_v6 = v6src;
 487         if (scopeid != 0) {
 488                 connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET;
 489                 connp->conn_ixa->ixa_scopeid = scopeid;
 490                 connp->conn_incoming_ifindex = scopeid;
 491         } else {
 492                 connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
 493                 connp->conn_incoming_ifindex = connp->conn_bound_if;
 494         }
 495 
 496         switch (laddr_type) {
 497         case IPVL_UNICAST_UP:
 498         case IPVL_UNICAST_DOWN:
 499                 connp->conn_saddr_v6 = v6src;
 500                 connp->conn_mcbc_bind = B_FALSE;
 501                 break;
 502         case IPVL_MCAST:
 503         case IPVL_BCAST:
 504                 /* ip_set_destination will pick a source address later */
 505                 connp->conn_saddr_v6 = ipv6_all_zeros;
 506                 connp->conn_mcbc_bind = B_TRUE;
 507                 break;
 508         }
 509 
 510         /* Any errors after this point should use late_error */
 511 
 512         /*
 513          * Use sin_port/sin6_port since applications like psh use SOCK_RAW
 514          * with IPPROTO_TCP.
 515          */
 516         connp->conn_lport = lport;
 517         connp->conn_fport = 0;
 518 
 519         if (connp->conn_family == AF_INET) {
 520                 ASSERT(connp->conn_ipversion == IPV4_VERSION);
 521         } else {
 522                 ASSERT(connp->conn_ipversion == IPV6_VERSION);
 523         }
 524 
 525         icmp->icmp_state = TS_IDLE;
 526 
 527         /*
 528          * We create an initial header template here to make a subsequent
 529          * sendto have a starting point. Since conn_last_dst is zero the
 530          * first sendto will always follow the 'dst changed' code path.
 531          * Note that we defer massaging options and the related checksum
 532          * adjustment until we have a destination address.
 533          */
 534         error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
 535             &connp->conn_faddr_v6, connp->conn_flowinfo);
 536         if (error != 0) {
 537                 mutex_exit(&connp->conn_lock);
 538                 goto late_error;
 539         }
 540         /* Just in case */
 541         connp->conn_faddr_v6 = ipv6_all_zeros;
 542         connp->conn_v6lastdst = ipv6_all_zeros;
 543         mutex_exit(&connp->conn_lock);
 544 
 545         error = ip_laddr_fanout_insert(connp);
 546         if (error != 0)
 547                 goto late_error;
 548 
 549         /* Bind succeeded */
 550         return (0);
 551 
 552 late_error:
 553         mutex_enter(&connp->conn_lock);
 554         connp->conn_saddr_v6 = ipv6_all_zeros;
 555         connp->conn_bound_addr_v6 = ipv6_all_zeros;
 556         connp->conn_laddr_v6 = ipv6_all_zeros;
 557         if (scopeid != 0) {
 558                 connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
 559                 connp->conn_incoming_ifindex = connp->conn_bound_if;
 560         }
 561         icmp->icmp_state = TS_UNBND;
 562         connp->conn_v6lastdst = ipv6_all_zeros;
 563         connp->conn_lport = 0;
 564 
 565         /* Restore the header that was built above - different source address */
 566         (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
 567             &connp->conn_faddr_v6, connp->conn_flowinfo);
 568         mutex_exit(&connp->conn_lock);
 569         return (error);
 570 }
 571 
 572 /*
 573  * Tell IP to just bind to the protocol.
 574  */
 575 static void
 576 icmp_bind_proto(icmp_t *icmp)
 577 {
 578         conn_t  *connp = icmp->icmp_connp;
 579 
 580         mutex_enter(&connp->conn_lock);
 581         connp->conn_saddr_v6 = ipv6_all_zeros;
 582         connp->conn_laddr_v6 = ipv6_all_zeros;
 583         connp->conn_faddr_v6 = ipv6_all_zeros;
 584         connp->conn_v6lastdst = ipv6_all_zeros;
 585         mutex_exit(&connp->conn_lock);
 586 
 587         (void) ip_laddr_fanout_insert(connp);
 588 }
 589 
 590 /*
 591  * This routine handles each T_CONN_REQ message passed to icmp.  It
 592  * associates a default destination address with the stream.
 593  *
 594  * After various error checks are completed, icmp_connect() lays
 595  * the target address and port into the composite header template.
 596  * Then we ask IP for information, including a source address if we didn't
 597  * already have one. Finally we send up the T_OK_ACK reply message.
 598  */
 599 static void
 600 icmp_tpi_connect(queue_t *q, mblk_t *mp)
 601 {
 602         conn_t  *connp = Q_TO_CONN(q);
 603         struct T_conn_req       *tcr;
 604         struct sockaddr *sa;
 605         socklen_t len;
 606         int error;
 607         cred_t *cr;
 608         pid_t pid;
 609         /*
 610          * All Solaris components should pass a db_credp
 611          * for this TPI message, hence we ASSERT.
 612          * But in case there is some other M_PROTO that looks
 613          * like a TPI message sent by some other kernel
 614          * component, we check and return an error.
 615          */
 616         cr = msg_getcred(mp, &pid);
 617         ASSERT(cr != NULL);
 618         if (cr == NULL) {
 619                 icmp_err_ack(q, mp, TSYSERR, EINVAL);
 620                 return;
 621         }
 622 
 623         tcr = (struct T_conn_req *)mp->b_rptr;
 624         /* Sanity checks */
 625         if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_req)) {
 626                 icmp_err_ack(q, mp, TPROTO, 0);
 627                 return;
 628         }
 629 
 630         if (tcr->OPT_length != 0) {
 631                 icmp_err_ack(q, mp, TBADOPT, 0);
 632                 return;
 633         }
 634 
 635         len = tcr->DEST_length;
 636 
 637         switch (len) {
 638         default:
 639                 icmp_err_ack(q, mp, TBADADDR, 0);
 640                 return;
 641         case sizeof (sin_t):
 642                 sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
 643                     sizeof (sin_t));
 644                 break;
 645         case sizeof (sin6_t):
 646                 sa = (struct sockaddr *)mi_offset_param(mp,
 647                     tcr->DEST_offset, sizeof (sin6_t));
 648                 break;
 649         }
 650 
 651         error = proto_verify_ip_addr(connp->conn_family, sa, len);
 652         if (error != 0) {
 653                 icmp_err_ack(q, mp, TSYSERR, error);
 654                 return;
 655         }
 656 
 657         error = rawip_do_connect(connp, sa, len, cr, pid);
 658         if (error != 0) {
 659                 if (error < 0) {
 660                         icmp_err_ack(q, mp, -error, 0);
 661                 } else {
 662                         icmp_err_ack(q, mp, 0, error);
 663                 }
 664         } else {
 665                 mblk_t *mp1;
 666 
 667                 /*
 668                  * We have to send a connection confirmation to
 669                  * keep TLI happy.
 670                  */
 671                 if (connp->conn_family == AF_INET) {
 672                         mp1 = mi_tpi_conn_con(NULL, (char *)sa,
 673                             sizeof (sin_t), NULL, 0);
 674                 } else {
 675                         ASSERT(connp->conn_family == AF_INET6);
 676                         mp1 = mi_tpi_conn_con(NULL, (char *)sa,
 677                             sizeof (sin6_t), NULL, 0);
 678                 }
 679                 if (mp1 == NULL) {
 680                         icmp_err_ack(q, mp, TSYSERR, ENOMEM);
 681                         return;
 682                 }
 683 
 684                 /*
 685                  * Send ok_ack for T_CONN_REQ
 686                  */
 687                 mp = mi_tpi_ok_ack_alloc(mp);
 688                 if (mp == NULL) {
 689                         /* Unable to reuse the T_CONN_REQ for the ack. */
 690                         icmp_err_ack_prim(q, mp1, T_CONN_REQ, TSYSERR, ENOMEM);
 691                         return;
 692                 }
 693                 putnext(connp->conn_rq, mp);
 694                 putnext(connp->conn_rq, mp1);
 695         }
 696 }
 697 
 698 static int
 699 rawip_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
 700     cred_t *cr, pid_t pid)
 701 {
 702         icmp_t          *icmp;
 703         sin_t           *sin;
 704         sin6_t          *sin6;
 705         int             error;
 706         uint16_t        dstport;
 707         ipaddr_t        v4dst;
 708         in6_addr_t      v6dst;
 709         uint32_t        flowinfo;
 710         ip_xmit_attr_t  *ixa;
 711         ip_xmit_attr_t  *oldixa;
 712         uint_t          scopeid = 0;
 713         uint_t          srcid = 0;
 714         in6_addr_t      v6src = connp->conn_saddr_v6;
 715 
 716         icmp = connp->conn_icmp;
 717 
 718         if (sa == NULL || !OK_32PTR((char *)sa)) {
 719                 return (EINVAL);
 720         }
 721 
 722         ASSERT(sa != NULL && len != 0);
 723 
 724         /*
 725          * Determine packet type based on type of address passed in
 726          * the request should contain an IPv4 or IPv6 address.
 727          * Make sure that address family matches the type of
 728          * family of the address passed down.
 729          */
 730         switch (len) {
 731         case sizeof (sin_t):
 732                 sin = (sin_t *)sa;
 733 
 734                 v4dst = sin->sin_addr.s_addr;
 735                 dstport = sin->sin_port;
 736                 IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
 737                 ASSERT(connp->conn_ipversion == IPV4_VERSION);
 738                 break;
 739 
 740         case sizeof (sin6_t):
 741                 sin6 = (sin6_t *)sa;
 742 
 743                 /* No support for mapped addresses on raw sockets */
 744                 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 745                         return (EADDRNOTAVAIL);
 746                 }
 747                 v6dst = sin6->sin6_addr;
 748                 dstport = sin6->sin6_port;
 749                 ASSERT(connp->conn_ipversion == IPV6_VERSION);
 750                 flowinfo = sin6->sin6_flowinfo;
 751                 if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr))
 752                         scopeid = sin6->sin6_scope_id;
 753                 srcid = sin6->__sin6_src_id;
 754                 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
 755                         ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
 756                             connp->conn_netstack);
 757                 }
 758                 break;
 759         }
 760 
 761         /*
 762          * If there is a different thread using conn_ixa then we get a new
 763          * copy and cut the old one loose from conn_ixa. Otherwise we use
 764          * conn_ixa and prevent any other thread from using/changing it.
 765          * Once connect() is done other threads can use conn_ixa since the
 766          * refcnt will be back at one.
 767          * We defer updating conn_ixa until later to handle any concurrent
 768          * conn_ixa_cleanup thread.
 769          */
 770         ixa = conn_get_ixa(connp, B_FALSE);
 771         if (ixa == NULL)
 772                 return (ENOMEM);
 773 
 774         mutex_enter(&connp->conn_lock);
 775         /*
 776          * This icmp_t must have bound already before doing a connect.
 777          * Reject if a connect is in progress (we drop conn_lock during
 778          * rawip_do_connect).
 779          */
 780         if (icmp->icmp_state == TS_UNBND || icmp->icmp_state == TS_WCON_CREQ) {
 781                 mutex_exit(&connp->conn_lock);
 782                 ixa_refrele(ixa);
 783                 return (-TOUTSTATE);
 784         }
 785 
 786         if (icmp->icmp_state == TS_DATA_XFER) {
 787                 /* Already connected - clear out state */
 788                 if (connp->conn_mcbc_bind)
 789                         connp->conn_saddr_v6 = ipv6_all_zeros;
 790                 else
 791                         connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
 792                 connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
 793                 connp->conn_faddr_v6 = ipv6_all_zeros;
 794                 icmp->icmp_state = TS_IDLE;
 795         }
 796 
 797         /*
 798          * Use sin_port/sin6_port since applications like psh use SOCK_RAW
 799          * with IPPROTO_TCP.
 800          */
 801         connp->conn_fport = dstport;
 802         if (connp->conn_ipversion == IPV4_VERSION) {
 803                 /*
 804                  * Interpret a zero destination to mean loopback.
 805                  * Update the T_CONN_REQ (sin/sin6) since it is used to
 806                  * generate the T_CONN_CON.
 807                  */
 808                 if (v4dst == INADDR_ANY) {
 809                         v4dst = htonl(INADDR_LOOPBACK);
 810                         IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
 811                         ASSERT(connp->conn_family == AF_INET);
 812                         sin->sin_addr.s_addr = v4dst;
 813                 }
 814                 connp->conn_faddr_v6 = v6dst;
 815                 connp->conn_flowinfo = 0;
 816         } else {
 817                 ASSERT(connp->conn_ipversion == IPV6_VERSION);
 818                 /*
 819                  * Interpret a zero destination to mean loopback.
 820                  * Update the T_CONN_REQ (sin/sin6) since it is used to
 821                  * generate the T_CONN_CON.
 822                  */
 823                 if (IN6_IS_ADDR_UNSPECIFIED(&v6dst)) {
 824                         v6dst = ipv6_loopback;
 825                         sin6->sin6_addr = v6dst;
 826                 }
 827                 connp->conn_faddr_v6 = v6dst;
 828                 connp->conn_flowinfo = flowinfo;
 829         }
 830 
 831         /*
 832          * We update our cred/cpid based on the caller of connect
 833          */
 834         if (connp->conn_cred != cr) {
 835                 crhold(cr);
 836                 crfree(connp->conn_cred);
 837                 connp->conn_cred = cr;
 838         }
 839         connp->conn_cpid = pid;
 840         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
 841         ixa->ixa_cred = cr;
 842         ixa->ixa_cpid = pid;
 843         if (is_system_labeled()) {
 844                 /* We need to restart with a label based on the cred */
 845                 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
 846         }
 847 
 848         if (scopeid != 0) {
 849                 ixa->ixa_flags |= IXAF_SCOPEID_SET;
 850                 ixa->ixa_scopeid = scopeid;
 851                 connp->conn_incoming_ifindex = scopeid;
 852         } else {
 853                 ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
 854                 connp->conn_incoming_ifindex = connp->conn_bound_if;
 855         }
 856 
 857         /*
 858          * conn_connect will drop conn_lock and reacquire it.
 859          * To prevent a send* from messing with this icmp_t while the lock
 860          * is dropped we set icmp_state and clear conn_v6lastdst.
 861          * That will make all send* fail with EISCONN.
 862          */
 863         connp->conn_v6lastdst = ipv6_all_zeros;
 864         icmp->icmp_state = TS_WCON_CREQ;
 865 
 866         error = conn_connect(connp, NULL, IPDF_ALLOW_MCBC);
 867         mutex_exit(&connp->conn_lock);
 868         if (error != 0)
 869                 goto connect_failed;
 870 
 871         /*
 872          * The addresses have been verified. Time to insert in
 873          * the correct fanout list.
 874          */
 875         error = ipcl_conn_insert(connp);
 876         if (error != 0)
 877                 goto connect_failed;
 878 
 879         mutex_enter(&connp->conn_lock);
 880         error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
 881             &connp->conn_faddr_v6, connp->conn_flowinfo);
 882         if (error != 0) {
 883                 mutex_exit(&connp->conn_lock);
 884                 goto connect_failed;
 885         }
 886 
 887         icmp->icmp_state = TS_DATA_XFER;
 888         /* Record this as the "last" send even though we haven't sent any */
 889         connp->conn_v6lastdst = connp->conn_faddr_v6;
 890         connp->conn_lastipversion = connp->conn_ipversion;
 891         connp->conn_lastdstport = connp->conn_fport;
 892         connp->conn_lastflowinfo = connp->conn_flowinfo;
 893         connp->conn_lastscopeid = scopeid;
 894         connp->conn_lastsrcid = srcid;
 895         /* Also remember a source to use together with lastdst */
 896         connp->conn_v6lastsrc = v6src;
 897 
 898         oldixa = conn_replace_ixa(connp, ixa);
 899         mutex_exit(&connp->conn_lock);
 900         ixa_refrele(oldixa);
 901 
 902         ixa_refrele(ixa);
 903         return (0);
 904 
 905 connect_failed:
 906         if (ixa != NULL)
 907                 ixa_refrele(ixa);
 908         mutex_enter(&connp->conn_lock);
 909         icmp->icmp_state = TS_IDLE;
 910         /* In case the source address was set above */
 911         if (connp->conn_mcbc_bind)
 912                 connp->conn_saddr_v6 = ipv6_all_zeros;
 913         else
 914                 connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
 915         connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
 916         connp->conn_faddr_v6 = ipv6_all_zeros;
 917         connp->conn_v6lastdst = ipv6_all_zeros;
 918         connp->conn_flowinfo = 0;
 919 
 920         (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
 921             &connp->conn_faddr_v6, connp->conn_flowinfo);
 922         mutex_exit(&connp->conn_lock);
 923         return (error);
 924 }
 925 
 926 static void
 927 rawip_do_close(conn_t *connp)
 928 {
 929         ASSERT(connp != NULL && IPCL_IS_RAWIP(connp));
 930 
 931         ip_quiesce_conn(connp);
 932 
 933         if (!IPCL_IS_NONSTR(connp)) {
 934                 qprocsoff(connp->conn_rq);
 935         }
 936 
 937         icmp_close_free(connp);
 938 
 939         /*
 940          * Now we are truly single threaded on this stream, and can
 941          * delete the things hanging off the connp, and finally the connp.
 942          * We removed this connp from the fanout list, it cannot be
 943          * accessed thru the fanouts, and we already waited for the
 944          * conn_ref to drop to 0. We are already in close, so
 945          * there cannot be any other thread from the top. qprocsoff
 946          * has completed, and service has completed or won't run in
 947          * future.
 948          */
 949         ASSERT(connp->conn_ref == 1);
 950 
 951         if (!IPCL_IS_NONSTR(connp)) {
 952                 inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
 953         } else {
 954                 ip_free_helper_stream(connp);
 955         }
 956 
 957         connp->conn_ref--;
 958         ipcl_conn_destroy(connp);
 959 }
 960 
 961 static int
 962 icmp_close(queue_t *q, int flags)
 963 {
 964         conn_t  *connp;
 965 
 966         if (flags & SO_FALLBACK) {
 967                 /*
 968                  * stream is being closed while in fallback
 969                  * simply free the resources that were allocated
 970                  */
 971                 inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr));
 972                 qprocsoff(q);
 973                 goto done;
 974         }
 975 
 976         connp = Q_TO_CONN(q);
 977         (void) rawip_do_close(connp);
 978 done:
 979         q->q_ptr = WR(q)->q_ptr = NULL;
 980         return (0);
 981 }
 982 
 983 static void
 984 icmp_close_free(conn_t *connp)
 985 {
 986         icmp_t *icmp = connp->conn_icmp;
 987 
 988         if (icmp->icmp_filter != NULL) {
 989                 kmem_free(icmp->icmp_filter, sizeof (icmp6_filter_t));
 990                 icmp->icmp_filter = NULL;
 991         }
 992 
 993         /*
 994          * Clear any fields which the kmem_cache constructor clears.
 995          * Only icmp_connp needs to be preserved.
 996          * TBD: We should make this more efficient to avoid clearing
 997          * everything.
 998          */
 999         ASSERT(icmp->icmp_connp == connp);
1000         bzero(icmp, sizeof (icmp_t));
1001         icmp->icmp_connp = connp;
1002 }
1003 
1004 /*
1005  * This routine handles each T_DISCON_REQ message passed to icmp
1006  * as an indicating that ICMP is no longer connected. This results
1007  * in telling IP to restore the binding to just the local address.
1008  */
1009 static int
1010 icmp_do_disconnect(conn_t *connp)
1011 {
1012         icmp_t  *icmp = connp->conn_icmp;
1013         int     error;
1014 
1015         mutex_enter(&connp->conn_lock);
1016         if (icmp->icmp_state != TS_DATA_XFER) {
1017                 mutex_exit(&connp->conn_lock);
1018                 return (-TOUTSTATE);
1019         }
1020         if (connp->conn_mcbc_bind)
1021                 connp->conn_saddr_v6 = ipv6_all_zeros;
1022         else
1023                 connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
1024         connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
1025         connp->conn_faddr_v6 = ipv6_all_zeros;
1026         icmp->icmp_state = TS_IDLE;
1027 
1028         connp->conn_v6lastdst = ipv6_all_zeros;
1029         error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
1030             &connp->conn_faddr_v6, connp->conn_flowinfo);
1031         mutex_exit(&connp->conn_lock);
1032         if (error != 0)
1033                 return (error);
1034 
1035         /*
1036          * Tell IP to remove the full binding and revert
1037          * to the local address binding.
1038          */
1039         return (ip_laddr_fanout_insert(connp));
1040 }
1041 
1042 static void
1043 icmp_tpi_disconnect(queue_t *q, mblk_t *mp)
1044 {
1045         conn_t  *connp = Q_TO_CONN(q);
1046         int     error;
1047 
1048         /*
1049          * Allocate the largest primitive we need to send back
1050          * T_error_ack is > than T_ok_ack
1051          */
1052         mp = reallocb(mp, sizeof (struct T_error_ack), 1);
1053         if (mp == NULL) {
1054                 /* Unable to reuse the T_DISCON_REQ for the ack. */
1055                 icmp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, ENOMEM);
1056                 return;
1057         }
1058 
1059         error = icmp_do_disconnect(connp);
1060 
1061         if (error != 0) {
1062                 if (error > 0) {
1063                         icmp_err_ack(q, mp, 0, error);
1064                 } else {
1065                         icmp_err_ack(q, mp, -error, 0);
1066                 }
1067         } else {
1068                 mp = mi_tpi_ok_ack_alloc(mp);
1069                 ASSERT(mp != NULL);
1070                 qreply(q, mp);
1071         }
1072 }
1073 
1074 static int
1075 icmp_disconnect(conn_t *connp)
1076 {
1077         int     error;
1078 
1079         connp->conn_dgram_errind = B_FALSE;
1080 
1081         error = icmp_do_disconnect(connp);
1082 
1083         if (error < 0)
1084                 error = proto_tlitosyserr(-error);
1085         return (error);
1086 }
1087 
1088 /* This routine creates a T_ERROR_ACK message and passes it upstream. */
1089 static void
1090 icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
1091 {
1092         if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
1093                 qreply(q, mp);
1094 }
1095 
1096 /* Shorthand to generate and send TPI error acks to our client */
1097 static void
1098 icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
1099     t_scalar_t t_error, int sys_error)
1100 {
1101         struct T_error_ack      *teackp;
1102 
1103         if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack),
1104             M_PCPROTO, T_ERROR_ACK)) != NULL) {
1105                 teackp = (struct T_error_ack *)mp->b_rptr;
1106                 teackp->ERROR_prim = primitive;
1107                 teackp->TLI_error = t_error;
1108                 teackp->UNIX_error = sys_error;
1109                 qreply(q, mp);
1110         }
1111 }
1112 
1113 /*
1114  * icmp_icmp_input is called as conn_recvicmp to process ICMP messages.
1115  * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors.
1116  * Assumes that IP has pulled up everything up to and including the ICMP header.
1117  */
1118 /* ARGSUSED2 */
1119 static void
1120 icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
1121 {
1122         conn_t          *connp = (conn_t *)arg1;
1123         icmp_t          *icmp = connp->conn_icmp;
1124         icmph_t         *icmph;
1125         ipha_t          *ipha;
1126         int             iph_hdr_length;
1127         sin_t           sin;
1128         mblk_t          *mp1;
1129         int             error = 0;
1130 
1131         ipha = (ipha_t *)mp->b_rptr;
1132 
1133         ASSERT(OK_32PTR(mp->b_rptr));
1134 
1135         if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
1136                 ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
1137                 icmp_icmp_error_ipv6(connp, mp, ira);
1138                 return;
1139         }
1140         ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
1141 
1142         /* Skip past the outer IP and ICMP headers */
1143         ASSERT(IPH_HDR_LENGTH(ipha) == ira->ira_ip_hdr_length);
1144         iph_hdr_length = ira->ira_ip_hdr_length;
1145         icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
1146         ipha = (ipha_t *)&icmph[1]; /* Inner IP header */
1147 
1148         iph_hdr_length = IPH_HDR_LENGTH(ipha);
1149 
1150         switch (icmph->icmph_type) {
1151         case ICMP_DEST_UNREACHABLE:
1152                 switch (icmph->icmph_code) {
1153                 case ICMP_FRAGMENTATION_NEEDED: {
1154                         ipha_t          *ipha;
1155                         ip_xmit_attr_t  *ixa;
1156                         /*
1157                          * IP has already adjusted the path MTU.
1158                          * But we need to adjust DF for IPv4.
1159                          */
1160                         if (connp->conn_ipversion != IPV4_VERSION)
1161                                 break;
1162 
1163                         ixa = conn_get_ixa(connp, B_FALSE);
1164                         if (ixa == NULL || ixa->ixa_ire == NULL) {
1165                                 /*
1166                                  * Some other thread holds conn_ixa. We will
1167                                  * redo this on the next ICMP too big.
1168                                  */
1169                                 if (ixa != NULL)
1170                                         ixa_refrele(ixa);
1171                                 break;
1172                         }
1173                         (void) ip_get_pmtu(ixa);
1174 
1175                         mutex_enter(&connp->conn_lock);
1176                         ipha = (ipha_t *)connp->conn_ht_iphc;
1177                         if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) {
1178                                 ipha->ipha_fragment_offset_and_flags |=
1179                                     IPH_DF_HTONS;
1180                         } else {
1181                                 ipha->ipha_fragment_offset_and_flags &=
1182                                     ~IPH_DF_HTONS;
1183                         }
1184                         mutex_exit(&connp->conn_lock);
1185                         ixa_refrele(ixa);
1186                         break;
1187                 }
1188                 case ICMP_PORT_UNREACHABLE:
1189                 case ICMP_PROTOCOL_UNREACHABLE:
1190                         error = ECONNREFUSED;
1191                         break;
1192                 default:
1193                         /* Transient errors */
1194                         break;
1195                 }
1196                 break;
1197         default:
1198                 /* Transient errors */
1199                 break;
1200         }
1201         if (error == 0) {
1202                 freemsg(mp);
1203                 return;
1204         }
1205 
1206         /*
1207          * Deliver T_UDERROR_IND when the application has asked for it.
1208          * The socket layer enables this automatically when connected.
1209          */
1210         if (!connp->conn_dgram_errind) {
1211                 freemsg(mp);
1212                 return;
1213         }
1214 
1215         sin = sin_null;
1216         sin.sin_family = AF_INET;
1217         sin.sin_addr.s_addr = ipha->ipha_dst;
1218 
1219         if (IPCL_IS_NONSTR(connp)) {
1220                 mutex_enter(&connp->conn_lock);
1221                 if (icmp->icmp_state == TS_DATA_XFER) {
1222                         if (sin.sin_addr.s_addr == connp->conn_faddr_v4) {
1223                                 mutex_exit(&connp->conn_lock);
1224                                 (*connp->conn_upcalls->su_set_error)
1225                                     (connp->conn_upper_handle, error);
1226                                 goto done;
1227                         }
1228                 } else {
1229                         icmp->icmp_delayed_error = error;
1230                         *((sin_t *)&icmp->icmp_delayed_addr) = sin;
1231                 }
1232                 mutex_exit(&connp->conn_lock);
1233         } else {
1234                 mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL, 0,
1235                     error);
1236                 if (mp1 != NULL)
1237                         putnext(connp->conn_rq, mp1);
1238         }
1239 done:
1240         freemsg(mp);
1241 }
1242 
1243 /*
1244  * icmp_icmp_error_ipv6 is called by icmp_icmp_error to process ICMP for IPv6.
1245  * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors.
1246  * Assumes that IP has pulled up all the extension headers as well as the
1247  * ICMPv6 header.
1248  */
1249 static void
1250 icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira)
1251 {
1252         icmp6_t         *icmp6;
1253         ip6_t           *ip6h, *outer_ip6h;
1254         uint16_t        iph_hdr_length;
1255         uint8_t         *nexthdrp;
1256         sin6_t          sin6;
1257         mblk_t          *mp1;
1258         int             error = 0;
1259         icmp_t          *icmp = connp->conn_icmp;
1260 
1261         outer_ip6h = (ip6_t *)mp->b_rptr;
1262 #ifdef DEBUG
1263         if (outer_ip6h->ip6_nxt != IPPROTO_ICMPV6)
1264                 iph_hdr_length = ip_hdr_length_v6(mp, outer_ip6h);
1265         else
1266                 iph_hdr_length = IPV6_HDR_LEN;
1267         ASSERT(iph_hdr_length == ira->ira_ip_hdr_length);
1268 #endif
1269         /* Skip past the outer IP and ICMP headers */
1270         iph_hdr_length = ira->ira_ip_hdr_length;
1271         icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length];
1272 
1273         ip6h = (ip6_t *)&icmp6[1];  /* Inner IP header */
1274         if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) {
1275                 freemsg(mp);
1276                 return;
1277         }
1278 
1279         switch (icmp6->icmp6_type) {
1280         case ICMP6_DST_UNREACH:
1281                 switch (icmp6->icmp6_code) {
1282                 case ICMP6_DST_UNREACH_NOPORT:
1283                         error = ECONNREFUSED;
1284                         break;
1285                 case ICMP6_DST_UNREACH_ADMIN:
1286                 case ICMP6_DST_UNREACH_NOROUTE:
1287                 case ICMP6_DST_UNREACH_BEYONDSCOPE:
1288                 case ICMP6_DST_UNREACH_ADDR:
1289                         /* Transient errors */
1290                         break;
1291                 default:
1292                         break;
1293                 }
1294                 break;
1295         case ICMP6_PACKET_TOO_BIG: {
1296                 struct T_unitdata_ind   *tudi;
1297                 struct T_opthdr         *toh;
1298                 size_t                  udi_size;
1299                 mblk_t                  *newmp;
1300                 t_scalar_t              opt_length = sizeof (struct T_opthdr) +
1301                     sizeof (struct ip6_mtuinfo);
1302                 sin6_t                  *sin6;
1303                 struct ip6_mtuinfo      *mtuinfo;
1304 
1305                 /*
1306                  * If the application has requested to receive path mtu
1307                  * information, send up an empty message containing an
1308                  * IPV6_PATHMTU ancillary data item.
1309                  */
1310                 if (!connp->conn_ipv6_recvpathmtu)
1311                         break;
1312 
1313                 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t) +
1314                     opt_length;
1315                 if ((newmp = allocb(udi_size, BPRI_MED)) == NULL) {
1316                         BUMP_MIB(&icmp->icmp_is->is_rawip_mib, rawipInErrors);
1317                         break;
1318                 }
1319 
1320                 /*
1321                  * newmp->b_cont is left to NULL on purpose.  This is an
1322                  * empty message containing only ancillary data.
1323                  */
1324                 newmp->b_datap->db_type = M_PROTO;
1325                 tudi = (struct T_unitdata_ind *)newmp->b_rptr;
1326                 newmp->b_wptr = (uchar_t *)tudi + udi_size;
1327                 tudi->PRIM_type = T_UNITDATA_IND;
1328                 tudi->SRC_length = sizeof (sin6_t);
1329                 tudi->SRC_offset = sizeof (struct T_unitdata_ind);
1330                 tudi->OPT_offset = tudi->SRC_offset + sizeof (sin6_t);
1331                 tudi->OPT_length = opt_length;
1332 
1333                 sin6 = (sin6_t *)&tudi[1];
1334                 bzero(sin6, sizeof (sin6_t));
1335                 sin6->sin6_family = AF_INET6;
1336                 sin6->sin6_addr = connp->conn_faddr_v6;
1337 
1338                 toh = (struct T_opthdr *)&sin6[1];
1339                 toh->level = IPPROTO_IPV6;
1340                 toh->name = IPV6_PATHMTU;
1341                 toh->len = opt_length;
1342                 toh->status = 0;
1343 
1344                 mtuinfo = (struct ip6_mtuinfo *)&toh[1];
1345                 bzero(mtuinfo, sizeof (struct ip6_mtuinfo));
1346                 mtuinfo->ip6m_addr.sin6_family = AF_INET6;
1347                 mtuinfo->ip6m_addr.sin6_addr = ip6h->ip6_dst;
1348                 mtuinfo->ip6m_mtu = icmp6->icmp6_mtu;
1349                 /*
1350                  * We've consumed everything we need from the original
1351                  * message.  Free it, then send our empty message.
1352                  */
1353                 freemsg(mp);
1354                 icmp_ulp_recv(connp, newmp, msgdsize(newmp));
1355                 return;
1356         }
1357         case ICMP6_TIME_EXCEEDED:
1358                 /* Transient errors */
1359                 break;
1360         case ICMP6_PARAM_PROB:
1361                 /* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */
1362                 if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER &&
1363                     (uchar_t *)ip6h + icmp6->icmp6_pptr ==
1364                     (uchar_t *)nexthdrp) {
1365                         error = ECONNREFUSED;
1366                         break;
1367                 }
1368                 break;
1369         }
1370         if (error == 0) {
1371                 freemsg(mp);
1372                 return;
1373         }
1374 
1375         /*
1376          * Deliver T_UDERROR_IND when the application has asked for it.
1377          * The socket layer enables this automatically when connected.
1378          */
1379         if (!connp->conn_dgram_errind) {
1380                 freemsg(mp);
1381                 return;
1382         }
1383 
1384         sin6 = sin6_null;
1385         sin6.sin6_family = AF_INET6;
1386         sin6.sin6_addr = ip6h->ip6_dst;
1387         sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
1388         if (IPCL_IS_NONSTR(connp)) {
1389                 mutex_enter(&connp->conn_lock);
1390                 if (icmp->icmp_state == TS_DATA_XFER) {
1391                         if (IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr,
1392                             &connp->conn_faddr_v6)) {
1393                                 mutex_exit(&connp->conn_lock);
1394                                 (*connp->conn_upcalls->su_set_error)
1395                                     (connp->conn_upper_handle, error);
1396                                 goto done;
1397                         }
1398                 } else {
1399                         icmp->icmp_delayed_error = error;
1400                         *((sin6_t *)&icmp->icmp_delayed_addr) = sin6;
1401                 }
1402                 mutex_exit(&connp->conn_lock);
1403         } else {
1404                 mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t),
1405                     NULL, 0, error);
1406                 if (mp1 != NULL)
1407                         putnext(connp->conn_rq, mp1);
1408         }
1409 done:
1410         freemsg(mp);
1411 }
1412 
1413 /*
1414  * This routine responds to T_ADDR_REQ messages.  It is called by icmp_wput.
1415  * The local address is filled in if endpoint is bound. The remote address
1416  * is filled in if remote address has been precified ("connected endpoint")
1417  * (The concept of connected CLTS sockets is alien to published TPI
1418  *  but we support it anyway).
1419  */
1420 static void
1421 icmp_addr_req(queue_t *q, mblk_t *mp)
1422 {
1423         struct sockaddr *sa;
1424         mblk_t  *ackmp;
1425         struct T_addr_ack *taa;
1426         icmp_t  *icmp = Q_TO_ICMP(q);
1427         conn_t  *connp = icmp->icmp_connp;
1428         uint_t  addrlen;
1429 
1430         /* Make it large enough for worst case */
1431         ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
1432             2 * sizeof (sin6_t), 1);
1433         if (ackmp == NULL) {
1434                 icmp_err_ack(q, mp, TSYSERR, ENOMEM);
1435                 return;
1436         }
1437         taa = (struct T_addr_ack *)ackmp->b_rptr;
1438 
1439         bzero(taa, sizeof (struct T_addr_ack));
1440         ackmp->b_wptr = (uchar_t *)&taa[1];
1441 
1442         taa->PRIM_type = T_ADDR_ACK;
1443         ackmp->b_datap->db_type = M_PCPROTO;
1444 
1445         if (connp->conn_family == AF_INET)
1446                 addrlen = sizeof (sin_t);
1447         else
1448                 addrlen = sizeof (sin6_t);
1449 
1450         mutex_enter(&connp->conn_lock);
1451         /*
1452          * Note: Following code assumes 32 bit alignment of basic
1453          * data structures like sin_t and struct T_addr_ack.
1454          */
1455         if (icmp->icmp_state != TS_UNBND) {
1456                 /*
1457                  * Fill in local address first
1458                  */
1459                 taa->LOCADDR_offset = sizeof (*taa);
1460                 taa->LOCADDR_length = addrlen;
1461                 sa = (struct sockaddr *)&taa[1];
1462                 (void) conn_getsockname(connp, sa, &addrlen);
1463                 ackmp->b_wptr += addrlen;
1464         }
1465         if (icmp->icmp_state == TS_DATA_XFER) {
1466                 /*
1467                  * connected, fill remote address too
1468                  */
1469                 taa->REMADDR_length = addrlen;
1470                 /* assumed 32-bit alignment */
1471                 taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length;
1472                 sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset);
1473                 (void) conn_getpeername(connp, sa, &addrlen);
1474                 ackmp->b_wptr += addrlen;
1475         }
1476         mutex_exit(&connp->conn_lock);
1477         ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim);
1478         qreply(q, ackmp);
1479 }
1480 
1481 static void
1482 icmp_copy_info(struct T_info_ack *tap, icmp_t *icmp)
1483 {
1484         conn_t          *connp = icmp->icmp_connp;
1485 
1486         *tap = icmp_g_t_info_ack;
1487 
1488         if (connp->conn_family == AF_INET6)
1489                 tap->ADDR_size = sizeof (sin6_t);
1490         else
1491                 tap->ADDR_size = sizeof (sin_t);
1492         tap->CURRENT_state = icmp->icmp_state;
1493         tap->OPT_size = icmp_max_optsize;
1494 }
1495 
1496 static void
1497 icmp_do_capability_ack(icmp_t *icmp, struct T_capability_ack *tcap,
1498     t_uscalar_t cap_bits1)
1499 {
1500         tcap->CAP_bits1 = 0;
1501 
1502         if (cap_bits1 & TC1_INFO) {
1503                 icmp_copy_info(&tcap->INFO_ack, icmp);
1504                 tcap->CAP_bits1 |= TC1_INFO;
1505         }
1506 }
1507 
1508 /*
1509  * This routine responds to T_CAPABILITY_REQ messages.  It is called by
1510  * icmp_wput.  Much of the T_CAPABILITY_ACK information is copied from
1511  * icmp_g_t_info_ack.  The current state of the stream is copied from
1512  * icmp_state.
1513  */
1514 static void
1515 icmp_capability_req(queue_t *q, mblk_t *mp)
1516 {
1517         icmp_t                  *icmp = Q_TO_ICMP(q);
1518         t_uscalar_t             cap_bits1;
1519         struct T_capability_ack *tcap;
1520 
1521         cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
1522 
1523         mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
1524             mp->b_datap->db_type, T_CAPABILITY_ACK);
1525         if (!mp)
1526                 return;
1527 
1528         tcap = (struct T_capability_ack *)mp->b_rptr;
1529 
1530         icmp_do_capability_ack(icmp, tcap, cap_bits1);
1531 
1532         qreply(q, mp);
1533 }
1534 
1535 /*
1536  * This routine responds to T_INFO_REQ messages.  It is called by icmp_wput.
1537  * Most of the T_INFO_ACK information is copied from icmp_g_t_info_ack.
1538  * The current state of the stream is copied from icmp_state.
1539  */
1540 static void
1541 icmp_info_req(queue_t *q, mblk_t *mp)
1542 {
1543         icmp_t  *icmp = Q_TO_ICMP(q);
1544 
1545         /* Create a T_INFO_ACK message. */
1546         mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO,
1547             T_INFO_ACK);
1548         if (!mp)
1549                 return;
1550         icmp_copy_info((struct T_info_ack *)mp->b_rptr, icmp);
1551         qreply(q, mp);
1552 }
1553 
1554 static int
1555 icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
1556     int family)
1557 {
1558         conn_t *connp;
1559         dev_t   conn_dev;
1560         int     error;
1561 
1562         /* If the stream is already open, return immediately. */
1563         if (q->q_ptr != NULL)
1564                 return (0);
1565 
1566         if (sflag == MODOPEN)
1567                 return (EINVAL);
1568 
1569         /*
1570          * Since ICMP is not used so heavily, allocating from the small
1571          * arena should be sufficient.
1572          */
1573         if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) {
1574                 return (EBUSY);
1575         }
1576 
1577         if (flag & SO_FALLBACK) {
1578                 /*
1579                  * Non streams socket needs a stream to fallback to
1580                  */
1581                 RD(q)->q_ptr = (void *)conn_dev;
1582                 WR(q)->q_qinfo = &icmp_fallback_sock_winit;
1583                 WR(q)->q_ptr = (void *)ip_minor_arena_sa;
1584                 qprocson(q);
1585                 return (0);
1586         }
1587 
1588         connp = rawip_do_open(family, credp, &error, KM_SLEEP);
1589         if (connp == NULL) {
1590                 ASSERT(error != 0);
1591                 inet_minor_free(ip_minor_arena_sa, conn_dev);
1592                 return (error);
1593         }
1594 
1595         *devp = makedevice(getemajor(*devp), (minor_t)conn_dev);
1596         connp->conn_dev = conn_dev;
1597         connp->conn_minor_arena = ip_minor_arena_sa;
1598 
1599         /*
1600          * Initialize the icmp_t structure for this stream.
1601          */
1602         q->q_ptr = connp;
1603         WR(q)->q_ptr = connp;
1604         connp->conn_rq = q;
1605         connp->conn_wq = WR(q);
1606 
1607         WR(q)->q_hiwat = connp->conn_sndbuf;
1608         WR(q)->q_lowat = connp->conn_sndlowat;
1609 
1610         qprocson(q);
1611 
1612         /* Set the Stream head write offset. */
1613         (void) proto_set_tx_wroff(q, connp, connp->conn_wroff);
1614         (void) proto_set_rx_hiwat(connp->conn_rq, connp, connp->conn_rcvbuf);
1615 
1616         mutex_enter(&connp->conn_lock);
1617         connp->conn_state_flags &= ~CONN_INCIPIENT;
1618         mutex_exit(&connp->conn_lock);
1619 
1620         icmp_bind_proto(connp->conn_icmp);
1621 
1622         return (0);
1623 }
1624 
1625 /* For /dev/icmp aka AF_INET open */
1626 static int
1627 icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
1628 {
1629         return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET));
1630 }
1631 
1632 /* For /dev/icmp6 aka AF_INET6 open */
1633 static int
1634 icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
1635 {
1636         return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET6));
1637 }
1638 
1639 /*
1640  * This is the open routine for icmp.  It allocates a icmp_t structure for
1641  * the stream and, on the first open of the module, creates an ND table.
1642  */
1643 static conn_t *
1644 rawip_do_open(int family, cred_t *credp, int *err, int flags)
1645 {
1646         icmp_t  *icmp;
1647         conn_t *connp;
1648         zoneid_t zoneid;
1649         netstack_t *ns;
1650         icmp_stack_t *is;
1651         int len;
1652         boolean_t isv6 = B_FALSE;
1653 
1654         *err = secpolicy_net_icmpaccess(credp);
1655         if (*err != 0)
1656                 return (NULL);
1657 
1658         if (family == AF_INET6)
1659                 isv6 = B_TRUE;
1660 
1661         ns = netstack_find_by_cred(credp);
1662         ASSERT(ns != NULL);
1663         is = ns->netstack_icmp;
1664         ASSERT(is != NULL);
1665 
1666         /*
1667          * For exclusive stacks we set the zoneid to zero
1668          * to make ICMP operate as if in the global zone.
1669          */
1670         if (ns->netstack_stackid != GLOBAL_NETSTACKID)
1671                 zoneid = GLOBAL_ZONEID;
1672         else
1673                 zoneid = crgetzoneid(credp);
1674 
1675         ASSERT(flags == KM_SLEEP || flags == KM_NOSLEEP);
1676 
1677         connp = ipcl_conn_create(IPCL_RAWIPCONN, flags, ns);
1678         icmp = connp->conn_icmp;
1679 
1680         /*
1681          * ipcl_conn_create did a netstack_hold. Undo the hold that was
1682          * done by netstack_find_by_cred()
1683          */
1684         netstack_rele(ns);
1685 
1686         /*
1687          * Since this conn_t/icmp_t is not yet visible to anybody else we don't
1688          * need to lock anything.
1689          */
1690         ASSERT(connp->conn_proto == IPPROTO_ICMP);
1691         ASSERT(connp->conn_icmp == icmp);
1692         ASSERT(icmp->icmp_connp == connp);
1693 
1694         /* Set the initial state of the stream and the privilege status. */
1695         icmp->icmp_state = TS_UNBND;
1696         connp->conn_ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
1697         if (isv6) {
1698                 connp->conn_family = AF_INET6;
1699                 connp->conn_ipversion = IPV6_VERSION;
1700                 connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4;
1701                 connp->conn_proto = IPPROTO_ICMPV6;
1702                 /* May be changed by a SO_PROTOTYPE socket option. */
1703                 connp->conn_proto = IPPROTO_ICMPV6;
1704                 connp->conn_ixa->ixa_protocol = connp->conn_proto;
1705                 connp->conn_ixa->ixa_raw_cksum_offset = 2;
1706                 connp->conn_default_ttl = is->is_ipv6_hoplimit;
1707                 len = sizeof (ip6_t);
1708         } else {
1709                 connp->conn_family = AF_INET;
1710                 connp->conn_ipversion = IPV4_VERSION;
1711                 connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4;
1712                 /* May be changed by a SO_PROTOTYPE socket option. */
1713                 connp->conn_proto = IPPROTO_ICMP;
1714                 connp->conn_ixa->ixa_protocol = connp->conn_proto;
1715                 connp->conn_default_ttl = is->is_ipv4_ttl;
1716                 len = sizeof (ipha_t);
1717         }
1718         connp->conn_xmit_ipp.ipp_unicast_hops = connp->conn_default_ttl;
1719 
1720         connp->conn_ixa->ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1721 
1722         /*
1723          * For the socket of protocol IPPROTO_RAW or when IP_HDRINCL is set,
1724          * the checksum is provided in the pre-built packet. We clear
1725          * IXAF_SET_ULP_CKSUM to tell IP that the application has sent a
1726          * complete IP header and not to compute the transport checksum.
1727          */
1728         connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM;
1729         /* conn_allzones can not be set this early, hence no IPCL_ZONEID */
1730         connp->conn_ixa->ixa_zoneid = zoneid;
1731 
1732         connp->conn_zoneid = zoneid;
1733 
1734         /*
1735          * If the caller has the process-wide flag set, then default to MAC
1736          * exempt mode.  This allows read-down to unlabeled hosts.
1737          */
1738         if (getpflags(NET_MAC_AWARE, credp) != 0)
1739                 connp->conn_mac_mode = CONN_MAC_AWARE;
1740 
1741         connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID);
1742 
1743         icmp->icmp_is = is;
1744 
1745         connp->conn_rcvbuf = is->is_recv_hiwat;
1746         connp->conn_sndbuf = is->is_xmit_hiwat;
1747         connp->conn_sndlowat = is->is_xmit_lowat;
1748         connp->conn_rcvlowat = icmp_mod_info.mi_lowat;
1749 
1750         connp->conn_wroff = len + is->is_wroff_extra;
1751         connp->conn_so_type = SOCK_RAW;
1752 
1753         connp->conn_recv = icmp_input;
1754         connp->conn_recvicmp = icmp_icmp_input;
1755         crhold(credp);
1756         connp->conn_cred = credp;
1757         connp->conn_cpid = curproc->p_pid;
1758         connp->conn_open_time = ddi_get_lbolt64();
1759         /* Cache things in ixa without an extra refhold */
1760         ASSERT(!(connp->conn_ixa->ixa_free_flags & IXA_FREE_CRED));
1761         connp->conn_ixa->ixa_cred = connp->conn_cred;
1762         connp->conn_ixa->ixa_cpid = connp->conn_cpid;
1763         if (is_system_labeled())
1764                 connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred);
1765 
1766         connp->conn_flow_cntrld = B_FALSE;
1767 
1768         if (is->is_pmtu_discovery)
1769                 connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
1770 
1771         return (connp);
1772 }
1773 
1774 /*
1775  * Which ICMP options OK to set through T_UNITDATA_REQ...
1776  */
1777 /* ARGSUSED */
1778 static boolean_t
1779 icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name)
1780 {
1781         return (B_TRUE);
1782 }
1783 
1784 /*
1785  * This routine gets default values of certain options whose default
1786  * values are maintained by protcol specific code
1787  */
1788 int
1789 icmp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
1790 {
1791         icmp_t *icmp = Q_TO_ICMP(q);
1792         icmp_stack_t *is = icmp->icmp_is;
1793         int *i1 = (int *)ptr;
1794 
1795         switch (level) {
1796         case IPPROTO_IP:
1797                 switch (name) {
1798                 case IP_MULTICAST_TTL:
1799                         *ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL;
1800                         return (sizeof (uchar_t));
1801                 case IP_MULTICAST_LOOP:
1802                         *ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP;
1803                         return (sizeof (uchar_t));
1804                 }
1805                 break;
1806         case IPPROTO_IPV6:
1807                 switch (name) {
1808                 case IPV6_MULTICAST_HOPS:
1809                         *i1 = IP_DEFAULT_MULTICAST_TTL;
1810                         return (sizeof (int));
1811                 case IPV6_MULTICAST_LOOP:
1812                         *i1 = IP_DEFAULT_MULTICAST_LOOP;
1813                         return (sizeof (int));
1814                 case IPV6_UNICAST_HOPS:
1815                         *i1 = is->is_ipv6_hoplimit;
1816                         return (sizeof (int));
1817                 }
1818                 break;
1819         case IPPROTO_ICMPV6:
1820                 switch (name) {
1821                 case ICMP6_FILTER:
1822                         /* Make it look like "pass all" */
1823                         ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr);
1824                         return (sizeof (icmp6_filter_t));
1825                 }
1826                 break;
1827         }
1828         return (-1);
1829 }
1830 
1831 /*
1832  * This routine retrieves the current status of socket options.
1833  * It returns the size of the option retrieved, or -1.
1834  */
1835 int
1836 icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
1837 {
1838         icmp_t          *icmp = connp->conn_icmp;
1839         int             *i1 = (int *)ptr;
1840         conn_opt_arg_t  coas;
1841         int             retval;
1842 
1843         coas.coa_connp = connp;
1844         coas.coa_ixa = connp->conn_ixa;
1845         coas.coa_ipp = &connp->conn_xmit_ipp;
1846         coas.coa_ancillary = B_FALSE;
1847         coas.coa_changed = 0;
1848 
1849         /*
1850          * We assume that the optcom framework has checked for the set
1851          * of levels and names that are supported, hence we don't worry
1852          * about rejecting based on that.
1853          * First check for ICMP specific handling, then pass to common routine.
1854          */
1855         switch (level) {
1856         case IPPROTO_IP:
1857                 /*
1858                  * Only allow IPv4 option processing on IPv4 sockets.
1859                  */
1860                 if (connp->conn_family != AF_INET)
1861                         return (-1);
1862 
1863                 switch (name) {
1864                 case IP_OPTIONS:
1865                 case T_IP_OPTIONS:
1866                         /* Options are passed up with each packet */
1867                         return (0);
1868                 case IP_HDRINCL:
1869                         mutex_enter(&connp->conn_lock);
1870                         *i1 = (int)icmp->icmp_hdrincl;
1871                         mutex_exit(&connp->conn_lock);
1872                         return (sizeof (int));
1873                 }
1874                 break;
1875 
1876         case IPPROTO_IPV6:
1877                 /*
1878                  * Only allow IPv6 option processing on native IPv6 sockets.
1879                  */
1880                 if (connp->conn_family != AF_INET6)
1881                         return (-1);
1882 
1883                 switch (name) {
1884                 case IPV6_CHECKSUM:
1885                         /*
1886                          * Return offset or -1 if no checksum offset.
1887                          * Does not apply to IPPROTO_ICMPV6
1888                          */
1889                         if (connp->conn_proto == IPPROTO_ICMPV6)
1890                                 return (-1);
1891 
1892                         mutex_enter(&connp->conn_lock);
1893                         if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM)
1894                                 *i1 = connp->conn_ixa->ixa_raw_cksum_offset;
1895                         else
1896                                 *i1 = -1;
1897                         mutex_exit(&connp->conn_lock);
1898                         return (sizeof (int));
1899                 }
1900                 break;
1901 
1902         case IPPROTO_ICMPV6:
1903                 /*
1904                  * Only allow IPv6 option processing on native IPv6 sockets.
1905                  */
1906                 if (connp->conn_family != AF_INET6)
1907                         return (-1);
1908 
1909                 if (connp->conn_proto != IPPROTO_ICMPV6)
1910                         return (-1);
1911 
1912                 switch (name) {
1913                 case ICMP6_FILTER:
1914                         mutex_enter(&connp->conn_lock);
1915                         if (icmp->icmp_filter == NULL) {
1916                                 /* Make it look like "pass all" */
1917                                 ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr);
1918                         } else {
1919                                 (void) bcopy(icmp->icmp_filter, ptr,
1920                                     sizeof (icmp6_filter_t));
1921                         }
1922                         mutex_exit(&connp->conn_lock);
1923                         return (sizeof (icmp6_filter_t));
1924                 }
1925         }
1926         mutex_enter(&connp->conn_lock);
1927         retval = conn_opt_get(&coas, level, name, ptr);
1928         mutex_exit(&connp->conn_lock);
1929         return (retval);
1930 }
1931 
1932 /*
1933  * This routine retrieves the current status of socket options.
1934  * It returns the size of the option retrieved, or -1.
1935  */
1936 int
1937 icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
1938 {
1939         conn_t          *connp = Q_TO_CONN(q);
1940         int             err;
1941 
1942         err = icmp_opt_get(connp, level, name, ptr);
1943         return (err);
1944 }
1945 
1946 /*
1947  * This routine sets socket options.
1948  */
1949 int
1950 icmp_do_opt_set(conn_opt_arg_t *coa, int level, int name,
1951     uint_t inlen, uchar_t *invalp, cred_t *cr, boolean_t checkonly)
1952 {
1953         conn_t          *connp = coa->coa_connp;
1954         ip_xmit_attr_t  *ixa = coa->coa_ixa;
1955         icmp_t          *icmp = connp->conn_icmp;
1956         icmp_stack_t    *is = icmp->icmp_is;
1957         int             *i1 = (int *)invalp;
1958         boolean_t       onoff = (*i1 == 0) ? 0 : 1;
1959         int             error;
1960 
1961         ASSERT(MUTEX_NOT_HELD(&coa->coa_connp->conn_lock));
1962 
1963         /*
1964          * For fixed length options, no sanity check
1965          * of passed in length is done. It is assumed *_optcom_req()
1966          * routines do the right thing.
1967          */
1968 
1969         switch (level) {
1970         case SOL_SOCKET:
1971                 switch (name) {
1972                 case SO_PROTOTYPE:
1973                         if ((*i1 & 0xFF) != IPPROTO_ICMP &&
1974                             (*i1 & 0xFF) != IPPROTO_ICMPV6 &&
1975                             secpolicy_net_rawaccess(cr) != 0) {
1976                                 return (EACCES);
1977                         }
1978                         if (checkonly)
1979                                 break;
1980 
1981                         mutex_enter(&connp->conn_lock);
1982                         connp->conn_proto = *i1 & 0xFF;
1983                         ixa->ixa_protocol = connp->conn_proto;
1984                         if ((connp->conn_proto == IPPROTO_RAW ||
1985                             connp->conn_proto == IPPROTO_IGMP) &&
1986                             connp->conn_family == AF_INET) {
1987                                 icmp->icmp_hdrincl = 1;
1988                                 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
1989                         } else if (connp->conn_proto == IPPROTO_UDP ||
1990                             connp->conn_proto == IPPROTO_TCP ||
1991                             connp->conn_proto == IPPROTO_SCTP) {
1992                                 /* Used by test applications like psh */
1993                                 icmp->icmp_hdrincl = 0;
1994                                 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
1995                         } else {
1996                                 icmp->icmp_hdrincl = 0;
1997                                 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
1998                         }
1999 
2000                         if (connp->conn_family == AF_INET6 &&
2001                             connp->conn_proto == IPPROTO_ICMPV6) {
2002                                 /* Set offset for icmp6_cksum */
2003                                 ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM;
2004                                 ixa->ixa_raw_cksum_offset = 2;
2005                         }
2006                         if (icmp->icmp_filter != NULL &&
2007                             connp->conn_proto != IPPROTO_ICMPV6) {
2008                                 kmem_free(icmp->icmp_filter,
2009                                     sizeof (icmp6_filter_t));
2010                                 icmp->icmp_filter = NULL;
2011                         }
2012                         mutex_exit(&connp->conn_lock);
2013 
2014                         coa->coa_changed |= COA_HEADER_CHANGED;
2015                         /*
2016                          * For SCTP, we don't use icmp_bind_proto() for
2017                          * raw socket binding.
2018                          */
2019                         if (connp->conn_proto == IPPROTO_SCTP)
2020                                 return (0);
2021 
2022                         coa->coa_changed |= COA_ICMP_BIND_NEEDED;
2023                         return (0);
2024 
2025                 case SO_SNDBUF:
2026                         if (*i1 > is->is_max_buf) {
2027                                 return (ENOBUFS);
2028                         }
2029                         break;
2030                 case SO_RCVBUF:
2031                         if (*i1 > is->is_max_buf) {
2032                                 return (ENOBUFS);
2033                         }
2034                         break;
2035                 }
2036                 break;
2037 
2038         case IPPROTO_IP:
2039                 /*
2040                  * Only allow IPv4 option processing on IPv4 sockets.
2041                  */
2042                 if (connp->conn_family != AF_INET)
2043                         return (EINVAL);
2044 
2045                 switch (name) {
2046                 case IP_HDRINCL:
2047                         if (!checkonly) {
2048                                 mutex_enter(&connp->conn_lock);
2049                                 icmp->icmp_hdrincl = onoff;
2050                                 if (onoff)
2051                                         ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
2052                                 else
2053                                         ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
2054                                 mutex_exit(&connp->conn_lock);
2055                         }
2056                         break;
2057                 }
2058                 break;
2059 
2060         case IPPROTO_IPV6:
2061                 if (connp->conn_family != AF_INET6)
2062                         return (EINVAL);
2063 
2064                 switch (name) {
2065                 case IPV6_CHECKSUM:
2066                         /*
2067                          * Integer offset into the user data of where the
2068                          * checksum is located.
2069                          * Offset of -1 disables option.
2070                          * Does not apply to IPPROTO_ICMPV6.
2071                          */
2072                         if (connp->conn_proto == IPPROTO_ICMPV6 ||
2073                             coa->coa_ancillary) {
2074                                 return (EINVAL);
2075                         }
2076                         if ((*i1 != -1) && ((*i1 < 0) || (*i1 & 0x1) != 0)) {
2077                                 /* Negative or not 16 bit aligned offset */
2078                                 return (EINVAL);
2079                         }
2080                         if (checkonly)
2081                                 break;
2082 
2083                         mutex_enter(&connp->conn_lock);
2084                         if (*i1 == -1) {
2085                                 ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM;
2086                                 ixa->ixa_raw_cksum_offset = 0;
2087                                 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
2088                         } else {
2089                                 ixa->ixa_flags |= IXAF_SET_RAW_CKSUM;
2090                                 ixa->ixa_raw_cksum_offset = *i1;
2091                                 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
2092                         }
2093                         mutex_exit(&connp->conn_lock);
2094                         break;
2095                 }
2096                 break;
2097 
2098         case IPPROTO_ICMPV6:
2099                 /*
2100                  * Only allow IPv6 option processing on IPv6 sockets.
2101                  */
2102                 if (connp->conn_family != AF_INET6)
2103                         return (EINVAL);
2104                 if (connp->conn_proto != IPPROTO_ICMPV6)
2105                         return (EINVAL);
2106 
2107                 switch (name) {
2108                 case ICMP6_FILTER:
2109                         if (checkonly)
2110                                 break;
2111 
2112                         if ((inlen != 0) &&
2113                             (inlen != sizeof (icmp6_filter_t)))
2114                                 return (EINVAL);
2115 
2116                         mutex_enter(&connp->conn_lock);
2117                         if (inlen == 0) {
2118                                 if (icmp->icmp_filter != NULL) {
2119                                         kmem_free(icmp->icmp_filter,
2120                                             sizeof (icmp6_filter_t));
2121                                         icmp->icmp_filter = NULL;
2122                                 }
2123                         } else {
2124                                 if (icmp->icmp_filter == NULL) {
2125                                         icmp->icmp_filter = kmem_alloc(
2126                                             sizeof (icmp6_filter_t),
2127                                             KM_NOSLEEP);
2128                                         if (icmp->icmp_filter == NULL) {
2129                                                 mutex_exit(&connp->conn_lock);
2130                                                 return (ENOBUFS);
2131                                         }
2132                                 }
2133                                 (void) bcopy(invalp, icmp->icmp_filter, inlen);
2134                         }
2135                         mutex_exit(&connp->conn_lock);
2136                         break;
2137                 }
2138                 break;
2139         }
2140         error = conn_opt_set(coa, level, name, inlen, invalp,
2141             checkonly, cr);
2142         return (error);
2143 }
2144 
2145 /*
2146  * This routine sets socket options.
2147  */
2148 int
2149 icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
2150     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
2151     void *thisdg_attrs, cred_t *cr)
2152 {
2153         icmp_t          *icmp = connp->conn_icmp;
2154         int             err;
2155         conn_opt_arg_t  coas, *coa;
2156         boolean_t       checkonly;
2157         icmp_stack_t    *is = icmp->icmp_is;
2158 
2159         switch (optset_context) {
2160         case SETFN_OPTCOM_CHECKONLY:
2161                 checkonly = B_TRUE;
2162                 /*
2163                  * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
2164                  * inlen != 0 implies value supplied and
2165                  *      we have to "pretend" to set it.
2166                  * inlen == 0 implies that there is no
2167                  *      value part in T_CHECK request and just validation
2168                  * done elsewhere should be enough, we just return here.
2169                  */
2170                 if (inlen == 0) {
2171                         *outlenp = 0;
2172                         return (0);
2173                 }
2174                 break;
2175         case SETFN_OPTCOM_NEGOTIATE:
2176                 checkonly = B_FALSE;
2177                 break;
2178         case SETFN_UD_NEGOTIATE:
2179         case SETFN_CONN_NEGOTIATE:
2180                 checkonly = B_FALSE;
2181                 /*
2182                  * Negotiating local and "association-related" options
2183                  * through T_UNITDATA_REQ.
2184                  *
2185                  * Following routine can filter out ones we do not
2186                  * want to be "set" this way.
2187                  */
2188                 if (!icmp_opt_allow_udr_set(level, name)) {
2189                         *outlenp = 0;
2190                         return (EINVAL);
2191                 }
2192                 break;
2193         default:
2194                 /*
2195                  * We should never get here
2196                  */
2197                 *outlenp = 0;
2198                 return (EINVAL);
2199         }
2200 
2201         ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
2202             (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
2203 
2204         if (thisdg_attrs != NULL) {
2205                 /* Options from T_UNITDATA_REQ */
2206                 coa = (conn_opt_arg_t *)thisdg_attrs;
2207                 ASSERT(coa->coa_connp == connp);
2208                 ASSERT(coa->coa_ixa != NULL);
2209                 ASSERT(coa->coa_ipp != NULL);
2210                 ASSERT(coa->coa_ancillary);
2211         } else {
2212                 coa = &coas;
2213                 coas.coa_connp = connp;
2214                 /* Get a reference on conn_ixa to prevent concurrent mods */
2215                 coas.coa_ixa = conn_get_ixa(connp, B_TRUE);
2216                 if (coas.coa_ixa == NULL) {
2217                         *outlenp = 0;
2218                         return (ENOMEM);
2219                 }
2220                 coas.coa_ipp = &connp->conn_xmit_ipp;
2221                 coas.coa_ancillary = B_FALSE;
2222                 coas.coa_changed = 0;
2223         }
2224 
2225         err = icmp_do_opt_set(coa, level, name, inlen, invalp,
2226             cr, checkonly);
2227         if (err != 0) {
2228 errout:
2229                 if (!coa->coa_ancillary)
2230                         ixa_refrele(coa->coa_ixa);
2231                 *outlenp = 0;
2232                 return (err);
2233         }
2234 
2235         /*
2236          * Common case of OK return with outval same as inval.
2237          */
2238         if (invalp != outvalp) {
2239                 /* don't trust bcopy for identical src/dst */
2240                 (void) bcopy(invalp, outvalp, inlen);
2241         }
2242         *outlenp = inlen;
2243 
2244         /*
2245          * If this was not ancillary data, then we rebuild the headers,
2246          * update the IRE/NCE, and IPsec as needed.
2247          * Since the label depends on the destination we go through
2248          * ip_set_destination first.
2249          */
2250         if (coa->coa_ancillary) {
2251                 return (0);
2252         }
2253 
2254         if (coa->coa_changed & COA_ROUTE_CHANGED) {
2255                 in6_addr_t saddr, faddr, nexthop;
2256                 in_port_t fport;
2257 
2258                 /*
2259                  * We clear lastdst to make sure we pick up the change
2260                  * next time sending.
2261                  * If we are connected we re-cache the information.
2262                  * We ignore errors to preserve BSD behavior.
2263                  * Note that we don't redo IPsec policy lookup here
2264                  * since the final destination (or source) didn't change.
2265                  */
2266                 mutex_enter(&connp->conn_lock);
2267                 connp->conn_v6lastdst = ipv6_all_zeros;
2268 
2269                 ip_attr_nexthop(coa->coa_ipp, coa->coa_ixa,
2270                     &connp->conn_faddr_v6, &nexthop);
2271                 saddr = connp->conn_saddr_v6;
2272                 faddr = connp->conn_faddr_v6;
2273                 fport = connp->conn_fport;
2274                 mutex_exit(&connp->conn_lock);
2275 
2276                 if (!IN6_IS_ADDR_UNSPECIFIED(&faddr) &&
2277                     !IN6_IS_ADDR_V4MAPPED_ANY(&faddr)) {
2278                         (void) ip_attr_connect(connp, coa->coa_ixa,
2279                             &saddr, &faddr, &nexthop, fport, NULL, NULL,
2280                             IPDF_ALLOW_MCBC | IPDF_VERIFY_DST);
2281                 }
2282         }
2283 
2284         ixa_refrele(coa->coa_ixa);
2285 
2286         if (coa->coa_changed & COA_HEADER_CHANGED) {
2287                 /*
2288                  * Rebuild the header template if we are connected.
2289                  * Otherwise clear conn_v6lastdst so we rebuild the header
2290                  * in the data path.
2291                  */
2292                 mutex_enter(&connp->conn_lock);
2293                 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
2294                     !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
2295                         err = icmp_build_hdr_template(connp,
2296                             &connp->conn_saddr_v6, &connp->conn_faddr_v6,
2297                             connp->conn_flowinfo);
2298                         if (err != 0) {
2299                                 mutex_exit(&connp->conn_lock);
2300                                 return (err);
2301                         }
2302                 } else {
2303                         connp->conn_v6lastdst = ipv6_all_zeros;
2304                 }
2305                 mutex_exit(&connp->conn_lock);
2306         }
2307         if (coa->coa_changed & COA_RCVBUF_CHANGED) {
2308                 (void) proto_set_rx_hiwat(connp->conn_rq, connp,
2309                     connp->conn_rcvbuf);
2310         }
2311         if ((coa->coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
2312                 connp->conn_wq->q_hiwat = connp->conn_sndbuf;
2313         }
2314         if (coa->coa_changed & COA_WROFF_CHANGED) {
2315                 /* Increase wroff if needed */
2316                 uint_t wroff;
2317 
2318                 mutex_enter(&connp->conn_lock);
2319                 wroff = connp->conn_ht_iphc_allocated + is->is_wroff_extra;
2320                 if (wroff > connp->conn_wroff) {
2321                         connp->conn_wroff = wroff;
2322                         mutex_exit(&connp->conn_lock);
2323                         (void) proto_set_tx_wroff(connp->conn_rq, connp, wroff);
2324                 } else {
2325                         mutex_exit(&connp->conn_lock);
2326                 }
2327         }
2328         if (coa->coa_changed & COA_ICMP_BIND_NEEDED) {
2329                 icmp_bind_proto(icmp);
2330         }
2331         return (err);
2332 }
2333 
2334 /* This routine sets socket options. */
2335 int
2336 icmp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name,
2337     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
2338     void *thisdg_attrs, cred_t *cr)
2339 {
2340         conn_t  *connp = Q_TO_CONN(q);
2341         int error;
2342 
2343         error = icmp_opt_set(connp, optset_context, level, name, inlen, invalp,
2344             outlenp, outvalp, thisdg_attrs, cr);
2345         return (error);
2346 }
2347 
2348 /*
2349  * Setup IP headers.
2350  *
2351  * Note that IP_HDRINCL has ipha_protocol that is different than conn_proto,
2352  * but icmp_output_hdrincl restores ipha_protocol once we return.
2353  */
2354 mblk_t *
2355 icmp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp,
2356     const in6_addr_t *v6src, const in6_addr_t *v6dst, uint32_t flowinfo,
2357     mblk_t *data_mp, int *errorp)
2358 {
2359         mblk_t          *mp;
2360         icmp_stack_t    *is = connp->conn_netstack->netstack_icmp;
2361         uint_t          data_len;
2362         uint32_t        cksum;
2363 
2364         data_len = msgdsize(data_mp);
2365         mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, connp->conn_proto,
2366             flowinfo, 0, data_mp, data_len, is->is_wroff_extra, &cksum, errorp);
2367         if (mp == NULL) {
2368                 ASSERT(*errorp != 0);
2369                 return (NULL);
2370         }
2371 
2372         ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length;
2373 
2374         /*
2375          * If there was a routing option/header then conn_prepend_hdr
2376          * has massaged it and placed the pseudo-header checksum difference
2377          * in the cksum argument.
2378          *
2379          * Prepare for ICMPv6 checksum done in IP.
2380          *
2381          * We make it easy for IP to include our pseudo header
2382          * by putting our length (and any routing header adjustment)
2383          * in the ICMPv6 checksum field.
2384          * The IP source, destination, and length have already been set by
2385          * conn_prepend_hdr.
2386          */
2387         cksum += data_len;
2388         cksum = (cksum >> 16) + (cksum & 0xFFFF);
2389         ASSERT(cksum < 0x10000);
2390 
2391         if (ixa->ixa_flags & IXAF_IS_IPV4) {
2392                 ipha_t  *ipha = (ipha_t *)mp->b_rptr;
2393 
2394                 ASSERT(ntohs(ipha->ipha_length) == ixa->ixa_pktlen);
2395         } else {
2396                 ip6_t   *ip6h = (ip6_t *)mp->b_rptr;
2397                 uint_t  cksum_offset = 0;
2398 
2399                 ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == ixa->ixa_pktlen);
2400 
2401                 if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) {
2402                         if (connp->conn_proto == IPPROTO_ICMPV6) {
2403                                 cksum_offset = ixa->ixa_ip_hdr_length +
2404                                     offsetof(icmp6_t, icmp6_cksum);
2405                         } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
2406                                 cksum_offset = ixa->ixa_ip_hdr_length +
2407                                     ixa->ixa_raw_cksum_offset;
2408                         }
2409                 }
2410                 if (cksum_offset != 0) {
2411                         uint16_t *ptr;
2412 
2413                         /* Make sure the checksum fits in the first mblk */
2414                         if (cksum_offset + sizeof (short) > MBLKL(mp)) {
2415                                 mblk_t *mp1;
2416 
2417                                 mp1 = msgpullup(mp,
2418                                     cksum_offset + sizeof (short));
2419                                 freemsg(mp);
2420                                 if (mp1 == NULL) {
2421                                         *errorp = ENOMEM;
2422                                         return (NULL);
2423                                 }
2424                                 mp = mp1;
2425                                 ip6h = (ip6_t *)mp->b_rptr;
2426                         }
2427                         ptr = (uint16_t *)(mp->b_rptr + cksum_offset);
2428                         *ptr = htons(cksum);
2429                 }
2430         }
2431 
2432         /* Note that we don't try to update wroff due to ancillary data */
2433         return (mp);
2434 }
2435 
2436 static int
2437 icmp_build_hdr_template(conn_t *connp, const in6_addr_t *v6src,
2438     const in6_addr_t *v6dst, uint32_t flowinfo)
2439 {
2440         int             error;
2441 
2442         ASSERT(MUTEX_HELD(&connp->conn_lock));
2443         /*
2444          * We clear lastdst to make sure we don't use the lastdst path
2445          * next time sending since we might not have set v6dst yet.
2446          */
2447         connp->conn_v6lastdst = ipv6_all_zeros;
2448 
2449         error = conn_build_hdr_template(connp, 0, 0, v6src, v6dst, flowinfo);
2450         if (error != 0)
2451                 return (error);
2452 
2453         /*
2454          * Any routing header/option has been massaged. The checksum difference
2455          * is stored in conn_sum.
2456          */
2457         return (0);
2458 }
2459 
2460 static mblk_t *
2461 icmp_queue_fallback(icmp_t *icmp, mblk_t *mp)
2462 {
2463         ASSERT(MUTEX_HELD(&icmp->icmp_recv_lock));
2464         if (IPCL_IS_NONSTR(icmp->icmp_connp)) {
2465                 /*
2466                  * fallback has started but messages have not been moved yet
2467                  */
2468                 if (icmp->icmp_fallback_queue_head == NULL) {
2469                         ASSERT(icmp->icmp_fallback_queue_tail == NULL);
2470                         icmp->icmp_fallback_queue_head = mp;
2471                         icmp->icmp_fallback_queue_tail = mp;
2472                 } else {
2473                         ASSERT(icmp->icmp_fallback_queue_tail != NULL);
2474                         icmp->icmp_fallback_queue_tail->b_next = mp;
2475                         icmp->icmp_fallback_queue_tail = mp;
2476                 }
2477                 return (NULL);
2478         } else {
2479                 /*
2480                  * Fallback completed, let the caller putnext() the mblk.
2481                  */
2482                 return (mp);
2483         }
2484 }
2485 
2486 /*
2487  * Deliver data to ULP. In case we have a socket, and it's falling back to
2488  * TPI, then we'll queue the mp for later processing.
2489  */
2490 static void
2491 icmp_ulp_recv(conn_t *connp, mblk_t *mp, uint_t len)
2492 {
2493         if (IPCL_IS_NONSTR(connp)) {
2494                 icmp_t *icmp = connp->conn_icmp;
2495                 int error;
2496 
2497                 ASSERT(len == msgdsize(mp));
2498                 if ((*connp->conn_upcalls->su_recv)
2499                     (connp->conn_upper_handle, mp, len, 0, &error, NULL) < 0) {
2500                         mutex_enter(&icmp->icmp_recv_lock);
2501                         if (error == ENOSPC) {
2502                                 /*
2503                                  * let's confirm while holding the lock
2504                                  */
2505                                 if ((*connp->conn_upcalls->su_recv)
2506                                     (connp->conn_upper_handle, NULL, 0, 0,
2507                                     &error, NULL) < 0) {
2508                                         ASSERT(error == ENOSPC);
2509                                         if (error == ENOSPC) {
2510                                                 connp->conn_flow_cntrld =
2511                                                     B_TRUE;
2512                                         }
2513                                 }
2514                                 mutex_exit(&icmp->icmp_recv_lock);
2515                         } else {
2516                                 ASSERT(error == EOPNOTSUPP);
2517                                 mp = icmp_queue_fallback(icmp, mp);
2518                                 mutex_exit(&icmp->icmp_recv_lock);
2519                                 if (mp != NULL)
2520                                         putnext(connp->conn_rq, mp);
2521                         }
2522                 }
2523                 ASSERT(MUTEX_NOT_HELD(&icmp->icmp_recv_lock));
2524         } else {
2525                 putnext(connp->conn_rq, mp);
2526         }
2527 }
2528 
2529 /*
2530  * This is the inbound data path.
2531  * IP has already pulled up the IP headers and verified alignment
2532  * etc.
2533  */
2534 /* ARGSUSED2 */
2535 static void
2536 icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
2537 {
2538         conn_t                  *connp = (conn_t *)arg1;
2539         struct T_unitdata_ind   *tudi;
2540         uchar_t                 *rptr;          /* Pointer to IP header */
2541         int                     ip_hdr_length;
2542         int                     udi_size;       /* Size of T_unitdata_ind */
2543         int                     pkt_len;
2544         icmp_t                  *icmp;
2545         ip_pkt_t                ipps;
2546         ip6_t                   *ip6h;
2547         mblk_t                  *mp1;
2548         crb_t                   recv_ancillary;
2549         icmp_stack_t            *is;
2550         sin_t                   *sin;
2551         sin6_t                  *sin6;
2552         ipha_t                  *ipha;
2553 
2554         ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2555 
2556         icmp = connp->conn_icmp;
2557         is = icmp->icmp_is;
2558         rptr = mp->b_rptr;
2559 
2560         ASSERT(DB_TYPE(mp) == M_DATA);
2561         ASSERT(OK_32PTR(rptr));
2562         ASSERT(ira->ira_pktlen == msgdsize(mp));
2563         pkt_len = ira->ira_pktlen;
2564 
2565         /*
2566          * Get a snapshot of these and allow other threads to change
2567          * them after that. We need the same recv_ancillary when determining
2568          * the size as when adding the ancillary data items.
2569          */
2570         mutex_enter(&connp->conn_lock);
2571         recv_ancillary = connp->conn_recv_ancillary;
2572         mutex_exit(&connp->conn_lock);
2573 
2574         ip_hdr_length = ira->ira_ip_hdr_length;
2575         ASSERT(MBLKL(mp) >= ip_hdr_length);  /* IP did a pullup */
2576 
2577         /* Initialize regardless of IP version */
2578         ipps.ipp_fields = 0;
2579 
2580         if (ira->ira_flags & IRAF_IS_IPV4) {
2581                 ASSERT(IPH_HDR_VERSION(rptr) == IPV4_VERSION);
2582                 ASSERT(MBLKL(mp) >= sizeof (ipha_t));
2583                 ASSERT(ira->ira_ip_hdr_length == IPH_HDR_LENGTH(rptr));
2584 
2585                 ipha = (ipha_t *)mp->b_rptr;
2586                 if (recv_ancillary.crb_all != 0)
2587                         (void) ip_find_hdr_v4(ipha, &ipps, B_FALSE);
2588 
2589                 /*
2590                  * BSD for some reason adjusts ipha_length to exclude the
2591                  * IP header length. We do the same.
2592                  */
2593                 if (is->is_bsd_compat) {
2594                         ushort_t len;
2595 
2596                         len = ntohs(ipha->ipha_length);
2597                         if (mp->b_datap->db_ref > 1) {
2598                                 /*
2599                                  * Allocate a new IP header so that we can
2600                                  * modify ipha_length.
2601                                  */
2602                                 mblk_t  *mp1;
2603 
2604                                 mp1 = allocb(ip_hdr_length, BPRI_MED);
2605                                 if (mp1 == NULL) {
2606                                         freemsg(mp);
2607                                         BUMP_MIB(&is->is_rawip_mib,
2608                                             rawipInErrors);
2609                                         return;
2610                                 }
2611                                 bcopy(rptr, mp1->b_rptr, ip_hdr_length);
2612                                 mp->b_rptr = rptr + ip_hdr_length;
2613                                 rptr = mp1->b_rptr;
2614                                 ipha = (ipha_t *)rptr;
2615                                 mp1->b_cont = mp;
2616                                 mp1->b_wptr = rptr + ip_hdr_length;
2617                                 mp = mp1;
2618                         }
2619                         len -= ip_hdr_length;
2620                         ipha->ipha_length = htons(len);
2621                 }
2622 
2623                 /*
2624                  * For RAW sockets we not pass ICMP/IPv4 packets to AF_INET6
2625                  * sockets. This is ensured by icmp_bind and the IP fanout code.
2626                  */
2627                 ASSERT(connp->conn_family == AF_INET);
2628 
2629                 /*
2630                  * This is the inbound data path.  Packets are passed upstream
2631                  * as T_UNITDATA_IND messages with full IPv4 headers still
2632                  * attached.
2633                  */
2634 
2635                 /*
2636                  * Normally only send up the source address.
2637                  * If any ancillary data items are wanted we add those.
2638                  */
2639                 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t);
2640                 if (recv_ancillary.crb_all != 0) {
2641                         udi_size += conn_recvancillary_size(connp,
2642                             recv_ancillary, ira, mp, &ipps);
2643                 }
2644 
2645                 /* Allocate a message block for the T_UNITDATA_IND structure. */
2646                 mp1 = allocb(udi_size, BPRI_MED);
2647                 if (mp1 == NULL) {
2648                         freemsg(mp);
2649                         BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
2650                         return;
2651                 }
2652                 mp1->b_cont = mp;
2653                 tudi = (struct T_unitdata_ind *)mp1->b_rptr;
2654                 mp1->b_datap->db_type = M_PROTO;
2655                 mp1->b_wptr = (uchar_t *)tudi + udi_size;
2656                 tudi->PRIM_type = T_UNITDATA_IND;
2657                 tudi->SRC_length = sizeof (sin_t);
2658                 tudi->SRC_offset = sizeof (struct T_unitdata_ind);
2659                 sin = (sin_t *)&tudi[1];
2660                 *sin = sin_null;
2661                 sin->sin_family = AF_INET;
2662                 sin->sin_addr.s_addr = ipha->ipha_src;
2663                 *(uint32_t *)&sin->sin_zero[0] = 0;
2664                 *(uint32_t *)&sin->sin_zero[4] = 0;
2665                 tudi->OPT_offset =  sizeof (struct T_unitdata_ind) +
2666                     sizeof (sin_t);
2667                 udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t));
2668                 tudi->OPT_length = udi_size;
2669 
2670                 /*
2671                  * Add options if IP_RECVIF etc is set
2672                  */
2673                 if (udi_size != 0) {
2674                         conn_recvancillary_add(connp, recv_ancillary, ira,
2675                             &ipps, (uchar_t *)&sin[1], udi_size);
2676                 }
2677                 goto deliver;
2678         }
2679 
2680         ASSERT(IPH_HDR_VERSION(rptr) == IPV6_VERSION);
2681         /*
2682          * IPv6 packets can only be received by applications
2683          * that are prepared to receive IPv6 addresses.
2684          * The IP fanout must ensure this.
2685          */
2686         ASSERT(connp->conn_family == AF_INET6);
2687 
2688         /*
2689          * Handle IPv6 packets. We don't pass up the IP headers with the
2690          * payload for IPv6.
2691          */
2692 
2693         ip6h = (ip6_t *)rptr;
2694         if (recv_ancillary.crb_all != 0) {
2695                 /*
2696                  * Call on ip_find_hdr_v6 which gets individual lenghts of
2697                  * extension headers (and pointers to them).
2698                  */
2699                 uint8_t         nexthdr;
2700 
2701                 /* We don't care about the length or nextheader. */
2702                 (void) ip_find_hdr_v6(mp, ip6h, B_TRUE, &ipps, &nexthdr);
2703 
2704                 /*
2705                  * We do not pass up hop-by-hop options or any other
2706                  * extension header as part of the packet. Applications
2707                  * that want to see them have to specify IPV6_RECV* socket
2708                  * options. And conn_recvancillary_size/add explicitly
2709                  * drops the TX option from IPV6_HOPOPTS as it does for UDP.
2710                  *
2711                  * If we had multilevel ICMP sockets, then we'd want to
2712                  * modify conn_recvancillary_size/add to
2713                  * allow the user to see the label.
2714                  */
2715         }
2716 
2717         /*
2718          * Check a filter for ICMPv6 types if needed.
2719          * Verify raw checksums if needed.
2720          */
2721         mutex_enter(&connp->conn_lock);
2722         if (icmp->icmp_filter != NULL) {
2723                 int type;
2724 
2725                 /* Assumes that IP has done the pullupmsg */
2726                 type = mp->b_rptr[ip_hdr_length];
2727 
2728                 ASSERT(mp->b_rptr + ip_hdr_length <= mp->b_wptr);
2729                 if (ICMP6_FILTER_WILLBLOCK(type, icmp->icmp_filter)) {
2730                         mutex_exit(&connp->conn_lock);
2731                         freemsg(mp);
2732                         return;
2733                 }
2734         }
2735         if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
2736                 /* Checksum */
2737                 uint16_t        *up;
2738                 uint32_t        sum;
2739                 int             remlen;
2740 
2741                 up = (uint16_t *)&ip6h->ip6_src;
2742 
2743                 remlen = msgdsize(mp) - ip_hdr_length;
2744                 sum = htons(connp->conn_proto + remlen)
2745                     + up[0] + up[1] + up[2] + up[3]
2746                     + up[4] + up[5] + up[6] + up[7]
2747                     + up[8] + up[9] + up[10] + up[11]
2748                     + up[12] + up[13] + up[14] + up[15];
2749                 sum = (sum & 0xffff) + (sum >> 16);
2750                 sum = IP_CSUM(mp, ip_hdr_length, sum);
2751                 if (sum != 0) {
2752                         /* IPv6 RAW checksum failed */
2753                         ip0dbg(("icmp_rput: RAW checksum failed %x\n", sum));
2754                         mutex_exit(&connp->conn_lock);
2755                         freemsg(mp);
2756                         BUMP_MIB(&is->is_rawip_mib, rawipInCksumErrs);
2757                         return;
2758                 }
2759         }
2760         mutex_exit(&connp->conn_lock);
2761 
2762         udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
2763 
2764         if (recv_ancillary.crb_all != 0) {
2765                 udi_size += conn_recvancillary_size(connp,
2766                     recv_ancillary, ira, mp, &ipps);
2767         }
2768 
2769         mp1 = allocb(udi_size, BPRI_MED);
2770         if (mp1 == NULL) {
2771                 freemsg(mp);
2772                 BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
2773                 return;
2774         }
2775         mp1->b_cont = mp;
2776         mp1->b_datap->db_type = M_PROTO;
2777         tudi = (struct T_unitdata_ind *)mp1->b_rptr;
2778         mp1->b_wptr = (uchar_t *)tudi + udi_size;
2779         tudi->PRIM_type = T_UNITDATA_IND;
2780         tudi->SRC_length = sizeof (sin6_t);
2781         tudi->SRC_offset = sizeof (struct T_unitdata_ind);
2782         tudi->OPT_offset = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
2783         udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin6_t));
2784         tudi->OPT_length = udi_size;
2785         sin6 = (sin6_t *)&tudi[1];
2786         *sin6 = sin6_null;
2787         sin6->sin6_port = 0;
2788         sin6->sin6_family = AF_INET6;
2789 
2790         sin6->sin6_addr = ip6h->ip6_src;
2791         /* No sin6_flowinfo per API */
2792         sin6->sin6_flowinfo = 0;
2793         /* For link-scope pass up scope id */
2794         if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src))
2795                 sin6->sin6_scope_id = ira->ira_ruifindex;
2796         else
2797                 sin6->sin6_scope_id = 0;
2798         sin6->__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst,
2799             IPCL_ZONEID(connp), is->is_netstack);
2800 
2801         if (udi_size != 0) {
2802                 conn_recvancillary_add(connp, recv_ancillary, ira,
2803                     &ipps, (uchar_t *)&sin6[1], udi_size);
2804         }
2805 
2806         /* Skip all the IPv6 headers per API */
2807         mp->b_rptr += ip_hdr_length;
2808         pkt_len -= ip_hdr_length;
2809 
2810 deliver:
2811         BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams);
2812         icmp_ulp_recv(connp, mp1, pkt_len);
2813 }
2814 
2815 /*
2816  * return SNMP stuff in buffer in mpdata. We don't hold any lock and report
2817  * information that can be changing beneath us.
2818  */
2819 mblk_t *
2820 icmp_snmp_get(queue_t *q, mblk_t *mpctl)
2821 {
2822         mblk_t                  *mpdata;
2823         struct opthdr           *optp;
2824         conn_t                  *connp = Q_TO_CONN(q);
2825         icmp_stack_t            *is = connp->conn_netstack->netstack_icmp;
2826         mblk_t                  *mp2ctl;
2827 
2828         /*
2829          * make a copy of the original message
2830          */
2831         mp2ctl = copymsg(mpctl);
2832 
2833         if (mpctl == NULL ||
2834             (mpdata = mpctl->b_cont) == NULL) {
2835                 freemsg(mpctl);
2836                 freemsg(mp2ctl);
2837                 return (0);
2838         }
2839 
2840         /* fixed length structure for IPv4 and IPv6 counters */
2841         optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
2842         optp->level = EXPER_RAWIP;
2843         optp->name = 0;
2844         (void) snmp_append_data(mpdata, (char *)&is->is_rawip_mib,
2845             sizeof (is->is_rawip_mib));
2846         optp->len = msgdsize(mpdata);
2847         qreply(q, mpctl);
2848 
2849         return (mp2ctl);
2850 }
2851 
2852 /*
2853  * Return 0 if invalid set request, 1 otherwise, including non-rawip requests.
2854  * TODO:  If this ever actually tries to set anything, it needs to be
2855  * to do the appropriate locking.
2856  */
2857 /* ARGSUSED */
2858 int
2859 icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
2860     uchar_t *ptr, int len)
2861 {
2862         switch (level) {
2863         case EXPER_RAWIP:
2864                 return (0);
2865         default:
2866                 return (1);
2867         }
2868 }
2869 
2870 /*
2871  * This routine creates a T_UDERROR_IND message and passes it upstream.
2872  * The address and options are copied from the T_UNITDATA_REQ message
2873  * passed in mp.  This message is freed.
2874  */
2875 static void
2876 icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err)
2877 {
2878         struct T_unitdata_req *tudr;
2879         mblk_t  *mp1;
2880         uchar_t *destaddr;
2881         t_scalar_t destlen;
2882         uchar_t *optaddr;
2883         t_scalar_t optlen;
2884 
2885         if ((mp->b_wptr < mp->b_rptr) ||
2886             (MBLKL(mp)) < sizeof (struct T_unitdata_req)) {
2887                 goto done;
2888         }
2889         tudr = (struct T_unitdata_req *)mp->b_rptr;
2890         destaddr = mp->b_rptr + tudr->DEST_offset;
2891         if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr ||
2892             destaddr + tudr->DEST_length < mp->b_rptr ||
2893             destaddr + tudr->DEST_length > mp->b_wptr) {
2894                 goto done;
2895         }
2896         optaddr = mp->b_rptr + tudr->OPT_offset;
2897         if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr ||
2898             optaddr + tudr->OPT_length < mp->b_rptr ||
2899             optaddr + tudr->OPT_length > mp->b_wptr) {
2900                 goto done;
2901         }
2902         destlen = tudr->DEST_length;
2903         optlen = tudr->OPT_length;
2904 
2905         mp1 = mi_tpi_uderror_ind((char *)destaddr, destlen,
2906             (char *)optaddr, optlen, err);
2907         if (mp1 != NULL)
2908                 qreply(q, mp1);
2909 
2910 done:
2911         freemsg(mp);
2912 }
2913 
2914 static int
2915 rawip_do_unbind(conn_t *connp)
2916 {
2917         icmp_t  *icmp = connp->conn_icmp;
2918 
2919         mutex_enter(&connp->conn_lock);
2920         /* If a bind has not been done, we can't unbind. */
2921         if (icmp->icmp_state == TS_UNBND) {
2922                 mutex_exit(&connp->conn_lock);
2923                 return (-TOUTSTATE);
2924         }
2925         connp->conn_saddr_v6 = ipv6_all_zeros;
2926         connp->conn_bound_addr_v6 = ipv6_all_zeros;
2927         connp->conn_laddr_v6 = ipv6_all_zeros;
2928         connp->conn_mcbc_bind = B_FALSE;
2929         connp->conn_lport = 0;
2930         connp->conn_fport = 0;
2931         /* In case we were also connected */
2932         connp->conn_faddr_v6 = ipv6_all_zeros;
2933         connp->conn_v6lastdst = ipv6_all_zeros;
2934 
2935         icmp->icmp_state = TS_UNBND;
2936 
2937         (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
2938             &connp->conn_faddr_v6, connp->conn_flowinfo);
2939         mutex_exit(&connp->conn_lock);
2940 
2941         ip_unbind(connp);
2942         return (0);
2943 }
2944 
2945 /*
2946  * This routine is called by icmp_wput to handle T_UNBIND_REQ messages.
2947  * After some error checking, the message is passed downstream to ip.
2948  */
2949 static void
2950 icmp_tpi_unbind(queue_t *q, mblk_t *mp)
2951 {
2952         conn_t  *connp = Q_TO_CONN(q);
2953         int     error;
2954 
2955         ASSERT(mp->b_cont == NULL);
2956         error = rawip_do_unbind(connp);
2957         if (error) {
2958                 if (error < 0) {
2959                         icmp_err_ack(q, mp, -error, 0);
2960                 } else {
2961                         icmp_err_ack(q, mp, 0, error);
2962                 }
2963                 return;
2964         }
2965 
2966         /*
2967          * Convert mp into a T_OK_ACK
2968          */
2969 
2970         mp = mi_tpi_ok_ack_alloc(mp);
2971 
2972         /*
2973          * should not happen in practice... T_OK_ACK is smaller than the
2974          * original message.
2975          */
2976         ASSERT(mp != NULL);
2977         ASSERT(((struct T_ok_ack *)mp->b_rptr)->PRIM_type == T_OK_ACK);
2978         qreply(q, mp);
2979 }
2980 
2981 /*
2982  * Process IPv4 packets that already include an IP header.
2983  * Used when IP_HDRINCL has been set (implicit for IPPROTO_RAW and
2984  * IPPROTO_IGMP).
2985  * In this case we ignore the address and any options in the T_UNITDATA_REQ.
2986  *
2987  * The packet is assumed to have a base (20 byte) IP header followed
2988  * by the upper-layer protocol. We include any IP_OPTIONS including a
2989  * CIPSO label but otherwise preserve the base IP header.
2990  */
2991 static int
2992 icmp_output_hdrincl(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid)
2993 {
2994         icmp_t          *icmp = connp->conn_icmp;
2995         icmp_stack_t    *is = icmp->icmp_is;
2996         ipha_t          iphas;
2997         ipha_t          *ipha;
2998         int             ip_hdr_length;
2999         int             tp_hdr_len;
3000         ip_xmit_attr_t  *ixa;
3001         ip_pkt_t        *ipp;
3002         in6_addr_t      v6src;
3003         in6_addr_t      v6dst;
3004         in6_addr_t      v6nexthop;
3005         int             error;
3006         boolean_t       do_ipsec;
3007 
3008         /*
3009          * We need an exclusive copy of conn_ixa since the included IP
3010          * header could have any destination.
3011          * That copy has no pointers hence we
3012          * need to set them up once we've parsed the ancillary data.
3013          */
3014         ixa = conn_get_ixa_exclusive(connp);
3015         if (ixa == NULL) {
3016                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3017                 freemsg(mp);
3018                 return (ENOMEM);
3019         }
3020         ASSERT(cr != NULL);
3021         /*
3022          * Caller has a reference on cr; from db_credp or because we
3023          * are running in process context.
3024          */
3025         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3026         ixa->ixa_cred = cr;
3027         ixa->ixa_cpid = pid;
3028         if (is_system_labeled()) {
3029                 /* We need to restart with a label based on the cred */
3030                 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
3031         }
3032 
3033         /* In case previous destination was multicast or multirt */
3034         ip_attr_newdst(ixa);
3035 
3036         /* Get a copy of conn_xmit_ipp since the TX label might change it */
3037         ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP);
3038         if (ipp == NULL) {
3039                 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3040                 ixa->ixa_cred = connp->conn_cred; /* Restore */
3041                 ixa->ixa_cpid = connp->conn_cpid;
3042                 ixa_refrele(ixa);
3043                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3044                 freemsg(mp);
3045                 return (ENOMEM);
3046         }
3047         mutex_enter(&connp->conn_lock);
3048         error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP);
3049         mutex_exit(&connp->conn_lock);
3050         if (error != 0) {
3051                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3052                 freemsg(mp);
3053                 goto done;
3054         }
3055 
3056         /* Sanity check length of packet */
3057         ipha = (ipha_t *)mp->b_rptr;
3058 
3059         ip_hdr_length = IP_SIMPLE_HDR_LENGTH;
3060         if ((mp->b_wptr - mp->b_rptr) < IP_SIMPLE_HDR_LENGTH) {
3061                 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
3062                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3063                         freemsg(mp);
3064                         goto done;
3065                 }
3066                 ipha = (ipha_t *)mp->b_rptr;
3067         }
3068         ipha->ipha_version_and_hdr_length =
3069             (IP_VERSION<<4) | (ip_hdr_length>>2);
3070 
3071         /*
3072          * We set IXAF_DONTFRAG if the application set DF which makes
3073          * IP not fragment.
3074          */
3075         ipha->ipha_fragment_offset_and_flags &= htons(IPH_DF);
3076         if (ipha->ipha_fragment_offset_and_flags & htons(IPH_DF))
3077                 ixa->ixa_flags |= (IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF);
3078         else
3079                 ixa->ixa_flags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF);
3080 
3081         /* Even for multicast and broadcast we honor the apps ttl */
3082         ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
3083 
3084         /*
3085          * No source verification for non-local addresses
3086          */
3087         if (ipha->ipha_src != INADDR_ANY &&
3088             ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid,
3089             is->is_netstack->netstack_ip, B_FALSE)
3090             != IPVL_UNICAST_UP) {
3091                 ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE;
3092         }
3093 
3094         if (ipha->ipha_dst == INADDR_ANY)
3095                 ipha->ipha_dst = htonl(INADDR_LOOPBACK);
3096 
3097         IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src);
3098         IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst);
3099 
3100         /* Defer IPsec if it might need to look at ICMP type/code */
3101         do_ipsec = ipha->ipha_protocol != IPPROTO_ICMP;
3102         ixa->ixa_flags |= IXAF_IS_IPV4;
3103 
3104         ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop);
3105         error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop,
3106             connp->conn_fport, &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
3107             (do_ipsec ? IPDF_IPSEC : 0));
3108         switch (error) {
3109         case 0:
3110                 break;
3111         case EADDRNOTAVAIL:
3112                 /*
3113                  * IXAF_VERIFY_SOURCE tells us to pick a better source.
3114                  * Don't have the application see that errno
3115                  */
3116                 error = ENETUNREACH;
3117                 goto failed;
3118         case ENETDOWN:
3119                 /*
3120                  * Have !ipif_addr_ready address; drop packet silently
3121                  * until we can get applications to not send until we
3122                  * are ready.
3123                  */
3124                 error = 0;
3125                 goto failed;
3126         case EHOSTUNREACH:
3127         case ENETUNREACH:
3128                 if (ixa->ixa_ire != NULL) {
3129                         /*
3130                          * Let conn_ip_output/ire_send_noroute return
3131                          * the error and send any local ICMP error.
3132                          */
3133                         error = 0;
3134                         break;
3135                 }
3136                 /* FALLTHRU */
3137         default:
3138         failed:
3139                 freemsg(mp);
3140                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3141                 goto done;
3142         }
3143         if (ipha->ipha_src == INADDR_ANY)
3144                 IN6_V4MAPPED_TO_IPADDR(&v6src, ipha->ipha_src);
3145 
3146         /*
3147          * We might be going to a different destination than last time,
3148          * thus check that TX allows the communication and compute any
3149          * needed label.
3150          *
3151          * TSOL Note: We have an exclusive ipp and ixa for this thread so we
3152          * don't have to worry about concurrent threads.
3153          */
3154         if (is_system_labeled()) {
3155                 /*
3156                  * Check whether Trusted Solaris policy allows communication
3157                  * with this host, and pretend that the destination is
3158                  * unreachable if not.
3159                  * Compute any needed label and place it in ipp_label_v4/v6.
3160                  *
3161                  * Later conn_build_hdr_template/conn_prepend_hdr takes
3162                  * ipp_label_v4/v6 to form the packet.
3163                  *
3164                  * Tsol note: We have ipp structure local to this thread so
3165                  * no locking is needed.
3166                  */
3167                 error = conn_update_label(connp, ixa, &v6dst, ipp);
3168                 if (error != 0) {
3169                         freemsg(mp);
3170                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3171                         goto done;
3172                 }
3173         }
3174 
3175         /*
3176          * Save away a copy of the IPv4 header the application passed down
3177          * and then prepend an IPv4 header complete with any IP options
3178          * including label.
3179          * We need a struct copy since icmp_prepend_hdr will reuse the available
3180          * space in the mblk.
3181          */
3182         iphas = *ipha;
3183         mp->b_rptr += IP_SIMPLE_HDR_LENGTH;
3184 
3185         mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, 0, mp, &error);
3186         if (mp == NULL) {
3187                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3188                 ASSERT(error != 0);
3189                 goto done;
3190         }
3191         if (ixa->ixa_pktlen > IP_MAXPACKET) {
3192                 error = EMSGSIZE;
3193                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3194                 freemsg(mp);
3195                 goto done;
3196         }
3197         /* Restore key parts of the header that the application passed down */
3198         ipha = (ipha_t *)mp->b_rptr;
3199         ipha->ipha_type_of_service = iphas.ipha_type_of_service;
3200         ipha->ipha_ident = iphas.ipha_ident;
3201         ipha->ipha_fragment_offset_and_flags =
3202             iphas.ipha_fragment_offset_and_flags;
3203         ipha->ipha_ttl = iphas.ipha_ttl;
3204         ipha->ipha_protocol = iphas.ipha_protocol;
3205         ipha->ipha_src = iphas.ipha_src;
3206         ipha->ipha_dst = iphas.ipha_dst;
3207 
3208         ixa->ixa_protocol = ipha->ipha_protocol;
3209 
3210         /*
3211          * Make sure that the IP header plus any transport header that is
3212          * checksumed by ip_output is in the first mblk. (ip_output assumes
3213          * that at least the checksum field is in the first mblk.)
3214          */
3215         switch (ipha->ipha_protocol) {
3216         case IPPROTO_UDP:
3217                 tp_hdr_len = 8;
3218                 break;
3219         case IPPROTO_TCP:
3220                 tp_hdr_len = 20;
3221                 break;
3222         default:
3223                 tp_hdr_len = 0;
3224                 break;
3225         }
3226         ip_hdr_length = IPH_HDR_LENGTH(ipha);
3227         if (mp->b_wptr - mp->b_rptr < ip_hdr_length + tp_hdr_len) {
3228                 if (!pullupmsg(mp, ip_hdr_length + tp_hdr_len)) {
3229                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3230                         if (mp->b_cont == NULL)
3231                                 error = EINVAL;
3232                         else
3233                                 error = ENOMEM;
3234                         freemsg(mp);
3235                         goto done;
3236                 }
3237         }
3238 
3239         if (!do_ipsec) {
3240                 /* Policy might differ for different ICMP type/code */
3241                 if (ixa->ixa_ipsec_policy != NULL) {
3242                         IPPOL_REFRELE(ixa->ixa_ipsec_policy);
3243                         ixa->ixa_ipsec_policy = NULL;
3244                         ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
3245                 }
3246                 mp = ip_output_attach_policy(mp, ipha, NULL, connp, ixa);
3247                 if (mp == NULL) {
3248                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3249                         error = EHOSTUNREACH;   /* IPsec policy failure */
3250                         goto done;
3251                 }
3252         }
3253 
3254         /* We're done.  Pass the packet to ip. */
3255         BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
3256 
3257         error = conn_ip_output(mp, ixa);
3258         /* No rawipOutErrors if an error since IP increases its error counter */
3259         switch (error) {
3260         case 0:
3261                 break;
3262         case EWOULDBLOCK:
3263                 (void) ixa_check_drain_insert(connp, ixa);
3264                 error = 0;
3265                 break;
3266         case EADDRNOTAVAIL:
3267                 /*
3268                  * IXAF_VERIFY_SOURCE tells us to pick a better source.
3269                  * Don't have the application see that errno
3270                  */
3271                 error = ENETUNREACH;
3272                 break;
3273         }
3274 done:
3275         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3276         ixa->ixa_cred = connp->conn_cred; /* Restore */
3277         ixa->ixa_cpid = connp->conn_cpid;
3278         ixa_refrele(ixa);
3279         ip_pkt_free(ipp);
3280         kmem_free(ipp, sizeof (*ipp));
3281         return (error);
3282 }
3283 
3284 static mblk_t *
3285 icmp_output_attach_policy(mblk_t *mp, conn_t *connp, ip_xmit_attr_t *ixa)
3286 {
3287         ipha_t  *ipha = NULL;
3288         ip6_t   *ip6h = NULL;
3289 
3290         if (ixa->ixa_flags & IXAF_IS_IPV4)
3291                 ipha = (ipha_t *)mp->b_rptr;
3292         else
3293                 ip6h = (ip6_t *)mp->b_rptr;
3294 
3295         if (ixa->ixa_ipsec_policy != NULL) {
3296                 IPPOL_REFRELE(ixa->ixa_ipsec_policy);
3297                 ixa->ixa_ipsec_policy = NULL;
3298                 ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
3299         }
3300         return (ip_output_attach_policy(mp, ipha, ip6h, connp, ixa));
3301 }
3302 
3303 /*
3304  * Handle T_UNITDATA_REQ with options. Both IPv4 and IPv6
3305  * Either tudr_mp or msg is set. If tudr_mp we take ancillary data from
3306  * the TPI options, otherwise we take them from msg_control.
3307  * If both sin and sin6 is set it is a connected socket and we use conn_faddr.
3308  * Always consumes mp; never consumes tudr_mp.
3309  */
3310 static int
3311 icmp_output_ancillary(conn_t *connp, sin_t *sin, sin6_t *sin6, mblk_t *mp,
3312     mblk_t *tudr_mp, struct nmsghdr *msg, cred_t *cr, pid_t pid)
3313 {
3314         icmp_t          *icmp = connp->conn_icmp;
3315         icmp_stack_t    *is = icmp->icmp_is;
3316         int             error;
3317         ip_xmit_attr_t  *ixa;
3318         ip_pkt_t        *ipp;
3319         in6_addr_t      v6src;
3320         in6_addr_t      v6dst;
3321         in6_addr_t      v6nexthop;
3322         in_port_t       dstport;
3323         uint32_t        flowinfo;
3324         uint_t          srcid;
3325         int             is_absreq_failure = 0;
3326         conn_opt_arg_t  coas, *coa;
3327 
3328         ASSERT(tudr_mp != NULL || msg != NULL);
3329 
3330         /*
3331          * Get ixa before checking state to handle a disconnect race.
3332          *
3333          * We need an exclusive copy of conn_ixa since the ancillary data
3334          * options might modify it. That copy has no pointers hence we
3335          * need to set them up once we've parsed the ancillary data.
3336          */
3337         ixa = conn_get_ixa_exclusive(connp);
3338         if (ixa == NULL) {
3339                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3340                 freemsg(mp);
3341                 return (ENOMEM);
3342         }
3343         ASSERT(cr != NULL);
3344         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3345         ixa->ixa_cred = cr;
3346         ixa->ixa_cpid = pid;
3347         if (is_system_labeled()) {
3348                 /* We need to restart with a label based on the cred */
3349                 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
3350         }
3351 
3352         /* In case previous destination was multicast or multirt */
3353         ip_attr_newdst(ixa);
3354 
3355         /* Get a copy of conn_xmit_ipp since the options might change it */
3356         ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP);
3357         if (ipp == NULL) {
3358                 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3359                 ixa->ixa_cred = connp->conn_cred; /* Restore */
3360                 ixa->ixa_cpid = connp->conn_cpid;
3361                 ixa_refrele(ixa);
3362                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3363                 freemsg(mp);
3364                 return (ENOMEM);
3365         }
3366         mutex_enter(&connp->conn_lock);
3367         error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP);
3368         mutex_exit(&connp->conn_lock);
3369         if (error != 0) {
3370                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3371                 freemsg(mp);
3372                 goto done;
3373         }
3374 
3375         /*
3376          * Parse the options and update ixa and ipp as a result.
3377          */
3378 
3379         coa = &coas;
3380         coa->coa_connp = connp;
3381         coa->coa_ixa = ixa;
3382         coa->coa_ipp = ipp;
3383         coa->coa_ancillary = B_TRUE;
3384         coa->coa_changed = 0;
3385 
3386         if (msg != NULL) {
3387                 error = process_auxiliary_options(connp, msg->msg_control,
3388                     msg->msg_controllen, coa, &icmp_opt_obj, icmp_opt_set, cr);
3389         } else {
3390                 struct T_unitdata_req *tudr;
3391 
3392                 tudr = (struct T_unitdata_req *)tudr_mp->b_rptr;
3393                 ASSERT(tudr->PRIM_type == T_UNITDATA_REQ);
3394                 error = tpi_optcom_buf(connp->conn_wq, tudr_mp,
3395                     &tudr->OPT_length, tudr->OPT_offset, cr, &icmp_opt_obj,
3396                     coa, &is_absreq_failure);
3397         }
3398         if (error != 0) {
3399                 /*
3400                  * Note: No special action needed in this
3401                  * module for "is_absreq_failure"
3402                  */
3403                 freemsg(mp);
3404                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3405                 goto done;
3406         }
3407         ASSERT(is_absreq_failure == 0);
3408 
3409         mutex_enter(&connp->conn_lock);
3410         /*
3411          * If laddr is unspecified then we look at sin6_src_id.
3412          * We will give precedence to a source address set with IPV6_PKTINFO
3413          * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't
3414          * want ip_attr_connect to select a source (since it can fail) when
3415          * IPV6_PKTINFO is specified.
3416          * If this doesn't result in a source address then we get a source
3417          * from ip_attr_connect() below.
3418          */
3419         v6src = connp->conn_saddr_v6;
3420         if (sin != NULL) {
3421                 IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst);
3422                 dstport = sin->sin_port;
3423                 flowinfo = 0;
3424                 ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
3425                 ixa->ixa_flags |= IXAF_IS_IPV4;
3426         } else if (sin6 != NULL) {
3427                 v6dst = sin6->sin6_addr;
3428                 dstport = sin6->sin6_port;
3429                 flowinfo = sin6->sin6_flowinfo;
3430                 srcid = sin6->__sin6_src_id;
3431                 if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) {
3432                         ixa->ixa_scopeid = sin6->sin6_scope_id;
3433                         ixa->ixa_flags |= IXAF_SCOPEID_SET;
3434                 } else {
3435                         ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
3436                 }
3437                 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
3438                         ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
3439                             connp->conn_netstack);
3440                 }
3441                 if (IN6_IS_ADDR_V4MAPPED(&v6dst))
3442                         ixa->ixa_flags |= IXAF_IS_IPV4;
3443                 else
3444                         ixa->ixa_flags &= ~IXAF_IS_IPV4;
3445         } else {
3446                 /* Connected case */
3447                 v6dst = connp->conn_faddr_v6;
3448                 flowinfo = connp->conn_flowinfo;
3449         }
3450         mutex_exit(&connp->conn_lock);
3451         /* Handle IP_PKTINFO/IPV6_PKTINFO setting source address. */
3452         if (ipp->ipp_fields & IPPF_ADDR) {
3453                 if (ixa->ixa_flags & IXAF_IS_IPV4) {
3454                         if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
3455                                 v6src = ipp->ipp_addr;
3456                 } else {
3457                         if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
3458                                 v6src = ipp->ipp_addr;
3459                 }
3460         }
3461         /*
3462          * Allow source not assigned to the system
3463          * only if it is not a local addresses
3464          */
3465         if (!V6_OR_V4_INADDR_ANY(v6src)) {
3466                 ip_laddr_t laddr_type;
3467 
3468                 if (ixa->ixa_flags & IXAF_IS_IPV4) {
3469                         ipaddr_t v4src;
3470 
3471                         IN6_V4MAPPED_TO_IPADDR(&v6src, v4src);
3472                         laddr_type = ip_laddr_verify_v4(v4src, ixa->ixa_zoneid,
3473                             is->is_netstack->netstack_ip, B_FALSE);
3474                 } else {
3475                         laddr_type = ip_laddr_verify_v6(&v6src, ixa->ixa_zoneid,
3476                             is->is_netstack->netstack_ip, B_FALSE, B_FALSE);
3477                 }
3478                 if (laddr_type != IPVL_UNICAST_UP)
3479                         ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE;
3480         }
3481 
3482         ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop);
3483         error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport,
3484             &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST);
3485 
3486         switch (error) {
3487         case 0:
3488                 break;
3489         case EADDRNOTAVAIL:
3490                 /*
3491                  * IXAF_VERIFY_SOURCE tells us to pick a better source.
3492                  * Don't have the application see that errno
3493                  */
3494                 error = ENETUNREACH;
3495                 goto failed;
3496         case ENETDOWN:
3497                 /*
3498                  * Have !ipif_addr_ready address; drop packet silently
3499                  * until we can get applications to not send until we
3500                  * are ready.
3501                  */
3502                 error = 0;
3503                 goto failed;
3504         case EHOSTUNREACH:
3505         case ENETUNREACH:
3506                 if (ixa->ixa_ire != NULL) {
3507                         /*
3508                          * Let conn_ip_output/ire_send_noroute return
3509                          * the error and send any local ICMP error.
3510                          */
3511                         error = 0;
3512                         break;
3513                 }
3514                 /* FALLTHRU */
3515         default:
3516         failed:
3517                 freemsg(mp);
3518                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3519                 goto done;
3520         }
3521 
3522         /*
3523          * We might be going to a different destination than last time,
3524          * thus check that TX allows the communication and compute any
3525          * needed label.
3526          *
3527          * TSOL Note: We have an exclusive ipp and ixa for this thread so we
3528          * don't have to worry about concurrent threads.
3529          */
3530         if (is_system_labeled()) {
3531                 /*
3532                  * Check whether Trusted Solaris policy allows communication
3533                  * with this host, and pretend that the destination is
3534                  * unreachable if not.
3535                  * Compute any needed label and place it in ipp_label_v4/v6.
3536                  *
3537                  * Later conn_build_hdr_template/conn_prepend_hdr takes
3538                  * ipp_label_v4/v6 to form the packet.
3539                  *
3540                  * Tsol note: We have ipp structure local to this thread so
3541                  * no locking is needed.
3542                  */
3543                 error = conn_update_label(connp, ixa, &v6dst, ipp);
3544                 if (error != 0) {
3545                         freemsg(mp);
3546                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3547                         goto done;
3548                 }
3549         }
3550         mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, flowinfo, mp,
3551             &error);
3552         if (mp == NULL) {
3553                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3554                 ASSERT(error != 0);
3555                 goto done;
3556         }
3557         if (ixa->ixa_pktlen > IP_MAXPACKET) {
3558                 error = EMSGSIZE;
3559                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3560                 freemsg(mp);
3561                 goto done;
3562         }
3563 
3564         /* Policy might differ for different ICMP type/code */
3565         mp = icmp_output_attach_policy(mp, connp, ixa);
3566         if (mp == NULL) {
3567                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3568                 error = EHOSTUNREACH;   /* IPsec policy failure */
3569                 goto done;
3570         }
3571 
3572         /* We're done.  Pass the packet to ip. */
3573         BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
3574 
3575         error = conn_ip_output(mp, ixa);
3576         if (!connp->conn_unspec_src)
3577                 ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
3578         /* No rawipOutErrors if an error since IP increases its error counter */
3579         switch (error) {
3580         case 0:
3581                 break;
3582         case EWOULDBLOCK:
3583                 (void) ixa_check_drain_insert(connp, ixa);
3584                 error = 0;
3585                 break;
3586         case EADDRNOTAVAIL:
3587                 /*
3588                  * IXAF_VERIFY_SOURCE tells us to pick a better source.
3589                  * Don't have the application see that errno
3590                  */
3591                 error = ENETUNREACH;
3592                 /* FALLTHRU */
3593         default:
3594                 mutex_enter(&connp->conn_lock);
3595                 /*
3596                  * Clear the source and v6lastdst so we call ip_attr_connect
3597                  * for the next packet and try to pick a better source.
3598                  */
3599                 if (connp->conn_mcbc_bind)
3600                         connp->conn_saddr_v6 = ipv6_all_zeros;
3601                 else
3602                         connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
3603                 connp->conn_v6lastdst = ipv6_all_zeros;
3604                 mutex_exit(&connp->conn_lock);
3605                 break;
3606         }
3607 done:
3608         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3609         ixa->ixa_cred = connp->conn_cred; /* Restore */
3610         ixa->ixa_cpid = connp->conn_cpid;
3611         ixa_refrele(ixa);
3612         ip_pkt_free(ipp);
3613         kmem_free(ipp, sizeof (*ipp));
3614         return (error);
3615 }
3616 
3617 /*
3618  * Handle sending an M_DATA for a connected socket.
3619  * Handles both IPv4 and IPv6.
3620  */
3621 int
3622 icmp_output_connected(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid)
3623 {
3624         icmp_t          *icmp = connp->conn_icmp;
3625         icmp_stack_t    *is = icmp->icmp_is;
3626         int             error;
3627         ip_xmit_attr_t  *ixa;
3628         boolean_t       do_ipsec;
3629 
3630         /*
3631          * If no other thread is using conn_ixa this just gets a reference to
3632          * conn_ixa. Otherwise we get a safe copy of conn_ixa.
3633          */
3634         ixa = conn_get_ixa(connp, B_FALSE);
3635         if (ixa == NULL) {
3636                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3637                 freemsg(mp);
3638                 return (ENOMEM);
3639         }
3640 
3641         ASSERT(cr != NULL);
3642         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3643         ixa->ixa_cred = cr;
3644         ixa->ixa_cpid = pid;
3645 
3646         /* Defer IPsec if it might need to look at ICMP type/code */
3647         switch (ixa->ixa_protocol) {
3648         case IPPROTO_ICMP:
3649         case IPPROTO_ICMPV6:
3650                 do_ipsec = B_FALSE;
3651                 break;
3652         default:
3653                 do_ipsec = B_TRUE;
3654         }
3655 
3656         mutex_enter(&connp->conn_lock);
3657         mp = icmp_prepend_header_template(connp, ixa, mp,
3658             &connp->conn_saddr_v6, connp->conn_flowinfo, &error);
3659 
3660         if (mp == NULL) {
3661                 ASSERT(error != 0);
3662                 mutex_exit(&connp->conn_lock);
3663                 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3664                 ixa->ixa_cred = connp->conn_cred; /* Restore */
3665                 ixa->ixa_cpid = connp->conn_cpid;
3666                 ixa_refrele(ixa);
3667                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3668                 freemsg(mp);
3669                 return (error);
3670         }
3671 
3672         if (!do_ipsec) {
3673                 /* Policy might differ for different ICMP type/code */
3674                 mp = icmp_output_attach_policy(mp, connp, ixa);
3675                 if (mp == NULL) {
3676                         mutex_exit(&connp->conn_lock);
3677                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3678                         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3679                         ixa->ixa_cred = connp->conn_cred; /* Restore */
3680                         ixa->ixa_cpid = connp->conn_cpid;
3681                         ixa_refrele(ixa);
3682                         return (EHOSTUNREACH);  /* IPsec policy failure */
3683                 }
3684         }
3685 
3686         /*
3687          * In case we got a safe copy of conn_ixa, or if opt_set made us a new
3688          * safe copy, then we need to fill in any pointers in it.
3689          */
3690         if (ixa->ixa_ire == NULL) {
3691                 in6_addr_t      faddr, saddr;
3692                 in6_addr_t      nexthop;
3693                 in_port_t       fport;
3694 
3695                 saddr = connp->conn_saddr_v6;
3696                 faddr = connp->conn_faddr_v6;
3697                 fport = connp->conn_fport;
3698                 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &faddr, &nexthop);
3699                 mutex_exit(&connp->conn_lock);
3700 
3701                 error = ip_attr_connect(connp, ixa, &saddr, &faddr, &nexthop,
3702                     fport, NULL, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
3703                     (do_ipsec ? IPDF_IPSEC : 0));
3704                 switch (error) {
3705                 case 0:
3706                         break;
3707                 case EADDRNOTAVAIL:
3708                         /*
3709                          * IXAF_VERIFY_SOURCE tells us to pick a better source.
3710                          * Don't have the application see that errno
3711                          */
3712                         error = ENETUNREACH;
3713                         goto failed;
3714                 case ENETDOWN:
3715                         /*
3716                          * Have !ipif_addr_ready address; drop packet silently
3717                          * until we can get applications to not send until we
3718                          * are ready.
3719                          */
3720                         error = 0;
3721                         goto failed;
3722                 case EHOSTUNREACH:
3723                 case ENETUNREACH:
3724                         if (ixa->ixa_ire != NULL) {
3725                                 /*
3726                                  * Let conn_ip_output/ire_send_noroute return
3727                                  * the error and send any local ICMP error.
3728                                  */
3729                                 error = 0;
3730                                 break;
3731                         }
3732                         /* FALLTHRU */
3733                 default:
3734                 failed:
3735                         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3736                         ixa->ixa_cred = connp->conn_cred; /* Restore */
3737                         ixa->ixa_cpid = connp->conn_cpid;
3738                         ixa_refrele(ixa);
3739                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3740                         freemsg(mp);
3741                         return (error);
3742                 }
3743         } else {
3744                 /* Done with conn_t */
3745                 mutex_exit(&connp->conn_lock);
3746         }
3747 
3748         /* We're done.  Pass the packet to ip. */
3749         BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
3750 
3751         error = conn_ip_output(mp, ixa);
3752         /* No rawipOutErrors if an error since IP increases its error counter */
3753         switch (error) {
3754         case 0:
3755                 break;
3756         case EWOULDBLOCK:
3757                 (void) ixa_check_drain_insert(connp, ixa);
3758                 error = 0;
3759                 break;
3760         case EADDRNOTAVAIL:
3761                 /*
3762                  * IXAF_VERIFY_SOURCE tells us to pick a better source.
3763                  * Don't have the application see that errno
3764                  */
3765                 error = ENETUNREACH;
3766                 break;
3767         }
3768         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3769         ixa->ixa_cred = connp->conn_cred; /* Restore */
3770         ixa->ixa_cpid = connp->conn_cpid;
3771         ixa_refrele(ixa);
3772         return (error);
3773 }
3774 
3775 /*
3776  * Handle sending an M_DATA to the last destination.
3777  * Handles both IPv4 and IPv6.
3778  *
3779  * NOTE: The caller must hold conn_lock and we drop it here.
3780  */
3781 int
3782 icmp_output_lastdst(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid,
3783     ip_xmit_attr_t *ixa)
3784 {
3785         icmp_t          *icmp = connp->conn_icmp;
3786         icmp_stack_t    *is = icmp->icmp_is;
3787         int             error;
3788         boolean_t       do_ipsec;
3789 
3790         ASSERT(MUTEX_HELD(&connp->conn_lock));
3791         ASSERT(ixa != NULL);
3792 
3793         ASSERT(cr != NULL);
3794         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3795         ixa->ixa_cred = cr;
3796         ixa->ixa_cpid = pid;
3797 
3798         /* Defer IPsec if it might need to look at ICMP type/code */
3799         switch (ixa->ixa_protocol) {
3800         case IPPROTO_ICMP:
3801         case IPPROTO_ICMPV6:
3802                 do_ipsec = B_FALSE;
3803                 break;
3804         default:
3805                 do_ipsec = B_TRUE;
3806         }
3807 
3808 
3809         mp = icmp_prepend_header_template(connp, ixa, mp,
3810             &connp->conn_v6lastsrc, connp->conn_lastflowinfo, &error);
3811 
3812         if (mp == NULL) {
3813                 ASSERT(error != 0);
3814                 mutex_exit(&connp->conn_lock);
3815                 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3816                 ixa->ixa_cred = connp->conn_cred; /* Restore */
3817                 ixa->ixa_cpid = connp->conn_cpid;
3818                 ixa_refrele(ixa);
3819                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3820                 freemsg(mp);
3821                 return (error);
3822         }
3823 
3824         if (!do_ipsec) {
3825                 /* Policy might differ for different ICMP type/code */
3826                 mp = icmp_output_attach_policy(mp, connp, ixa);
3827                 if (mp == NULL) {
3828                         mutex_exit(&connp->conn_lock);
3829                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3830                         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3831                         ixa->ixa_cred = connp->conn_cred; /* Restore */
3832                         ixa->ixa_cpid = connp->conn_cpid;
3833                         ixa_refrele(ixa);
3834                         return (EHOSTUNREACH);  /* IPsec policy failure */
3835                 }
3836         }
3837 
3838         /*
3839          * In case we got a safe copy of conn_ixa, or if opt_set made us a new
3840          * safe copy, then we need to fill in any pointers in it.
3841          */
3842         if (ixa->ixa_ire == NULL) {
3843                 in6_addr_t      lastdst, lastsrc;
3844                 in6_addr_t      nexthop;
3845                 in_port_t       lastport;
3846 
3847                 lastsrc = connp->conn_v6lastsrc;
3848                 lastdst = connp->conn_v6lastdst;
3849                 lastport = connp->conn_lastdstport;
3850                 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &lastdst, &nexthop);
3851                 mutex_exit(&connp->conn_lock);
3852 
3853                 error = ip_attr_connect(connp, ixa, &lastsrc, &lastdst,
3854                     &nexthop, lastport, NULL, NULL, IPDF_ALLOW_MCBC |
3855                     IPDF_VERIFY_DST | (do_ipsec ? IPDF_IPSEC : 0));
3856                 switch (error) {
3857                 case 0:
3858                         break;
3859                 case EADDRNOTAVAIL:
3860                         /*
3861                          * IXAF_VERIFY_SOURCE tells us to pick a better source.
3862                          * Don't have the application see that errno
3863                          */
3864                         error = ENETUNREACH;
3865                         goto failed;
3866                 case ENETDOWN:
3867                         /*
3868                          * Have !ipif_addr_ready address; drop packet silently
3869                          * until we can get applications to not send until we
3870                          * are ready.
3871                          */
3872                         error = 0;
3873                         goto failed;
3874                 case EHOSTUNREACH:
3875                 case ENETUNREACH:
3876                         if (ixa->ixa_ire != NULL) {
3877                                 /*
3878                                  * Let conn_ip_output/ire_send_noroute return
3879                                  * the error and send any local ICMP error.
3880                                  */
3881                                 error = 0;
3882                                 break;
3883                         }
3884                         /* FALLTHRU */
3885                 default:
3886                 failed:
3887                         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3888                         ixa->ixa_cred = connp->conn_cred; /* Restore */
3889                         ixa->ixa_cpid = connp->conn_cpid;
3890                         ixa_refrele(ixa);
3891                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3892                         freemsg(mp);
3893                         return (error);
3894                 }
3895         } else {
3896                 /* Done with conn_t */
3897                 mutex_exit(&connp->conn_lock);
3898         }
3899 
3900         /* We're done.  Pass the packet to ip. */
3901         BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
3902         error = conn_ip_output(mp, ixa);
3903         /* No rawipOutErrors if an error since IP increases its error counter */
3904         switch (error) {
3905         case 0:
3906                 break;
3907         case EWOULDBLOCK:
3908                 (void) ixa_check_drain_insert(connp, ixa);
3909                 error = 0;
3910                 break;
3911         case EADDRNOTAVAIL:
3912                 /*
3913                  * IXAF_VERIFY_SOURCE tells us to pick a better source.
3914                  * Don't have the application see that errno
3915                  */
3916                 error = ENETUNREACH;
3917                 /* FALLTHRU */
3918         default:
3919                 mutex_enter(&connp->conn_lock);
3920                 /*
3921                  * Clear the source and v6lastdst so we call ip_attr_connect
3922                  * for the next packet and try to pick a better source.
3923                  */
3924                 if (connp->conn_mcbc_bind)
3925                         connp->conn_saddr_v6 = ipv6_all_zeros;
3926                 else
3927                         connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
3928                 connp->conn_v6lastdst = ipv6_all_zeros;
3929                 mutex_exit(&connp->conn_lock);
3930                 break;
3931         }
3932         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3933         ixa->ixa_cred = connp->conn_cred; /* Restore */
3934         ixa->ixa_cpid = connp->conn_cpid;
3935         ixa_refrele(ixa);
3936         return (error);
3937 }
3938 
3939 
3940 /*
3941  * Prepend the header template and then fill in the source and
3942  * flowinfo. The caller needs to handle the destination address since
3943  * it's setting is different if rthdr or source route.
3944  *
3945  * Returns NULL is allocation failed or if the packet would exceed IP_MAXPACKET.
3946  * When it returns NULL it sets errorp.
3947  */
3948 static mblk_t *
3949 icmp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp,
3950     const in6_addr_t *v6src, uint32_t flowinfo, int *errorp)
3951 {
3952         icmp_t          *icmp = connp->conn_icmp;
3953         icmp_stack_t    *is = icmp->icmp_is;
3954         uint_t          pktlen;
3955         uint_t          copylen;
3956         uint8_t         *iph;
3957         uint_t          ip_hdr_length;
3958         uint32_t        cksum;
3959         ip_pkt_t        *ipp;
3960 
3961         ASSERT(MUTEX_HELD(&connp->conn_lock));
3962 
3963         /*
3964          * Copy the header template.
3965          */
3966         copylen = connp->conn_ht_iphc_len;
3967         pktlen = copylen + msgdsize(mp);
3968         if (pktlen > IP_MAXPACKET) {
3969                 freemsg(mp);
3970                 *errorp = EMSGSIZE;
3971                 return (NULL);
3972         }
3973         ixa->ixa_pktlen = pktlen;
3974 
3975         /* check/fix buffer config, setup pointers into it */
3976         iph = mp->b_rptr - copylen;
3977         if (DB_REF(mp) != 1 || iph < DB_BASE(mp) || !OK_32PTR(iph)) {
3978                 mblk_t *mp1;
3979 
3980                 mp1 = allocb(copylen + is->is_wroff_extra, BPRI_MED);
3981                 if (mp1 == NULL) {
3982                         freemsg(mp);
3983                         *errorp = ENOMEM;
3984                         return (NULL);
3985                 }
3986                 mp1->b_wptr = DB_LIM(mp1);
3987                 mp1->b_cont = mp;
3988                 mp = mp1;
3989                 iph = (mp->b_wptr - copylen);
3990         }
3991         mp->b_rptr = iph;
3992         bcopy(connp->conn_ht_iphc, iph, copylen);
3993         ip_hdr_length = (uint_t)(connp->conn_ht_ulp - connp->conn_ht_iphc);
3994 
3995         ixa->ixa_ip_hdr_length = ip_hdr_length;
3996 
3997         /*
3998          * Prepare for ICMPv6 checksum done in IP.
3999          *
4000          * icmp_build_hdr_template has already massaged any routing header
4001          * and placed the result in conn_sum.
4002          *
4003          * We make it easy for IP to include our pseudo header
4004          * by putting our length (and any routing header adjustment)
4005          * in the ICMPv6 checksum field.
4006          */
4007         cksum = pktlen - ip_hdr_length;
4008 
4009         cksum += connp->conn_sum;
4010         cksum = (cksum >> 16) + (cksum & 0xFFFF);
4011         ASSERT(cksum < 0x10000);
4012 
4013         ipp = &connp->conn_xmit_ipp;
4014         if (ixa->ixa_flags & IXAF_IS_IPV4) {
4015                 ipha_t  *ipha = (ipha_t *)iph;
4016 
4017                 ipha->ipha_length = htons((uint16_t)pktlen);
4018 
4019                 /* if IP_PKTINFO specified an addres it wins over bind() */
4020                 if ((ipp->ipp_fields & IPPF_ADDR) &&
4021                     IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) {
4022                         ASSERT(ipp->ipp_addr_v4 != INADDR_ANY);
4023                         ipha->ipha_src = ipp->ipp_addr_v4;
4024                 } else {
4025                         IN6_V4MAPPED_TO_IPADDR(v6src, ipha->ipha_src);
4026                 }
4027         } else {
4028                 ip6_t *ip6h = (ip6_t *)iph;
4029                 uint_t  cksum_offset = 0;
4030 
4031                 ip6h->ip6_plen =  htons((uint16_t)(pktlen - IPV6_HDR_LEN));
4032 
4033                 /* if IP_PKTINFO specified an addres it wins over bind() */
4034                 if ((ipp->ipp_fields & IPPF_ADDR) &&
4035                     !IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) {
4036                         ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr));
4037                         ip6h->ip6_src = ipp->ipp_addr;
4038                 } else {
4039                         ip6h->ip6_src = *v6src;
4040                 }
4041                 ip6h->ip6_vcf =
4042                     (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
4043                     (flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
4044                 if (ipp->ipp_fields & IPPF_TCLASS) {
4045                         /* Overrides the class part of flowinfo */
4046                         ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf,
4047                             ipp->ipp_tclass);
4048                 }
4049 
4050                 if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) {
4051                         if (connp->conn_proto == IPPROTO_ICMPV6) {
4052                                 cksum_offset = ixa->ixa_ip_hdr_length +
4053                                     offsetof(icmp6_t, icmp6_cksum);
4054                         } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
4055                                 cksum_offset = ixa->ixa_ip_hdr_length +
4056                                     ixa->ixa_raw_cksum_offset;
4057                         }
4058                 }
4059                 if (cksum_offset != 0) {
4060                         uint16_t *ptr;
4061 
4062                         /* Make sure the checksum fits in the first mblk */
4063                         if (cksum_offset + sizeof (short) > MBLKL(mp)) {
4064                                 mblk_t *mp1;
4065 
4066                                 mp1 = msgpullup(mp,
4067                                     cksum_offset + sizeof (short));
4068                                 freemsg(mp);
4069                                 if (mp1 == NULL) {
4070                                         *errorp = ENOMEM;
4071                                         return (NULL);
4072                                 }
4073                                 mp = mp1;
4074                                 iph = mp->b_rptr;
4075                                 ip6h = (ip6_t *)iph;
4076                         }
4077                         ptr = (uint16_t *)(mp->b_rptr + cksum_offset);
4078                         *ptr = htons(cksum);
4079                 }
4080         }
4081 
4082         return (mp);
4083 }
4084 
4085 /*
4086  * This routine handles all messages passed downstream.  It either
4087  * consumes the message or passes it downstream; it never queues a
4088  * a message.
4089  */
4090 void
4091 icmp_wput(queue_t *q, mblk_t *mp)
4092 {
4093         sin6_t          *sin6;
4094         sin_t           *sin = NULL;
4095         uint_t          srcid;
4096         conn_t          *connp = Q_TO_CONN(q);
4097         icmp_t          *icmp = connp->conn_icmp;
4098         int             error = 0;
4099         struct sockaddr *addr = NULL;
4100         socklen_t       addrlen;
4101         icmp_stack_t    *is = icmp->icmp_is;
4102         struct T_unitdata_req *tudr;
4103         mblk_t          *data_mp;
4104         cred_t          *cr;
4105         pid_t           pid;
4106 
4107         /*
4108          * We directly handle several cases here: T_UNITDATA_REQ message
4109          * coming down as M_PROTO/M_PCPROTO and M_DATA messages for connected
4110          * socket.
4111          */
4112         switch (DB_TYPE(mp)) {
4113         case M_DATA:
4114                 /* sockfs never sends down M_DATA */
4115                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4116                 freemsg(mp);
4117                 return;
4118 
4119         case M_PROTO:
4120         case M_PCPROTO:
4121                 tudr = (struct T_unitdata_req *)mp->b_rptr;
4122                 if (MBLKL(mp) < sizeof (*tudr) ||
4123                     ((t_primp_t)mp->b_rptr)->type != T_UNITDATA_REQ) {
4124                         icmp_wput_other(q, mp);
4125                         return;
4126                 }
4127                 break;
4128 
4129         default:
4130                 icmp_wput_other(q, mp);
4131                 return;
4132         }
4133 
4134         /* Handle valid T_UNITDATA_REQ here */
4135         data_mp = mp->b_cont;
4136         if (data_mp == NULL) {
4137                 error = EPROTO;
4138                 goto ud_error2;
4139         }
4140         mp->b_cont = NULL;
4141 
4142         if (!MBLKIN(mp, 0, tudr->DEST_offset + tudr->DEST_length)) {
4143                 error = EADDRNOTAVAIL;
4144                 goto ud_error2;
4145         }
4146 
4147         /*
4148          * All Solaris components should pass a db_credp
4149          * for this message, hence we ASSERT.
4150          * On production kernels we return an error to be robust against
4151          * random streams modules sitting on top of us.
4152          */
4153         cr = msg_getcred(mp, &pid);
4154         ASSERT(cr != NULL);
4155         if (cr == NULL) {
4156                 error = EINVAL;
4157                 goto ud_error2;
4158         }
4159 
4160         /*
4161          * If a port has not been bound to the stream, fail.
4162          * This is not a problem when sockfs is directly
4163          * above us, because it will ensure that the socket
4164          * is first bound before allowing data to be sent.
4165          */
4166         if (icmp->icmp_state == TS_UNBND) {
4167                 error = EPROTO;
4168                 goto ud_error2;
4169         }
4170         addr = (struct sockaddr *)&mp->b_rptr[tudr->DEST_offset];
4171         addrlen = tudr->DEST_length;
4172 
4173         switch (connp->conn_family) {
4174         case AF_INET6:
4175                 sin6 = (sin6_t *)addr;
4176                 if (!OK_32PTR((char *)sin6) || (addrlen != sizeof (sin6_t)) ||
4177                     (sin6->sin6_family != AF_INET6)) {
4178                         error = EADDRNOTAVAIL;
4179                         goto ud_error2;
4180                 }
4181 
4182                 /* No support for mapped addresses on raw sockets */
4183                 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
4184                         error = EADDRNOTAVAIL;
4185                         goto ud_error2;
4186                 }
4187                 srcid = sin6->__sin6_src_id;
4188 
4189                 /*
4190                  * If the local address is a mapped address return
4191                  * an error.
4192                  * It would be possible to send an IPv6 packet but the
4193                  * response would never make it back to the application
4194                  * since it is bound to a mapped address.
4195                  */
4196                 if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) {
4197                         error = EADDRNOTAVAIL;
4198                         goto ud_error2;
4199                 }
4200 
4201                 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
4202                         sin6->sin6_addr = ipv6_loopback;
4203 
4204                 if (tudr->OPT_length != 0) {
4205                         /*
4206                          * If we are connected then the destination needs to be
4207                          * the same as the connected one.
4208                          */
4209                         if (icmp->icmp_state == TS_DATA_XFER &&
4210                             !conn_same_as_last_v6(connp, sin6)) {
4211                                 error = EISCONN;
4212                                 goto ud_error2;
4213                         }
4214                         error = icmp_output_ancillary(connp, NULL, sin6,
4215                             data_mp, mp, NULL, cr, pid);
4216                 } else {
4217                         ip_xmit_attr_t *ixa;
4218 
4219                         /*
4220                          * We have to allocate an ip_xmit_attr_t before we grab
4221                          * conn_lock and we need to hold conn_lock once we've
4222                          * checked conn_same_as_last_v6 to handle concurrent
4223                          * send* calls on a socket.
4224                          */
4225                         ixa = conn_get_ixa(connp, B_FALSE);
4226                         if (ixa == NULL) {
4227                                 error = ENOMEM;
4228                                 goto ud_error2;
4229                         }
4230                         mutex_enter(&connp->conn_lock);
4231 
4232                         if (conn_same_as_last_v6(connp, sin6) &&
4233                             connp->conn_lastsrcid == srcid &&
4234                             ipsec_outbound_policy_current(ixa)) {
4235                                 /* icmp_output_lastdst drops conn_lock */
4236                                 error = icmp_output_lastdst(connp, data_mp, cr,
4237                                     pid, ixa);
4238                         } else {
4239                                 /* icmp_output_newdst drops conn_lock */
4240                                 error = icmp_output_newdst(connp, data_mp, NULL,
4241                                     sin6, cr, pid, ixa);
4242                         }
4243                         ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
4244                 }
4245                 if (error == 0) {
4246                         freeb(mp);
4247                         return;
4248                 }
4249                 break;
4250 
4251         case AF_INET:
4252                 sin = (sin_t *)addr;
4253                 if ((!OK_32PTR((char *)sin) || addrlen != sizeof (sin_t)) ||
4254                     (sin->sin_family != AF_INET)) {
4255                         error = EADDRNOTAVAIL;
4256                         goto ud_error2;
4257                 }
4258                 if (sin->sin_addr.s_addr == INADDR_ANY)
4259                         sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
4260 
4261                 /* Protocol 255 contains full IP headers */
4262                 /* Read without holding lock */
4263                 if (icmp->icmp_hdrincl) {
4264                         if (MBLKL(data_mp) < IP_SIMPLE_HDR_LENGTH) {
4265                                 if (!pullupmsg(data_mp, IP_SIMPLE_HDR_LENGTH)) {
4266                                         error = EINVAL;
4267                                         goto ud_error2;
4268                                 }
4269                         }
4270                         error = icmp_output_hdrincl(connp, data_mp, cr, pid);
4271                         if (error == 0) {
4272                                 freeb(mp);
4273                                 return;
4274                         }
4275                         /* data_mp consumed above */
4276                         data_mp = NULL;
4277                         goto ud_error2;
4278                 }
4279 
4280                 if (tudr->OPT_length != 0) {
4281                         /*
4282                          * If we are connected then the destination needs to be
4283                          * the same as the connected one.
4284                          */
4285                         if (icmp->icmp_state == TS_DATA_XFER &&
4286                             !conn_same_as_last_v4(connp, sin)) {
4287                                 error = EISCONN;
4288                                 goto ud_error2;
4289                         }
4290                         error = icmp_output_ancillary(connp, sin, NULL,
4291                             data_mp, mp, NULL, cr, pid);
4292                 } else {
4293                         ip_xmit_attr_t *ixa;
4294 
4295                         /*
4296                          * We have to allocate an ip_xmit_attr_t before we grab
4297                          * conn_lock and we need to hold conn_lock once we've
4298                          * checked conn_same_as_last_v4 to handle concurrent
4299                          * send* calls on a socket.
4300                          */
4301                         ixa = conn_get_ixa(connp, B_FALSE);
4302                         if (ixa == NULL) {
4303                                 error = ENOMEM;
4304                                 goto ud_error2;
4305                         }
4306                         mutex_enter(&connp->conn_lock);
4307 
4308                         if (conn_same_as_last_v4(connp, sin) &&
4309                             ipsec_outbound_policy_current(ixa)) {
4310                                 /* icmp_output_lastdst drops conn_lock */
4311                                 error = icmp_output_lastdst(connp, data_mp, cr,
4312                                     pid, ixa);
4313                         } else {
4314                                 /* icmp_output_newdst drops conn_lock */
4315                                 error = icmp_output_newdst(connp, data_mp, sin,
4316                                     NULL, cr, pid, ixa);
4317                         }
4318                         ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
4319                 }
4320                 if (error == 0) {
4321                         freeb(mp);
4322                         return;
4323                 }
4324                 break;
4325         }
4326         ASSERT(mp != NULL);
4327         /* mp is freed by the following routine */
4328         icmp_ud_err(q, mp, (t_scalar_t)error);
4329         return;
4330 
4331 ud_error2:
4332         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4333         freemsg(data_mp);
4334         ASSERT(mp != NULL);
4335         /* mp is freed by the following routine */
4336         icmp_ud_err(q, mp, (t_scalar_t)error);
4337 }
4338 
4339 /*
4340  * Handle the case of the IP address or flow label being different
4341  * for both IPv4 and IPv6.
4342  *
4343  * NOTE: The caller must hold conn_lock and we drop it here.
4344  */
4345 static int
4346 icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, sin6_t *sin6,
4347     cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa)
4348 {
4349         icmp_t          *icmp = connp->conn_icmp;
4350         icmp_stack_t    *is = icmp->icmp_is;
4351         int             error;
4352         ip_xmit_attr_t  *oldixa;
4353         boolean_t       do_ipsec;
4354         uint_t          srcid;
4355         uint32_t        flowinfo;
4356         in6_addr_t      v6src;
4357         in6_addr_t      v6dst;
4358         in6_addr_t      v6nexthop;
4359         in_port_t       dstport;
4360 
4361         ASSERT(MUTEX_HELD(&connp->conn_lock));
4362         ASSERT(ixa != NULL);
4363 
4364         /*
4365          * We hold conn_lock across all the use and modifications of
4366          * the conn_lastdst, conn_ixa, and conn_xmit_ipp to ensure that they
4367          * stay consistent.
4368          */
4369 
4370         ASSERT(cr != NULL);
4371         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
4372         ixa->ixa_cred = cr;
4373         ixa->ixa_cpid = pid;
4374         if (is_system_labeled()) {
4375                 /* We need to restart with a label based on the cred */
4376                 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
4377         }
4378         /*
4379          * If we are connected then the destination needs to be the
4380          * same as the connected one, which is not the case here since we
4381          * checked for that above.
4382          */
4383         if (icmp->icmp_state == TS_DATA_XFER) {
4384                 mutex_exit(&connp->conn_lock);
4385                 error = EISCONN;
4386                 goto ud_error;
4387         }
4388 
4389         /* In case previous destination was multicast or multirt */
4390         ip_attr_newdst(ixa);
4391 
4392         /*
4393          * If laddr is unspecified then we look at sin6_src_id.
4394          * We will give precedence to a source address set with IPV6_PKTINFO
4395          * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't
4396          * want ip_attr_connect to select a source (since it can fail) when
4397          * IPV6_PKTINFO is specified.
4398          * If this doesn't result in a source address then we get a source
4399          * from ip_attr_connect() below.
4400          */
4401         v6src = connp->conn_saddr_v6;
4402         if (sin != NULL) {
4403                 IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst);
4404                 dstport = sin->sin_port;
4405                 flowinfo = 0;
4406                 srcid = 0;
4407                 ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
4408                 if (srcid != 0 && V4_PART_OF_V6(&v6src) == INADDR_ANY) {
4409                         ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
4410                             connp->conn_netstack);
4411                 }
4412                 ixa->ixa_flags |= IXAF_IS_IPV4;
4413         } else {
4414                 v6dst = sin6->sin6_addr;
4415                 dstport = sin6->sin6_port;
4416                 flowinfo = sin6->sin6_flowinfo;
4417                 srcid = sin6->__sin6_src_id;
4418                 if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) {
4419                         ixa->ixa_scopeid = sin6->sin6_scope_id;
4420                         ixa->ixa_flags |= IXAF_SCOPEID_SET;
4421                 } else {
4422                         ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
4423                 }
4424                 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
4425                         ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
4426                             connp->conn_netstack);
4427                 }
4428                 if (IN6_IS_ADDR_V4MAPPED(&v6dst))
4429                         ixa->ixa_flags |= IXAF_IS_IPV4;
4430                 else
4431                         ixa->ixa_flags &= ~IXAF_IS_IPV4;
4432         }
4433         /* Handle IP_PKTINFO/IPV6_PKTINFO setting source address. */
4434         if (connp->conn_xmit_ipp.ipp_fields & IPPF_ADDR) {
4435                 ip_pkt_t *ipp = &connp->conn_xmit_ipp;
4436 
4437                 if (ixa->ixa_flags & IXAF_IS_IPV4) {
4438                         if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
4439                                 v6src = ipp->ipp_addr;
4440                 } else {
4441                         if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
4442                                 v6src = ipp->ipp_addr;
4443                 }
4444         }
4445 
4446         /* Defer IPsec if it might need to look at ICMP type/code */
4447         switch (ixa->ixa_protocol) {
4448         case IPPROTO_ICMP:
4449         case IPPROTO_ICMPV6:
4450                 do_ipsec = B_FALSE;
4451                 break;
4452         default:
4453                 do_ipsec = B_TRUE;
4454         }
4455 
4456         ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &v6dst, &v6nexthop);
4457         mutex_exit(&connp->conn_lock);
4458 
4459         error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport,
4460             &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
4461             (do_ipsec ? IPDF_IPSEC : 0));
4462         switch (error) {
4463         case 0:
4464                 break;
4465         case EADDRNOTAVAIL:
4466                 /*
4467                  * IXAF_VERIFY_SOURCE tells us to pick a better source.
4468                  * Don't have the application see that errno
4469                  */
4470                 error = ENETUNREACH;
4471                 goto failed;
4472         case ENETDOWN:
4473                 /*
4474                  * Have !ipif_addr_ready address; drop packet silently
4475                  * until we can get applications to not send until we
4476                  * are ready.
4477                  */
4478                 error = 0;
4479                 goto failed;
4480         case EHOSTUNREACH:
4481         case ENETUNREACH:
4482                 if (ixa->ixa_ire != NULL) {
4483                         /*
4484                          * Let conn_ip_output/ire_send_noroute return
4485                          * the error and send any local ICMP error.
4486                          */
4487                         error = 0;
4488                         break;
4489                 }
4490                 /* FALLTHRU */
4491         default:
4492         failed:
4493                 goto ud_error;
4494         }
4495 
4496         mutex_enter(&connp->conn_lock);
4497         /*
4498          * While we dropped the lock some other thread might have connected
4499          * this socket. If so we bail out with EISCONN to ensure that the
4500          * connecting thread is the one that updates conn_ixa, conn_ht_*
4501          * and conn_*last*.
4502          */
4503         if (icmp->icmp_state == TS_DATA_XFER) {
4504                 mutex_exit(&connp->conn_lock);
4505                 error = EISCONN;
4506                 goto ud_error;
4507         }
4508 
4509         /*
4510          * We need to rebuild the headers if
4511          *  - we are labeling packets (could be different for different
4512          *    destinations)
4513          *  - we have a source route (or routing header) since we need to
4514          *    massage that to get the pseudo-header checksum
4515          *  - a socket option with COA_HEADER_CHANGED has been set which
4516          *    set conn_v6lastdst to zero.
4517          *
4518          * Otherwise the prepend function will just update the src, dst,
4519          * and flow label.
4520          */
4521         if (is_system_labeled()) {
4522                 /* TX MLP requires SCM_UCRED and don't have that here */
4523                 if (connp->conn_mlp_type != mlptSingle) {
4524                         mutex_exit(&connp->conn_lock);
4525                         error = ECONNREFUSED;
4526                         goto ud_error;
4527                 }
4528                 /*
4529                  * Check whether Trusted Solaris policy allows communication
4530                  * with this host, and pretend that the destination is
4531                  * unreachable if not.
4532                  * Compute any needed label and place it in ipp_label_v4/v6.
4533                  *
4534                  * Later conn_build_hdr_template/conn_prepend_hdr takes
4535                  * ipp_label_v4/v6 to form the packet.
4536                  *
4537                  * Tsol note: Since we hold conn_lock we know no other
4538                  * thread manipulates conn_xmit_ipp.
4539                  */
4540                 error = conn_update_label(connp, ixa, &v6dst,
4541                     &connp->conn_xmit_ipp);
4542                 if (error != 0) {
4543                         mutex_exit(&connp->conn_lock);
4544                         goto ud_error;
4545                 }
4546                 /* Rebuild the header template */
4547                 error = icmp_build_hdr_template(connp, &v6src, &v6dst,
4548                     flowinfo);
4549                 if (error != 0) {
4550                         mutex_exit(&connp->conn_lock);
4551                         goto ud_error;
4552                 }
4553         } else if (connp->conn_xmit_ipp.ipp_fields &
4554             (IPPF_IPV4_OPTIONS|IPPF_RTHDR) ||
4555             IN6_IS_ADDR_UNSPECIFIED(&connp->conn_v6lastdst)) {
4556                 /* Rebuild the header template */
4557                 error = icmp_build_hdr_template(connp, &v6src, &v6dst,
4558                     flowinfo);
4559                 if (error != 0) {
4560                         mutex_exit(&connp->conn_lock);
4561                         goto ud_error;
4562                 }
4563         } else {
4564                 /* Simply update the destination address if no source route */
4565                 if (ixa->ixa_flags & IXAF_IS_IPV4) {
4566                         ipha_t  *ipha = (ipha_t *)connp->conn_ht_iphc;
4567 
4568                         IN6_V4MAPPED_TO_IPADDR(&v6dst, ipha->ipha_dst);
4569                         if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) {
4570                                 ipha->ipha_fragment_offset_and_flags |=
4571                                     IPH_DF_HTONS;
4572                         } else {
4573                                 ipha->ipha_fragment_offset_and_flags &=
4574                                     ~IPH_DF_HTONS;
4575                         }
4576                 } else {
4577                         ip6_t *ip6h = (ip6_t *)connp->conn_ht_iphc;
4578                         ip6h->ip6_dst = v6dst;
4579                 }
4580         }
4581 
4582         /*
4583          * Remember the dst etc which corresponds to the built header
4584          * template and conn_ixa.
4585          */
4586         oldixa = conn_replace_ixa(connp, ixa);
4587         connp->conn_v6lastdst = v6dst;
4588         connp->conn_lastflowinfo = flowinfo;
4589         connp->conn_lastscopeid = ixa->ixa_scopeid;
4590         connp->conn_lastsrcid = srcid;
4591         /* Also remember a source to use together with lastdst */
4592         connp->conn_v6lastsrc = v6src;
4593 
4594         data_mp = icmp_prepend_header_template(connp, ixa, data_mp, &v6src,
4595             flowinfo, &error);
4596 
4597         /* Done with conn_t */
4598         mutex_exit(&connp->conn_lock);
4599         ixa_refrele(oldixa);
4600 
4601         if (data_mp == NULL) {
4602                 ASSERT(error != 0);
4603                 goto ud_error;
4604         }
4605 
4606         if (!do_ipsec) {
4607                 /* Policy might differ for different ICMP type/code */
4608                 data_mp = icmp_output_attach_policy(data_mp, connp, ixa);
4609                 if (data_mp == NULL) {
4610                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4611                         error = EHOSTUNREACH;   /* IPsec policy failure */
4612                         goto done;
4613                 }
4614         }
4615 
4616         /* We're done.  Pass the packet to ip. */
4617         BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
4618 
4619         error = conn_ip_output(data_mp, ixa);
4620         /* No rawipOutErrors if an error since IP increases its error counter */
4621         switch (error) {
4622         case 0:
4623                 break;
4624         case EWOULDBLOCK:
4625                 (void) ixa_check_drain_insert(connp, ixa);
4626                 error = 0;
4627                 break;
4628         case EADDRNOTAVAIL:
4629                 /*
4630                  * IXAF_VERIFY_SOURCE tells us to pick a better source.
4631                  * Don't have the application see that errno
4632                  */
4633                 error = ENETUNREACH;
4634                 /* FALLTHRU */
4635         default:
4636                 mutex_enter(&connp->conn_lock);
4637                 /*
4638                  * Clear the source and v6lastdst so we call ip_attr_connect
4639                  * for the next packet and try to pick a better source.
4640                  */
4641                 if (connp->conn_mcbc_bind)
4642                         connp->conn_saddr_v6 = ipv6_all_zeros;
4643                 else
4644                         connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
4645                 connp->conn_v6lastdst = ipv6_all_zeros;
4646                 mutex_exit(&connp->conn_lock);
4647                 break;
4648         }
4649 done:
4650         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
4651         ixa->ixa_cred = connp->conn_cred; /* Restore */
4652         ixa->ixa_cpid = connp->conn_cpid;
4653         ixa_refrele(ixa);
4654         return (error);
4655 
4656 ud_error:
4657         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
4658         ixa->ixa_cred = connp->conn_cred; /* Restore */
4659         ixa->ixa_cpid = connp->conn_cpid;
4660         ixa_refrele(ixa);
4661 
4662         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4663         freemsg(data_mp);
4664         return (error);
4665 }
4666 
4667 /* ARGSUSED */
4668 static void
4669 icmp_wput_fallback(queue_t *q, mblk_t *mp)
4670 {
4671 #ifdef DEBUG
4672         cmn_err(CE_CONT, "icmp_wput_fallback: Message during fallback \n");
4673 #endif
4674         freemsg(mp);
4675 }
4676 
4677 static void
4678 icmp_wput_other(queue_t *q, mblk_t *mp)
4679 {
4680         uchar_t *rptr = mp->b_rptr;
4681         struct iocblk *iocp;
4682         conn_t  *connp = Q_TO_CONN(q);
4683         icmp_t  *icmp = connp->conn_icmp;
4684         cred_t *cr;
4685 
4686         switch (mp->b_datap->db_type) {
4687         case M_PROTO:
4688         case M_PCPROTO:
4689                 if (mp->b_wptr - rptr < sizeof (t_scalar_t)) {
4690                         /*
4691                          * If the message does not contain a PRIM_type,
4692                          * throw it away.
4693                          */
4694                         freemsg(mp);
4695                         return;
4696                 }
4697                 switch (((t_primp_t)rptr)->type) {
4698                 case T_ADDR_REQ:
4699                         icmp_addr_req(q, mp);
4700                         return;
4701                 case O_T_BIND_REQ:
4702                 case T_BIND_REQ:
4703                         icmp_tpi_bind(q, mp);
4704                         return;
4705                 case T_CONN_REQ:
4706                         icmp_tpi_connect(q, mp);
4707                         return;
4708                 case T_CAPABILITY_REQ:
4709                         icmp_capability_req(q, mp);
4710                         return;
4711                 case T_INFO_REQ:
4712                         icmp_info_req(q, mp);
4713                         return;
4714                 case T_UNITDATA_REQ:
4715                         /*
4716                          * If a T_UNITDATA_REQ gets here, the address must
4717                          * be bad.  Valid T_UNITDATA_REQs are handled
4718                          * in icmp_wput.
4719                          */
4720                         icmp_ud_err(q, mp, EADDRNOTAVAIL);
4721                         return;
4722                 case T_UNBIND_REQ:
4723                         icmp_tpi_unbind(q, mp);
4724                         return;
4725                 case T_SVR4_OPTMGMT_REQ:
4726                         /*
4727                          * All Solaris components should pass a db_credp
4728                          * for this TPI message, hence we ASSERT.
4729                          * But in case there is some other M_PROTO that looks
4730                          * like a TPI message sent by some other kernel
4731                          * component, we check and return an error.
4732                          */
4733                         cr = msg_getcred(mp, NULL);
4734                         ASSERT(cr != NULL);
4735                         if (cr == NULL) {
4736                                 icmp_err_ack(q, mp, TSYSERR, EINVAL);
4737                                 return;
4738                         }
4739 
4740                         if (!snmpcom_req(q, mp, icmp_snmp_set, ip_snmp_get,
4741                             cr)) {
4742                                 svr4_optcom_req(q, mp, cr, &icmp_opt_obj);
4743                         }
4744                         return;
4745 
4746                 case T_OPTMGMT_REQ:
4747                         /*
4748                          * All Solaris components should pass a db_credp
4749                          * for this TPI message, hence we ASSERT.
4750                          * But in case there is some other M_PROTO that looks
4751                          * like a TPI message sent by some other kernel
4752                          * component, we check and return an error.
4753                          */
4754                         cr = msg_getcred(mp, NULL);
4755                         ASSERT(cr != NULL);
4756                         if (cr == NULL) {
4757                                 icmp_err_ack(q, mp, TSYSERR, EINVAL);
4758                                 return;
4759                         }
4760                         tpi_optcom_req(q, mp, cr, &icmp_opt_obj);
4761                         return;
4762 
4763                 case T_DISCON_REQ:
4764                         icmp_tpi_disconnect(q, mp);
4765                         return;
4766 
4767                 /* The following TPI message is not supported by icmp. */
4768                 case O_T_CONN_RES:
4769                 case T_CONN_RES:
4770                         icmp_err_ack(q, mp, TNOTSUPPORT, 0);
4771                         return;
4772 
4773                 /* The following 3 TPI requests are illegal for icmp. */
4774                 case T_DATA_REQ:
4775                 case T_EXDATA_REQ:
4776                 case T_ORDREL_REQ:
4777                         icmp_err_ack(q, mp, TNOTSUPPORT, 0);
4778                         return;
4779                 default:
4780                         break;
4781                 }
4782                 break;
4783         case M_FLUSH:
4784                 if (*rptr & FLUSHW)
4785                         flushq(q, FLUSHDATA);
4786                 break;
4787         case M_IOCTL:
4788                 iocp = (struct iocblk *)mp->b_rptr;
4789                 switch (iocp->ioc_cmd) {
4790                 case TI_GETPEERNAME:
4791                         if (icmp->icmp_state != TS_DATA_XFER) {
4792                                 /*
4793                                  * If a default destination address has not
4794                                  * been associated with the stream, then we
4795                                  * don't know the peer's name.
4796                                  */
4797                                 iocp->ioc_error = ENOTCONN;
4798                                 iocp->ioc_count = 0;
4799                                 mp->b_datap->db_type = M_IOCACK;
4800                                 qreply(q, mp);
4801                                 return;
4802                         }
4803                         /* FALLTHRU */
4804                 case TI_GETMYNAME:
4805                         /*
4806                          * For TI_GETPEERNAME and TI_GETMYNAME, we first
4807                          * need to copyin the user's strbuf structure.
4808                          * Processing will continue in the M_IOCDATA case
4809                          * below.
4810                          */
4811                         mi_copyin(q, mp, NULL,
4812                             SIZEOF_STRUCT(strbuf, iocp->ioc_flag));
4813                         return;
4814                 default:
4815                         break;
4816                 }
4817                 break;
4818         case M_IOCDATA:
4819                 icmp_wput_iocdata(q, mp);
4820                 return;
4821         default:
4822                 /* Unrecognized messages are passed through without change. */
4823                 break;
4824         }
4825         ip_wput_nondata(q, mp);
4826 }
4827 
4828 /*
4829  * icmp_wput_iocdata is called by icmp_wput_other to handle all M_IOCDATA
4830  * messages.
4831  */
4832 static void
4833 icmp_wput_iocdata(queue_t *q, mblk_t *mp)
4834 {
4835         mblk_t          *mp1;
4836         STRUCT_HANDLE(strbuf, sb);
4837         uint_t          addrlen;
4838         conn_t          *connp = Q_TO_CONN(q);
4839         icmp_t          *icmp = connp->conn_icmp;
4840 
4841         /* Make sure it is one of ours. */
4842         switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
4843         case TI_GETMYNAME:
4844         case TI_GETPEERNAME:
4845                 break;
4846         default:
4847                 ip_wput_nondata(q, mp);
4848                 return;
4849         }
4850 
4851         switch (mi_copy_state(q, mp, &mp1)) {
4852         case -1:
4853                 return;
4854         case MI_COPY_CASE(MI_COPY_IN, 1):
4855                 break;
4856         case MI_COPY_CASE(MI_COPY_OUT, 1):
4857                 /*
4858                  * The address has been copied out, so now
4859                  * copyout the strbuf.
4860                  */
4861                 mi_copyout(q, mp);
4862                 return;
4863         case MI_COPY_CASE(MI_COPY_OUT, 2):
4864                 /*
4865                  * The address and strbuf have been copied out.
4866                  * We're done, so just acknowledge the original
4867                  * M_IOCTL.
4868                  */
4869                 mi_copy_done(q, mp, 0);
4870                 return;
4871         default:
4872                 /*
4873                  * Something strange has happened, so acknowledge
4874                  * the original M_IOCTL with an EPROTO error.
4875                  */
4876                 mi_copy_done(q, mp, EPROTO);
4877                 return;
4878         }
4879 
4880         /*
4881          * Now we have the strbuf structure for TI_GETMYNAME
4882          * and TI_GETPEERNAME.  Next we copyout the requested
4883          * address and then we'll copyout the strbuf.
4884          */
4885         STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag,
4886             (void *)mp1->b_rptr);
4887 
4888         if (connp->conn_family == AF_INET)
4889                 addrlen = sizeof (sin_t);
4890         else
4891                 addrlen = sizeof (sin6_t);
4892 
4893         if (STRUCT_FGET(sb, maxlen) < addrlen) {
4894                 mi_copy_done(q, mp, EINVAL);
4895                 return;
4896         }
4897         switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
4898         case TI_GETMYNAME:
4899                 break;
4900         case TI_GETPEERNAME:
4901                 if (icmp->icmp_state != TS_DATA_XFER) {
4902                         mi_copy_done(q, mp, ENOTCONN);
4903                         return;
4904                 }
4905                 break;
4906         default:
4907                 mi_copy_done(q, mp, EPROTO);
4908                 return;
4909         }
4910         mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE);
4911         if (!mp1)
4912                 return;
4913 
4914         STRUCT_FSET(sb, len, addrlen);
4915         switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
4916         case TI_GETMYNAME:
4917                 (void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr,
4918                     &addrlen);
4919                 break;
4920         case TI_GETPEERNAME:
4921                 (void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr,
4922                     &addrlen);
4923                 break;
4924         }
4925         mp1->b_wptr += addrlen;
4926         /* Copy out the address */
4927         mi_copyout(q, mp);
4928 }
4929 
4930 void
4931 icmp_ddi_g_init(void)
4932 {
4933         icmp_max_optsize = optcom_max_optsize(icmp_opt_obj.odb_opt_des_arr,
4934             icmp_opt_obj.odb_opt_arr_cnt);
4935 
4936         /*
4937          * We want to be informed each time a stack is created or
4938          * destroyed in the kernel, so we can maintain the
4939          * set of icmp_stack_t's.
4940          */
4941         netstack_register(NS_ICMP, rawip_stack_init, NULL, rawip_stack_fini);
4942 }
4943 
4944 void
4945 icmp_ddi_g_destroy(void)
4946 {
4947         netstack_unregister(NS_ICMP);
4948 }
4949 
4950 #define INET_NAME       "ip"
4951 
4952 /*
4953  * Initialize the ICMP stack instance.
4954  */
4955 static void *
4956 rawip_stack_init(netstackid_t stackid, netstack_t *ns)
4957 {
4958         icmp_stack_t    *is;
4959         int             error = 0;
4960         size_t          arrsz;
4961         major_t         major;
4962 
4963         is = (icmp_stack_t *)kmem_zalloc(sizeof (*is), KM_SLEEP);
4964         is->is_netstack = ns;
4965 
4966         arrsz = sizeof (icmp_propinfo_tbl);
4967         is->is_propinfo_tbl = (mod_prop_info_t *)kmem_alloc(arrsz, KM_SLEEP);
4968         bcopy(icmp_propinfo_tbl, is->is_propinfo_tbl, arrsz);
4969 
4970         is->is_ksp = rawip_kstat_init(stackid);
4971 
4972         major = mod_name_to_major(INET_NAME);
4973         error = ldi_ident_from_major(major, &is->is_ldi_ident);
4974         ASSERT(error == 0);
4975         return (is);
4976 }
4977 
4978 /*
4979  * Free the ICMP stack instance.
4980  */
4981 static void
4982 rawip_stack_fini(netstackid_t stackid, void *arg)
4983 {
4984         icmp_stack_t *is = (icmp_stack_t *)arg;
4985 
4986         kmem_free(is->is_propinfo_tbl, sizeof (icmp_propinfo_tbl));
4987         is->is_propinfo_tbl = NULL;
4988 
4989         rawip_kstat_fini(stackid, is->is_ksp);
4990         is->is_ksp = NULL;
4991         ldi_ident_release(is->is_ldi_ident);
4992         kmem_free(is, sizeof (*is));
4993 }
4994 
4995 static void *
4996 rawip_kstat_init(netstackid_t stackid) {
4997         kstat_t *ksp;
4998 
4999         rawip_named_kstat_t template = {
5000                 { "inDatagrams",        KSTAT_DATA_UINT32, 0 },
5001                 { "inCksumErrs",        KSTAT_DATA_UINT32, 0 },
5002                 { "inErrors",           KSTAT_DATA_UINT32, 0 },
5003                 { "outDatagrams",       KSTAT_DATA_UINT32, 0 },
5004                 { "outErrors",          KSTAT_DATA_UINT32, 0 },
5005         };
5006 
5007         ksp = kstat_create_netstack("icmp", 0, "rawip", "mib2",
5008                                         KSTAT_TYPE_NAMED,
5009                                         NUM_OF_FIELDS(rawip_named_kstat_t),
5010                                         0, stackid);
5011         if (ksp == NULL || ksp->ks_data == NULL)
5012                 return (NULL);
5013 
5014         bcopy(&template, ksp->ks_data, sizeof (template));
5015         ksp->ks_update = rawip_kstat_update;
5016         ksp->ks_private = (void *)(uintptr_t)stackid;
5017 
5018         kstat_install(ksp);
5019         return (ksp);
5020 }
5021 
5022 static void
5023 rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp)
5024 {
5025         if (ksp != NULL) {
5026                 ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private);
5027                 kstat_delete_netstack(ksp, stackid);
5028         }
5029 }
5030 
5031 static int
5032 rawip_kstat_update(kstat_t *ksp, int rw)
5033 {
5034         rawip_named_kstat_t *rawipkp;
5035         netstackid_t    stackid = (netstackid_t)(uintptr_t)ksp->ks_private;
5036         netstack_t      *ns;
5037         icmp_stack_t    *is;
5038 
5039         if ((ksp == NULL) || (ksp->ks_data == NULL))
5040                 return (EIO);
5041 
5042         if (rw == KSTAT_WRITE)
5043                 return (EACCES);
5044 
5045         rawipkp = (rawip_named_kstat_t *)ksp->ks_data;
5046 
5047         ns = netstack_find_by_stackid(stackid);
5048         if (ns == NULL)
5049                 return (-1);
5050         is = ns->netstack_icmp;
5051         if (is == NULL) {
5052                 netstack_rele(ns);
5053                 return (-1);
5054         }
5055         rawipkp->inDatagrams.value.ui32 =  is->is_rawip_mib.rawipInDatagrams;
5056         rawipkp->inCksumErrs.value.ui32 =  is->is_rawip_mib.rawipInCksumErrs;
5057         rawipkp->inErrors.value.ui32 =          is->is_rawip_mib.rawipInErrors;
5058         rawipkp->outDatagrams.value.ui32 = is->is_rawip_mib.rawipOutDatagrams;
5059         rawipkp->outErrors.value.ui32 =         is->is_rawip_mib.rawipOutErrors;
5060         netstack_rele(ns);
5061         return (0);
5062 }
5063 
5064 /* ARGSUSED */
5065 int
5066 rawip_accept(sock_lower_handle_t lproto_handle,
5067     sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
5068     cred_t *cr)
5069 {
5070         return (EOPNOTSUPP);
5071 }
5072 
5073 /* ARGSUSED */
5074 int
5075 rawip_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
5076     socklen_t len, cred_t *cr)
5077 {
5078         conn_t  *connp = (conn_t *)proto_handle;
5079         int     error;
5080 
5081         /* All Solaris components should pass a cred for this operation. */
5082         ASSERT(cr != NULL);
5083 
5084         /* Binding to a NULL address really means unbind */
5085         if (sa == NULL)
5086                 error = rawip_do_unbind(connp);
5087         else
5088                 error = rawip_do_bind(connp, sa, len);
5089 
5090         if (error < 0) {
5091                 if (error == -TOUTSTATE)
5092                         error = EINVAL;
5093                 else
5094                         error = proto_tlitosyserr(-error);
5095         }
5096         return (error);
5097 }
5098 
5099 static int
5100 rawip_implicit_bind(conn_t *connp)
5101 {
5102         sin6_t sin6addr;
5103         sin_t *sin;
5104         sin6_t *sin6;
5105         socklen_t len;
5106         int error;
5107 
5108         if (connp->conn_family == AF_INET) {
5109                 len = sizeof (struct sockaddr_in);
5110                 sin = (sin_t *)&sin6addr;
5111                 *sin = sin_null;
5112                 sin->sin_family = AF_INET;
5113                 sin->sin_addr.s_addr = INADDR_ANY;
5114         } else {
5115                 ASSERT(connp->conn_family == AF_INET6);
5116                 len = sizeof (sin6_t);
5117                 sin6 = (sin6_t *)&sin6addr;
5118                 *sin6 = sin6_null;
5119                 sin6->sin6_family = AF_INET6;
5120                 V6_SET_ZERO(sin6->sin6_addr);
5121         }
5122 
5123         error = rawip_do_bind(connp, (struct sockaddr *)&sin6addr, len);
5124 
5125         return ((error < 0) ? proto_tlitosyserr(-error) : error);
5126 }
5127 
5128 static int
5129 rawip_unbind(conn_t *connp)
5130 {
5131         int error;
5132 
5133         error = rawip_do_unbind(connp);
5134         if (error < 0) {
5135                 error = proto_tlitosyserr(-error);
5136         }
5137         return (error);
5138 }
5139 
5140 /* ARGSUSED */
5141 int
5142 rawip_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
5143 {
5144         return (EOPNOTSUPP);
5145 }
5146 
5147 int
5148 rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
5149     socklen_t len, sock_connid_t *id, cred_t *cr)
5150 {
5151         conn_t  *connp = (conn_t *)proto_handle;
5152         icmp_t *icmp = connp->conn_icmp;
5153         int     error;
5154         boolean_t did_bind = B_FALSE;
5155         pid_t   pid = curproc->p_pid;
5156 
5157         /* All Solaris components should pass a cred for this operation. */
5158         ASSERT(cr != NULL);
5159 
5160         if (sa == NULL) {
5161                 /*
5162                  * Disconnect
5163                  * Make sure we are connected
5164                  */
5165                 if (icmp->icmp_state != TS_DATA_XFER)
5166                         return (EINVAL);
5167 
5168                 error = icmp_disconnect(connp);
5169                 return (error);
5170         }
5171 
5172         error = proto_verify_ip_addr(connp->conn_family, sa, len);
5173         if (error != 0)
5174                 return (error);
5175 
5176         /* do an implicit bind if necessary */
5177         if (icmp->icmp_state == TS_UNBND) {
5178                 error = rawip_implicit_bind(connp);
5179                 /*
5180                  * We could be racing with an actual bind, in which case
5181                  * we would see EPROTO. We cross our fingers and try
5182                  * to connect.
5183                  */
5184                 if (!(error == 0 || error == EPROTO))
5185                         return (error);
5186                 did_bind = B_TRUE;
5187         }
5188 
5189         /*
5190          * set SO_DGRAM_ERRIND
5191          */
5192         connp->conn_dgram_errind = B_TRUE;
5193 
5194         error = rawip_do_connect(connp, sa, len, cr, pid);
5195         if (error != 0 && did_bind) {
5196                 int unbind_err;
5197 
5198                 unbind_err = rawip_unbind(connp);
5199                 ASSERT(unbind_err == 0);
5200         }
5201 
5202         if (error == 0) {
5203                 *id = 0;
5204                 (*connp->conn_upcalls->su_connected)(connp->conn_upper_handle,
5205                     0, NULL, -1);
5206         } else if (error < 0) {
5207                 error = proto_tlitosyserr(-error);
5208         }
5209         return (error);
5210 }
5211 
5212 /* ARGSUSED2 */
5213 int
5214 rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q,
5215     boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb,
5216     sock_quiesce_arg_t *arg)
5217 {
5218         conn_t  *connp = (conn_t *)proto_handle;
5219         icmp_t  *icmp;
5220         struct T_capability_ack tca;
5221         struct sockaddr_in6 laddr, faddr;
5222         socklen_t laddrlen, faddrlen;
5223         short opts;
5224         struct stroptions *stropt;
5225         mblk_t *mp, *stropt_mp;
5226         int error;
5227 
5228         icmp = connp->conn_icmp;
5229 
5230         stropt_mp = allocb_wait(sizeof (*stropt), BPRI_HI, STR_NOSIG, NULL);
5231 
5232         /*
5233          * setup the fallback stream that was allocated
5234          */
5235         connp->conn_dev = (dev_t)RD(q)->q_ptr;
5236         connp->conn_minor_arena = WR(q)->q_ptr;
5237 
5238         RD(q)->q_ptr = WR(q)->q_ptr = connp;
5239 
5240         WR(q)->q_qinfo = &icmpwinit;
5241 
5242         connp->conn_rq = RD(q);
5243         connp->conn_wq = WR(q);
5244 
5245         /* Notify stream head about options before sending up data */
5246         stropt_mp->b_datap->db_type = M_SETOPTS;
5247         stropt_mp->b_wptr += sizeof (*stropt);
5248         stropt = (struct stroptions *)stropt_mp->b_rptr;
5249         stropt->so_flags = SO_WROFF | SO_HIWAT;
5250         stropt->so_wroff = connp->conn_wroff;
5251         stropt->so_hiwat = connp->conn_rcvbuf;
5252         putnext(RD(q), stropt_mp);
5253 
5254         /*
5255          * free helper stream
5256          */
5257         ip_free_helper_stream(connp);
5258 
5259         /*
5260          * Collect the information needed to sync with the sonode
5261          */
5262         icmp_do_capability_ack(icmp, &tca, TC1_INFO);
5263 
5264         laddrlen = faddrlen = sizeof (sin6_t);
5265         (void) rawip_getsockname((sock_lower_handle_t)connp,
5266             (struct sockaddr *)&laddr, &laddrlen, CRED());
5267         error = rawip_getpeername((sock_lower_handle_t)connp,
5268             (struct sockaddr *)&faddr, &faddrlen, CRED());
5269         if (error != 0)
5270                 faddrlen = 0;
5271         opts = 0;
5272         if (connp->conn_dgram_errind)
5273                 opts |= SO_DGRAM_ERRIND;
5274         if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
5275                 opts |= SO_DONTROUTE;
5276 
5277         mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca,
5278             (struct sockaddr *)&laddr, laddrlen,
5279             (struct sockaddr *)&faddr, faddrlen, opts);
5280 
5281         /*
5282          * Attempts to send data up during fallback will result in it being
5283          * queued in icmp_t. Now we push up any queued packets.
5284          */
5285         mutex_enter(&icmp->icmp_recv_lock);
5286         if (mp != NULL) {
5287                 mp->b_next = icmp->icmp_fallback_queue_head;
5288                 icmp->icmp_fallback_queue_head = mp;
5289         }
5290         while (icmp->icmp_fallback_queue_head != NULL) {
5291                 mp = icmp->icmp_fallback_queue_head;
5292                 icmp->icmp_fallback_queue_head = mp->b_next;
5293                 mp->b_next = NULL;
5294                 mutex_exit(&icmp->icmp_recv_lock);
5295                 putnext(RD(q), mp);
5296                 mutex_enter(&icmp->icmp_recv_lock);
5297         }
5298         icmp->icmp_fallback_queue_tail = icmp->icmp_fallback_queue_head;
5299 
5300         /*
5301          * No longer a streams less socket
5302          */
5303         mutex_enter(&connp->conn_lock);
5304         connp->conn_flags &= ~IPCL_NONSTR;
5305         mutex_exit(&connp->conn_lock);
5306 
5307         mutex_exit(&icmp->icmp_recv_lock);
5308 
5309         ASSERT(icmp->icmp_fallback_queue_head == NULL &&
5310             icmp->icmp_fallback_queue_tail == NULL);
5311 
5312         ASSERT(connp->conn_ref >= 1);
5313 
5314         return (0);
5315 }
5316 
5317 /* ARGSUSED2 */
5318 sock_lower_handle_t
5319 rawip_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
5320     uint_t *smodep, int *errorp, int flags, cred_t *credp)
5321 {
5322         conn_t *connp;
5323 
5324         if (type != SOCK_RAW || (family != AF_INET && family != AF_INET6)) {
5325                 *errorp = EPROTONOSUPPORT;
5326                 return (NULL);
5327         }
5328 
5329         connp = rawip_do_open(family, credp, errorp, flags);
5330         if (connp != NULL) {
5331                 connp->conn_flags |= IPCL_NONSTR;
5332 
5333                 mutex_enter(&connp->conn_lock);
5334                 connp->conn_state_flags &= ~CONN_INCIPIENT;
5335                 mutex_exit(&connp->conn_lock);
5336                 *sock_downcalls = &sock_rawip_downcalls;
5337                 *smodep = SM_ATOMIC;
5338         } else {
5339                 ASSERT(*errorp != 0);
5340         }
5341 
5342         return ((sock_lower_handle_t)connp);
5343 }
5344 
5345 /* ARGSUSED3 */
5346 void
5347 rawip_activate(sock_lower_handle_t proto_handle,
5348     sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, int flags,
5349     cred_t *cr)
5350 {
5351         conn_t                  *connp = (conn_t *)proto_handle;
5352         struct sock_proto_props sopp;
5353 
5354         /* All Solaris components should pass a cred for this operation. */
5355         ASSERT(cr != NULL);
5356 
5357         connp->conn_upcalls = sock_upcalls;
5358         connp->conn_upper_handle = sock_handle;
5359 
5360         sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
5361             SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ;
5362         sopp.sopp_wroff = connp->conn_wroff;
5363         sopp.sopp_rxhiwat = connp->conn_rcvbuf;
5364         sopp.sopp_rxlowat = connp->conn_rcvlowat;
5365         sopp.sopp_maxblk = INFPSZ;
5366         sopp.sopp_maxpsz = IP_MAXPACKET;
5367         sopp.sopp_minpsz = (icmp_mod_info.mi_minpsz == 1) ? 0 :
5368             icmp_mod_info.mi_minpsz;
5369 
5370         (*connp->conn_upcalls->su_set_proto_props)
5371             (connp->conn_upper_handle, &sopp);
5372 
5373         icmp_bind_proto(connp->conn_icmp);
5374 }
5375 
5376 /* ARGSUSED3 */
5377 int
5378 rawip_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa,
5379     socklen_t *salenp, cred_t *cr)
5380 {
5381         conn_t  *connp = (conn_t *)proto_handle;
5382         icmp_t  *icmp = connp->conn_icmp;
5383         int     error;
5384 
5385         /* All Solaris components should pass a cred for this operation. */
5386         ASSERT(cr != NULL);
5387 
5388         mutex_enter(&connp->conn_lock);
5389         if (icmp->icmp_state != TS_DATA_XFER)
5390                 error = ENOTCONN;
5391         else
5392                 error = conn_getpeername(connp, sa, salenp);
5393         mutex_exit(&connp->conn_lock);
5394         return (error);
5395 }
5396 
5397 /* ARGSUSED3 */
5398 int
5399 rawip_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *sa,
5400     socklen_t *salenp, cred_t *cr)
5401 {
5402         conn_t  *connp = (conn_t *)proto_handle;
5403         int     error;
5404 
5405         /* All Solaris components should pass a cred for this operation. */
5406         ASSERT(cr != NULL);
5407 
5408         mutex_enter(&connp->conn_lock);
5409         error = conn_getsockname(connp, sa, salenp);
5410         mutex_exit(&connp->conn_lock);
5411         return (error);
5412 }
5413 
5414 int
5415 rawip_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
5416     const void *optvalp, socklen_t optlen, cred_t *cr)
5417 {
5418         conn_t  *connp = (conn_t *)proto_handle;
5419         int error;
5420 
5421         /* All Solaris components should pass a cred for this operation. */
5422         ASSERT(cr != NULL);
5423 
5424         error = proto_opt_check(level, option_name, optlen, NULL,
5425             icmp_opt_obj.odb_opt_des_arr,
5426             icmp_opt_obj.odb_opt_arr_cnt,
5427             B_TRUE, B_FALSE, cr);
5428 
5429         if (error != 0) {
5430                 /*
5431                  * option not recognized
5432                  */
5433                 if (error < 0) {
5434                         error = proto_tlitosyserr(-error);
5435                 }
5436                 return (error);
5437         }
5438 
5439         error = icmp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level,
5440             option_name, optlen, (uchar_t *)optvalp, (uint_t *)&optlen,
5441             (uchar_t *)optvalp, NULL, cr);
5442 
5443         ASSERT(error >= 0);
5444 
5445         return (error);
5446 }
5447 
5448 int
5449 rawip_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
5450     void *optvalp, socklen_t *optlen, cred_t *cr)
5451 {
5452         int             error;
5453         conn_t          *connp = (conn_t *)proto_handle;
5454         t_uscalar_t     max_optbuf_len;
5455         void            *optvalp_buf;
5456         int             len;
5457 
5458         /* All Solaris components should pass a cred for this operation. */
5459         ASSERT(cr != NULL);
5460 
5461         error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
5462             icmp_opt_obj.odb_opt_des_arr,
5463             icmp_opt_obj.odb_opt_arr_cnt,
5464             B_FALSE, B_TRUE, cr);
5465 
5466         if (error != 0) {
5467                 if (error < 0) {
5468                         error = proto_tlitosyserr(-error);
5469                 }
5470                 return (error);
5471         }
5472 
5473         optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
5474         len = icmp_opt_get(connp, level, option_name, optvalp_buf);
5475         if (len == -1) {
5476                 kmem_free(optvalp_buf, max_optbuf_len);
5477                 return (EINVAL);
5478         }
5479 
5480         /*
5481          * update optlen and copy option value
5482          */
5483         t_uscalar_t size = MIN(len, *optlen);
5484 
5485         bcopy(optvalp_buf, optvalp, size);
5486         bcopy(&size, optlen, sizeof (size));
5487 
5488         kmem_free(optvalp_buf, max_optbuf_len);
5489         return (0);
5490 }
5491 
5492 /* ARGSUSED1 */
5493 int
5494 rawip_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
5495 {
5496         conn_t  *connp = (conn_t *)proto_handle;
5497 
5498         /* All Solaris components should pass a cred for this operation. */
5499         ASSERT(cr != NULL);
5500 
5501         (void) rawip_do_close(connp);
5502         return (0);
5503 }
5504 
5505 /* ARGSUSED2 */
5506 int
5507 rawip_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
5508 {
5509         conn_t  *connp = (conn_t *)proto_handle;
5510 
5511         /* All Solaris components should pass a cred for this operation. */
5512         ASSERT(cr != NULL);
5513 
5514         /* shut down the send side */
5515         if (how != SHUT_RD)
5516                 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
5517                     SOCK_OPCTL_SHUT_SEND, 0);
5518         /* shut down the recv side */
5519         if (how != SHUT_WR)
5520                 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
5521                     SOCK_OPCTL_SHUT_RECV, 0);
5522         return (0);
5523 }
5524 
5525 void
5526 rawip_clr_flowctrl(sock_lower_handle_t proto_handle)
5527 {
5528         conn_t  *connp = (conn_t *)proto_handle;
5529         icmp_t  *icmp = connp->conn_icmp;
5530 
5531         mutex_enter(&icmp->icmp_recv_lock);
5532         connp->conn_flow_cntrld = B_FALSE;
5533         mutex_exit(&icmp->icmp_recv_lock);
5534 }
5535 
5536 int
5537 rawip_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
5538     int mode, int32_t *rvalp, cred_t *cr)
5539 {
5540         conn_t          *connp = (conn_t *)proto_handle;
5541         int             error;
5542 
5543         /* All Solaris components should pass a cred for this operation. */
5544         ASSERT(cr != NULL);
5545 
5546         /*
5547          * If we don't have a helper stream then create one.
5548          * ip_create_helper_stream takes care of locking the conn_t,
5549          * so this check for NULL is just a performance optimization.
5550          */
5551         if (connp->conn_helper_info == NULL) {
5552                 icmp_stack_t *is = connp->conn_icmp->icmp_is;
5553 
5554                 ASSERT(is->is_ldi_ident != NULL);
5555 
5556                 /*
5557                  * Create a helper stream for non-STREAMS socket.
5558                  */
5559                 error = ip_create_helper_stream(connp, is->is_ldi_ident);
5560                 if (error != 0) {
5561                         ip0dbg(("rawip_ioctl: create of IP helper stream "
5562                             "failed %d\n", error));
5563                         return (error);
5564                 }
5565         }
5566 
5567         switch (cmd) {
5568         case _SIOCSOCKFALLBACK:
5569         case TI_GETPEERNAME:
5570         case TI_GETMYNAME:
5571 #ifdef DEBUG
5572                 cmn_err(CE_CONT, "icmp_ioctl cmd 0x%x on non streams"
5573                     " socket", cmd);
5574 #endif
5575                 error = EINVAL;
5576                 break;
5577         default:
5578                 /*
5579                  * Pass on to IP using helper stream
5580                  */
5581                 error = ldi_ioctl(connp->conn_helper_info->iphs_handle,
5582                     cmd, arg, mode, cr, rvalp);
5583                 break;
5584         }
5585         return (error);
5586 }
5587 
5588 int
5589 rawip_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
5590     cred_t *cr)
5591 {
5592         sin6_t          *sin6;
5593         sin_t           *sin = NULL;
5594         uint_t          srcid;
5595         conn_t          *connp = (conn_t *)proto_handle;
5596         icmp_t          *icmp = connp->conn_icmp;
5597         int             error = 0;
5598         icmp_stack_t    *is = icmp->icmp_is;
5599         pid_t           pid = curproc->p_pid;
5600         ip_xmit_attr_t  *ixa;
5601 
5602         ASSERT(DB_TYPE(mp) == M_DATA);
5603 
5604         /* All Solaris components should pass a cred for this operation. */
5605         ASSERT(cr != NULL);
5606 
5607         /* do an implicit bind if necessary */
5608         if (icmp->icmp_state == TS_UNBND) {
5609                 error = rawip_implicit_bind(connp);
5610                 /*
5611                  * We could be racing with an actual bind, in which case
5612                  * we would see EPROTO. We cross our fingers and try
5613                  * to connect.
5614                  */
5615                 if (!(error == 0 || error == EPROTO)) {
5616                         freemsg(mp);
5617                         return (error);
5618                 }
5619         }
5620 
5621         /* Protocol 255 contains full IP headers */
5622         /* Read without holding lock */
5623         if (icmp->icmp_hdrincl) {
5624                 ASSERT(connp->conn_ipversion == IPV4_VERSION);
5625                 if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH) {
5626                         if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
5627                                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5628                                 freemsg(mp);
5629                                 return (EINVAL);
5630                         }
5631                 }
5632                 error = icmp_output_hdrincl(connp, mp, cr, pid);
5633                 if (is->is_sendto_ignerr)
5634                         return (0);
5635                 else
5636                         return (error);
5637         }
5638 
5639         /* Connected? */
5640         if (msg->msg_name == NULL) {
5641                 if (icmp->icmp_state != TS_DATA_XFER) {
5642                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5643                         return (EDESTADDRREQ);
5644                 }
5645                 if (msg->msg_controllen != 0) {
5646                         error = icmp_output_ancillary(connp, NULL, NULL, mp,
5647                             NULL, msg, cr, pid);
5648                 } else {
5649                         error = icmp_output_connected(connp, mp, cr, pid);
5650                 }
5651                 if (is->is_sendto_ignerr)
5652                         return (0);
5653                 else
5654                         return (error);
5655         }
5656         if (icmp->icmp_state == TS_DATA_XFER) {
5657                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5658                 return (EISCONN);
5659         }
5660         error = proto_verify_ip_addr(connp->conn_family,
5661             (struct sockaddr *)msg->msg_name, msg->msg_namelen);
5662         if (error != 0) {
5663                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5664                 return (error);
5665         }
5666         switch (connp->conn_family) {
5667         case AF_INET6:
5668                 sin6 = (sin6_t *)msg->msg_name;
5669 
5670                 /* No support for mapped addresses on raw sockets */
5671                 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
5672                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5673                         return (EADDRNOTAVAIL);
5674                 }
5675                 srcid = sin6->__sin6_src_id;
5676 
5677                 /*
5678                  * If the local address is a mapped address return
5679                  * an error.
5680                  * It would be possible to send an IPv6 packet but the
5681                  * response would never make it back to the application
5682                  * since it is bound to a mapped address.
5683                  */
5684                 if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) {
5685                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5686                         return (EADDRNOTAVAIL);
5687                 }
5688 
5689                 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
5690                         sin6->sin6_addr = ipv6_loopback;
5691 
5692                 /*
5693                  * We have to allocate an ip_xmit_attr_t before we grab
5694                  * conn_lock and we need to hold conn_lock once we've check
5695                  * conn_same_as_last_v6 to handle concurrent send* calls on a
5696                  * socket.
5697                  */
5698                 if (msg->msg_controllen == 0) {
5699                         ixa = conn_get_ixa(connp, B_FALSE);
5700                         if (ixa == NULL) {
5701                                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5702                                 return (ENOMEM);
5703                         }
5704                 } else {
5705                         ixa = NULL;
5706                 }
5707                 mutex_enter(&connp->conn_lock);
5708                 if (icmp->icmp_delayed_error != 0) {
5709                         sin6_t  *sin2 = (sin6_t *)&icmp->icmp_delayed_addr;
5710 
5711                         error = icmp->icmp_delayed_error;
5712                         icmp->icmp_delayed_error = 0;
5713 
5714                         /* Compare IP address and family */
5715 
5716                         if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr,
5717                             &sin2->sin6_addr) &&
5718                             sin6->sin6_family == sin2->sin6_family) {
5719                                 mutex_exit(&connp->conn_lock);
5720                                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5721                                 if (ixa != NULL)
5722                                         ixa_refrele(ixa);
5723                                 return (error);
5724                         }
5725                 }
5726                 if (msg->msg_controllen != 0) {
5727                         mutex_exit(&connp->conn_lock);
5728                         ASSERT(ixa == NULL);
5729                         error = icmp_output_ancillary(connp, NULL, sin6, mp,
5730                             NULL, msg, cr, pid);
5731                 } else if (conn_same_as_last_v6(connp, sin6) &&
5732                     connp->conn_lastsrcid == srcid &&
5733                     ipsec_outbound_policy_current(ixa)) {
5734                         /* icmp_output_lastdst drops conn_lock */
5735                         error = icmp_output_lastdst(connp, mp, cr, pid, ixa);
5736                 } else {
5737                         /* icmp_output_newdst drops conn_lock */
5738                         error = icmp_output_newdst(connp, mp, NULL, sin6, cr,
5739                             pid, ixa);
5740                 }
5741                 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
5742                 if (is->is_sendto_ignerr)
5743                         return (0);
5744                 else
5745                         return (error);
5746         case AF_INET:
5747                 sin = (sin_t *)msg->msg_name;
5748 
5749                 if (sin->sin_addr.s_addr == INADDR_ANY)
5750                         sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
5751 
5752                 /*
5753                  * We have to allocate an ip_xmit_attr_t before we grab
5754                  * conn_lock and we need to hold conn_lock once we've check
5755                  * conn_same_as_last_v6 to handle concurrent send* on a socket.
5756                  */
5757                 if (msg->msg_controllen == 0) {
5758                         ixa = conn_get_ixa(connp, B_FALSE);
5759                         if (ixa == NULL) {
5760                                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5761                                 return (ENOMEM);
5762                         }
5763                 } else {
5764                         ixa = NULL;
5765                 }
5766                 mutex_enter(&connp->conn_lock);
5767                 if (icmp->icmp_delayed_error != 0) {
5768                         sin_t  *sin2 = (sin_t *)&icmp->icmp_delayed_addr;
5769 
5770                         error = icmp->icmp_delayed_error;
5771                         icmp->icmp_delayed_error = 0;
5772 
5773                         /* Compare IP address */
5774 
5775                         if (sin->sin_addr.s_addr == sin2->sin_addr.s_addr) {
5776                                 mutex_exit(&connp->conn_lock);
5777                                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5778                                 if (ixa != NULL)
5779                                         ixa_refrele(ixa);
5780                                 return (error);
5781                         }
5782                 }
5783 
5784                 if (msg->msg_controllen != 0) {
5785                         mutex_exit(&connp->conn_lock);
5786                         ASSERT(ixa == NULL);
5787                         error = icmp_output_ancillary(connp, sin, NULL, mp,
5788                             NULL, msg, cr, pid);
5789                 } else if (conn_same_as_last_v4(connp, sin) &&
5790                     ipsec_outbound_policy_current(ixa)) {
5791                         /* icmp_output_lastdst drops conn_lock */
5792                         error = icmp_output_lastdst(connp, mp, cr, pid, ixa);
5793                 } else {
5794                         /* icmp_output_newdst drops conn_lock */
5795                         error = icmp_output_newdst(connp, mp, sin, NULL, cr,
5796                             pid, ixa);
5797                 }
5798                 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
5799                 if (is->is_sendto_ignerr)
5800                         return (0);
5801                 else
5802                         return (error);
5803         default:
5804                 return (EINVAL);
5805         }
5806 }
5807 
5808 sock_downcalls_t sock_rawip_downcalls = {
5809         rawip_activate,
5810         rawip_accept,
5811         rawip_bind,
5812         rawip_listen,
5813         rawip_connect,
5814         rawip_getpeername,
5815         rawip_getsockname,
5816         rawip_getsockopt,
5817         rawip_setsockopt,
5818         rawip_send,
5819         NULL,
5820         NULL,
5821         NULL,
5822         rawip_shutdown,
5823         rawip_clr_flowctrl,
5824         rawip_ioctl,
5825         rawip_close
5826 };