1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2013 by Delphix. All rights reserved. 24 */ 25 /* Copyright (c) 1990 Mentat Inc. */ 26 27 #include <sys/types.h> 28 #include <sys/stream.h> 29 #include <sys/stropts.h> 30 #include <sys/strlog.h> 31 #include <sys/strsun.h> 32 #define _SUN_TPI_VERSION 2 33 #include <sys/tihdr.h> 34 #include <sys/timod.h> 35 #include <sys/ddi.h> 36 #include <sys/sunddi.h> 37 #include <sys/strsubr.h> 38 #include <sys/suntpi.h> 39 #include <sys/xti_inet.h> 40 #include <sys/cmn_err.h> 41 #include <sys/kmem.h> 42 #include <sys/cred.h> 43 #include <sys/policy.h> 44 #include <sys/priv.h> 45 #include <sys/ucred.h> 46 #include <sys/zone.h> 47 48 #include <sys/sockio.h> 49 #include <sys/socket.h> 50 #include <sys/socketvar.h> 51 #include <sys/vtrace.h> 52 #include <sys/sdt.h> 53 #include <sys/debug.h> 54 #include <sys/isa_defs.h> 55 #include <sys/random.h> 56 #include <netinet/in.h> 57 #include <netinet/ip6.h> 58 #include <netinet/icmp6.h> 59 #include <netinet/udp.h> 60 61 #include <inet/common.h> 62 #include <inet/ip.h> 63 #include <inet/ip_impl.h> 64 #include <inet/ipsec_impl.h> 65 #include <inet/ip6.h> 66 #include <inet/ip_ire.h> 67 #include <inet/ip_if.h> 68 #include <inet/ip_multi.h> 69 #include <inet/ip_ndp.h> 70 #include <inet/proto_set.h> 71 #include <inet/mib2.h> 72 #include <inet/nd.h> 73 #include <inet/optcom.h> 74 #include <inet/snmpcom.h> 75 #include <inet/kstatcom.h> 76 #include <inet/ipclassifier.h> 77 78 #include <sys/tsol/label.h> 79 #include <sys/tsol/tnet.h> 80 81 #include <inet/rawip_impl.h> 82 83 #include <sys/disp.h> 84 85 /* 86 * Synchronization notes: 87 * 88 * RAWIP is MT and uses the usual kernel synchronization primitives. We use 89 * conn_lock to protect the icmp_t. 90 * 91 * Plumbing notes: 92 * ICMP is always a device driver. For compatibility with mibopen() code 93 * it is possible to I_PUSH "icmp", but that results in pushing a passthrough 94 * dummy module. 95 */ 96 static void icmp_addr_req(queue_t *q, mblk_t *mp); 97 static void icmp_tpi_bind(queue_t *q, mblk_t *mp); 98 static void icmp_bind_proto(icmp_t *icmp); 99 static int icmp_build_hdr_template(conn_t *, const in6_addr_t *, 100 const in6_addr_t *, uint32_t); 101 static void icmp_capability_req(queue_t *q, mblk_t *mp); 102 static int icmp_close(queue_t *q, int flags); 103 static void icmp_close_free(conn_t *); 104 static void icmp_tpi_connect(queue_t *q, mblk_t *mp); 105 static void icmp_tpi_disconnect(queue_t *q, mblk_t *mp); 106 static void icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, 107 int sys_error); 108 static void icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive, 109 t_scalar_t tlierr, int sys_error); 110 static void icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2, 111 ip_recv_attr_t *); 112 static void icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, 113 ip_recv_attr_t *); 114 static void icmp_info_req(queue_t *q, mblk_t *mp); 115 static void icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *); 116 static conn_t *icmp_open(int family, cred_t *credp, int *err, int flags); 117 static int icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, 118 cred_t *credp); 119 static int icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, 120 cred_t *credp); 121 static boolean_t icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name); 122 int icmp_opt_set(conn_t *connp, uint_t optset_context, 123 int level, int name, uint_t inlen, 124 uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 125 void *thisdg_attrs, cred_t *cr); 126 int icmp_opt_get(conn_t *connp, int level, int name, 127 uchar_t *ptr); 128 static int icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, 129 sin6_t *sin6, cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa); 130 static mblk_t *icmp_prepend_hdr(conn_t *, ip_xmit_attr_t *, const ip_pkt_t *, 131 const in6_addr_t *, const in6_addr_t *, uint32_t, mblk_t *, int *); 132 static mblk_t *icmp_prepend_header_template(conn_t *, ip_xmit_attr_t *, 133 mblk_t *, const in6_addr_t *, uint32_t, int *); 134 static int icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name, 135 uchar_t *ptr, int len); 136 static void icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err); 137 static void icmp_tpi_unbind(queue_t *q, mblk_t *mp); 138 static void icmp_wput(queue_t *q, mblk_t *mp); 139 static void icmp_wput_fallback(queue_t *q, mblk_t *mp); 140 static void icmp_wput_other(queue_t *q, mblk_t *mp); 141 static void icmp_wput_iocdata(queue_t *q, mblk_t *mp); 142 static void icmp_wput_restricted(queue_t *q, mblk_t *mp); 143 static void icmp_ulp_recv(conn_t *, mblk_t *, uint_t); 144 145 static void *rawip_stack_init(netstackid_t stackid, netstack_t *ns); 146 static void rawip_stack_fini(netstackid_t stackid, void *arg); 147 148 static void *rawip_kstat_init(netstackid_t stackid); 149 static void rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp); 150 static int rawip_kstat_update(kstat_t *kp, int rw); 151 static void rawip_stack_shutdown(netstackid_t stackid, void *arg); 152 153 /* Common routines for TPI and socket module */ 154 static conn_t *rawip_do_open(int, cred_t *, int *, int); 155 static void rawip_do_close(conn_t *); 156 static int rawip_do_bind(conn_t *, struct sockaddr *, socklen_t); 157 static int rawip_do_unbind(conn_t *); 158 static int rawip_do_connect(conn_t *, const struct sockaddr *, socklen_t, 159 cred_t *, pid_t); 160 161 int rawip_getsockname(sock_lower_handle_t, struct sockaddr *, 162 socklen_t *, cred_t *); 163 int rawip_getpeername(sock_lower_handle_t, struct sockaddr *, 164 socklen_t *, cred_t *); 165 166 static struct module_info icmp_mod_info = { 167 5707, "icmp", 1, INFPSZ, 512, 128 168 }; 169 170 /* 171 * Entry points for ICMP as a device. 172 * We have separate open functions for the /dev/icmp and /dev/icmp6 devices. 173 */ 174 static struct qinit icmprinitv4 = { 175 NULL, NULL, icmp_openv4, icmp_close, NULL, &icmp_mod_info 176 }; 177 178 static struct qinit icmprinitv6 = { 179 NULL, NULL, icmp_openv6, icmp_close, NULL, &icmp_mod_info 180 }; 181 182 static struct qinit icmpwinit = { 183 (pfi_t)icmp_wput, (pfi_t)ip_wsrv, NULL, NULL, NULL, &icmp_mod_info 184 }; 185 186 /* ICMP entry point during fallback */ 187 static struct qinit icmp_fallback_sock_winit = { 188 (pfi_t)icmp_wput_fallback, NULL, NULL, NULL, NULL, &icmp_mod_info 189 }; 190 191 /* For AF_INET aka /dev/icmp */ 192 struct streamtab icmpinfov4 = { 193 &icmprinitv4, &icmpwinit 194 }; 195 196 /* For AF_INET6 aka /dev/icmp6 */ 197 struct streamtab icmpinfov6 = { 198 &icmprinitv6, &icmpwinit 199 }; 200 201 /* Default structure copied into T_INFO_ACK messages */ 202 static struct T_info_ack icmp_g_t_info_ack = { 203 T_INFO_ACK, 204 IP_MAXPACKET, /* TSDU_size. icmp allows maximum size messages. */ 205 T_INVALID, /* ETSDU_size. icmp does not support expedited data. */ 206 T_INVALID, /* CDATA_size. icmp does not support connect data. */ 207 T_INVALID, /* DDATA_size. icmp does not support disconnect data. */ 208 0, /* ADDR_size - filled in later. */ 209 0, /* OPT_size - not initialized here */ 210 IP_MAXPACKET, /* TIDU_size. icmp allows maximum size messages. */ 211 T_CLTS, /* SERV_type. icmp supports connection-less. */ 212 TS_UNBND, /* CURRENT_state. This is set from icmp_state. */ 213 (XPG4_1|SENDZERO) /* PROVIDER_flag */ 214 }; 215 216 static int 217 icmp_set_buf_prop(netstack_t *stack, cred_t *cr, mod_prop_info_t *pinfo, 218 const char *ifname, const void *pval, uint_t flags) 219 { 220 return (mod_set_buf_prop(stack->netstack_icmp->is_propinfo_tbl, 221 stack, cr, pinfo, ifname, pval, flags)); 222 } 223 224 static int 225 icmp_get_buf_prop(netstack_t *stack, mod_prop_info_t *pinfo, const char *ifname, 226 void *val, uint_t psize, uint_t flags) 227 { 228 return (mod_get_buf_prop(stack->netstack_icmp->is_propinfo_tbl, stack, 229 pinfo, ifname, val, psize, flags)); 230 } 231 232 /* 233 * All of these are alterable, within the min/max values given, at run time. 234 * 235 * Note: All those tunables which do not start with "icmp_" are Committed and 236 * therefore are public. See PSARC 2010/080. 237 */ 238 static mod_prop_info_t icmp_propinfo_tbl[] = { 239 /* tunable - 0 */ 240 { "_wroff_extra", MOD_PROTO_RAWIP, 241 mod_set_uint32, mod_get_uint32, 242 {0, 128, 32}, {32} }, 243 244 { "_ipv4_ttl", MOD_PROTO_RAWIP, 245 mod_set_uint32, mod_get_uint32, 246 {1, 255, 255}, {255} }, 247 248 { "_ipv6_hoplimit", MOD_PROTO_RAWIP, 249 mod_set_uint32, mod_get_uint32, 250 {0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS}, 251 {IPV6_DEFAULT_HOPS} }, 252 253 { "_bsd_compat", MOD_PROTO_RAWIP, 254 mod_set_boolean, mod_get_boolean, 255 {B_TRUE}, {B_TRUE} }, 256 257 { "send_buf", MOD_PROTO_RAWIP, 258 icmp_set_buf_prop, icmp_get_buf_prop, 259 {4096, 65536, 8192}, {8192} }, 260 261 { "_xmit_lowat", MOD_PROTO_RAWIP, 262 mod_set_uint32, mod_get_uint32, 263 {0, 65536, 1024}, {1024} }, 264 265 { "recv_buf", MOD_PROTO_RAWIP, 266 icmp_set_buf_prop, icmp_get_buf_prop, 267 {4096, 65536, 8192}, {8192} }, 268 269 { "max_buf", MOD_PROTO_RAWIP, 270 mod_set_uint32, mod_get_uint32, 271 {65536, ULP_MAX_BUF, 256*1024}, {256*1024} }, 272 273 { "_pmtu_discovery", MOD_PROTO_RAWIP, 274 mod_set_boolean, mod_get_boolean, 275 {B_FALSE}, {B_FALSE} }, 276 277 { "_sendto_ignerr", MOD_PROTO_RAWIP, 278 mod_set_boolean, mod_get_boolean, 279 {B_FALSE}, {B_FALSE} }, 280 281 { "?", MOD_PROTO_RAWIP, NULL, mod_get_allprop, {0}, {0} }, 282 283 { NULL, 0, NULL, NULL, {0}, {0} } 284 }; 285 286 #define is_wroff_extra is_propinfo_tbl[0].prop_cur_uval 287 #define is_ipv4_ttl is_propinfo_tbl[1].prop_cur_uval 288 #define is_ipv6_hoplimit is_propinfo_tbl[2].prop_cur_uval 289 #define is_bsd_compat is_propinfo_tbl[3].prop_cur_bval 290 #define is_xmit_hiwat is_propinfo_tbl[4].prop_cur_uval 291 #define is_xmit_lowat is_propinfo_tbl[5].prop_cur_uval 292 #define is_recv_hiwat is_propinfo_tbl[6].prop_cur_uval 293 #define is_max_buf is_propinfo_tbl[7].prop_cur_uval 294 #define is_pmtu_discovery is_propinfo_tbl[8].prop_cur_bval 295 #define is_sendto_ignerr is_propinfo_tbl[9].prop_cur_bval 296 297 typedef union T_primitives *t_primp_t; 298 299 /* 300 * This routine is called to handle each O_T_BIND_REQ/T_BIND_REQ message 301 * passed to icmp_wput. 302 * It calls IP to verify the local IP address, and calls IP to insert 303 * the conn_t in the fanout table. 304 * If everything is ok it then sends the T_BIND_ACK back up. 305 */ 306 static void 307 icmp_tpi_bind(queue_t *q, mblk_t *mp) 308 { 309 int error; 310 struct sockaddr *sa; 311 struct T_bind_req *tbr; 312 socklen_t len; 313 sin_t *sin; 314 sin6_t *sin6; 315 icmp_t *icmp; 316 conn_t *connp = Q_TO_CONN(q); 317 mblk_t *mp1; 318 cred_t *cr; 319 320 /* 321 * All Solaris components should pass a db_credp 322 * for this TPI message, hence we ASSERT. 323 * But in case there is some other M_PROTO that looks 324 * like a TPI message sent by some other kernel 325 * component, we check and return an error. 326 */ 327 cr = msg_getcred(mp, NULL); 328 ASSERT(cr != NULL); 329 if (cr == NULL) { 330 icmp_err_ack(q, mp, TSYSERR, EINVAL); 331 return; 332 } 333 334 icmp = connp->conn_icmp; 335 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) { 336 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 337 "icmp_bind: bad req, len %u", 338 (uint_t)(mp->b_wptr - mp->b_rptr)); 339 icmp_err_ack(q, mp, TPROTO, 0); 340 return; 341 } 342 343 if (icmp->icmp_state != TS_UNBND) { 344 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 345 "icmp_bind: bad state, %u", icmp->icmp_state); 346 icmp_err_ack(q, mp, TOUTSTATE, 0); 347 return; 348 } 349 350 /* 351 * Reallocate the message to make sure we have enough room for an 352 * address. 353 */ 354 mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1); 355 if (mp1 == NULL) { 356 icmp_err_ack(q, mp, TSYSERR, ENOMEM); 357 return; 358 } 359 mp = mp1; 360 361 /* Reset the message type in preparation for shipping it back. */ 362 DB_TYPE(mp) = M_PCPROTO; 363 tbr = (struct T_bind_req *)mp->b_rptr; 364 len = tbr->ADDR_length; 365 switch (len) { 366 case 0: /* request for a generic port */ 367 tbr->ADDR_offset = sizeof (struct T_bind_req); 368 if (connp->conn_family == AF_INET) { 369 tbr->ADDR_length = sizeof (sin_t); 370 sin = (sin_t *)&tbr[1]; 371 *sin = sin_null; 372 sin->sin_family = AF_INET; 373 mp->b_wptr = (uchar_t *)&sin[1]; 374 sa = (struct sockaddr *)sin; 375 len = sizeof (sin_t); 376 } else { 377 ASSERT(connp->conn_family == AF_INET6); 378 tbr->ADDR_length = sizeof (sin6_t); 379 sin6 = (sin6_t *)&tbr[1]; 380 *sin6 = sin6_null; 381 sin6->sin6_family = AF_INET6; 382 mp->b_wptr = (uchar_t *)&sin6[1]; 383 sa = (struct sockaddr *)sin6; 384 len = sizeof (sin6_t); 385 } 386 break; 387 388 case sizeof (sin_t): /* Complete IPv4 address */ 389 sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset, 390 sizeof (sin_t)); 391 break; 392 393 case sizeof (sin6_t): /* Complete IPv6 address */ 394 sa = (struct sockaddr *)mi_offset_param(mp, 395 tbr->ADDR_offset, sizeof (sin6_t)); 396 break; 397 398 default: 399 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 400 "icmp_bind: bad ADDR_length %u", tbr->ADDR_length); 401 icmp_err_ack(q, mp, TBADADDR, 0); 402 return; 403 } 404 405 error = rawip_do_bind(connp, sa, len); 406 if (error != 0) { 407 if (error > 0) { 408 icmp_err_ack(q, mp, TSYSERR, error); 409 } else { 410 icmp_err_ack(q, mp, -error, 0); 411 } 412 } else { 413 tbr->PRIM_type = T_BIND_ACK; 414 qreply(q, mp); 415 } 416 } 417 418 static int 419 rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len) 420 { 421 sin_t *sin; 422 sin6_t *sin6; 423 icmp_t *icmp = connp->conn_icmp; 424 int error = 0; 425 ip_laddr_t laddr_type = IPVL_UNICAST_UP; /* INADDR_ANY */ 426 in_port_t lport; /* Network byte order */ 427 ipaddr_t v4src; /* Set if AF_INET */ 428 in6_addr_t v6src; 429 uint_t scopeid = 0; 430 zoneid_t zoneid = IPCL_ZONEID(connp); 431 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 432 433 if (sa == NULL || !OK_32PTR((char *)sa)) { 434 return (EINVAL); 435 } 436 437 switch (len) { 438 case sizeof (sin_t): /* Complete IPv4 address */ 439 sin = (sin_t *)sa; 440 if (sin->sin_family != AF_INET || 441 connp->conn_family != AF_INET) { 442 /* TSYSERR, EAFNOSUPPORT */ 443 return (EAFNOSUPPORT); 444 } 445 v4src = sin->sin_addr.s_addr; 446 IN6_IPADDR_TO_V4MAPPED(v4src, &v6src); 447 if (v4src != INADDR_ANY) { 448 laddr_type = ip_laddr_verify_v4(v4src, zoneid, ipst, 449 B_TRUE); 450 } 451 lport = sin->sin_port; 452 break; 453 case sizeof (sin6_t): /* Complete IPv6 address */ 454 sin6 = (sin6_t *)sa; 455 if (sin6->sin6_family != AF_INET6 || 456 connp->conn_family != AF_INET6) { 457 /* TSYSERR, EAFNOSUPPORT */ 458 return (EAFNOSUPPORT); 459 } 460 /* No support for mapped addresses on raw sockets */ 461 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 462 /* TSYSERR, EADDRNOTAVAIL */ 463 return (EADDRNOTAVAIL); 464 } 465 v6src = sin6->sin6_addr; 466 if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 467 if (IN6_IS_ADDR_LINKSCOPE(&v6src)) 468 scopeid = sin6->sin6_scope_id; 469 laddr_type = ip_laddr_verify_v6(&v6src, zoneid, ipst, 470 B_TRUE, scopeid); 471 } 472 lport = sin6->sin6_port; 473 break; 474 475 default: 476 /* TBADADDR */ 477 return (EADDRNOTAVAIL); 478 } 479 480 /* Is the local address a valid unicast, multicast, or broadcast? */ 481 if (laddr_type == IPVL_BAD) 482 return (EADDRNOTAVAIL); 483 484 /* 485 * The state must be TS_UNBND. 486 */ 487 mutex_enter(&connp->conn_lock); 488 if (icmp->icmp_state != TS_UNBND) { 489 mutex_exit(&connp->conn_lock); 490 return (-TOUTSTATE); 491 } 492 493 /* 494 * Copy the source address into our icmp structure. This address 495 * may still be zero; if so, ip will fill in the correct address 496 * each time an outbound packet is passed to it. 497 * If we are binding to a broadcast or multicast address then 498 * we just set the conn_bound_addr since we don't want to use 499 * that as the source address when sending. 500 */ 501 connp->conn_bound_addr_v6 = v6src; 502 connp->conn_laddr_v6 = v6src; 503 if (scopeid != 0) { 504 connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET; 505 connp->conn_ixa->ixa_scopeid = scopeid; 506 connp->conn_incoming_ifindex = scopeid; 507 } else { 508 connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 509 connp->conn_incoming_ifindex = connp->conn_bound_if; 510 } 511 512 switch (laddr_type) { 513 case IPVL_UNICAST_UP: 514 case IPVL_UNICAST_DOWN: 515 connp->conn_saddr_v6 = v6src; 516 connp->conn_mcbc_bind = B_FALSE; 517 break; 518 case IPVL_MCAST: 519 case IPVL_BCAST: 520 /* ip_set_destination will pick a source address later */ 521 connp->conn_saddr_v6 = ipv6_all_zeros; 522 connp->conn_mcbc_bind = B_TRUE; 523 break; 524 } 525 526 /* Any errors after this point should use late_error */ 527 528 /* 529 * Use sin_port/sin6_port since applications like psh use SOCK_RAW 530 * with IPPROTO_TCP. 531 */ 532 connp->conn_lport = lport; 533 connp->conn_fport = 0; 534 535 if (connp->conn_family == AF_INET) { 536 ASSERT(connp->conn_ipversion == IPV4_VERSION); 537 } else { 538 ASSERT(connp->conn_ipversion == IPV6_VERSION); 539 } 540 541 icmp->icmp_state = TS_IDLE; 542 543 /* 544 * We create an initial header template here to make a subsequent 545 * sendto have a starting point. Since conn_last_dst is zero the 546 * first sendto will always follow the 'dst changed' code path. 547 * Note that we defer massaging options and the related checksum 548 * adjustment until we have a destination address. 549 */ 550 error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 551 &connp->conn_faddr_v6, connp->conn_flowinfo); 552 if (error != 0) { 553 mutex_exit(&connp->conn_lock); 554 goto late_error; 555 } 556 /* Just in case */ 557 connp->conn_faddr_v6 = ipv6_all_zeros; 558 connp->conn_v6lastdst = ipv6_all_zeros; 559 mutex_exit(&connp->conn_lock); 560 561 error = ip_laddr_fanout_insert(connp); 562 if (error != 0) 563 goto late_error; 564 565 /* Bind succeeded */ 566 return (0); 567 568 late_error: 569 mutex_enter(&connp->conn_lock); 570 connp->conn_saddr_v6 = ipv6_all_zeros; 571 connp->conn_bound_addr_v6 = ipv6_all_zeros; 572 connp->conn_laddr_v6 = ipv6_all_zeros; 573 if (scopeid != 0) { 574 connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 575 connp->conn_incoming_ifindex = connp->conn_bound_if; 576 } 577 icmp->icmp_state = TS_UNBND; 578 connp->conn_v6lastdst = ipv6_all_zeros; 579 connp->conn_lport = 0; 580 581 /* Restore the header that was built above - different source address */ 582 (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 583 &connp->conn_faddr_v6, connp->conn_flowinfo); 584 mutex_exit(&connp->conn_lock); 585 return (error); 586 } 587 588 /* 589 * Tell IP to just bind to the protocol. 590 */ 591 static void 592 icmp_bind_proto(icmp_t *icmp) 593 { 594 conn_t *connp = icmp->icmp_connp; 595 596 mutex_enter(&connp->conn_lock); 597 connp->conn_saddr_v6 = ipv6_all_zeros; 598 connp->conn_laddr_v6 = ipv6_all_zeros; 599 connp->conn_faddr_v6 = ipv6_all_zeros; 600 connp->conn_v6lastdst = ipv6_all_zeros; 601 mutex_exit(&connp->conn_lock); 602 603 (void) ip_laddr_fanout_insert(connp); 604 } 605 606 /* 607 * This routine handles each T_CONN_REQ message passed to icmp. It 608 * associates a default destination address with the stream. 609 * 610 * After various error checks are completed, icmp_connect() lays 611 * the target address and port into the composite header template. 612 * Then we ask IP for information, including a source address if we didn't 613 * already have one. Finally we send up the T_OK_ACK reply message. 614 */ 615 static void 616 icmp_tpi_connect(queue_t *q, mblk_t *mp) 617 { 618 conn_t *connp = Q_TO_CONN(q); 619 struct T_conn_req *tcr; 620 struct sockaddr *sa; 621 socklen_t len; 622 int error; 623 cred_t *cr; 624 pid_t pid; 625 /* 626 * All Solaris components should pass a db_credp 627 * for this TPI message, hence we ASSERT. 628 * But in case there is some other M_PROTO that looks 629 * like a TPI message sent by some other kernel 630 * component, we check and return an error. 631 */ 632 cr = msg_getcred(mp, &pid); 633 ASSERT(cr != NULL); 634 if (cr == NULL) { 635 icmp_err_ack(q, mp, TSYSERR, EINVAL); 636 return; 637 } 638 639 tcr = (struct T_conn_req *)mp->b_rptr; 640 /* Sanity checks */ 641 if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_req)) { 642 icmp_err_ack(q, mp, TPROTO, 0); 643 return; 644 } 645 646 if (tcr->OPT_length != 0) { 647 icmp_err_ack(q, mp, TBADOPT, 0); 648 return; 649 } 650 651 len = tcr->DEST_length; 652 653 switch (len) { 654 default: 655 icmp_err_ack(q, mp, TBADADDR, 0); 656 return; 657 case sizeof (sin_t): 658 sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset, 659 sizeof (sin_t)); 660 break; 661 case sizeof (sin6_t): 662 sa = (struct sockaddr *)mi_offset_param(mp, 663 tcr->DEST_offset, sizeof (sin6_t)); 664 break; 665 } 666 667 error = proto_verify_ip_addr(connp->conn_family, sa, len); 668 if (error != 0) { 669 icmp_err_ack(q, mp, TSYSERR, error); 670 return; 671 } 672 673 error = rawip_do_connect(connp, sa, len, cr, pid); 674 if (error != 0) { 675 if (error < 0) { 676 icmp_err_ack(q, mp, -error, 0); 677 } else { 678 icmp_err_ack(q, mp, 0, error); 679 } 680 } else { 681 mblk_t *mp1; 682 683 /* 684 * We have to send a connection confirmation to 685 * keep TLI happy. 686 */ 687 if (connp->conn_family == AF_INET) { 688 mp1 = mi_tpi_conn_con(NULL, (char *)sa, 689 sizeof (sin_t), NULL, 0); 690 } else { 691 ASSERT(connp->conn_family == AF_INET6); 692 mp1 = mi_tpi_conn_con(NULL, (char *)sa, 693 sizeof (sin6_t), NULL, 0); 694 } 695 if (mp1 == NULL) { 696 icmp_err_ack(q, mp, TSYSERR, ENOMEM); 697 return; 698 } 699 700 /* 701 * Send ok_ack for T_CONN_REQ 702 */ 703 mp = mi_tpi_ok_ack_alloc(mp); 704 if (mp == NULL) { 705 /* Unable to reuse the T_CONN_REQ for the ack. */ 706 icmp_err_ack_prim(q, mp1, T_CONN_REQ, TSYSERR, ENOMEM); 707 return; 708 } 709 putnext(connp->conn_rq, mp); 710 putnext(connp->conn_rq, mp1); 711 } 712 } 713 714 static int 715 rawip_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len, 716 cred_t *cr, pid_t pid) 717 { 718 icmp_t *icmp; 719 sin_t *sin; 720 sin6_t *sin6; 721 int error; 722 uint16_t dstport; 723 ipaddr_t v4dst; 724 in6_addr_t v6dst; 725 uint32_t flowinfo; 726 ip_xmit_attr_t *ixa; 727 ip_xmit_attr_t *oldixa; 728 uint_t scopeid = 0; 729 uint_t srcid = 0; 730 in6_addr_t v6src = connp->conn_saddr_v6; 731 732 icmp = connp->conn_icmp; 733 734 if (sa == NULL || !OK_32PTR((char *)sa)) { 735 return (EINVAL); 736 } 737 738 ASSERT(sa != NULL && len != 0); 739 740 /* 741 * Determine packet type based on type of address passed in 742 * the request should contain an IPv4 or IPv6 address. 743 * Make sure that address family matches the type of 744 * family of the address passed down. 745 */ 746 switch (len) { 747 case sizeof (sin_t): 748 sin = (sin_t *)sa; 749 750 v4dst = sin->sin_addr.s_addr; 751 dstport = sin->sin_port; 752 IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst); 753 ASSERT(connp->conn_ipversion == IPV4_VERSION); 754 break; 755 756 case sizeof (sin6_t): 757 sin6 = (sin6_t *)sa; 758 759 /* No support for mapped addresses on raw sockets */ 760 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 761 return (EADDRNOTAVAIL); 762 } 763 v6dst = sin6->sin6_addr; 764 dstport = sin6->sin6_port; 765 ASSERT(connp->conn_ipversion == IPV6_VERSION); 766 flowinfo = sin6->sin6_flowinfo; 767 if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) 768 scopeid = sin6->sin6_scope_id; 769 srcid = sin6->__sin6_src_id; 770 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 771 ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 772 connp->conn_netstack); 773 } 774 break; 775 } 776 777 /* 778 * If there is a different thread using conn_ixa then we get a new 779 * copy and cut the old one loose from conn_ixa. Otherwise we use 780 * conn_ixa and prevent any other thread from using/changing it. 781 * Once connect() is done other threads can use conn_ixa since the 782 * refcnt will be back at one. 783 * We defer updating conn_ixa until later to handle any concurrent 784 * conn_ixa_cleanup thread. 785 */ 786 ixa = conn_get_ixa(connp, B_FALSE); 787 if (ixa == NULL) 788 return (ENOMEM); 789 790 mutex_enter(&connp->conn_lock); 791 /* 792 * This icmp_t must have bound already before doing a connect. 793 * Reject if a connect is in progress (we drop conn_lock during 794 * rawip_do_connect). 795 */ 796 if (icmp->icmp_state == TS_UNBND || icmp->icmp_state == TS_WCON_CREQ) { 797 mutex_exit(&connp->conn_lock); 798 ixa_refrele(ixa); 799 return (-TOUTSTATE); 800 } 801 802 if (icmp->icmp_state == TS_DATA_XFER) { 803 /* Already connected - clear out state */ 804 if (connp->conn_mcbc_bind) 805 connp->conn_saddr_v6 = ipv6_all_zeros; 806 else 807 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 808 connp->conn_laddr_v6 = connp->conn_bound_addr_v6; 809 connp->conn_faddr_v6 = ipv6_all_zeros; 810 icmp->icmp_state = TS_IDLE; 811 } 812 813 /* 814 * Use sin_port/sin6_port since applications like psh use SOCK_RAW 815 * with IPPROTO_TCP. 816 */ 817 connp->conn_fport = dstport; 818 if (connp->conn_ipversion == IPV4_VERSION) { 819 /* 820 * Interpret a zero destination to mean loopback. 821 * Update the T_CONN_REQ (sin/sin6) since it is used to 822 * generate the T_CONN_CON. 823 */ 824 if (v4dst == INADDR_ANY) { 825 v4dst = htonl(INADDR_LOOPBACK); 826 IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst); 827 ASSERT(connp->conn_family == AF_INET); 828 sin->sin_addr.s_addr = v4dst; 829 } 830 connp->conn_faddr_v6 = v6dst; 831 connp->conn_flowinfo = 0; 832 } else { 833 ASSERT(connp->conn_ipversion == IPV6_VERSION); 834 /* 835 * Interpret a zero destination to mean loopback. 836 * Update the T_CONN_REQ (sin/sin6) since it is used to 837 * generate the T_CONN_CON. 838 */ 839 if (IN6_IS_ADDR_UNSPECIFIED(&v6dst)) { 840 v6dst = ipv6_loopback; 841 sin6->sin6_addr = v6dst; 842 } 843 connp->conn_faddr_v6 = v6dst; 844 connp->conn_flowinfo = flowinfo; 845 } 846 847 /* 848 * We update our cred/cpid based on the caller of connect 849 */ 850 if (connp->conn_cred != cr) { 851 crhold(cr); 852 crfree(connp->conn_cred); 853 connp->conn_cred = cr; 854 } 855 connp->conn_cpid = pid; 856 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 857 ixa->ixa_cred = cr; 858 ixa->ixa_cpid = pid; 859 if (is_system_labeled()) { 860 /* We need to restart with a label based on the cred */ 861 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 862 } 863 864 if (scopeid != 0) { 865 ixa->ixa_flags |= IXAF_SCOPEID_SET; 866 ixa->ixa_scopeid = scopeid; 867 connp->conn_incoming_ifindex = scopeid; 868 } else { 869 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 870 connp->conn_incoming_ifindex = connp->conn_bound_if; 871 } 872 873 /* 874 * conn_connect will drop conn_lock and reacquire it. 875 * To prevent a send* from messing with this icmp_t while the lock 876 * is dropped we set icmp_state and clear conn_v6lastdst. 877 * That will make all send* fail with EISCONN. 878 */ 879 connp->conn_v6lastdst = ipv6_all_zeros; 880 icmp->icmp_state = TS_WCON_CREQ; 881 882 error = conn_connect(connp, NULL, IPDF_ALLOW_MCBC); 883 mutex_exit(&connp->conn_lock); 884 if (error != 0) 885 goto connect_failed; 886 887 /* 888 * The addresses have been verified. Time to insert in 889 * the correct fanout list. 890 */ 891 error = ipcl_conn_insert(connp); 892 if (error != 0) 893 goto connect_failed; 894 895 mutex_enter(&connp->conn_lock); 896 error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 897 &connp->conn_faddr_v6, connp->conn_flowinfo); 898 if (error != 0) { 899 mutex_exit(&connp->conn_lock); 900 goto connect_failed; 901 } 902 903 icmp->icmp_state = TS_DATA_XFER; 904 /* Record this as the "last" send even though we haven't sent any */ 905 connp->conn_v6lastdst = connp->conn_faddr_v6; 906 connp->conn_lastipversion = connp->conn_ipversion; 907 connp->conn_lastdstport = connp->conn_fport; 908 connp->conn_lastflowinfo = connp->conn_flowinfo; 909 connp->conn_lastscopeid = scopeid; 910 connp->conn_lastsrcid = srcid; 911 /* Also remember a source to use together with lastdst */ 912 connp->conn_v6lastsrc = v6src; 913 914 oldixa = conn_replace_ixa(connp, ixa); 915 mutex_exit(&connp->conn_lock); 916 ixa_refrele(oldixa); 917 918 ixa_refrele(ixa); 919 return (0); 920 921 connect_failed: 922 if (ixa != NULL) 923 ixa_refrele(ixa); 924 mutex_enter(&connp->conn_lock); 925 icmp->icmp_state = TS_IDLE; 926 /* In case the source address was set above */ 927 if (connp->conn_mcbc_bind) 928 connp->conn_saddr_v6 = ipv6_all_zeros; 929 else 930 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 931 connp->conn_laddr_v6 = connp->conn_bound_addr_v6; 932 connp->conn_faddr_v6 = ipv6_all_zeros; 933 connp->conn_v6lastdst = ipv6_all_zeros; 934 connp->conn_flowinfo = 0; 935 936 (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 937 &connp->conn_faddr_v6, connp->conn_flowinfo); 938 mutex_exit(&connp->conn_lock); 939 return (error); 940 } 941 942 static void 943 rawip_do_close(conn_t *connp) 944 { 945 ASSERT(connp != NULL && IPCL_IS_RAWIP(connp)); 946 947 ip_quiesce_conn(connp); 948 949 if (!IPCL_IS_NONSTR(connp)) { 950 qprocsoff(connp->conn_rq); 951 } 952 953 icmp_close_free(connp); 954 955 /* 956 * Now we are truly single threaded on this stream, and can 957 * delete the things hanging off the connp, and finally the connp. 958 * We removed this connp from the fanout list, it cannot be 959 * accessed thru the fanouts, and we already waited for the 960 * conn_ref to drop to 0. We are already in close, so 961 * there cannot be any other thread from the top. qprocsoff 962 * has completed, and service has completed or won't run in 963 * future. 964 */ 965 ASSERT(connp->conn_ref == 1); 966 967 if (!IPCL_IS_NONSTR(connp)) { 968 inet_minor_free(connp->conn_minor_arena, connp->conn_dev); 969 } else { 970 ip_free_helper_stream(connp); 971 } 972 973 connp->conn_ref--; 974 ipcl_conn_destroy(connp); 975 } 976 977 static int 978 icmp_close(queue_t *q, int flags) 979 { 980 conn_t *connp; 981 982 if (flags & SO_FALLBACK) { 983 /* 984 * stream is being closed while in fallback 985 * simply free the resources that were allocated 986 */ 987 inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr)); 988 qprocsoff(q); 989 goto done; 990 } 991 992 connp = Q_TO_CONN(q); 993 (void) rawip_do_close(connp); 994 done: 995 q->q_ptr = WR(q)->q_ptr = NULL; 996 return (0); 997 } 998 999 static void 1000 icmp_close_free(conn_t *connp) 1001 { 1002 icmp_t *icmp = connp->conn_icmp; 1003 1004 if (icmp->icmp_filter != NULL) { 1005 kmem_free(icmp->icmp_filter, sizeof (icmp6_filter_t)); 1006 icmp->icmp_filter = NULL; 1007 } 1008 1009 /* 1010 * Clear any fields which the kmem_cache constructor clears. 1011 * Only icmp_connp needs to be preserved. 1012 * TBD: We should make this more efficient to avoid clearing 1013 * everything. 1014 */ 1015 ASSERT(icmp->icmp_connp == connp); 1016 bzero(icmp, sizeof (icmp_t)); 1017 icmp->icmp_connp = connp; 1018 } 1019 1020 /* 1021 * This routine handles each T_DISCON_REQ message passed to icmp 1022 * as an indicating that ICMP is no longer connected. This results 1023 * in telling IP to restore the binding to just the local address. 1024 */ 1025 static int 1026 icmp_do_disconnect(conn_t *connp) 1027 { 1028 icmp_t *icmp = connp->conn_icmp; 1029 int error; 1030 1031 mutex_enter(&connp->conn_lock); 1032 if (icmp->icmp_state != TS_DATA_XFER) { 1033 mutex_exit(&connp->conn_lock); 1034 return (-TOUTSTATE); 1035 } 1036 if (connp->conn_mcbc_bind) 1037 connp->conn_saddr_v6 = ipv6_all_zeros; 1038 else 1039 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 1040 connp->conn_laddr_v6 = connp->conn_bound_addr_v6; 1041 connp->conn_faddr_v6 = ipv6_all_zeros; 1042 icmp->icmp_state = TS_IDLE; 1043 1044 connp->conn_v6lastdst = ipv6_all_zeros; 1045 error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 1046 &connp->conn_faddr_v6, connp->conn_flowinfo); 1047 mutex_exit(&connp->conn_lock); 1048 if (error != 0) 1049 return (error); 1050 1051 /* 1052 * Tell IP to remove the full binding and revert 1053 * to the local address binding. 1054 */ 1055 return (ip_laddr_fanout_insert(connp)); 1056 } 1057 1058 static void 1059 icmp_tpi_disconnect(queue_t *q, mblk_t *mp) 1060 { 1061 conn_t *connp = Q_TO_CONN(q); 1062 int error; 1063 1064 /* 1065 * Allocate the largest primitive we need to send back 1066 * T_error_ack is > than T_ok_ack 1067 */ 1068 mp = reallocb(mp, sizeof (struct T_error_ack), 1); 1069 if (mp == NULL) { 1070 /* Unable to reuse the T_DISCON_REQ for the ack. */ 1071 icmp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, ENOMEM); 1072 return; 1073 } 1074 1075 error = icmp_do_disconnect(connp); 1076 1077 if (error != 0) { 1078 if (error > 0) { 1079 icmp_err_ack(q, mp, 0, error); 1080 } else { 1081 icmp_err_ack(q, mp, -error, 0); 1082 } 1083 } else { 1084 mp = mi_tpi_ok_ack_alloc(mp); 1085 ASSERT(mp != NULL); 1086 qreply(q, mp); 1087 } 1088 } 1089 1090 static int 1091 icmp_disconnect(conn_t *connp) 1092 { 1093 int error; 1094 1095 connp->conn_dgram_errind = B_FALSE; 1096 1097 error = icmp_do_disconnect(connp); 1098 1099 if (error < 0) 1100 error = proto_tlitosyserr(-error); 1101 return (error); 1102 } 1103 1104 /* This routine creates a T_ERROR_ACK message and passes it upstream. */ 1105 static void 1106 icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error) 1107 { 1108 if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL) 1109 qreply(q, mp); 1110 } 1111 1112 /* Shorthand to generate and send TPI error acks to our client */ 1113 static void 1114 icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive, 1115 t_scalar_t t_error, int sys_error) 1116 { 1117 struct T_error_ack *teackp; 1118 1119 if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack), 1120 M_PCPROTO, T_ERROR_ACK)) != NULL) { 1121 teackp = (struct T_error_ack *)mp->b_rptr; 1122 teackp->ERROR_prim = primitive; 1123 teackp->TLI_error = t_error; 1124 teackp->UNIX_error = sys_error; 1125 qreply(q, mp); 1126 } 1127 } 1128 1129 /* 1130 * icmp_icmp_input is called as conn_recvicmp to process ICMP messages. 1131 * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors. 1132 * Assumes that IP has pulled up everything up to and including the ICMP header. 1133 */ 1134 /* ARGSUSED2 */ 1135 static void 1136 icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) 1137 { 1138 conn_t *connp = (conn_t *)arg1; 1139 icmp_t *icmp = connp->conn_icmp; 1140 icmph_t *icmph; 1141 ipha_t *ipha; 1142 int iph_hdr_length; 1143 sin_t sin; 1144 mblk_t *mp1; 1145 int error = 0; 1146 1147 ipha = (ipha_t *)mp->b_rptr; 1148 1149 ASSERT(OK_32PTR(mp->b_rptr)); 1150 1151 if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) { 1152 ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION); 1153 icmp_icmp_error_ipv6(connp, mp, ira); 1154 return; 1155 } 1156 ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); 1157 1158 /* Skip past the outer IP and ICMP headers */ 1159 ASSERT(IPH_HDR_LENGTH(ipha) == ira->ira_ip_hdr_length); 1160 iph_hdr_length = ira->ira_ip_hdr_length; 1161 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1162 ipha = (ipha_t *)&icmph[1]; /* Inner IP header */ 1163 1164 iph_hdr_length = IPH_HDR_LENGTH(ipha); 1165 1166 switch (icmph->icmph_type) { 1167 case ICMP_DEST_UNREACHABLE: 1168 switch (icmph->icmph_code) { 1169 case ICMP_FRAGMENTATION_NEEDED: { 1170 ipha_t *ipha; 1171 ip_xmit_attr_t *ixa; 1172 /* 1173 * IP has already adjusted the path MTU. 1174 * But we need to adjust DF for IPv4. 1175 */ 1176 if (connp->conn_ipversion != IPV4_VERSION) 1177 break; 1178 1179 ixa = conn_get_ixa(connp, B_FALSE); 1180 if (ixa == NULL || ixa->ixa_ire == NULL) { 1181 /* 1182 * Some other thread holds conn_ixa. We will 1183 * redo this on the next ICMP too big. 1184 */ 1185 if (ixa != NULL) 1186 ixa_refrele(ixa); 1187 break; 1188 } 1189 (void) ip_get_pmtu(ixa); 1190 1191 mutex_enter(&connp->conn_lock); 1192 ipha = (ipha_t *)connp->conn_ht_iphc; 1193 if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) { 1194 ipha->ipha_fragment_offset_and_flags |= 1195 IPH_DF_HTONS; 1196 } else { 1197 ipha->ipha_fragment_offset_and_flags &= 1198 ~IPH_DF_HTONS; 1199 } 1200 mutex_exit(&connp->conn_lock); 1201 ixa_refrele(ixa); 1202 break; 1203 } 1204 case ICMP_PORT_UNREACHABLE: 1205 case ICMP_PROTOCOL_UNREACHABLE: 1206 error = ECONNREFUSED; 1207 break; 1208 default: 1209 /* Transient errors */ 1210 break; 1211 } 1212 break; 1213 default: 1214 /* Transient errors */ 1215 break; 1216 } 1217 if (error == 0) { 1218 freemsg(mp); 1219 return; 1220 } 1221 1222 /* 1223 * Deliver T_UDERROR_IND when the application has asked for it. 1224 * The socket layer enables this automatically when connected. 1225 */ 1226 if (!connp->conn_dgram_errind) { 1227 freemsg(mp); 1228 return; 1229 } 1230 1231 sin = sin_null; 1232 sin.sin_family = AF_INET; 1233 sin.sin_addr.s_addr = ipha->ipha_dst; 1234 1235 if (IPCL_IS_NONSTR(connp)) { 1236 mutex_enter(&connp->conn_lock); 1237 if (icmp->icmp_state == TS_DATA_XFER) { 1238 if (sin.sin_addr.s_addr == connp->conn_faddr_v4) { 1239 mutex_exit(&connp->conn_lock); 1240 (*connp->conn_upcalls->su_set_error) 1241 (connp->conn_upper_handle, error); 1242 goto done; 1243 } 1244 } else { 1245 icmp->icmp_delayed_error = error; 1246 *((sin_t *)&icmp->icmp_delayed_addr) = sin; 1247 } 1248 mutex_exit(&connp->conn_lock); 1249 } else { 1250 mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL, 0, 1251 error); 1252 if (mp1 != NULL) 1253 putnext(connp->conn_rq, mp1); 1254 } 1255 done: 1256 freemsg(mp); 1257 } 1258 1259 /* 1260 * icmp_icmp_error_ipv6 is called by icmp_icmp_error to process ICMP for IPv6. 1261 * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors. 1262 * Assumes that IP has pulled up all the extension headers as well as the 1263 * ICMPv6 header. 1264 */ 1265 static void 1266 icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira) 1267 { 1268 icmp6_t *icmp6; 1269 ip6_t *ip6h, *outer_ip6h; 1270 uint16_t iph_hdr_length; 1271 uint8_t *nexthdrp; 1272 sin6_t sin6; 1273 mblk_t *mp1; 1274 int error = 0; 1275 icmp_t *icmp = connp->conn_icmp; 1276 1277 outer_ip6h = (ip6_t *)mp->b_rptr; 1278 #ifdef DEBUG 1279 if (outer_ip6h->ip6_nxt != IPPROTO_ICMPV6) 1280 iph_hdr_length = ip_hdr_length_v6(mp, outer_ip6h); 1281 else 1282 iph_hdr_length = IPV6_HDR_LEN; 1283 ASSERT(iph_hdr_length == ira->ira_ip_hdr_length); 1284 #endif 1285 /* Skip past the outer IP and ICMP headers */ 1286 iph_hdr_length = ira->ira_ip_hdr_length; 1287 icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length]; 1288 1289 ip6h = (ip6_t *)&icmp6[1]; /* Inner IP header */ 1290 if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) { 1291 freemsg(mp); 1292 return; 1293 } 1294 1295 switch (icmp6->icmp6_type) { 1296 case ICMP6_DST_UNREACH: 1297 switch (icmp6->icmp6_code) { 1298 case ICMP6_DST_UNREACH_NOPORT: 1299 error = ECONNREFUSED; 1300 break; 1301 case ICMP6_DST_UNREACH_ADMIN: 1302 case ICMP6_DST_UNREACH_NOROUTE: 1303 case ICMP6_DST_UNREACH_BEYONDSCOPE: 1304 case ICMP6_DST_UNREACH_ADDR: 1305 /* Transient errors */ 1306 break; 1307 default: 1308 break; 1309 } 1310 break; 1311 case ICMP6_PACKET_TOO_BIG: { 1312 struct T_unitdata_ind *tudi; 1313 struct T_opthdr *toh; 1314 size_t udi_size; 1315 mblk_t *newmp; 1316 t_scalar_t opt_length = sizeof (struct T_opthdr) + 1317 sizeof (struct ip6_mtuinfo); 1318 sin6_t *sin6; 1319 struct ip6_mtuinfo *mtuinfo; 1320 1321 /* 1322 * If the application has requested to receive path mtu 1323 * information, send up an empty message containing an 1324 * IPV6_PATHMTU ancillary data item. 1325 */ 1326 if (!connp->conn_ipv6_recvpathmtu) 1327 break; 1328 1329 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t) + 1330 opt_length; 1331 if ((newmp = allocb(udi_size, BPRI_MED)) == NULL) { 1332 BUMP_MIB(&icmp->icmp_is->is_rawip_mib, rawipInErrors); 1333 break; 1334 } 1335 1336 /* 1337 * newmp->b_cont is left to NULL on purpose. This is an 1338 * empty message containing only ancillary data. 1339 */ 1340 newmp->b_datap->db_type = M_PROTO; 1341 tudi = (struct T_unitdata_ind *)newmp->b_rptr; 1342 newmp->b_wptr = (uchar_t *)tudi + udi_size; 1343 tudi->PRIM_type = T_UNITDATA_IND; 1344 tudi->SRC_length = sizeof (sin6_t); 1345 tudi->SRC_offset = sizeof (struct T_unitdata_ind); 1346 tudi->OPT_offset = tudi->SRC_offset + sizeof (sin6_t); 1347 tudi->OPT_length = opt_length; 1348 1349 sin6 = (sin6_t *)&tudi[1]; 1350 bzero(sin6, sizeof (sin6_t)); 1351 sin6->sin6_family = AF_INET6; 1352 sin6->sin6_addr = connp->conn_faddr_v6; 1353 1354 toh = (struct T_opthdr *)&sin6[1]; 1355 toh->level = IPPROTO_IPV6; 1356 toh->name = IPV6_PATHMTU; 1357 toh->len = opt_length; 1358 toh->status = 0; 1359 1360 mtuinfo = (struct ip6_mtuinfo *)&toh[1]; 1361 bzero(mtuinfo, sizeof (struct ip6_mtuinfo)); 1362 mtuinfo->ip6m_addr.sin6_family = AF_INET6; 1363 mtuinfo->ip6m_addr.sin6_addr = ip6h->ip6_dst; 1364 mtuinfo->ip6m_mtu = icmp6->icmp6_mtu; 1365 /* 1366 * We've consumed everything we need from the original 1367 * message. Free it, then send our empty message. 1368 */ 1369 freemsg(mp); 1370 icmp_ulp_recv(connp, newmp, msgdsize(newmp)); 1371 return; 1372 } 1373 case ICMP6_TIME_EXCEEDED: 1374 /* Transient errors */ 1375 break; 1376 case ICMP6_PARAM_PROB: 1377 /* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */ 1378 if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER && 1379 (uchar_t *)ip6h + icmp6->icmp6_pptr == 1380 (uchar_t *)nexthdrp) { 1381 error = ECONNREFUSED; 1382 break; 1383 } 1384 break; 1385 } 1386 if (error == 0) { 1387 freemsg(mp); 1388 return; 1389 } 1390 1391 /* 1392 * Deliver T_UDERROR_IND when the application has asked for it. 1393 * The socket layer enables this automatically when connected. 1394 */ 1395 if (!connp->conn_dgram_errind) { 1396 freemsg(mp); 1397 return; 1398 } 1399 1400 sin6 = sin6_null; 1401 sin6.sin6_family = AF_INET6; 1402 sin6.sin6_addr = ip6h->ip6_dst; 1403 sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; 1404 if (IPCL_IS_NONSTR(connp)) { 1405 mutex_enter(&connp->conn_lock); 1406 if (icmp->icmp_state == TS_DATA_XFER) { 1407 if (IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr, 1408 &connp->conn_faddr_v6)) { 1409 mutex_exit(&connp->conn_lock); 1410 (*connp->conn_upcalls->su_set_error) 1411 (connp->conn_upper_handle, error); 1412 goto done; 1413 } 1414 } else { 1415 icmp->icmp_delayed_error = error; 1416 *((sin6_t *)&icmp->icmp_delayed_addr) = sin6; 1417 } 1418 mutex_exit(&connp->conn_lock); 1419 } else { 1420 mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t), 1421 NULL, 0, error); 1422 if (mp1 != NULL) 1423 putnext(connp->conn_rq, mp1); 1424 } 1425 done: 1426 freemsg(mp); 1427 } 1428 1429 /* 1430 * This routine responds to T_ADDR_REQ messages. It is called by icmp_wput. 1431 * The local address is filled in if endpoint is bound. The remote address 1432 * is filled in if remote address has been precified ("connected endpoint") 1433 * (The concept of connected CLTS sockets is alien to published TPI 1434 * but we support it anyway). 1435 */ 1436 static void 1437 icmp_addr_req(queue_t *q, mblk_t *mp) 1438 { 1439 struct sockaddr *sa; 1440 mblk_t *ackmp; 1441 struct T_addr_ack *taa; 1442 icmp_t *icmp = Q_TO_ICMP(q); 1443 conn_t *connp = icmp->icmp_connp; 1444 uint_t addrlen; 1445 1446 /* Make it large enough for worst case */ 1447 ackmp = reallocb(mp, sizeof (struct T_addr_ack) + 1448 2 * sizeof (sin6_t), 1); 1449 if (ackmp == NULL) { 1450 icmp_err_ack(q, mp, TSYSERR, ENOMEM); 1451 return; 1452 } 1453 taa = (struct T_addr_ack *)ackmp->b_rptr; 1454 1455 bzero(taa, sizeof (struct T_addr_ack)); 1456 ackmp->b_wptr = (uchar_t *)&taa[1]; 1457 1458 taa->PRIM_type = T_ADDR_ACK; 1459 ackmp->b_datap->db_type = M_PCPROTO; 1460 1461 if (connp->conn_family == AF_INET) 1462 addrlen = sizeof (sin_t); 1463 else 1464 addrlen = sizeof (sin6_t); 1465 1466 mutex_enter(&connp->conn_lock); 1467 /* 1468 * Note: Following code assumes 32 bit alignment of basic 1469 * data structures like sin_t and struct T_addr_ack. 1470 */ 1471 if (icmp->icmp_state != TS_UNBND) { 1472 /* 1473 * Fill in local address first 1474 */ 1475 taa->LOCADDR_offset = sizeof (*taa); 1476 taa->LOCADDR_length = addrlen; 1477 sa = (struct sockaddr *)&taa[1]; 1478 (void) conn_getsockname(connp, sa, &addrlen); 1479 ackmp->b_wptr += addrlen; 1480 } 1481 if (icmp->icmp_state == TS_DATA_XFER) { 1482 /* 1483 * connected, fill remote address too 1484 */ 1485 taa->REMADDR_length = addrlen; 1486 /* assumed 32-bit alignment */ 1487 taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length; 1488 sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset); 1489 (void) conn_getpeername(connp, sa, &addrlen); 1490 ackmp->b_wptr += addrlen; 1491 } 1492 mutex_exit(&connp->conn_lock); 1493 ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim); 1494 qreply(q, ackmp); 1495 } 1496 1497 static void 1498 icmp_copy_info(struct T_info_ack *tap, icmp_t *icmp) 1499 { 1500 conn_t *connp = icmp->icmp_connp; 1501 1502 *tap = icmp_g_t_info_ack; 1503 1504 if (connp->conn_family == AF_INET6) 1505 tap->ADDR_size = sizeof (sin6_t); 1506 else 1507 tap->ADDR_size = sizeof (sin_t); 1508 tap->CURRENT_state = icmp->icmp_state; 1509 tap->OPT_size = icmp_max_optsize; 1510 } 1511 1512 static void 1513 icmp_do_capability_ack(icmp_t *icmp, struct T_capability_ack *tcap, 1514 t_uscalar_t cap_bits1) 1515 { 1516 tcap->CAP_bits1 = 0; 1517 1518 if (cap_bits1 & TC1_INFO) { 1519 icmp_copy_info(&tcap->INFO_ack, icmp); 1520 tcap->CAP_bits1 |= TC1_INFO; 1521 } 1522 } 1523 1524 /* 1525 * This routine responds to T_CAPABILITY_REQ messages. It is called by 1526 * icmp_wput. Much of the T_CAPABILITY_ACK information is copied from 1527 * icmp_g_t_info_ack. The current state of the stream is copied from 1528 * icmp_state. 1529 */ 1530 static void 1531 icmp_capability_req(queue_t *q, mblk_t *mp) 1532 { 1533 icmp_t *icmp = Q_TO_ICMP(q); 1534 t_uscalar_t cap_bits1; 1535 struct T_capability_ack *tcap; 1536 1537 cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1; 1538 1539 mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack), 1540 mp->b_datap->db_type, T_CAPABILITY_ACK); 1541 if (!mp) 1542 return; 1543 1544 tcap = (struct T_capability_ack *)mp->b_rptr; 1545 1546 icmp_do_capability_ack(icmp, tcap, cap_bits1); 1547 1548 qreply(q, mp); 1549 } 1550 1551 /* 1552 * This routine responds to T_INFO_REQ messages. It is called by icmp_wput. 1553 * Most of the T_INFO_ACK information is copied from icmp_g_t_info_ack. 1554 * The current state of the stream is copied from icmp_state. 1555 */ 1556 static void 1557 icmp_info_req(queue_t *q, mblk_t *mp) 1558 { 1559 icmp_t *icmp = Q_TO_ICMP(q); 1560 1561 /* Create a T_INFO_ACK message. */ 1562 mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO, 1563 T_INFO_ACK); 1564 if (!mp) 1565 return; 1566 icmp_copy_info((struct T_info_ack *)mp->b_rptr, icmp); 1567 qreply(q, mp); 1568 } 1569 1570 static int 1571 icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, 1572 int family) 1573 { 1574 conn_t *connp; 1575 dev_t conn_dev; 1576 int error; 1577 1578 /* If the stream is already open, return immediately. */ 1579 if (q->q_ptr != NULL) 1580 return (0); 1581 1582 if (sflag == MODOPEN) 1583 return (EINVAL); 1584 1585 /* 1586 * Since ICMP is not used so heavily, allocating from the small 1587 * arena should be sufficient. 1588 */ 1589 if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) { 1590 return (EBUSY); 1591 } 1592 1593 if (flag & SO_FALLBACK) { 1594 /* 1595 * Non streams socket needs a stream to fallback to 1596 */ 1597 RD(q)->q_ptr = (void *)conn_dev; 1598 WR(q)->q_qinfo = &icmp_fallback_sock_winit; 1599 WR(q)->q_ptr = (void *)ip_minor_arena_sa; 1600 qprocson(q); 1601 return (0); 1602 } 1603 1604 connp = rawip_do_open(family, credp, &error, KM_SLEEP); 1605 if (connp == NULL) { 1606 ASSERT(error != 0); 1607 inet_minor_free(ip_minor_arena_sa, conn_dev); 1608 return (error); 1609 } 1610 1611 *devp = makedevice(getemajor(*devp), (minor_t)conn_dev); 1612 connp->conn_dev = conn_dev; 1613 connp->conn_minor_arena = ip_minor_arena_sa; 1614 1615 /* 1616 * Initialize the icmp_t structure for this stream. 1617 */ 1618 q->q_ptr = connp; 1619 WR(q)->q_ptr = connp; 1620 connp->conn_rq = q; 1621 connp->conn_wq = WR(q); 1622 1623 WR(q)->q_hiwat = connp->conn_sndbuf; 1624 WR(q)->q_lowat = connp->conn_sndlowat; 1625 1626 qprocson(q); 1627 1628 /* Set the Stream head write offset. */ 1629 (void) proto_set_tx_wroff(q, connp, connp->conn_wroff); 1630 (void) proto_set_rx_hiwat(connp->conn_rq, connp, connp->conn_rcvbuf); 1631 1632 mutex_enter(&connp->conn_lock); 1633 connp->conn_state_flags &= ~CONN_INCIPIENT; 1634 mutex_exit(&connp->conn_lock); 1635 1636 icmp_bind_proto(connp->conn_icmp); 1637 1638 return (0); 1639 } 1640 1641 /* For /dev/icmp aka AF_INET open */ 1642 static int 1643 icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 1644 { 1645 return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET)); 1646 } 1647 1648 /* For /dev/icmp6 aka AF_INET6 open */ 1649 static int 1650 icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 1651 { 1652 return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET6)); 1653 } 1654 1655 /* 1656 * This is the open routine for icmp. It allocates a icmp_t structure for 1657 * the stream and, on the first open of the module, creates an ND table. 1658 */ 1659 static conn_t * 1660 rawip_do_open(int family, cred_t *credp, int *err, int flags) 1661 { 1662 icmp_t *icmp; 1663 conn_t *connp; 1664 zoneid_t zoneid; 1665 netstack_t *ns; 1666 icmp_stack_t *is; 1667 int len; 1668 boolean_t isv6 = B_FALSE; 1669 1670 *err = secpolicy_net_icmpaccess(credp); 1671 if (*err != 0) 1672 return (NULL); 1673 1674 if (family == AF_INET6) 1675 isv6 = B_TRUE; 1676 1677 ns = netstack_find_by_cred(credp); 1678 ASSERT(ns != NULL); 1679 is = ns->netstack_icmp; 1680 ASSERT(is != NULL); 1681 1682 /* 1683 * For exclusive stacks we set the zoneid to zero 1684 * to make ICMP operate as if in the global zone. 1685 */ 1686 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 1687 zoneid = GLOBAL_ZONEID; 1688 else 1689 zoneid = crgetzoneid(credp); 1690 1691 ASSERT(flags == KM_SLEEP || flags == KM_NOSLEEP); 1692 1693 connp = ipcl_conn_create(IPCL_RAWIPCONN, flags, ns); 1694 icmp = connp->conn_icmp; 1695 1696 /* 1697 * ipcl_conn_create did a netstack_hold. Undo the hold that was 1698 * done by netstack_find_by_cred() 1699 */ 1700 netstack_rele(ns); 1701 1702 /* 1703 * Since this conn_t/icmp_t is not yet visible to anybody else we don't 1704 * need to lock anything. 1705 */ 1706 ASSERT(connp->conn_proto == IPPROTO_ICMP); 1707 ASSERT(connp->conn_icmp == icmp); 1708 ASSERT(icmp->icmp_connp == connp); 1709 1710 /* Set the initial state of the stream and the privilege status. */ 1711 icmp->icmp_state = TS_UNBND; 1712 connp->conn_ixa->ixa_flags |= IXAF_VERIFY_SOURCE; 1713 if (isv6) { 1714 connp->conn_family = AF_INET6; 1715 connp->conn_ipversion = IPV6_VERSION; 1716 connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4; 1717 connp->conn_proto = IPPROTO_ICMPV6; 1718 /* May be changed by a SO_PROTOTYPE socket option. */ 1719 connp->conn_proto = IPPROTO_ICMPV6; 1720 connp->conn_ixa->ixa_protocol = connp->conn_proto; 1721 connp->conn_ixa->ixa_raw_cksum_offset = 2; 1722 connp->conn_default_ttl = is->is_ipv6_hoplimit; 1723 len = sizeof (ip6_t); 1724 } else { 1725 connp->conn_family = AF_INET; 1726 connp->conn_ipversion = IPV4_VERSION; 1727 connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4; 1728 /* May be changed by a SO_PROTOTYPE socket option. */ 1729 connp->conn_proto = IPPROTO_ICMP; 1730 connp->conn_ixa->ixa_protocol = connp->conn_proto; 1731 connp->conn_default_ttl = is->is_ipv4_ttl; 1732 len = sizeof (ipha_t); 1733 } 1734 connp->conn_xmit_ipp.ipp_unicast_hops = connp->conn_default_ttl; 1735 1736 connp->conn_ixa->ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; 1737 1738 /* 1739 * For the socket of protocol IPPROTO_RAW or when IP_HDRINCL is set, 1740 * the checksum is provided in the pre-built packet. We clear 1741 * IXAF_SET_ULP_CKSUM to tell IP that the application has sent a 1742 * complete IP header and not to compute the transport checksum. 1743 */ 1744 connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM; 1745 /* conn_allzones can not be set this early, hence no IPCL_ZONEID */ 1746 connp->conn_ixa->ixa_zoneid = zoneid; 1747 1748 connp->conn_zoneid = zoneid; 1749 1750 /* 1751 * If the caller has the process-wide flag set, then default to MAC 1752 * exempt mode. This allows read-down to unlabeled hosts. 1753 */ 1754 if (getpflags(NET_MAC_AWARE, credp) != 0) 1755 connp->conn_mac_mode = CONN_MAC_AWARE; 1756 1757 connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID); 1758 1759 icmp->icmp_is = is; 1760 1761 connp->conn_rcvbuf = is->is_recv_hiwat; 1762 connp->conn_sndbuf = is->is_xmit_hiwat; 1763 connp->conn_sndlowat = is->is_xmit_lowat; 1764 connp->conn_rcvlowat = icmp_mod_info.mi_lowat; 1765 1766 connp->conn_wroff = len + is->is_wroff_extra; 1767 connp->conn_so_type = SOCK_RAW; 1768 1769 connp->conn_recv = icmp_input; 1770 connp->conn_recvicmp = icmp_icmp_input; 1771 crhold(credp); 1772 connp->conn_cred = credp; 1773 connp->conn_cpid = curproc->p_pid; 1774 connp->conn_open_time = ddi_get_lbolt64(); 1775 /* Cache things in ixa without an extra refhold */ 1776 ASSERT(!(connp->conn_ixa->ixa_free_flags & IXA_FREE_CRED)); 1777 connp->conn_ixa->ixa_cred = connp->conn_cred; 1778 connp->conn_ixa->ixa_cpid = connp->conn_cpid; 1779 if (is_system_labeled()) 1780 connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred); 1781 1782 connp->conn_flow_cntrld = B_FALSE; 1783 1784 if (is->is_pmtu_discovery) 1785 connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY; 1786 1787 return (connp); 1788 } 1789 1790 /* 1791 * Which ICMP options OK to set through T_UNITDATA_REQ... 1792 */ 1793 /* ARGSUSED */ 1794 static boolean_t 1795 icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name) 1796 { 1797 return (B_TRUE); 1798 } 1799 1800 /* 1801 * This routine gets default values of certain options whose default 1802 * values are maintained by protcol specific code 1803 */ 1804 int 1805 icmp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) 1806 { 1807 icmp_t *icmp = Q_TO_ICMP(q); 1808 icmp_stack_t *is = icmp->icmp_is; 1809 int *i1 = (int *)ptr; 1810 1811 switch (level) { 1812 case IPPROTO_IP: 1813 switch (name) { 1814 case IP_MULTICAST_TTL: 1815 *ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL; 1816 return (sizeof (uchar_t)); 1817 case IP_MULTICAST_LOOP: 1818 *ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP; 1819 return (sizeof (uchar_t)); 1820 } 1821 break; 1822 case IPPROTO_IPV6: 1823 switch (name) { 1824 case IPV6_MULTICAST_HOPS: 1825 *i1 = IP_DEFAULT_MULTICAST_TTL; 1826 return (sizeof (int)); 1827 case IPV6_MULTICAST_LOOP: 1828 *i1 = IP_DEFAULT_MULTICAST_LOOP; 1829 return (sizeof (int)); 1830 case IPV6_UNICAST_HOPS: 1831 *i1 = is->is_ipv6_hoplimit; 1832 return (sizeof (int)); 1833 } 1834 break; 1835 case IPPROTO_ICMPV6: 1836 switch (name) { 1837 case ICMP6_FILTER: 1838 /* Make it look like "pass all" */ 1839 ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr); 1840 return (sizeof (icmp6_filter_t)); 1841 } 1842 break; 1843 } 1844 return (-1); 1845 } 1846 1847 /* 1848 * This routine retrieves the current status of socket options. 1849 * It returns the size of the option retrieved, or -1. 1850 */ 1851 int 1852 icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) 1853 { 1854 icmp_t *icmp = connp->conn_icmp; 1855 int *i1 = (int *)ptr; 1856 conn_opt_arg_t coas; 1857 int retval; 1858 1859 coas.coa_connp = connp; 1860 coas.coa_ixa = connp->conn_ixa; 1861 coas.coa_ipp = &connp->conn_xmit_ipp; 1862 coas.coa_ancillary = B_FALSE; 1863 coas.coa_changed = 0; 1864 1865 /* 1866 * We assume that the optcom framework has checked for the set 1867 * of levels and names that are supported, hence we don't worry 1868 * about rejecting based on that. 1869 * First check for ICMP specific handling, then pass to common routine. 1870 */ 1871 switch (level) { 1872 case IPPROTO_IP: 1873 /* 1874 * Only allow IPv4 option processing on IPv4 sockets. 1875 */ 1876 if (connp->conn_family != AF_INET) 1877 return (-1); 1878 1879 switch (name) { 1880 case IP_OPTIONS: 1881 case T_IP_OPTIONS: 1882 /* Options are passed up with each packet */ 1883 return (0); 1884 case IP_HDRINCL: 1885 mutex_enter(&connp->conn_lock); 1886 *i1 = (int)icmp->icmp_hdrincl; 1887 mutex_exit(&connp->conn_lock); 1888 return (sizeof (int)); 1889 } 1890 break; 1891 1892 case IPPROTO_IPV6: 1893 /* 1894 * Only allow IPv6 option processing on native IPv6 sockets. 1895 */ 1896 if (connp->conn_family != AF_INET6) 1897 return (-1); 1898 1899 switch (name) { 1900 case IPV6_CHECKSUM: 1901 /* 1902 * Return offset or -1 if no checksum offset. 1903 * Does not apply to IPPROTO_ICMPV6 1904 */ 1905 if (connp->conn_proto == IPPROTO_ICMPV6) 1906 return (-1); 1907 1908 mutex_enter(&connp->conn_lock); 1909 if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM) 1910 *i1 = connp->conn_ixa->ixa_raw_cksum_offset; 1911 else 1912 *i1 = -1; 1913 mutex_exit(&connp->conn_lock); 1914 return (sizeof (int)); 1915 } 1916 break; 1917 1918 case IPPROTO_ICMPV6: 1919 /* 1920 * Only allow IPv6 option processing on native IPv6 sockets. 1921 */ 1922 if (connp->conn_family != AF_INET6) 1923 return (-1); 1924 1925 if (connp->conn_proto != IPPROTO_ICMPV6) 1926 return (-1); 1927 1928 switch (name) { 1929 case ICMP6_FILTER: 1930 mutex_enter(&connp->conn_lock); 1931 if (icmp->icmp_filter == NULL) { 1932 /* Make it look like "pass all" */ 1933 ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr); 1934 } else { 1935 (void) bcopy(icmp->icmp_filter, ptr, 1936 sizeof (icmp6_filter_t)); 1937 } 1938 mutex_exit(&connp->conn_lock); 1939 return (sizeof (icmp6_filter_t)); 1940 } 1941 } 1942 mutex_enter(&connp->conn_lock); 1943 retval = conn_opt_get(&coas, level, name, ptr); 1944 mutex_exit(&connp->conn_lock); 1945 return (retval); 1946 } 1947 1948 /* 1949 * This routine retrieves the current status of socket options. 1950 * It returns the size of the option retrieved, or -1. 1951 */ 1952 int 1953 icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr) 1954 { 1955 conn_t *connp = Q_TO_CONN(q); 1956 int err; 1957 1958 err = icmp_opt_get(connp, level, name, ptr); 1959 return (err); 1960 } 1961 1962 /* 1963 * This routine sets socket options. 1964 */ 1965 int 1966 icmp_do_opt_set(conn_opt_arg_t *coa, int level, int name, 1967 uint_t inlen, uchar_t *invalp, cred_t *cr, boolean_t checkonly) 1968 { 1969 conn_t *connp = coa->coa_connp; 1970 ip_xmit_attr_t *ixa = coa->coa_ixa; 1971 icmp_t *icmp = connp->conn_icmp; 1972 icmp_stack_t *is = icmp->icmp_is; 1973 int *i1 = (int *)invalp; 1974 boolean_t onoff = (*i1 == 0) ? 0 : 1; 1975 int error; 1976 1977 ASSERT(MUTEX_NOT_HELD(&coa->coa_connp->conn_lock)); 1978 1979 /* 1980 * For fixed length options, no sanity check 1981 * of passed in length is done. It is assumed *_optcom_req() 1982 * routines do the right thing. 1983 */ 1984 1985 switch (level) { 1986 case SOL_SOCKET: 1987 switch (name) { 1988 case SO_PROTOTYPE: 1989 if ((*i1 & 0xFF) != IPPROTO_ICMP && 1990 (*i1 & 0xFF) != IPPROTO_ICMPV6 && 1991 secpolicy_net_rawaccess(cr) != 0) { 1992 return (EACCES); 1993 } 1994 if (checkonly) 1995 break; 1996 1997 mutex_enter(&connp->conn_lock); 1998 connp->conn_proto = *i1 & 0xFF; 1999 ixa->ixa_protocol = connp->conn_proto; 2000 if ((connp->conn_proto == IPPROTO_RAW || 2001 connp->conn_proto == IPPROTO_IGMP) && 2002 connp->conn_family == AF_INET) { 2003 icmp->icmp_hdrincl = 1; 2004 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 2005 } else if (connp->conn_proto == IPPROTO_UDP || 2006 connp->conn_proto == IPPROTO_TCP || 2007 connp->conn_proto == IPPROTO_SCTP) { 2008 /* Used by test applications like psh */ 2009 icmp->icmp_hdrincl = 0; 2010 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 2011 } else { 2012 icmp->icmp_hdrincl = 0; 2013 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM; 2014 } 2015 2016 if (connp->conn_family == AF_INET6 && 2017 connp->conn_proto == IPPROTO_ICMPV6) { 2018 /* Set offset for icmp6_cksum */ 2019 ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM; 2020 ixa->ixa_raw_cksum_offset = 2; 2021 } 2022 if (icmp->icmp_filter != NULL && 2023 connp->conn_proto != IPPROTO_ICMPV6) { 2024 kmem_free(icmp->icmp_filter, 2025 sizeof (icmp6_filter_t)); 2026 icmp->icmp_filter = NULL; 2027 } 2028 mutex_exit(&connp->conn_lock); 2029 2030 coa->coa_changed |= COA_HEADER_CHANGED; 2031 /* 2032 * For SCTP, we don't use icmp_bind_proto() for 2033 * raw socket binding. 2034 */ 2035 if (connp->conn_proto == IPPROTO_SCTP) 2036 return (0); 2037 2038 coa->coa_changed |= COA_ICMP_BIND_NEEDED; 2039 return (0); 2040 2041 case SO_SNDBUF: 2042 if (*i1 > is->is_max_buf) { 2043 return (ENOBUFS); 2044 } 2045 break; 2046 case SO_RCVBUF: 2047 if (*i1 > is->is_max_buf) { 2048 return (ENOBUFS); 2049 } 2050 break; 2051 } 2052 break; 2053 2054 case IPPROTO_IP: 2055 /* 2056 * Only allow IPv4 option processing on IPv4 sockets. 2057 */ 2058 if (connp->conn_family != AF_INET) 2059 return (EINVAL); 2060 2061 switch (name) { 2062 case IP_HDRINCL: 2063 if (!checkonly) { 2064 mutex_enter(&connp->conn_lock); 2065 icmp->icmp_hdrincl = onoff; 2066 if (onoff) 2067 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 2068 else 2069 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM; 2070 mutex_exit(&connp->conn_lock); 2071 } 2072 break; 2073 } 2074 break; 2075 2076 case IPPROTO_IPV6: 2077 if (connp->conn_family != AF_INET6) 2078 return (EINVAL); 2079 2080 switch (name) { 2081 case IPV6_CHECKSUM: 2082 /* 2083 * Integer offset into the user data of where the 2084 * checksum is located. 2085 * Offset of -1 disables option. 2086 * Does not apply to IPPROTO_ICMPV6. 2087 */ 2088 if (connp->conn_proto == IPPROTO_ICMPV6 || 2089 coa->coa_ancillary) { 2090 return (EINVAL); 2091 } 2092 if ((*i1 != -1) && ((*i1 < 0) || (*i1 & 0x1) != 0)) { 2093 /* Negative or not 16 bit aligned offset */ 2094 return (EINVAL); 2095 } 2096 if (checkonly) 2097 break; 2098 2099 mutex_enter(&connp->conn_lock); 2100 if (*i1 == -1) { 2101 ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM; 2102 ixa->ixa_raw_cksum_offset = 0; 2103 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 2104 } else { 2105 ixa->ixa_flags |= IXAF_SET_RAW_CKSUM; 2106 ixa->ixa_raw_cksum_offset = *i1; 2107 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM; 2108 } 2109 mutex_exit(&connp->conn_lock); 2110 break; 2111 } 2112 break; 2113 2114 case IPPROTO_ICMPV6: 2115 /* 2116 * Only allow IPv6 option processing on IPv6 sockets. 2117 */ 2118 if (connp->conn_family != AF_INET6) 2119 return (EINVAL); 2120 if (connp->conn_proto != IPPROTO_ICMPV6) 2121 return (EINVAL); 2122 2123 switch (name) { 2124 case ICMP6_FILTER: 2125 if (checkonly) 2126 break; 2127 2128 if ((inlen != 0) && 2129 (inlen != sizeof (icmp6_filter_t))) 2130 return (EINVAL); 2131 2132 mutex_enter(&connp->conn_lock); 2133 if (inlen == 0) { 2134 if (icmp->icmp_filter != NULL) { 2135 kmem_free(icmp->icmp_filter, 2136 sizeof (icmp6_filter_t)); 2137 icmp->icmp_filter = NULL; 2138 } 2139 } else { 2140 if (icmp->icmp_filter == NULL) { 2141 icmp->icmp_filter = kmem_alloc( 2142 sizeof (icmp6_filter_t), 2143 KM_NOSLEEP); 2144 if (icmp->icmp_filter == NULL) { 2145 mutex_exit(&connp->conn_lock); 2146 return (ENOBUFS); 2147 } 2148 } 2149 (void) bcopy(invalp, icmp->icmp_filter, inlen); 2150 } 2151 mutex_exit(&connp->conn_lock); 2152 break; 2153 } 2154 break; 2155 } 2156 error = conn_opt_set(coa, level, name, inlen, invalp, 2157 checkonly, cr); 2158 return (error); 2159 } 2160 2161 /* 2162 * This routine sets socket options. 2163 */ 2164 int 2165 icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, 2166 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 2167 void *thisdg_attrs, cred_t *cr) 2168 { 2169 icmp_t *icmp = connp->conn_icmp; 2170 int err; 2171 conn_opt_arg_t coas, *coa; 2172 boolean_t checkonly; 2173 icmp_stack_t *is = icmp->icmp_is; 2174 2175 switch (optset_context) { 2176 case SETFN_OPTCOM_CHECKONLY: 2177 checkonly = B_TRUE; 2178 /* 2179 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ 2180 * inlen != 0 implies value supplied and 2181 * we have to "pretend" to set it. 2182 * inlen == 0 implies that there is no 2183 * value part in T_CHECK request and just validation 2184 * done elsewhere should be enough, we just return here. 2185 */ 2186 if (inlen == 0) { 2187 *outlenp = 0; 2188 return (0); 2189 } 2190 break; 2191 case SETFN_OPTCOM_NEGOTIATE: 2192 checkonly = B_FALSE; 2193 break; 2194 case SETFN_UD_NEGOTIATE: 2195 case SETFN_CONN_NEGOTIATE: 2196 checkonly = B_FALSE; 2197 /* 2198 * Negotiating local and "association-related" options 2199 * through T_UNITDATA_REQ. 2200 * 2201 * Following routine can filter out ones we do not 2202 * want to be "set" this way. 2203 */ 2204 if (!icmp_opt_allow_udr_set(level, name)) { 2205 *outlenp = 0; 2206 return (EINVAL); 2207 } 2208 break; 2209 default: 2210 /* 2211 * We should never get here 2212 */ 2213 *outlenp = 0; 2214 return (EINVAL); 2215 } 2216 2217 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || 2218 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); 2219 2220 if (thisdg_attrs != NULL) { 2221 /* Options from T_UNITDATA_REQ */ 2222 coa = (conn_opt_arg_t *)thisdg_attrs; 2223 ASSERT(coa->coa_connp == connp); 2224 ASSERT(coa->coa_ixa != NULL); 2225 ASSERT(coa->coa_ipp != NULL); 2226 ASSERT(coa->coa_ancillary); 2227 } else { 2228 coa = &coas; 2229 coas.coa_connp = connp; 2230 /* Get a reference on conn_ixa to prevent concurrent mods */ 2231 coas.coa_ixa = conn_get_ixa(connp, B_TRUE); 2232 if (coas.coa_ixa == NULL) { 2233 *outlenp = 0; 2234 return (ENOMEM); 2235 } 2236 coas.coa_ipp = &connp->conn_xmit_ipp; 2237 coas.coa_ancillary = B_FALSE; 2238 coas.coa_changed = 0; 2239 } 2240 2241 err = icmp_do_opt_set(coa, level, name, inlen, invalp, 2242 cr, checkonly); 2243 if (err != 0) { 2244 errout: 2245 if (!coa->coa_ancillary) 2246 ixa_refrele(coa->coa_ixa); 2247 *outlenp = 0; 2248 return (err); 2249 } 2250 2251 /* 2252 * Common case of OK return with outval same as inval. 2253 */ 2254 if (invalp != outvalp) { 2255 /* don't trust bcopy for identical src/dst */ 2256 (void) bcopy(invalp, outvalp, inlen); 2257 } 2258 *outlenp = inlen; 2259 2260 /* 2261 * If this was not ancillary data, then we rebuild the headers, 2262 * update the IRE/NCE, and IPsec as needed. 2263 * Since the label depends on the destination we go through 2264 * ip_set_destination first. 2265 */ 2266 if (coa->coa_ancillary) { 2267 return (0); 2268 } 2269 2270 if (coa->coa_changed & COA_ROUTE_CHANGED) { 2271 in6_addr_t saddr, faddr, nexthop; 2272 in_port_t fport; 2273 2274 /* 2275 * We clear lastdst to make sure we pick up the change 2276 * next time sending. 2277 * If we are connected we re-cache the information. 2278 * We ignore errors to preserve BSD behavior. 2279 * Note that we don't redo IPsec policy lookup here 2280 * since the final destination (or source) didn't change. 2281 */ 2282 mutex_enter(&connp->conn_lock); 2283 connp->conn_v6lastdst = ipv6_all_zeros; 2284 2285 ip_attr_nexthop(coa->coa_ipp, coa->coa_ixa, 2286 &connp->conn_faddr_v6, &nexthop); 2287 saddr = connp->conn_saddr_v6; 2288 faddr = connp->conn_faddr_v6; 2289 fport = connp->conn_fport; 2290 mutex_exit(&connp->conn_lock); 2291 2292 if (!IN6_IS_ADDR_UNSPECIFIED(&faddr) && 2293 !IN6_IS_ADDR_V4MAPPED_ANY(&faddr)) { 2294 (void) ip_attr_connect(connp, coa->coa_ixa, 2295 &saddr, &faddr, &nexthop, fport, NULL, NULL, 2296 IPDF_ALLOW_MCBC | IPDF_VERIFY_DST); 2297 } 2298 } 2299 2300 ixa_refrele(coa->coa_ixa); 2301 2302 if (coa->coa_changed & COA_HEADER_CHANGED) { 2303 /* 2304 * Rebuild the header template if we are connected. 2305 * Otherwise clear conn_v6lastdst so we rebuild the header 2306 * in the data path. 2307 */ 2308 mutex_enter(&connp->conn_lock); 2309 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && 2310 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 2311 err = icmp_build_hdr_template(connp, 2312 &connp->conn_saddr_v6, &connp->conn_faddr_v6, 2313 connp->conn_flowinfo); 2314 if (err != 0) { 2315 mutex_exit(&connp->conn_lock); 2316 return (err); 2317 } 2318 } else { 2319 connp->conn_v6lastdst = ipv6_all_zeros; 2320 } 2321 mutex_exit(&connp->conn_lock); 2322 } 2323 if (coa->coa_changed & COA_RCVBUF_CHANGED) { 2324 (void) proto_set_rx_hiwat(connp->conn_rq, connp, 2325 connp->conn_rcvbuf); 2326 } 2327 if ((coa->coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) { 2328 connp->conn_wq->q_hiwat = connp->conn_sndbuf; 2329 } 2330 if (coa->coa_changed & COA_WROFF_CHANGED) { 2331 /* Increase wroff if needed */ 2332 uint_t wroff; 2333 2334 mutex_enter(&connp->conn_lock); 2335 wroff = connp->conn_ht_iphc_allocated + is->is_wroff_extra; 2336 if (wroff > connp->conn_wroff) { 2337 connp->conn_wroff = wroff; 2338 mutex_exit(&connp->conn_lock); 2339 (void) proto_set_tx_wroff(connp->conn_rq, connp, wroff); 2340 } else { 2341 mutex_exit(&connp->conn_lock); 2342 } 2343 } 2344 if (coa->coa_changed & COA_ICMP_BIND_NEEDED) { 2345 icmp_bind_proto(icmp); 2346 } 2347 return (err); 2348 } 2349 2350 /* This routine sets socket options. */ 2351 int 2352 icmp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name, 2353 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 2354 void *thisdg_attrs, cred_t *cr) 2355 { 2356 conn_t *connp = Q_TO_CONN(q); 2357 int error; 2358 2359 error = icmp_opt_set(connp, optset_context, level, name, inlen, invalp, 2360 outlenp, outvalp, thisdg_attrs, cr); 2361 return (error); 2362 } 2363 2364 /* 2365 * Setup IP headers. 2366 * 2367 * Note that IP_HDRINCL has ipha_protocol that is different than conn_proto, 2368 * but icmp_output_hdrincl restores ipha_protocol once we return. 2369 */ 2370 mblk_t * 2371 icmp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp, 2372 const in6_addr_t *v6src, const in6_addr_t *v6dst, uint32_t flowinfo, 2373 mblk_t *data_mp, int *errorp) 2374 { 2375 mblk_t *mp; 2376 icmp_stack_t *is = connp->conn_netstack->netstack_icmp; 2377 uint_t data_len; 2378 uint32_t cksum; 2379 2380 data_len = msgdsize(data_mp); 2381 mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, connp->conn_proto, 2382 flowinfo, 0, data_mp, data_len, is->is_wroff_extra, &cksum, errorp); 2383 if (mp == NULL) { 2384 ASSERT(*errorp != 0); 2385 return (NULL); 2386 } 2387 2388 ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length; 2389 2390 /* 2391 * If there was a routing option/header then conn_prepend_hdr 2392 * has massaged it and placed the pseudo-header checksum difference 2393 * in the cksum argument. 2394 * 2395 * Prepare for ICMPv6 checksum done in IP. 2396 * 2397 * We make it easy for IP to include our pseudo header 2398 * by putting our length (and any routing header adjustment) 2399 * in the ICMPv6 checksum field. 2400 * The IP source, destination, and length have already been set by 2401 * conn_prepend_hdr. 2402 */ 2403 cksum += data_len; 2404 cksum = (cksum >> 16) + (cksum & 0xFFFF); 2405 ASSERT(cksum < 0x10000); 2406 2407 if (ixa->ixa_flags & IXAF_IS_IPV4) { 2408 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2409 2410 ASSERT(ntohs(ipha->ipha_length) == ixa->ixa_pktlen); 2411 } else { 2412 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2413 uint_t cksum_offset = 0; 2414 2415 ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == ixa->ixa_pktlen); 2416 2417 if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) { 2418 if (connp->conn_proto == IPPROTO_ICMPV6) { 2419 cksum_offset = ixa->ixa_ip_hdr_length + 2420 offsetof(icmp6_t, icmp6_cksum); 2421 } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { 2422 cksum_offset = ixa->ixa_ip_hdr_length + 2423 ixa->ixa_raw_cksum_offset; 2424 } 2425 } 2426 if (cksum_offset != 0) { 2427 uint16_t *ptr; 2428 2429 /* Make sure the checksum fits in the first mblk */ 2430 if (cksum_offset + sizeof (short) > MBLKL(mp)) { 2431 mblk_t *mp1; 2432 2433 mp1 = msgpullup(mp, 2434 cksum_offset + sizeof (short)); 2435 freemsg(mp); 2436 if (mp1 == NULL) { 2437 *errorp = ENOMEM; 2438 return (NULL); 2439 } 2440 mp = mp1; 2441 ip6h = (ip6_t *)mp->b_rptr; 2442 } 2443 ptr = (uint16_t *)(mp->b_rptr + cksum_offset); 2444 *ptr = htons(cksum); 2445 } 2446 } 2447 2448 /* Note that we don't try to update wroff due to ancillary data */ 2449 return (mp); 2450 } 2451 2452 static int 2453 icmp_build_hdr_template(conn_t *connp, const in6_addr_t *v6src, 2454 const in6_addr_t *v6dst, uint32_t flowinfo) 2455 { 2456 int error; 2457 2458 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2459 /* 2460 * We clear lastdst to make sure we don't use the lastdst path 2461 * next time sending since we might not have set v6dst yet. 2462 */ 2463 connp->conn_v6lastdst = ipv6_all_zeros; 2464 2465 error = conn_build_hdr_template(connp, 0, 0, v6src, v6dst, flowinfo); 2466 if (error != 0) 2467 return (error); 2468 2469 /* 2470 * Any routing header/option has been massaged. The checksum difference 2471 * is stored in conn_sum. 2472 */ 2473 return (0); 2474 } 2475 2476 static mblk_t * 2477 icmp_queue_fallback(icmp_t *icmp, mblk_t *mp) 2478 { 2479 ASSERT(MUTEX_HELD(&icmp->icmp_recv_lock)); 2480 if (IPCL_IS_NONSTR(icmp->icmp_connp)) { 2481 /* 2482 * fallback has started but messages have not been moved yet 2483 */ 2484 if (icmp->icmp_fallback_queue_head == NULL) { 2485 ASSERT(icmp->icmp_fallback_queue_tail == NULL); 2486 icmp->icmp_fallback_queue_head = mp; 2487 icmp->icmp_fallback_queue_tail = mp; 2488 } else { 2489 ASSERT(icmp->icmp_fallback_queue_tail != NULL); 2490 icmp->icmp_fallback_queue_tail->b_next = mp; 2491 icmp->icmp_fallback_queue_tail = mp; 2492 } 2493 return (NULL); 2494 } else { 2495 /* 2496 * Fallback completed, let the caller putnext() the mblk. 2497 */ 2498 return (mp); 2499 } 2500 } 2501 2502 /* 2503 * Deliver data to ULP. In case we have a socket, and it's falling back to 2504 * TPI, then we'll queue the mp for later processing. 2505 */ 2506 static void 2507 icmp_ulp_recv(conn_t *connp, mblk_t *mp, uint_t len) 2508 { 2509 if (IPCL_IS_NONSTR(connp)) { 2510 icmp_t *icmp = connp->conn_icmp; 2511 int error; 2512 2513 ASSERT(len == msgdsize(mp)); 2514 if ((*connp->conn_upcalls->su_recv) 2515 (connp->conn_upper_handle, mp, len, 0, &error, NULL) < 0) { 2516 mutex_enter(&icmp->icmp_recv_lock); 2517 if (error == ENOSPC) { 2518 /* 2519 * let's confirm while holding the lock 2520 */ 2521 if ((*connp->conn_upcalls->su_recv) 2522 (connp->conn_upper_handle, NULL, 0, 0, 2523 &error, NULL) < 0) { 2524 ASSERT(error == ENOSPC); 2525 if (error == ENOSPC) { 2526 connp->conn_flow_cntrld = 2527 B_TRUE; 2528 } 2529 } 2530 mutex_exit(&icmp->icmp_recv_lock); 2531 } else { 2532 ASSERT(error == EOPNOTSUPP); 2533 mp = icmp_queue_fallback(icmp, mp); 2534 mutex_exit(&icmp->icmp_recv_lock); 2535 if (mp != NULL) 2536 putnext(connp->conn_rq, mp); 2537 } 2538 } 2539 ASSERT(MUTEX_NOT_HELD(&icmp->icmp_recv_lock)); 2540 } else { 2541 putnext(connp->conn_rq, mp); 2542 } 2543 } 2544 2545 /* 2546 * This is the inbound data path. 2547 * IP has already pulled up the IP headers and verified alignment 2548 * etc. 2549 */ 2550 /* ARGSUSED2 */ 2551 static void 2552 icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) 2553 { 2554 conn_t *connp = (conn_t *)arg1; 2555 struct T_unitdata_ind *tudi; 2556 uchar_t *rptr; /* Pointer to IP header */ 2557 int ip_hdr_length; 2558 int udi_size; /* Size of T_unitdata_ind */ 2559 int pkt_len; 2560 icmp_t *icmp; 2561 ip_pkt_t ipps; 2562 ip6_t *ip6h; 2563 mblk_t *mp1; 2564 crb_t recv_ancillary; 2565 icmp_stack_t *is; 2566 sin_t *sin; 2567 sin6_t *sin6; 2568 ipha_t *ipha; 2569 2570 ASSERT(connp->conn_flags & IPCL_RAWIPCONN); 2571 2572 icmp = connp->conn_icmp; 2573 is = icmp->icmp_is; 2574 rptr = mp->b_rptr; 2575 2576 ASSERT(DB_TYPE(mp) == M_DATA); 2577 ASSERT(OK_32PTR(rptr)); 2578 ASSERT(ira->ira_pktlen == msgdsize(mp)); 2579 pkt_len = ira->ira_pktlen; 2580 2581 /* 2582 * Get a snapshot of these and allow other threads to change 2583 * them after that. We need the same recv_ancillary when determining 2584 * the size as when adding the ancillary data items. 2585 */ 2586 mutex_enter(&connp->conn_lock); 2587 recv_ancillary = connp->conn_recv_ancillary; 2588 mutex_exit(&connp->conn_lock); 2589 2590 ip_hdr_length = ira->ira_ip_hdr_length; 2591 ASSERT(MBLKL(mp) >= ip_hdr_length); /* IP did a pullup */ 2592 2593 /* Initialize regardless of IP version */ 2594 ipps.ipp_fields = 0; 2595 2596 if (ira->ira_flags & IRAF_IS_IPV4) { 2597 ASSERT(IPH_HDR_VERSION(rptr) == IPV4_VERSION); 2598 ASSERT(MBLKL(mp) >= sizeof (ipha_t)); 2599 ASSERT(ira->ira_ip_hdr_length == IPH_HDR_LENGTH(rptr)); 2600 2601 ipha = (ipha_t *)mp->b_rptr; 2602 if (recv_ancillary.crb_all != 0) 2603 (void) ip_find_hdr_v4(ipha, &ipps, B_FALSE); 2604 2605 /* 2606 * BSD for some reason adjusts ipha_length to exclude the 2607 * IP header length. We do the same. 2608 */ 2609 if (is->is_bsd_compat) { 2610 ushort_t len; 2611 2612 len = ntohs(ipha->ipha_length); 2613 if (mp->b_datap->db_ref > 1) { 2614 /* 2615 * Allocate a new IP header so that we can 2616 * modify ipha_length. 2617 */ 2618 mblk_t *mp1; 2619 2620 mp1 = allocb(ip_hdr_length, BPRI_MED); 2621 if (mp1 == NULL) { 2622 freemsg(mp); 2623 BUMP_MIB(&is->is_rawip_mib, 2624 rawipInErrors); 2625 return; 2626 } 2627 bcopy(rptr, mp1->b_rptr, ip_hdr_length); 2628 mp->b_rptr = rptr + ip_hdr_length; 2629 rptr = mp1->b_rptr; 2630 ipha = (ipha_t *)rptr; 2631 mp1->b_cont = mp; 2632 mp1->b_wptr = rptr + ip_hdr_length; 2633 mp = mp1; 2634 } 2635 len -= ip_hdr_length; 2636 ipha->ipha_length = htons(len); 2637 } 2638 2639 /* 2640 * For RAW sockets we not pass ICMP/IPv4 packets to AF_INET6 2641 * sockets. This is ensured by icmp_bind and the IP fanout code. 2642 */ 2643 ASSERT(connp->conn_family == AF_INET); 2644 2645 /* 2646 * This is the inbound data path. Packets are passed upstream 2647 * as T_UNITDATA_IND messages with full IPv4 headers still 2648 * attached. 2649 */ 2650 2651 /* 2652 * Normally only send up the source address. 2653 * If any ancillary data items are wanted we add those. 2654 */ 2655 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t); 2656 if (recv_ancillary.crb_all != 0) { 2657 udi_size += conn_recvancillary_size(connp, 2658 recv_ancillary, ira, mp, &ipps); 2659 } 2660 2661 /* Allocate a message block for the T_UNITDATA_IND structure. */ 2662 mp1 = allocb(udi_size, BPRI_MED); 2663 if (mp1 == NULL) { 2664 freemsg(mp); 2665 BUMP_MIB(&is->is_rawip_mib, rawipInErrors); 2666 return; 2667 } 2668 mp1->b_cont = mp; 2669 tudi = (struct T_unitdata_ind *)mp1->b_rptr; 2670 mp1->b_datap->db_type = M_PROTO; 2671 mp1->b_wptr = (uchar_t *)tudi + udi_size; 2672 tudi->PRIM_type = T_UNITDATA_IND; 2673 tudi->SRC_length = sizeof (sin_t); 2674 tudi->SRC_offset = sizeof (struct T_unitdata_ind); 2675 sin = (sin_t *)&tudi[1]; 2676 *sin = sin_null; 2677 sin->sin_family = AF_INET; 2678 sin->sin_addr.s_addr = ipha->ipha_src; 2679 *(uint32_t *)&sin->sin_zero[0] = 0; 2680 *(uint32_t *)&sin->sin_zero[4] = 0; 2681 tudi->OPT_offset = sizeof (struct T_unitdata_ind) + 2682 sizeof (sin_t); 2683 udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t)); 2684 tudi->OPT_length = udi_size; 2685 2686 /* 2687 * Add options if IP_RECVIF etc is set 2688 */ 2689 if (udi_size != 0) { 2690 conn_recvancillary_add(connp, recv_ancillary, ira, 2691 &ipps, (uchar_t *)&sin[1], udi_size); 2692 } 2693 goto deliver; 2694 } 2695 2696 ASSERT(IPH_HDR_VERSION(rptr) == IPV6_VERSION); 2697 /* 2698 * IPv6 packets can only be received by applications 2699 * that are prepared to receive IPv6 addresses. 2700 * The IP fanout must ensure this. 2701 */ 2702 ASSERT(connp->conn_family == AF_INET6); 2703 2704 /* 2705 * Handle IPv6 packets. We don't pass up the IP headers with the 2706 * payload for IPv6. 2707 */ 2708 2709 ip6h = (ip6_t *)rptr; 2710 if (recv_ancillary.crb_all != 0) { 2711 /* 2712 * Call on ip_find_hdr_v6 which gets individual lenghts of 2713 * extension headers (and pointers to them). 2714 */ 2715 uint8_t nexthdr; 2716 2717 /* We don't care about the length or nextheader. */ 2718 (void) ip_find_hdr_v6(mp, ip6h, B_TRUE, &ipps, &nexthdr); 2719 2720 /* 2721 * We do not pass up hop-by-hop options or any other 2722 * extension header as part of the packet. Applications 2723 * that want to see them have to specify IPV6_RECV* socket 2724 * options. And conn_recvancillary_size/add explicitly 2725 * drops the TX option from IPV6_HOPOPTS as it does for UDP. 2726 * 2727 * If we had multilevel ICMP sockets, then we'd want to 2728 * modify conn_recvancillary_size/add to 2729 * allow the user to see the label. 2730 */ 2731 } 2732 2733 /* 2734 * Check a filter for ICMPv6 types if needed. 2735 * Verify raw checksums if needed. 2736 */ 2737 mutex_enter(&connp->conn_lock); 2738 if (icmp->icmp_filter != NULL) { 2739 int type; 2740 2741 /* Assumes that IP has done the pullupmsg */ 2742 type = mp->b_rptr[ip_hdr_length]; 2743 2744 ASSERT(mp->b_rptr + ip_hdr_length <= mp->b_wptr); 2745 if (ICMP6_FILTER_WILLBLOCK(type, icmp->icmp_filter)) { 2746 mutex_exit(&connp->conn_lock); 2747 freemsg(mp); 2748 return; 2749 } 2750 } 2751 if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { 2752 /* Checksum */ 2753 uint16_t *up; 2754 uint32_t sum; 2755 int remlen; 2756 2757 up = (uint16_t *)&ip6h->ip6_src; 2758 2759 remlen = msgdsize(mp) - ip_hdr_length; 2760 sum = htons(connp->conn_proto + remlen) 2761 + up[0] + up[1] + up[2] + up[3] 2762 + up[4] + up[5] + up[6] + up[7] 2763 + up[8] + up[9] + up[10] + up[11] 2764 + up[12] + up[13] + up[14] + up[15]; 2765 sum = (sum & 0xffff) + (sum >> 16); 2766 sum = IP_CSUM(mp, ip_hdr_length, sum); 2767 if (sum != 0) { 2768 /* IPv6 RAW checksum failed */ 2769 ip0dbg(("icmp_rput: RAW checksum failed %x\n", sum)); 2770 mutex_exit(&connp->conn_lock); 2771 freemsg(mp); 2772 BUMP_MIB(&is->is_rawip_mib, rawipInCksumErrs); 2773 return; 2774 } 2775 } 2776 mutex_exit(&connp->conn_lock); 2777 2778 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t); 2779 2780 if (recv_ancillary.crb_all != 0) { 2781 udi_size += conn_recvancillary_size(connp, 2782 recv_ancillary, ira, mp, &ipps); 2783 } 2784 2785 mp1 = allocb(udi_size, BPRI_MED); 2786 if (mp1 == NULL) { 2787 freemsg(mp); 2788 BUMP_MIB(&is->is_rawip_mib, rawipInErrors); 2789 return; 2790 } 2791 mp1->b_cont = mp; 2792 mp1->b_datap->db_type = M_PROTO; 2793 tudi = (struct T_unitdata_ind *)mp1->b_rptr; 2794 mp1->b_wptr = (uchar_t *)tudi + udi_size; 2795 tudi->PRIM_type = T_UNITDATA_IND; 2796 tudi->SRC_length = sizeof (sin6_t); 2797 tudi->SRC_offset = sizeof (struct T_unitdata_ind); 2798 tudi->OPT_offset = sizeof (struct T_unitdata_ind) + sizeof (sin6_t); 2799 udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin6_t)); 2800 tudi->OPT_length = udi_size; 2801 sin6 = (sin6_t *)&tudi[1]; 2802 *sin6 = sin6_null; 2803 sin6->sin6_port = 0; 2804 sin6->sin6_family = AF_INET6; 2805 2806 sin6->sin6_addr = ip6h->ip6_src; 2807 /* No sin6_flowinfo per API */ 2808 sin6->sin6_flowinfo = 0; 2809 /* For link-scope pass up scope id */ 2810 if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) 2811 sin6->sin6_scope_id = ira->ira_ruifindex; 2812 else 2813 sin6->sin6_scope_id = 0; 2814 sin6->__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst, 2815 IPCL_ZONEID(connp), is->is_netstack); 2816 2817 if (udi_size != 0) { 2818 conn_recvancillary_add(connp, recv_ancillary, ira, 2819 &ipps, (uchar_t *)&sin6[1], udi_size); 2820 } 2821 2822 /* Skip all the IPv6 headers per API */ 2823 mp->b_rptr += ip_hdr_length; 2824 pkt_len -= ip_hdr_length; 2825 2826 deliver: 2827 BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams); 2828 icmp_ulp_recv(connp, mp1, pkt_len); 2829 } 2830 2831 /* 2832 * return SNMP stuff in buffer in mpdata. We don't hold any lock and report 2833 * information that can be changing beneath us. 2834 */ 2835 mblk_t * 2836 icmp_snmp_get(queue_t *q, mblk_t *mpctl) 2837 { 2838 mblk_t *mpdata; 2839 struct opthdr *optp; 2840 conn_t *connp = Q_TO_CONN(q); 2841 icmp_stack_t *is = connp->conn_netstack->netstack_icmp; 2842 mblk_t *mp2ctl; 2843 2844 /* 2845 * make a copy of the original message 2846 */ 2847 mp2ctl = copymsg(mpctl); 2848 2849 if (mpctl == NULL || 2850 (mpdata = mpctl->b_cont) == NULL) { 2851 freemsg(mpctl); 2852 freemsg(mp2ctl); 2853 return (0); 2854 } 2855 2856 /* fixed length structure for IPv4 and IPv6 counters */ 2857 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 2858 optp->level = EXPER_RAWIP; 2859 optp->name = 0; 2860 (void) snmp_append_data(mpdata, (char *)&is->is_rawip_mib, 2861 sizeof (is->is_rawip_mib)); 2862 optp->len = msgdsize(mpdata); 2863 qreply(q, mpctl); 2864 2865 return (mp2ctl); 2866 } 2867 2868 /* 2869 * Return 0 if invalid set request, 1 otherwise, including non-rawip requests. 2870 * TODO: If this ever actually tries to set anything, it needs to be 2871 * to do the appropriate locking. 2872 */ 2873 /* ARGSUSED */ 2874 int 2875 icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name, 2876 uchar_t *ptr, int len) 2877 { 2878 switch (level) { 2879 case EXPER_RAWIP: 2880 return (0); 2881 default: 2882 return (1); 2883 } 2884 } 2885 2886 /* 2887 * This routine creates a T_UDERROR_IND message and passes it upstream. 2888 * The address and options are copied from the T_UNITDATA_REQ message 2889 * passed in mp. This message is freed. 2890 */ 2891 static void 2892 icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err) 2893 { 2894 struct T_unitdata_req *tudr; 2895 mblk_t *mp1; 2896 uchar_t *destaddr; 2897 t_scalar_t destlen; 2898 uchar_t *optaddr; 2899 t_scalar_t optlen; 2900 2901 if ((mp->b_wptr < mp->b_rptr) || 2902 (MBLKL(mp)) < sizeof (struct T_unitdata_req)) { 2903 goto done; 2904 } 2905 tudr = (struct T_unitdata_req *)mp->b_rptr; 2906 destaddr = mp->b_rptr + tudr->DEST_offset; 2907 if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr || 2908 destaddr + tudr->DEST_length < mp->b_rptr || 2909 destaddr + tudr->DEST_length > mp->b_wptr) { 2910 goto done; 2911 } 2912 optaddr = mp->b_rptr + tudr->OPT_offset; 2913 if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr || 2914 optaddr + tudr->OPT_length < mp->b_rptr || 2915 optaddr + tudr->OPT_length > mp->b_wptr) { 2916 goto done; 2917 } 2918 destlen = tudr->DEST_length; 2919 optlen = tudr->OPT_length; 2920 2921 mp1 = mi_tpi_uderror_ind((char *)destaddr, destlen, 2922 (char *)optaddr, optlen, err); 2923 if (mp1 != NULL) 2924 qreply(q, mp1); 2925 2926 done: 2927 freemsg(mp); 2928 } 2929 2930 static int 2931 rawip_do_unbind(conn_t *connp) 2932 { 2933 icmp_t *icmp = connp->conn_icmp; 2934 2935 mutex_enter(&connp->conn_lock); 2936 /* If a bind has not been done, we can't unbind. */ 2937 if (icmp->icmp_state == TS_UNBND) { 2938 mutex_exit(&connp->conn_lock); 2939 return (-TOUTSTATE); 2940 } 2941 connp->conn_saddr_v6 = ipv6_all_zeros; 2942 connp->conn_bound_addr_v6 = ipv6_all_zeros; 2943 connp->conn_laddr_v6 = ipv6_all_zeros; 2944 connp->conn_mcbc_bind = B_FALSE; 2945 connp->conn_lport = 0; 2946 connp->conn_fport = 0; 2947 /* In case we were also connected */ 2948 connp->conn_faddr_v6 = ipv6_all_zeros; 2949 connp->conn_v6lastdst = ipv6_all_zeros; 2950 2951 icmp->icmp_state = TS_UNBND; 2952 2953 (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 2954 &connp->conn_faddr_v6, connp->conn_flowinfo); 2955 mutex_exit(&connp->conn_lock); 2956 2957 ip_unbind(connp); 2958 return (0); 2959 } 2960 2961 /* 2962 * This routine is called by icmp_wput to handle T_UNBIND_REQ messages. 2963 * After some error checking, the message is passed downstream to ip. 2964 */ 2965 static void 2966 icmp_tpi_unbind(queue_t *q, mblk_t *mp) 2967 { 2968 conn_t *connp = Q_TO_CONN(q); 2969 int error; 2970 2971 ASSERT(mp->b_cont == NULL); 2972 error = rawip_do_unbind(connp); 2973 if (error) { 2974 if (error < 0) { 2975 icmp_err_ack(q, mp, -error, 0); 2976 } else { 2977 icmp_err_ack(q, mp, 0, error); 2978 } 2979 return; 2980 } 2981 2982 /* 2983 * Convert mp into a T_OK_ACK 2984 */ 2985 2986 mp = mi_tpi_ok_ack_alloc(mp); 2987 2988 /* 2989 * should not happen in practice... T_OK_ACK is smaller than the 2990 * original message. 2991 */ 2992 ASSERT(mp != NULL); 2993 ASSERT(((struct T_ok_ack *)mp->b_rptr)->PRIM_type == T_OK_ACK); 2994 qreply(q, mp); 2995 } 2996 2997 /* 2998 * Process IPv4 packets that already include an IP header. 2999 * Used when IP_HDRINCL has been set (implicit for IPPROTO_RAW and 3000 * IPPROTO_IGMP). 3001 * In this case we ignore the address and any options in the T_UNITDATA_REQ. 3002 * 3003 * The packet is assumed to have a base (20 byte) IP header followed 3004 * by the upper-layer protocol. We include any IP_OPTIONS including a 3005 * CIPSO label but otherwise preserve the base IP header. 3006 */ 3007 static int 3008 icmp_output_hdrincl(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid) 3009 { 3010 icmp_t *icmp = connp->conn_icmp; 3011 icmp_stack_t *is = icmp->icmp_is; 3012 ipha_t iphas; 3013 ipha_t *ipha; 3014 int ip_hdr_length; 3015 int tp_hdr_len; 3016 ip_xmit_attr_t *ixa; 3017 ip_pkt_t *ipp; 3018 in6_addr_t v6src; 3019 in6_addr_t v6dst; 3020 in6_addr_t v6nexthop; 3021 int error; 3022 boolean_t do_ipsec; 3023 3024 /* 3025 * We need an exclusive copy of conn_ixa since the included IP 3026 * header could have any destination. 3027 * That copy has no pointers hence we 3028 * need to set them up once we've parsed the ancillary data. 3029 */ 3030 ixa = conn_get_ixa_exclusive(connp); 3031 if (ixa == NULL) { 3032 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3033 freemsg(mp); 3034 return (ENOMEM); 3035 } 3036 ASSERT(cr != NULL); 3037 /* 3038 * Caller has a reference on cr; from db_credp or because we 3039 * are running in process context. 3040 */ 3041 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3042 ixa->ixa_cred = cr; 3043 ixa->ixa_cpid = pid; 3044 if (is_system_labeled()) { 3045 /* We need to restart with a label based on the cred */ 3046 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 3047 } 3048 3049 /* In case previous destination was multicast or multirt */ 3050 ip_attr_newdst(ixa); 3051 3052 /* Get a copy of conn_xmit_ipp since the TX label might change it */ 3053 ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP); 3054 if (ipp == NULL) { 3055 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3056 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3057 ixa->ixa_cpid = connp->conn_cpid; 3058 ixa_refrele(ixa); 3059 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3060 freemsg(mp); 3061 return (ENOMEM); 3062 } 3063 mutex_enter(&connp->conn_lock); 3064 error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP); 3065 mutex_exit(&connp->conn_lock); 3066 if (error != 0) { 3067 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3068 freemsg(mp); 3069 goto done; 3070 } 3071 3072 /* Sanity check length of packet */ 3073 ipha = (ipha_t *)mp->b_rptr; 3074 3075 ip_hdr_length = IP_SIMPLE_HDR_LENGTH; 3076 if ((mp->b_wptr - mp->b_rptr) < IP_SIMPLE_HDR_LENGTH) { 3077 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { 3078 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3079 freemsg(mp); 3080 goto done; 3081 } 3082 ipha = (ipha_t *)mp->b_rptr; 3083 } 3084 ipha->ipha_version_and_hdr_length = 3085 (IP_VERSION<<4) | (ip_hdr_length>>2); 3086 3087 /* 3088 * We set IXAF_DONTFRAG if the application set DF which makes 3089 * IP not fragment. 3090 */ 3091 ipha->ipha_fragment_offset_and_flags &= htons(IPH_DF); 3092 if (ipha->ipha_fragment_offset_and_flags & htons(IPH_DF)) 3093 ixa->ixa_flags |= (IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF); 3094 else 3095 ixa->ixa_flags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF); 3096 3097 /* Even for multicast and broadcast we honor the apps ttl */ 3098 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE; 3099 3100 /* 3101 * No source verification for non-local addresses 3102 */ 3103 if (ipha->ipha_src != INADDR_ANY && 3104 ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid, 3105 is->is_netstack->netstack_ip, B_FALSE) 3106 != IPVL_UNICAST_UP) { 3107 ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE; 3108 } 3109 3110 if (ipha->ipha_dst == INADDR_ANY) 3111 ipha->ipha_dst = htonl(INADDR_LOOPBACK); 3112 3113 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src); 3114 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst); 3115 3116 /* Defer IPsec if it might need to look at ICMP type/code */ 3117 do_ipsec = ipha->ipha_protocol != IPPROTO_ICMP; 3118 ixa->ixa_flags |= IXAF_IS_IPV4; 3119 3120 ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop); 3121 error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, 3122 connp->conn_fport, &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | 3123 (do_ipsec ? IPDF_IPSEC : 0)); 3124 switch (error) { 3125 case 0: 3126 break; 3127 case EADDRNOTAVAIL: 3128 /* 3129 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3130 * Don't have the application see that errno 3131 */ 3132 error = ENETUNREACH; 3133 goto failed; 3134 case ENETDOWN: 3135 /* 3136 * Have !ipif_addr_ready address; drop packet silently 3137 * until we can get applications to not send until we 3138 * are ready. 3139 */ 3140 error = 0; 3141 goto failed; 3142 case EHOSTUNREACH: 3143 case ENETUNREACH: 3144 if (ixa->ixa_ire != NULL) { 3145 /* 3146 * Let conn_ip_output/ire_send_noroute return 3147 * the error and send any local ICMP error. 3148 */ 3149 error = 0; 3150 break; 3151 } 3152 /* FALLTHRU */ 3153 default: 3154 failed: 3155 freemsg(mp); 3156 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3157 goto done; 3158 } 3159 if (ipha->ipha_src == INADDR_ANY) 3160 IN6_V4MAPPED_TO_IPADDR(&v6src, ipha->ipha_src); 3161 3162 /* 3163 * We might be going to a different destination than last time, 3164 * thus check that TX allows the communication and compute any 3165 * needed label. 3166 * 3167 * TSOL Note: We have an exclusive ipp and ixa for this thread so we 3168 * don't have to worry about concurrent threads. 3169 */ 3170 if (is_system_labeled()) { 3171 /* 3172 * Check whether Trusted Solaris policy allows communication 3173 * with this host, and pretend that the destination is 3174 * unreachable if not. 3175 * Compute any needed label and place it in ipp_label_v4/v6. 3176 * 3177 * Later conn_build_hdr_template/conn_prepend_hdr takes 3178 * ipp_label_v4/v6 to form the packet. 3179 * 3180 * Tsol note: We have ipp structure local to this thread so 3181 * no locking is needed. 3182 */ 3183 error = conn_update_label(connp, ixa, &v6dst, ipp); 3184 if (error != 0) { 3185 freemsg(mp); 3186 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3187 goto done; 3188 } 3189 } 3190 3191 /* 3192 * Save away a copy of the IPv4 header the application passed down 3193 * and then prepend an IPv4 header complete with any IP options 3194 * including label. 3195 * We need a struct copy since icmp_prepend_hdr will reuse the available 3196 * space in the mblk. 3197 */ 3198 iphas = *ipha; 3199 mp->b_rptr += IP_SIMPLE_HDR_LENGTH; 3200 3201 mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, 0, mp, &error); 3202 if (mp == NULL) { 3203 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3204 ASSERT(error != 0); 3205 goto done; 3206 } 3207 if (ixa->ixa_pktlen > IP_MAXPACKET) { 3208 error = EMSGSIZE; 3209 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3210 freemsg(mp); 3211 goto done; 3212 } 3213 /* Restore key parts of the header that the application passed down */ 3214 ipha = (ipha_t *)mp->b_rptr; 3215 ipha->ipha_type_of_service = iphas.ipha_type_of_service; 3216 ipha->ipha_ident = iphas.ipha_ident; 3217 ipha->ipha_fragment_offset_and_flags = 3218 iphas.ipha_fragment_offset_and_flags; 3219 ipha->ipha_ttl = iphas.ipha_ttl; 3220 ipha->ipha_protocol = iphas.ipha_protocol; 3221 ipha->ipha_src = iphas.ipha_src; 3222 ipha->ipha_dst = iphas.ipha_dst; 3223 3224 ixa->ixa_protocol = ipha->ipha_protocol; 3225 3226 /* 3227 * Make sure that the IP header plus any transport header that is 3228 * checksumed by ip_output is in the first mblk. (ip_output assumes 3229 * that at least the checksum field is in the first mblk.) 3230 */ 3231 switch (ipha->ipha_protocol) { 3232 case IPPROTO_UDP: 3233 tp_hdr_len = 8; 3234 break; 3235 case IPPROTO_TCP: 3236 tp_hdr_len = 20; 3237 break; 3238 default: 3239 tp_hdr_len = 0; 3240 break; 3241 } 3242 ip_hdr_length = IPH_HDR_LENGTH(ipha); 3243 if (mp->b_wptr - mp->b_rptr < ip_hdr_length + tp_hdr_len) { 3244 if (!pullupmsg(mp, ip_hdr_length + tp_hdr_len)) { 3245 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3246 if (mp->b_cont == NULL) 3247 error = EINVAL; 3248 else 3249 error = ENOMEM; 3250 freemsg(mp); 3251 goto done; 3252 } 3253 } 3254 3255 if (!do_ipsec) { 3256 /* Policy might differ for different ICMP type/code */ 3257 if (ixa->ixa_ipsec_policy != NULL) { 3258 IPPOL_REFRELE(ixa->ixa_ipsec_policy); 3259 ixa->ixa_ipsec_policy = NULL; 3260 ixa->ixa_flags &= ~IXAF_IPSEC_SECURE; 3261 } 3262 mp = ip_output_attach_policy(mp, ipha, NULL, connp, ixa); 3263 if (mp == NULL) { 3264 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3265 error = EHOSTUNREACH; /* IPsec policy failure */ 3266 goto done; 3267 } 3268 } 3269 3270 /* We're done. Pass the packet to ip. */ 3271 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3272 3273 error = conn_ip_output(mp, ixa); 3274 /* No rawipOutErrors if an error since IP increases its error counter */ 3275 switch (error) { 3276 case 0: 3277 break; 3278 case EWOULDBLOCK: 3279 (void) ixa_check_drain_insert(connp, ixa); 3280 error = 0; 3281 break; 3282 case EADDRNOTAVAIL: 3283 /* 3284 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3285 * Don't have the application see that errno 3286 */ 3287 error = ENETUNREACH; 3288 break; 3289 } 3290 done: 3291 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3292 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3293 ixa->ixa_cpid = connp->conn_cpid; 3294 ixa_refrele(ixa); 3295 ip_pkt_free(ipp); 3296 kmem_free(ipp, sizeof (*ipp)); 3297 return (error); 3298 } 3299 3300 static mblk_t * 3301 icmp_output_attach_policy(mblk_t *mp, conn_t *connp, ip_xmit_attr_t *ixa) 3302 { 3303 ipha_t *ipha = NULL; 3304 ip6_t *ip6h = NULL; 3305 3306 if (ixa->ixa_flags & IXAF_IS_IPV4) 3307 ipha = (ipha_t *)mp->b_rptr; 3308 else 3309 ip6h = (ip6_t *)mp->b_rptr; 3310 3311 if (ixa->ixa_ipsec_policy != NULL) { 3312 IPPOL_REFRELE(ixa->ixa_ipsec_policy); 3313 ixa->ixa_ipsec_policy = NULL; 3314 ixa->ixa_flags &= ~IXAF_IPSEC_SECURE; 3315 } 3316 return (ip_output_attach_policy(mp, ipha, ip6h, connp, ixa)); 3317 } 3318 3319 /* 3320 * Handle T_UNITDATA_REQ with options. Both IPv4 and IPv6 3321 * Either tudr_mp or msg is set. If tudr_mp we take ancillary data from 3322 * the TPI options, otherwise we take them from msg_control. 3323 * If both sin and sin6 is set it is a connected socket and we use conn_faddr. 3324 * Always consumes mp; never consumes tudr_mp. 3325 */ 3326 static int 3327 icmp_output_ancillary(conn_t *connp, sin_t *sin, sin6_t *sin6, mblk_t *mp, 3328 mblk_t *tudr_mp, struct nmsghdr *msg, cred_t *cr, pid_t pid) 3329 { 3330 icmp_t *icmp = connp->conn_icmp; 3331 icmp_stack_t *is = icmp->icmp_is; 3332 int error; 3333 ip_xmit_attr_t *ixa; 3334 ip_pkt_t *ipp; 3335 in6_addr_t v6src; 3336 in6_addr_t v6dst; 3337 in6_addr_t v6nexthop; 3338 in_port_t dstport; 3339 uint32_t flowinfo; 3340 uint_t srcid; 3341 int is_absreq_failure = 0; 3342 conn_opt_arg_t coas, *coa; 3343 3344 ASSERT(tudr_mp != NULL || msg != NULL); 3345 3346 /* 3347 * Get ixa before checking state to handle a disconnect race. 3348 * 3349 * We need an exclusive copy of conn_ixa since the ancillary data 3350 * options might modify it. That copy has no pointers hence we 3351 * need to set them up once we've parsed the ancillary data. 3352 */ 3353 ixa = conn_get_ixa_exclusive(connp); 3354 if (ixa == NULL) { 3355 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3356 freemsg(mp); 3357 return (ENOMEM); 3358 } 3359 ASSERT(cr != NULL); 3360 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3361 ixa->ixa_cred = cr; 3362 ixa->ixa_cpid = pid; 3363 if (is_system_labeled()) { 3364 /* We need to restart with a label based on the cred */ 3365 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 3366 } 3367 3368 /* In case previous destination was multicast or multirt */ 3369 ip_attr_newdst(ixa); 3370 3371 /* Get a copy of conn_xmit_ipp since the options might change it */ 3372 ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP); 3373 if (ipp == NULL) { 3374 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3375 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3376 ixa->ixa_cpid = connp->conn_cpid; 3377 ixa_refrele(ixa); 3378 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3379 freemsg(mp); 3380 return (ENOMEM); 3381 } 3382 mutex_enter(&connp->conn_lock); 3383 error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP); 3384 mutex_exit(&connp->conn_lock); 3385 if (error != 0) { 3386 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3387 freemsg(mp); 3388 goto done; 3389 } 3390 3391 /* 3392 * Parse the options and update ixa and ipp as a result. 3393 */ 3394 3395 coa = &coas; 3396 coa->coa_connp = connp; 3397 coa->coa_ixa = ixa; 3398 coa->coa_ipp = ipp; 3399 coa->coa_ancillary = B_TRUE; 3400 coa->coa_changed = 0; 3401 3402 if (msg != NULL) { 3403 error = process_auxiliary_options(connp, msg->msg_control, 3404 msg->msg_controllen, coa, &icmp_opt_obj, icmp_opt_set, cr); 3405 } else { 3406 struct T_unitdata_req *tudr; 3407 3408 tudr = (struct T_unitdata_req *)tudr_mp->b_rptr; 3409 ASSERT(tudr->PRIM_type == T_UNITDATA_REQ); 3410 error = tpi_optcom_buf(connp->conn_wq, tudr_mp, 3411 &tudr->OPT_length, tudr->OPT_offset, cr, &icmp_opt_obj, 3412 coa, &is_absreq_failure); 3413 } 3414 if (error != 0) { 3415 /* 3416 * Note: No special action needed in this 3417 * module for "is_absreq_failure" 3418 */ 3419 freemsg(mp); 3420 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3421 goto done; 3422 } 3423 ASSERT(is_absreq_failure == 0); 3424 3425 mutex_enter(&connp->conn_lock); 3426 /* 3427 * If laddr is unspecified then we look at sin6_src_id. 3428 * We will give precedence to a source address set with IPV6_PKTINFO 3429 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't 3430 * want ip_attr_connect to select a source (since it can fail) when 3431 * IPV6_PKTINFO is specified. 3432 * If this doesn't result in a source address then we get a source 3433 * from ip_attr_connect() below. 3434 */ 3435 v6src = connp->conn_saddr_v6; 3436 if (sin != NULL) { 3437 IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst); 3438 dstport = sin->sin_port; 3439 flowinfo = 0; 3440 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 3441 ixa->ixa_flags |= IXAF_IS_IPV4; 3442 } else if (sin6 != NULL) { 3443 v6dst = sin6->sin6_addr; 3444 dstport = sin6->sin6_port; 3445 flowinfo = sin6->sin6_flowinfo; 3446 srcid = sin6->__sin6_src_id; 3447 if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) { 3448 ixa->ixa_scopeid = sin6->sin6_scope_id; 3449 ixa->ixa_flags |= IXAF_SCOPEID_SET; 3450 } else { 3451 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 3452 } 3453 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 3454 ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 3455 connp->conn_netstack); 3456 } 3457 if (IN6_IS_ADDR_V4MAPPED(&v6dst)) 3458 ixa->ixa_flags |= IXAF_IS_IPV4; 3459 else 3460 ixa->ixa_flags &= ~IXAF_IS_IPV4; 3461 } else { 3462 /* Connected case */ 3463 v6dst = connp->conn_faddr_v6; 3464 flowinfo = connp->conn_flowinfo; 3465 } 3466 mutex_exit(&connp->conn_lock); 3467 /* Handle IP_PKTINFO/IPV6_PKTINFO setting source address. */ 3468 if (ipp->ipp_fields & IPPF_ADDR) { 3469 if (ixa->ixa_flags & IXAF_IS_IPV4) { 3470 if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 3471 v6src = ipp->ipp_addr; 3472 } else { 3473 if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 3474 v6src = ipp->ipp_addr; 3475 } 3476 } 3477 /* 3478 * Allow source not assigned to the system 3479 * only if it is not a local addresses 3480 */ 3481 if (!V6_OR_V4_INADDR_ANY(v6src)) { 3482 ip_laddr_t laddr_type; 3483 3484 if (ixa->ixa_flags & IXAF_IS_IPV4) { 3485 ipaddr_t v4src; 3486 3487 IN6_V4MAPPED_TO_IPADDR(&v6src, v4src); 3488 laddr_type = ip_laddr_verify_v4(v4src, ixa->ixa_zoneid, 3489 is->is_netstack->netstack_ip, B_FALSE); 3490 } else { 3491 laddr_type = ip_laddr_verify_v6(&v6src, ixa->ixa_zoneid, 3492 is->is_netstack->netstack_ip, B_FALSE, B_FALSE); 3493 } 3494 if (laddr_type != IPVL_UNICAST_UP) 3495 ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE; 3496 } 3497 3498 ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop); 3499 error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport, 3500 &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST); 3501 3502 switch (error) { 3503 case 0: 3504 break; 3505 case EADDRNOTAVAIL: 3506 /* 3507 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3508 * Don't have the application see that errno 3509 */ 3510 error = ENETUNREACH; 3511 goto failed; 3512 case ENETDOWN: 3513 /* 3514 * Have !ipif_addr_ready address; drop packet silently 3515 * until we can get applications to not send until we 3516 * are ready. 3517 */ 3518 error = 0; 3519 goto failed; 3520 case EHOSTUNREACH: 3521 case ENETUNREACH: 3522 if (ixa->ixa_ire != NULL) { 3523 /* 3524 * Let conn_ip_output/ire_send_noroute return 3525 * the error and send any local ICMP error. 3526 */ 3527 error = 0; 3528 break; 3529 } 3530 /* FALLTHRU */ 3531 default: 3532 failed: 3533 freemsg(mp); 3534 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3535 goto done; 3536 } 3537 3538 /* 3539 * We might be going to a different destination than last time, 3540 * thus check that TX allows the communication and compute any 3541 * needed label. 3542 * 3543 * TSOL Note: We have an exclusive ipp and ixa for this thread so we 3544 * don't have to worry about concurrent threads. 3545 */ 3546 if (is_system_labeled()) { 3547 /* 3548 * Check whether Trusted Solaris policy allows communication 3549 * with this host, and pretend that the destination is 3550 * unreachable if not. 3551 * Compute any needed label and place it in ipp_label_v4/v6. 3552 * 3553 * Later conn_build_hdr_template/conn_prepend_hdr takes 3554 * ipp_label_v4/v6 to form the packet. 3555 * 3556 * Tsol note: We have ipp structure local to this thread so 3557 * no locking is needed. 3558 */ 3559 error = conn_update_label(connp, ixa, &v6dst, ipp); 3560 if (error != 0) { 3561 freemsg(mp); 3562 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3563 goto done; 3564 } 3565 } 3566 mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, flowinfo, mp, 3567 &error); 3568 if (mp == NULL) { 3569 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3570 ASSERT(error != 0); 3571 goto done; 3572 } 3573 if (ixa->ixa_pktlen > IP_MAXPACKET) { 3574 error = EMSGSIZE; 3575 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3576 freemsg(mp); 3577 goto done; 3578 } 3579 3580 /* Policy might differ for different ICMP type/code */ 3581 mp = icmp_output_attach_policy(mp, connp, ixa); 3582 if (mp == NULL) { 3583 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3584 error = EHOSTUNREACH; /* IPsec policy failure */ 3585 goto done; 3586 } 3587 3588 /* We're done. Pass the packet to ip. */ 3589 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3590 3591 error = conn_ip_output(mp, ixa); 3592 if (!connp->conn_unspec_src) 3593 ixa->ixa_flags |= IXAF_VERIFY_SOURCE; 3594 /* No rawipOutErrors if an error since IP increases its error counter */ 3595 switch (error) { 3596 case 0: 3597 break; 3598 case EWOULDBLOCK: 3599 (void) ixa_check_drain_insert(connp, ixa); 3600 error = 0; 3601 break; 3602 case EADDRNOTAVAIL: 3603 /* 3604 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3605 * Don't have the application see that errno 3606 */ 3607 error = ENETUNREACH; 3608 /* FALLTHRU */ 3609 default: 3610 mutex_enter(&connp->conn_lock); 3611 /* 3612 * Clear the source and v6lastdst so we call ip_attr_connect 3613 * for the next packet and try to pick a better source. 3614 */ 3615 if (connp->conn_mcbc_bind) 3616 connp->conn_saddr_v6 = ipv6_all_zeros; 3617 else 3618 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 3619 connp->conn_v6lastdst = ipv6_all_zeros; 3620 mutex_exit(&connp->conn_lock); 3621 break; 3622 } 3623 done: 3624 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3625 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3626 ixa->ixa_cpid = connp->conn_cpid; 3627 ixa_refrele(ixa); 3628 ip_pkt_free(ipp); 3629 kmem_free(ipp, sizeof (*ipp)); 3630 return (error); 3631 } 3632 3633 /* 3634 * Handle sending an M_DATA for a connected socket. 3635 * Handles both IPv4 and IPv6. 3636 */ 3637 int 3638 icmp_output_connected(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid) 3639 { 3640 icmp_t *icmp = connp->conn_icmp; 3641 icmp_stack_t *is = icmp->icmp_is; 3642 int error; 3643 ip_xmit_attr_t *ixa; 3644 boolean_t do_ipsec; 3645 3646 /* 3647 * If no other thread is using conn_ixa this just gets a reference to 3648 * conn_ixa. Otherwise we get a safe copy of conn_ixa. 3649 */ 3650 ixa = conn_get_ixa(connp, B_FALSE); 3651 if (ixa == NULL) { 3652 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3653 freemsg(mp); 3654 return (ENOMEM); 3655 } 3656 3657 ASSERT(cr != NULL); 3658 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3659 ixa->ixa_cred = cr; 3660 ixa->ixa_cpid = pid; 3661 3662 /* Defer IPsec if it might need to look at ICMP type/code */ 3663 switch (ixa->ixa_protocol) { 3664 case IPPROTO_ICMP: 3665 case IPPROTO_ICMPV6: 3666 do_ipsec = B_FALSE; 3667 break; 3668 default: 3669 do_ipsec = B_TRUE; 3670 } 3671 3672 mutex_enter(&connp->conn_lock); 3673 mp = icmp_prepend_header_template(connp, ixa, mp, 3674 &connp->conn_saddr_v6, connp->conn_flowinfo, &error); 3675 3676 if (mp == NULL) { 3677 ASSERT(error != 0); 3678 mutex_exit(&connp->conn_lock); 3679 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3680 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3681 ixa->ixa_cpid = connp->conn_cpid; 3682 ixa_refrele(ixa); 3683 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3684 freemsg(mp); 3685 return (error); 3686 } 3687 3688 if (!do_ipsec) { 3689 /* Policy might differ for different ICMP type/code */ 3690 mp = icmp_output_attach_policy(mp, connp, ixa); 3691 if (mp == NULL) { 3692 mutex_exit(&connp->conn_lock); 3693 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3694 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3695 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3696 ixa->ixa_cpid = connp->conn_cpid; 3697 ixa_refrele(ixa); 3698 return (EHOSTUNREACH); /* IPsec policy failure */ 3699 } 3700 } 3701 3702 /* 3703 * In case we got a safe copy of conn_ixa, or if opt_set made us a new 3704 * safe copy, then we need to fill in any pointers in it. 3705 */ 3706 if (ixa->ixa_ire == NULL) { 3707 in6_addr_t faddr, saddr; 3708 in6_addr_t nexthop; 3709 in_port_t fport; 3710 3711 saddr = connp->conn_saddr_v6; 3712 faddr = connp->conn_faddr_v6; 3713 fport = connp->conn_fport; 3714 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &faddr, &nexthop); 3715 mutex_exit(&connp->conn_lock); 3716 3717 error = ip_attr_connect(connp, ixa, &saddr, &faddr, &nexthop, 3718 fport, NULL, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | 3719 (do_ipsec ? IPDF_IPSEC : 0)); 3720 switch (error) { 3721 case 0: 3722 break; 3723 case EADDRNOTAVAIL: 3724 /* 3725 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3726 * Don't have the application see that errno 3727 */ 3728 error = ENETUNREACH; 3729 goto failed; 3730 case ENETDOWN: 3731 /* 3732 * Have !ipif_addr_ready address; drop packet silently 3733 * until we can get applications to not send until we 3734 * are ready. 3735 */ 3736 error = 0; 3737 goto failed; 3738 case EHOSTUNREACH: 3739 case ENETUNREACH: 3740 if (ixa->ixa_ire != NULL) { 3741 /* 3742 * Let conn_ip_output/ire_send_noroute return 3743 * the error and send any local ICMP error. 3744 */ 3745 error = 0; 3746 break; 3747 } 3748 /* FALLTHRU */ 3749 default: 3750 failed: 3751 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3752 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3753 ixa->ixa_cpid = connp->conn_cpid; 3754 ixa_refrele(ixa); 3755 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3756 freemsg(mp); 3757 return (error); 3758 } 3759 } else { 3760 /* Done with conn_t */ 3761 mutex_exit(&connp->conn_lock); 3762 } 3763 3764 /* We're done. Pass the packet to ip. */ 3765 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3766 3767 error = conn_ip_output(mp, ixa); 3768 /* No rawipOutErrors if an error since IP increases its error counter */ 3769 switch (error) { 3770 case 0: 3771 break; 3772 case EWOULDBLOCK: 3773 (void) ixa_check_drain_insert(connp, ixa); 3774 error = 0; 3775 break; 3776 case EADDRNOTAVAIL: 3777 /* 3778 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3779 * Don't have the application see that errno 3780 */ 3781 error = ENETUNREACH; 3782 break; 3783 } 3784 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3785 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3786 ixa->ixa_cpid = connp->conn_cpid; 3787 ixa_refrele(ixa); 3788 return (error); 3789 } 3790 3791 /* 3792 * Handle sending an M_DATA to the last destination. 3793 * Handles both IPv4 and IPv6. 3794 * 3795 * NOTE: The caller must hold conn_lock and we drop it here. 3796 */ 3797 int 3798 icmp_output_lastdst(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid, 3799 ip_xmit_attr_t *ixa) 3800 { 3801 icmp_t *icmp = connp->conn_icmp; 3802 icmp_stack_t *is = icmp->icmp_is; 3803 int error; 3804 boolean_t do_ipsec; 3805 3806 ASSERT(MUTEX_HELD(&connp->conn_lock)); 3807 ASSERT(ixa != NULL); 3808 3809 ASSERT(cr != NULL); 3810 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3811 ixa->ixa_cred = cr; 3812 ixa->ixa_cpid = pid; 3813 3814 /* Defer IPsec if it might need to look at ICMP type/code */ 3815 switch (ixa->ixa_protocol) { 3816 case IPPROTO_ICMP: 3817 case IPPROTO_ICMPV6: 3818 do_ipsec = B_FALSE; 3819 break; 3820 default: 3821 do_ipsec = B_TRUE; 3822 } 3823 3824 3825 mp = icmp_prepend_header_template(connp, ixa, mp, 3826 &connp->conn_v6lastsrc, connp->conn_lastflowinfo, &error); 3827 3828 if (mp == NULL) { 3829 ASSERT(error != 0); 3830 mutex_exit(&connp->conn_lock); 3831 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3832 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3833 ixa->ixa_cpid = connp->conn_cpid; 3834 ixa_refrele(ixa); 3835 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3836 freemsg(mp); 3837 return (error); 3838 } 3839 3840 if (!do_ipsec) { 3841 /* Policy might differ for different ICMP type/code */ 3842 mp = icmp_output_attach_policy(mp, connp, ixa); 3843 if (mp == NULL) { 3844 mutex_exit(&connp->conn_lock); 3845 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3846 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3847 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3848 ixa->ixa_cpid = connp->conn_cpid; 3849 ixa_refrele(ixa); 3850 return (EHOSTUNREACH); /* IPsec policy failure */ 3851 } 3852 } 3853 3854 /* 3855 * In case we got a safe copy of conn_ixa, or if opt_set made us a new 3856 * safe copy, then we need to fill in any pointers in it. 3857 */ 3858 if (ixa->ixa_ire == NULL) { 3859 in6_addr_t lastdst, lastsrc; 3860 in6_addr_t nexthop; 3861 in_port_t lastport; 3862 3863 lastsrc = connp->conn_v6lastsrc; 3864 lastdst = connp->conn_v6lastdst; 3865 lastport = connp->conn_lastdstport; 3866 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &lastdst, &nexthop); 3867 mutex_exit(&connp->conn_lock); 3868 3869 error = ip_attr_connect(connp, ixa, &lastsrc, &lastdst, 3870 &nexthop, lastport, NULL, NULL, IPDF_ALLOW_MCBC | 3871 IPDF_VERIFY_DST | (do_ipsec ? IPDF_IPSEC : 0)); 3872 switch (error) { 3873 case 0: 3874 break; 3875 case EADDRNOTAVAIL: 3876 /* 3877 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3878 * Don't have the application see that errno 3879 */ 3880 error = ENETUNREACH; 3881 goto failed; 3882 case ENETDOWN: 3883 /* 3884 * Have !ipif_addr_ready address; drop packet silently 3885 * until we can get applications to not send until we 3886 * are ready. 3887 */ 3888 error = 0; 3889 goto failed; 3890 case EHOSTUNREACH: 3891 case ENETUNREACH: 3892 if (ixa->ixa_ire != NULL) { 3893 /* 3894 * Let conn_ip_output/ire_send_noroute return 3895 * the error and send any local ICMP error. 3896 */ 3897 error = 0; 3898 break; 3899 } 3900 /* FALLTHRU */ 3901 default: 3902 failed: 3903 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3904 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3905 ixa->ixa_cpid = connp->conn_cpid; 3906 ixa_refrele(ixa); 3907 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3908 freemsg(mp); 3909 return (error); 3910 } 3911 } else { 3912 /* Done with conn_t */ 3913 mutex_exit(&connp->conn_lock); 3914 } 3915 3916 /* We're done. Pass the packet to ip. */ 3917 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3918 error = conn_ip_output(mp, ixa); 3919 /* No rawipOutErrors if an error since IP increases its error counter */ 3920 switch (error) { 3921 case 0: 3922 break; 3923 case EWOULDBLOCK: 3924 (void) ixa_check_drain_insert(connp, ixa); 3925 error = 0; 3926 break; 3927 case EADDRNOTAVAIL: 3928 /* 3929 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3930 * Don't have the application see that errno 3931 */ 3932 error = ENETUNREACH; 3933 /* FALLTHRU */ 3934 default: 3935 mutex_enter(&connp->conn_lock); 3936 /* 3937 * Clear the source and v6lastdst so we call ip_attr_connect 3938 * for the next packet and try to pick a better source. 3939 */ 3940 if (connp->conn_mcbc_bind) 3941 connp->conn_saddr_v6 = ipv6_all_zeros; 3942 else 3943 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 3944 connp->conn_v6lastdst = ipv6_all_zeros; 3945 mutex_exit(&connp->conn_lock); 3946 break; 3947 } 3948 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3949 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3950 ixa->ixa_cpid = connp->conn_cpid; 3951 ixa_refrele(ixa); 3952 return (error); 3953 } 3954 3955 3956 /* 3957 * Prepend the header template and then fill in the source and 3958 * flowinfo. The caller needs to handle the destination address since 3959 * it's setting is different if rthdr or source route. 3960 * 3961 * Returns NULL is allocation failed or if the packet would exceed IP_MAXPACKET. 3962 * When it returns NULL it sets errorp. 3963 */ 3964 static mblk_t * 3965 icmp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp, 3966 const in6_addr_t *v6src, uint32_t flowinfo, int *errorp) 3967 { 3968 icmp_t *icmp = connp->conn_icmp; 3969 icmp_stack_t *is = icmp->icmp_is; 3970 uint_t pktlen; 3971 uint_t copylen; 3972 uint8_t *iph; 3973 uint_t ip_hdr_length; 3974 uint32_t cksum; 3975 ip_pkt_t *ipp; 3976 3977 ASSERT(MUTEX_HELD(&connp->conn_lock)); 3978 3979 /* 3980 * Copy the header template. 3981 */ 3982 copylen = connp->conn_ht_iphc_len; 3983 pktlen = copylen + msgdsize(mp); 3984 if (pktlen > IP_MAXPACKET) { 3985 freemsg(mp); 3986 *errorp = EMSGSIZE; 3987 return (NULL); 3988 } 3989 ixa->ixa_pktlen = pktlen; 3990 3991 /* check/fix buffer config, setup pointers into it */ 3992 iph = mp->b_rptr - copylen; 3993 if (DB_REF(mp) != 1 || iph < DB_BASE(mp) || !OK_32PTR(iph)) { 3994 mblk_t *mp1; 3995 3996 mp1 = allocb(copylen + is->is_wroff_extra, BPRI_MED); 3997 if (mp1 == NULL) { 3998 freemsg(mp); 3999 *errorp = ENOMEM; 4000 return (NULL); 4001 } 4002 mp1->b_wptr = DB_LIM(mp1); 4003 mp1->b_cont = mp; 4004 mp = mp1; 4005 iph = (mp->b_wptr - copylen); 4006 } 4007 mp->b_rptr = iph; 4008 bcopy(connp->conn_ht_iphc, iph, copylen); 4009 ip_hdr_length = (uint_t)(connp->conn_ht_ulp - connp->conn_ht_iphc); 4010 4011 ixa->ixa_ip_hdr_length = ip_hdr_length; 4012 4013 /* 4014 * Prepare for ICMPv6 checksum done in IP. 4015 * 4016 * icmp_build_hdr_template has already massaged any routing header 4017 * and placed the result in conn_sum. 4018 * 4019 * We make it easy for IP to include our pseudo header 4020 * by putting our length (and any routing header adjustment) 4021 * in the ICMPv6 checksum field. 4022 */ 4023 cksum = pktlen - ip_hdr_length; 4024 4025 cksum += connp->conn_sum; 4026 cksum = (cksum >> 16) + (cksum & 0xFFFF); 4027 ASSERT(cksum < 0x10000); 4028 4029 ipp = &connp->conn_xmit_ipp; 4030 if (ixa->ixa_flags & IXAF_IS_IPV4) { 4031 ipha_t *ipha = (ipha_t *)iph; 4032 4033 ipha->ipha_length = htons((uint16_t)pktlen); 4034 4035 /* if IP_PKTINFO specified an addres it wins over bind() */ 4036 if ((ipp->ipp_fields & IPPF_ADDR) && 4037 IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) { 4038 ASSERT(ipp->ipp_addr_v4 != INADDR_ANY); 4039 ipha->ipha_src = ipp->ipp_addr_v4; 4040 } else { 4041 IN6_V4MAPPED_TO_IPADDR(v6src, ipha->ipha_src); 4042 } 4043 } else { 4044 ip6_t *ip6h = (ip6_t *)iph; 4045 uint_t cksum_offset = 0; 4046 4047 ip6h->ip6_plen = htons((uint16_t)(pktlen - IPV6_HDR_LEN)); 4048 4049 /* if IP_PKTINFO specified an addres it wins over bind() */ 4050 if ((ipp->ipp_fields & IPPF_ADDR) && 4051 !IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) { 4052 ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr)); 4053 ip6h->ip6_src = ipp->ipp_addr; 4054 } else { 4055 ip6h->ip6_src = *v6src; 4056 } 4057 ip6h->ip6_vcf = 4058 (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) | 4059 (flowinfo & ~IPV6_VERS_AND_FLOW_MASK); 4060 if (ipp->ipp_fields & IPPF_TCLASS) { 4061 /* Overrides the class part of flowinfo */ 4062 ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf, 4063 ipp->ipp_tclass); 4064 } 4065 4066 if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) { 4067 if (connp->conn_proto == IPPROTO_ICMPV6) { 4068 cksum_offset = ixa->ixa_ip_hdr_length + 4069 offsetof(icmp6_t, icmp6_cksum); 4070 } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { 4071 cksum_offset = ixa->ixa_ip_hdr_length + 4072 ixa->ixa_raw_cksum_offset; 4073 } 4074 } 4075 if (cksum_offset != 0) { 4076 uint16_t *ptr; 4077 4078 /* Make sure the checksum fits in the first mblk */ 4079 if (cksum_offset + sizeof (short) > MBLKL(mp)) { 4080 mblk_t *mp1; 4081 4082 mp1 = msgpullup(mp, 4083 cksum_offset + sizeof (short)); 4084 freemsg(mp); 4085 if (mp1 == NULL) { 4086 *errorp = ENOMEM; 4087 return (NULL); 4088 } 4089 mp = mp1; 4090 iph = mp->b_rptr; 4091 ip6h = (ip6_t *)iph; 4092 } 4093 ptr = (uint16_t *)(mp->b_rptr + cksum_offset); 4094 *ptr = htons(cksum); 4095 } 4096 } 4097 4098 return (mp); 4099 } 4100 4101 /* 4102 * This routine handles all messages passed downstream. It either 4103 * consumes the message or passes it downstream; it never queues a 4104 * a message. 4105 */ 4106 void 4107 icmp_wput(queue_t *q, mblk_t *mp) 4108 { 4109 sin6_t *sin6; 4110 sin_t *sin = NULL; 4111 uint_t srcid; 4112 conn_t *connp = Q_TO_CONN(q); 4113 icmp_t *icmp = connp->conn_icmp; 4114 int error = 0; 4115 struct sockaddr *addr = NULL; 4116 socklen_t addrlen; 4117 icmp_stack_t *is = icmp->icmp_is; 4118 struct T_unitdata_req *tudr; 4119 mblk_t *data_mp; 4120 cred_t *cr; 4121 pid_t pid; 4122 4123 /* 4124 * We directly handle several cases here: T_UNITDATA_REQ message 4125 * coming down as M_PROTO/M_PCPROTO and M_DATA messages for connected 4126 * socket. 4127 */ 4128 switch (DB_TYPE(mp)) { 4129 case M_DATA: 4130 /* sockfs never sends down M_DATA */ 4131 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4132 freemsg(mp); 4133 return; 4134 4135 case M_PROTO: 4136 case M_PCPROTO: 4137 tudr = (struct T_unitdata_req *)mp->b_rptr; 4138 if (MBLKL(mp) < sizeof (*tudr) || 4139 ((t_primp_t)mp->b_rptr)->type != T_UNITDATA_REQ) { 4140 icmp_wput_other(q, mp); 4141 return; 4142 } 4143 break; 4144 4145 default: 4146 icmp_wput_other(q, mp); 4147 return; 4148 } 4149 4150 /* Handle valid T_UNITDATA_REQ here */ 4151 data_mp = mp->b_cont; 4152 if (data_mp == NULL) { 4153 error = EPROTO; 4154 goto ud_error2; 4155 } 4156 mp->b_cont = NULL; 4157 4158 if (!MBLKIN(mp, 0, tudr->DEST_offset + tudr->DEST_length)) { 4159 error = EADDRNOTAVAIL; 4160 goto ud_error2; 4161 } 4162 4163 /* 4164 * All Solaris components should pass a db_credp 4165 * for this message, hence we ASSERT. 4166 * On production kernels we return an error to be robust against 4167 * random streams modules sitting on top of us. 4168 */ 4169 cr = msg_getcred(mp, &pid); 4170 ASSERT(cr != NULL); 4171 if (cr == NULL) { 4172 error = EINVAL; 4173 goto ud_error2; 4174 } 4175 4176 /* 4177 * If a port has not been bound to the stream, fail. 4178 * This is not a problem when sockfs is directly 4179 * above us, because it will ensure that the socket 4180 * is first bound before allowing data to be sent. 4181 */ 4182 if (icmp->icmp_state == TS_UNBND) { 4183 error = EPROTO; 4184 goto ud_error2; 4185 } 4186 addr = (struct sockaddr *)&mp->b_rptr[tudr->DEST_offset]; 4187 addrlen = tudr->DEST_length; 4188 4189 switch (connp->conn_family) { 4190 case AF_INET6: 4191 sin6 = (sin6_t *)addr; 4192 if (!OK_32PTR((char *)sin6) || (addrlen != sizeof (sin6_t)) || 4193 (sin6->sin6_family != AF_INET6)) { 4194 error = EADDRNOTAVAIL; 4195 goto ud_error2; 4196 } 4197 4198 /* No support for mapped addresses on raw sockets */ 4199 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 4200 error = EADDRNOTAVAIL; 4201 goto ud_error2; 4202 } 4203 srcid = sin6->__sin6_src_id; 4204 4205 /* 4206 * If the local address is a mapped address return 4207 * an error. 4208 * It would be possible to send an IPv6 packet but the 4209 * response would never make it back to the application 4210 * since it is bound to a mapped address. 4211 */ 4212 if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) { 4213 error = EADDRNOTAVAIL; 4214 goto ud_error2; 4215 } 4216 4217 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) 4218 sin6->sin6_addr = ipv6_loopback; 4219 4220 if (tudr->OPT_length != 0) { 4221 /* 4222 * If we are connected then the destination needs to be 4223 * the same as the connected one. 4224 */ 4225 if (icmp->icmp_state == TS_DATA_XFER && 4226 !conn_same_as_last_v6(connp, sin6)) { 4227 error = EISCONN; 4228 goto ud_error2; 4229 } 4230 error = icmp_output_ancillary(connp, NULL, sin6, 4231 data_mp, mp, NULL, cr, pid); 4232 } else { 4233 ip_xmit_attr_t *ixa; 4234 4235 /* 4236 * We have to allocate an ip_xmit_attr_t before we grab 4237 * conn_lock and we need to hold conn_lock once we've 4238 * checked conn_same_as_last_v6 to handle concurrent 4239 * send* calls on a socket. 4240 */ 4241 ixa = conn_get_ixa(connp, B_FALSE); 4242 if (ixa == NULL) { 4243 error = ENOMEM; 4244 goto ud_error2; 4245 } 4246 mutex_enter(&connp->conn_lock); 4247 4248 if (conn_same_as_last_v6(connp, sin6) && 4249 connp->conn_lastsrcid == srcid && 4250 ipsec_outbound_policy_current(ixa)) { 4251 /* icmp_output_lastdst drops conn_lock */ 4252 error = icmp_output_lastdst(connp, data_mp, cr, 4253 pid, ixa); 4254 } else { 4255 /* icmp_output_newdst drops conn_lock */ 4256 error = icmp_output_newdst(connp, data_mp, NULL, 4257 sin6, cr, pid, ixa); 4258 } 4259 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 4260 } 4261 if (error == 0) { 4262 freeb(mp); 4263 return; 4264 } 4265 break; 4266 4267 case AF_INET: 4268 sin = (sin_t *)addr; 4269 if ((!OK_32PTR((char *)sin) || addrlen != sizeof (sin_t)) || 4270 (sin->sin_family != AF_INET)) { 4271 error = EADDRNOTAVAIL; 4272 goto ud_error2; 4273 } 4274 if (sin->sin_addr.s_addr == INADDR_ANY) 4275 sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); 4276 4277 /* Protocol 255 contains full IP headers */ 4278 /* Read without holding lock */ 4279 if (icmp->icmp_hdrincl) { 4280 if (MBLKL(data_mp) < IP_SIMPLE_HDR_LENGTH) { 4281 if (!pullupmsg(data_mp, IP_SIMPLE_HDR_LENGTH)) { 4282 error = EINVAL; 4283 goto ud_error2; 4284 } 4285 } 4286 error = icmp_output_hdrincl(connp, data_mp, cr, pid); 4287 if (error == 0) { 4288 freeb(mp); 4289 return; 4290 } 4291 /* data_mp consumed above */ 4292 data_mp = NULL; 4293 goto ud_error2; 4294 } 4295 4296 if (tudr->OPT_length != 0) { 4297 /* 4298 * If we are connected then the destination needs to be 4299 * the same as the connected one. 4300 */ 4301 if (icmp->icmp_state == TS_DATA_XFER && 4302 !conn_same_as_last_v4(connp, sin)) { 4303 error = EISCONN; 4304 goto ud_error2; 4305 } 4306 error = icmp_output_ancillary(connp, sin, NULL, 4307 data_mp, mp, NULL, cr, pid); 4308 } else { 4309 ip_xmit_attr_t *ixa; 4310 4311 /* 4312 * We have to allocate an ip_xmit_attr_t before we grab 4313 * conn_lock and we need to hold conn_lock once we've 4314 * checked conn_same_as_last_v4 to handle concurrent 4315 * send* calls on a socket. 4316 */ 4317 ixa = conn_get_ixa(connp, B_FALSE); 4318 if (ixa == NULL) { 4319 error = ENOMEM; 4320 goto ud_error2; 4321 } 4322 mutex_enter(&connp->conn_lock); 4323 4324 if (conn_same_as_last_v4(connp, sin) && 4325 ipsec_outbound_policy_current(ixa)) { 4326 /* icmp_output_lastdst drops conn_lock */ 4327 error = icmp_output_lastdst(connp, data_mp, cr, 4328 pid, ixa); 4329 } else { 4330 /* icmp_output_newdst drops conn_lock */ 4331 error = icmp_output_newdst(connp, data_mp, sin, 4332 NULL, cr, pid, ixa); 4333 } 4334 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 4335 } 4336 if (error == 0) { 4337 freeb(mp); 4338 return; 4339 } 4340 break; 4341 } 4342 ASSERT(mp != NULL); 4343 /* mp is freed by the following routine */ 4344 icmp_ud_err(q, mp, (t_scalar_t)error); 4345 return; 4346 4347 ud_error2: 4348 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4349 freemsg(data_mp); 4350 ASSERT(mp != NULL); 4351 /* mp is freed by the following routine */ 4352 icmp_ud_err(q, mp, (t_scalar_t)error); 4353 } 4354 4355 /* 4356 * Handle the case of the IP address or flow label being different 4357 * for both IPv4 and IPv6. 4358 * 4359 * NOTE: The caller must hold conn_lock and we drop it here. 4360 */ 4361 static int 4362 icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, sin6_t *sin6, 4363 cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa) 4364 { 4365 icmp_t *icmp = connp->conn_icmp; 4366 icmp_stack_t *is = icmp->icmp_is; 4367 int error; 4368 ip_xmit_attr_t *oldixa; 4369 boolean_t do_ipsec; 4370 uint_t srcid; 4371 uint32_t flowinfo; 4372 in6_addr_t v6src; 4373 in6_addr_t v6dst; 4374 in6_addr_t v6nexthop; 4375 in_port_t dstport; 4376 4377 ASSERT(MUTEX_HELD(&connp->conn_lock)); 4378 ASSERT(ixa != NULL); 4379 4380 /* 4381 * We hold conn_lock across all the use and modifications of 4382 * the conn_lastdst, conn_ixa, and conn_xmit_ipp to ensure that they 4383 * stay consistent. 4384 */ 4385 4386 ASSERT(cr != NULL); 4387 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 4388 ixa->ixa_cred = cr; 4389 ixa->ixa_cpid = pid; 4390 if (is_system_labeled()) { 4391 /* We need to restart with a label based on the cred */ 4392 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 4393 } 4394 /* 4395 * If we are connected then the destination needs to be the 4396 * same as the connected one, which is not the case here since we 4397 * checked for that above. 4398 */ 4399 if (icmp->icmp_state == TS_DATA_XFER) { 4400 mutex_exit(&connp->conn_lock); 4401 error = EISCONN; 4402 goto ud_error; 4403 } 4404 4405 /* In case previous destination was multicast or multirt */ 4406 ip_attr_newdst(ixa); 4407 4408 /* 4409 * If laddr is unspecified then we look at sin6_src_id. 4410 * We will give precedence to a source address set with IPV6_PKTINFO 4411 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't 4412 * want ip_attr_connect to select a source (since it can fail) when 4413 * IPV6_PKTINFO is specified. 4414 * If this doesn't result in a source address then we get a source 4415 * from ip_attr_connect() below. 4416 */ 4417 v6src = connp->conn_saddr_v6; 4418 if (sin != NULL) { 4419 IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst); 4420 dstport = sin->sin_port; 4421 flowinfo = 0; 4422 srcid = 0; 4423 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 4424 if (srcid != 0 && V4_PART_OF_V6(&v6src) == INADDR_ANY) { 4425 ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 4426 connp->conn_netstack); 4427 } 4428 ixa->ixa_flags |= IXAF_IS_IPV4; 4429 } else { 4430 v6dst = sin6->sin6_addr; 4431 dstport = sin6->sin6_port; 4432 flowinfo = sin6->sin6_flowinfo; 4433 srcid = sin6->__sin6_src_id; 4434 if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) { 4435 ixa->ixa_scopeid = sin6->sin6_scope_id; 4436 ixa->ixa_flags |= IXAF_SCOPEID_SET; 4437 } else { 4438 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 4439 } 4440 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 4441 ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 4442 connp->conn_netstack); 4443 } 4444 if (IN6_IS_ADDR_V4MAPPED(&v6dst)) 4445 ixa->ixa_flags |= IXAF_IS_IPV4; 4446 else 4447 ixa->ixa_flags &= ~IXAF_IS_IPV4; 4448 } 4449 /* Handle IP_PKTINFO/IPV6_PKTINFO setting source address. */ 4450 if (connp->conn_xmit_ipp.ipp_fields & IPPF_ADDR) { 4451 ip_pkt_t *ipp = &connp->conn_xmit_ipp; 4452 4453 if (ixa->ixa_flags & IXAF_IS_IPV4) { 4454 if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 4455 v6src = ipp->ipp_addr; 4456 } else { 4457 if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 4458 v6src = ipp->ipp_addr; 4459 } 4460 } 4461 4462 /* Defer IPsec if it might need to look at ICMP type/code */ 4463 switch (ixa->ixa_protocol) { 4464 case IPPROTO_ICMP: 4465 case IPPROTO_ICMPV6: 4466 do_ipsec = B_FALSE; 4467 break; 4468 default: 4469 do_ipsec = B_TRUE; 4470 } 4471 4472 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &v6dst, &v6nexthop); 4473 mutex_exit(&connp->conn_lock); 4474 4475 error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport, 4476 &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | 4477 (do_ipsec ? IPDF_IPSEC : 0)); 4478 switch (error) { 4479 case 0: 4480 break; 4481 case EADDRNOTAVAIL: 4482 /* 4483 * IXAF_VERIFY_SOURCE tells us to pick a better source. 4484 * Don't have the application see that errno 4485 */ 4486 error = ENETUNREACH; 4487 goto failed; 4488 case ENETDOWN: 4489 /* 4490 * Have !ipif_addr_ready address; drop packet silently 4491 * until we can get applications to not send until we 4492 * are ready. 4493 */ 4494 error = 0; 4495 goto failed; 4496 case EHOSTUNREACH: 4497 case ENETUNREACH: 4498 if (ixa->ixa_ire != NULL) { 4499 /* 4500 * Let conn_ip_output/ire_send_noroute return 4501 * the error and send any local ICMP error. 4502 */ 4503 error = 0; 4504 break; 4505 } 4506 /* FALLTHRU */ 4507 default: 4508 failed: 4509 goto ud_error; 4510 } 4511 4512 mutex_enter(&connp->conn_lock); 4513 /* 4514 * While we dropped the lock some other thread might have connected 4515 * this socket. If so we bail out with EISCONN to ensure that the 4516 * connecting thread is the one that updates conn_ixa, conn_ht_* 4517 * and conn_*last*. 4518 */ 4519 if (icmp->icmp_state == TS_DATA_XFER) { 4520 mutex_exit(&connp->conn_lock); 4521 error = EISCONN; 4522 goto ud_error; 4523 } 4524 4525 /* 4526 * We need to rebuild the headers if 4527 * - we are labeling packets (could be different for different 4528 * destinations) 4529 * - we have a source route (or routing header) since we need to 4530 * massage that to get the pseudo-header checksum 4531 * - a socket option with COA_HEADER_CHANGED has been set which 4532 * set conn_v6lastdst to zero. 4533 * 4534 * Otherwise the prepend function will just update the src, dst, 4535 * and flow label. 4536 */ 4537 if (is_system_labeled()) { 4538 /* TX MLP requires SCM_UCRED and don't have that here */ 4539 if (connp->conn_mlp_type != mlptSingle) { 4540 mutex_exit(&connp->conn_lock); 4541 error = ECONNREFUSED; 4542 goto ud_error; 4543 } 4544 /* 4545 * Check whether Trusted Solaris policy allows communication 4546 * with this host, and pretend that the destination is 4547 * unreachable if not. 4548 * Compute any needed label and place it in ipp_label_v4/v6. 4549 * 4550 * Later conn_build_hdr_template/conn_prepend_hdr takes 4551 * ipp_label_v4/v6 to form the packet. 4552 * 4553 * Tsol note: Since we hold conn_lock we know no other 4554 * thread manipulates conn_xmit_ipp. 4555 */ 4556 error = conn_update_label(connp, ixa, &v6dst, 4557 &connp->conn_xmit_ipp); 4558 if (error != 0) { 4559 mutex_exit(&connp->conn_lock); 4560 goto ud_error; 4561 } 4562 /* Rebuild the header template */ 4563 error = icmp_build_hdr_template(connp, &v6src, &v6dst, 4564 flowinfo); 4565 if (error != 0) { 4566 mutex_exit(&connp->conn_lock); 4567 goto ud_error; 4568 } 4569 } else if (connp->conn_xmit_ipp.ipp_fields & 4570 (IPPF_IPV4_OPTIONS|IPPF_RTHDR) || 4571 IN6_IS_ADDR_UNSPECIFIED(&connp->conn_v6lastdst)) { 4572 /* Rebuild the header template */ 4573 error = icmp_build_hdr_template(connp, &v6src, &v6dst, 4574 flowinfo); 4575 if (error != 0) { 4576 mutex_exit(&connp->conn_lock); 4577 goto ud_error; 4578 } 4579 } else { 4580 /* Simply update the destination address if no source route */ 4581 if (ixa->ixa_flags & IXAF_IS_IPV4) { 4582 ipha_t *ipha = (ipha_t *)connp->conn_ht_iphc; 4583 4584 IN6_V4MAPPED_TO_IPADDR(&v6dst, ipha->ipha_dst); 4585 if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) { 4586 ipha->ipha_fragment_offset_and_flags |= 4587 IPH_DF_HTONS; 4588 } else { 4589 ipha->ipha_fragment_offset_and_flags &= 4590 ~IPH_DF_HTONS; 4591 } 4592 } else { 4593 ip6_t *ip6h = (ip6_t *)connp->conn_ht_iphc; 4594 ip6h->ip6_dst = v6dst; 4595 } 4596 } 4597 4598 /* 4599 * Remember the dst etc which corresponds to the built header 4600 * template and conn_ixa. 4601 */ 4602 oldixa = conn_replace_ixa(connp, ixa); 4603 connp->conn_v6lastdst = v6dst; 4604 connp->conn_lastflowinfo = flowinfo; 4605 connp->conn_lastscopeid = ixa->ixa_scopeid; 4606 connp->conn_lastsrcid = srcid; 4607 /* Also remember a source to use together with lastdst */ 4608 connp->conn_v6lastsrc = v6src; 4609 4610 data_mp = icmp_prepend_header_template(connp, ixa, data_mp, &v6src, 4611 flowinfo, &error); 4612 4613 /* Done with conn_t */ 4614 mutex_exit(&connp->conn_lock); 4615 ixa_refrele(oldixa); 4616 4617 if (data_mp == NULL) { 4618 ASSERT(error != 0); 4619 goto ud_error; 4620 } 4621 4622 if (!do_ipsec) { 4623 /* Policy might differ for different ICMP type/code */ 4624 data_mp = icmp_output_attach_policy(data_mp, connp, ixa); 4625 if (data_mp == NULL) { 4626 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4627 error = EHOSTUNREACH; /* IPsec policy failure */ 4628 goto done; 4629 } 4630 } 4631 4632 /* We're done. Pass the packet to ip. */ 4633 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 4634 4635 error = conn_ip_output(data_mp, ixa); 4636 /* No rawipOutErrors if an error since IP increases its error counter */ 4637 switch (error) { 4638 case 0: 4639 break; 4640 case EWOULDBLOCK: 4641 (void) ixa_check_drain_insert(connp, ixa); 4642 error = 0; 4643 break; 4644 case EADDRNOTAVAIL: 4645 /* 4646 * IXAF_VERIFY_SOURCE tells us to pick a better source. 4647 * Don't have the application see that errno 4648 */ 4649 error = ENETUNREACH; 4650 /* FALLTHRU */ 4651 default: 4652 mutex_enter(&connp->conn_lock); 4653 /* 4654 * Clear the source and v6lastdst so we call ip_attr_connect 4655 * for the next packet and try to pick a better source. 4656 */ 4657 if (connp->conn_mcbc_bind) 4658 connp->conn_saddr_v6 = ipv6_all_zeros; 4659 else 4660 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 4661 connp->conn_v6lastdst = ipv6_all_zeros; 4662 mutex_exit(&connp->conn_lock); 4663 break; 4664 } 4665 done: 4666 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 4667 ixa->ixa_cred = connp->conn_cred; /* Restore */ 4668 ixa->ixa_cpid = connp->conn_cpid; 4669 ixa_refrele(ixa); 4670 return (error); 4671 4672 ud_error: 4673 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 4674 ixa->ixa_cred = connp->conn_cred; /* Restore */ 4675 ixa->ixa_cpid = connp->conn_cpid; 4676 ixa_refrele(ixa); 4677 4678 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4679 freemsg(data_mp); 4680 return (error); 4681 } 4682 4683 /* ARGSUSED */ 4684 static void 4685 icmp_wput_fallback(queue_t *q, mblk_t *mp) 4686 { 4687 #ifdef DEBUG 4688 cmn_err(CE_CONT, "icmp_wput_fallback: Message during fallback \n"); 4689 #endif 4690 freemsg(mp); 4691 } 4692 4693 static void 4694 icmp_wput_other(queue_t *q, mblk_t *mp) 4695 { 4696 uchar_t *rptr = mp->b_rptr; 4697 struct iocblk *iocp; 4698 conn_t *connp = Q_TO_CONN(q); 4699 icmp_t *icmp = connp->conn_icmp; 4700 cred_t *cr; 4701 4702 switch (mp->b_datap->db_type) { 4703 case M_PROTO: 4704 case M_PCPROTO: 4705 if (mp->b_wptr - rptr < sizeof (t_scalar_t)) { 4706 /* 4707 * If the message does not contain a PRIM_type, 4708 * throw it away. 4709 */ 4710 freemsg(mp); 4711 return; 4712 } 4713 switch (((t_primp_t)rptr)->type) { 4714 case T_ADDR_REQ: 4715 icmp_addr_req(q, mp); 4716 return; 4717 case O_T_BIND_REQ: 4718 case T_BIND_REQ: 4719 icmp_tpi_bind(q, mp); 4720 return; 4721 case T_CONN_REQ: 4722 icmp_tpi_connect(q, mp); 4723 return; 4724 case T_CAPABILITY_REQ: 4725 icmp_capability_req(q, mp); 4726 return; 4727 case T_INFO_REQ: 4728 icmp_info_req(q, mp); 4729 return; 4730 case T_UNITDATA_REQ: 4731 /* 4732 * If a T_UNITDATA_REQ gets here, the address must 4733 * be bad. Valid T_UNITDATA_REQs are handled 4734 * in icmp_wput. 4735 */ 4736 icmp_ud_err(q, mp, EADDRNOTAVAIL); 4737 return; 4738 case T_UNBIND_REQ: 4739 icmp_tpi_unbind(q, mp); 4740 return; 4741 case T_SVR4_OPTMGMT_REQ: 4742 /* 4743 * All Solaris components should pass a db_credp 4744 * for this TPI message, hence we ASSERT. 4745 * But in case there is some other M_PROTO that looks 4746 * like a TPI message sent by some other kernel 4747 * component, we check and return an error. 4748 */ 4749 cr = msg_getcred(mp, NULL); 4750 ASSERT(cr != NULL); 4751 if (cr == NULL) { 4752 icmp_err_ack(q, mp, TSYSERR, EINVAL); 4753 return; 4754 } 4755 4756 if (!snmpcom_req(q, mp, icmp_snmp_set, ip_snmp_get, 4757 cr)) { 4758 svr4_optcom_req(q, mp, cr, &icmp_opt_obj); 4759 } 4760 return; 4761 4762 case T_OPTMGMT_REQ: 4763 /* 4764 * All Solaris components should pass a db_credp 4765 * for this TPI message, hence we ASSERT. 4766 * But in case there is some other M_PROTO that looks 4767 * like a TPI message sent by some other kernel 4768 * component, we check and return an error. 4769 */ 4770 cr = msg_getcred(mp, NULL); 4771 ASSERT(cr != NULL); 4772 if (cr == NULL) { 4773 icmp_err_ack(q, mp, TSYSERR, EINVAL); 4774 return; 4775 } 4776 tpi_optcom_req(q, mp, cr, &icmp_opt_obj); 4777 return; 4778 4779 case T_DISCON_REQ: 4780 icmp_tpi_disconnect(q, mp); 4781 return; 4782 4783 /* The following TPI message is not supported by icmp. */ 4784 case O_T_CONN_RES: 4785 case T_CONN_RES: 4786 icmp_err_ack(q, mp, TNOTSUPPORT, 0); 4787 return; 4788 4789 /* The following 3 TPI requests are illegal for icmp. */ 4790 case T_DATA_REQ: 4791 case T_EXDATA_REQ: 4792 case T_ORDREL_REQ: 4793 icmp_err_ack(q, mp, TNOTSUPPORT, 0); 4794 return; 4795 default: 4796 break; 4797 } 4798 break; 4799 case M_FLUSH: 4800 if (*rptr & FLUSHW) 4801 flushq(q, FLUSHDATA); 4802 break; 4803 case M_IOCTL: 4804 iocp = (struct iocblk *)mp->b_rptr; 4805 switch (iocp->ioc_cmd) { 4806 case TI_GETPEERNAME: 4807 if (icmp->icmp_state != TS_DATA_XFER) { 4808 /* 4809 * If a default destination address has not 4810 * been associated with the stream, then we 4811 * don't know the peer's name. 4812 */ 4813 iocp->ioc_error = ENOTCONN; 4814 iocp->ioc_count = 0; 4815 mp->b_datap->db_type = M_IOCACK; 4816 qreply(q, mp); 4817 return; 4818 } 4819 /* FALLTHRU */ 4820 case TI_GETMYNAME: 4821 /* 4822 * For TI_GETPEERNAME and TI_GETMYNAME, we first 4823 * need to copyin the user's strbuf structure. 4824 * Processing will continue in the M_IOCDATA case 4825 * below. 4826 */ 4827 mi_copyin(q, mp, NULL, 4828 SIZEOF_STRUCT(strbuf, iocp->ioc_flag)); 4829 return; 4830 default: 4831 break; 4832 } 4833 break; 4834 case M_IOCDATA: 4835 icmp_wput_iocdata(q, mp); 4836 return; 4837 default: 4838 /* Unrecognized messages are passed through without change. */ 4839 break; 4840 } 4841 ip_wput_nondata(q, mp); 4842 } 4843 4844 /* 4845 * icmp_wput_iocdata is called by icmp_wput_other to handle all M_IOCDATA 4846 * messages. 4847 */ 4848 static void 4849 icmp_wput_iocdata(queue_t *q, mblk_t *mp) 4850 { 4851 mblk_t *mp1; 4852 STRUCT_HANDLE(strbuf, sb); 4853 uint_t addrlen; 4854 conn_t *connp = Q_TO_CONN(q); 4855 icmp_t *icmp = connp->conn_icmp; 4856 4857 /* Make sure it is one of ours. */ 4858 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 4859 case TI_GETMYNAME: 4860 case TI_GETPEERNAME: 4861 break; 4862 default: 4863 ip_wput_nondata(q, mp); 4864 return; 4865 } 4866 4867 switch (mi_copy_state(q, mp, &mp1)) { 4868 case -1: 4869 return; 4870 case MI_COPY_CASE(MI_COPY_IN, 1): 4871 break; 4872 case MI_COPY_CASE(MI_COPY_OUT, 1): 4873 /* 4874 * The address has been copied out, so now 4875 * copyout the strbuf. 4876 */ 4877 mi_copyout(q, mp); 4878 return; 4879 case MI_COPY_CASE(MI_COPY_OUT, 2): 4880 /* 4881 * The address and strbuf have been copied out. 4882 * We're done, so just acknowledge the original 4883 * M_IOCTL. 4884 */ 4885 mi_copy_done(q, mp, 0); 4886 return; 4887 default: 4888 /* 4889 * Something strange has happened, so acknowledge 4890 * the original M_IOCTL with an EPROTO error. 4891 */ 4892 mi_copy_done(q, mp, EPROTO); 4893 return; 4894 } 4895 4896 /* 4897 * Now we have the strbuf structure for TI_GETMYNAME 4898 * and TI_GETPEERNAME. Next we copyout the requested 4899 * address and then we'll copyout the strbuf. 4900 */ 4901 STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag, 4902 (void *)mp1->b_rptr); 4903 4904 if (connp->conn_family == AF_INET) 4905 addrlen = sizeof (sin_t); 4906 else 4907 addrlen = sizeof (sin6_t); 4908 4909 if (STRUCT_FGET(sb, maxlen) < addrlen) { 4910 mi_copy_done(q, mp, EINVAL); 4911 return; 4912 } 4913 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 4914 case TI_GETMYNAME: 4915 break; 4916 case TI_GETPEERNAME: 4917 if (icmp->icmp_state != TS_DATA_XFER) { 4918 mi_copy_done(q, mp, ENOTCONN); 4919 return; 4920 } 4921 break; 4922 default: 4923 mi_copy_done(q, mp, EPROTO); 4924 return; 4925 } 4926 mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE); 4927 if (!mp1) 4928 return; 4929 4930 STRUCT_FSET(sb, len, addrlen); 4931 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 4932 case TI_GETMYNAME: 4933 (void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr, 4934 &addrlen); 4935 break; 4936 case TI_GETPEERNAME: 4937 (void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr, 4938 &addrlen); 4939 break; 4940 } 4941 mp1->b_wptr += addrlen; 4942 /* Copy out the address */ 4943 mi_copyout(q, mp); 4944 } 4945 4946 void 4947 icmp_ddi_g_init(void) 4948 { 4949 icmp_max_optsize = optcom_max_optsize(icmp_opt_obj.odb_opt_des_arr, 4950 icmp_opt_obj.odb_opt_arr_cnt); 4951 4952 /* 4953 * We want to be informed each time a stack is created or 4954 * destroyed in the kernel, so we can maintain the 4955 * set of icmp_stack_t's. 4956 */ 4957 netstack_register(NS_ICMP, rawip_stack_init, NULL, rawip_stack_fini); 4958 } 4959 4960 void 4961 icmp_ddi_g_destroy(void) 4962 { 4963 netstack_unregister(NS_ICMP); 4964 } 4965 4966 #define INET_NAME "ip" 4967 4968 /* 4969 * Initialize the ICMP stack instance. 4970 */ 4971 static void * 4972 rawip_stack_init(netstackid_t stackid, netstack_t *ns) 4973 { 4974 icmp_stack_t *is; 4975 int error = 0; 4976 size_t arrsz; 4977 major_t major; 4978 4979 is = (icmp_stack_t *)kmem_zalloc(sizeof (*is), KM_SLEEP); 4980 is->is_netstack = ns; 4981 4982 arrsz = sizeof (icmp_propinfo_tbl); 4983 is->is_propinfo_tbl = (mod_prop_info_t *)kmem_alloc(arrsz, KM_SLEEP); 4984 bcopy(icmp_propinfo_tbl, is->is_propinfo_tbl, arrsz); 4985 4986 is->is_ksp = rawip_kstat_init(stackid); 4987 4988 major = mod_name_to_major(INET_NAME); 4989 error = ldi_ident_from_major(major, &is->is_ldi_ident); 4990 ASSERT(error == 0); 4991 return (is); 4992 } 4993 4994 /* 4995 * Free the ICMP stack instance. 4996 */ 4997 static void 4998 rawip_stack_fini(netstackid_t stackid, void *arg) 4999 { 5000 icmp_stack_t *is = (icmp_stack_t *)arg; 5001 5002 kmem_free(is->is_propinfo_tbl, sizeof (icmp_propinfo_tbl)); 5003 is->is_propinfo_tbl = NULL; 5004 5005 rawip_kstat_fini(stackid, is->is_ksp); 5006 is->is_ksp = NULL; 5007 ldi_ident_release(is->is_ldi_ident); 5008 kmem_free(is, sizeof (*is)); 5009 } 5010 5011 static void * 5012 rawip_kstat_init(netstackid_t stackid) { 5013 kstat_t *ksp; 5014 5015 rawip_named_kstat_t template = { 5016 { "inDatagrams", KSTAT_DATA_UINT32, 0 }, 5017 { "inCksumErrs", KSTAT_DATA_UINT32, 0 }, 5018 { "inErrors", KSTAT_DATA_UINT32, 0 }, 5019 { "outDatagrams", KSTAT_DATA_UINT32, 0 }, 5020 { "outErrors", KSTAT_DATA_UINT32, 0 }, 5021 }; 5022 5023 ksp = kstat_create_netstack("icmp", 0, "rawip", "mib2", 5024 KSTAT_TYPE_NAMED, 5025 NUM_OF_FIELDS(rawip_named_kstat_t), 5026 0, stackid); 5027 if (ksp == NULL || ksp->ks_data == NULL) 5028 return (NULL); 5029 5030 bcopy(&template, ksp->ks_data, sizeof (template)); 5031 ksp->ks_update = rawip_kstat_update; 5032 ksp->ks_private = (void *)(uintptr_t)stackid; 5033 5034 kstat_install(ksp); 5035 return (ksp); 5036 } 5037 5038 static void 5039 rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp) 5040 { 5041 if (ksp != NULL) { 5042 ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private); 5043 kstat_delete_netstack(ksp, stackid); 5044 } 5045 } 5046 5047 static int 5048 rawip_kstat_update(kstat_t *ksp, int rw) 5049 { 5050 rawip_named_kstat_t *rawipkp; 5051 netstackid_t stackid = (netstackid_t)(uintptr_t)ksp->ks_private; 5052 netstack_t *ns; 5053 icmp_stack_t *is; 5054 5055 if ((ksp == NULL) || (ksp->ks_data == NULL)) 5056 return (EIO); 5057 5058 if (rw == KSTAT_WRITE) 5059 return (EACCES); 5060 5061 rawipkp = (rawip_named_kstat_t *)ksp->ks_data; 5062 5063 ns = netstack_find_by_stackid(stackid); 5064 if (ns == NULL) 5065 return (-1); 5066 is = ns->netstack_icmp; 5067 if (is == NULL) { 5068 netstack_rele(ns); 5069 return (-1); 5070 } 5071 rawipkp->inDatagrams.value.ui32 = is->is_rawip_mib.rawipInDatagrams; 5072 rawipkp->inCksumErrs.value.ui32 = is->is_rawip_mib.rawipInCksumErrs; 5073 rawipkp->inErrors.value.ui32 = is->is_rawip_mib.rawipInErrors; 5074 rawipkp->outDatagrams.value.ui32 = is->is_rawip_mib.rawipOutDatagrams; 5075 rawipkp->outErrors.value.ui32 = is->is_rawip_mib.rawipOutErrors; 5076 netstack_rele(ns); 5077 return (0); 5078 } 5079 5080 /* ARGSUSED */ 5081 int 5082 rawip_accept(sock_lower_handle_t lproto_handle, 5083 sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle, 5084 cred_t *cr) 5085 { 5086 return (EOPNOTSUPP); 5087 } 5088 5089 /* ARGSUSED */ 5090 int 5091 rawip_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa, 5092 socklen_t len, cred_t *cr) 5093 { 5094 conn_t *connp = (conn_t *)proto_handle; 5095 int error; 5096 5097 /* All Solaris components should pass a cred for this operation. */ 5098 ASSERT(cr != NULL); 5099 5100 /* Binding to a NULL address really means unbind */ 5101 if (sa == NULL) 5102 error = rawip_do_unbind(connp); 5103 else 5104 error = rawip_do_bind(connp, sa, len); 5105 5106 if (error < 0) { 5107 if (error == -TOUTSTATE) 5108 error = EINVAL; 5109 else 5110 error = proto_tlitosyserr(-error); 5111 } 5112 return (error); 5113 } 5114 5115 static int 5116 rawip_implicit_bind(conn_t *connp) 5117 { 5118 sin6_t sin6addr; 5119 sin_t *sin; 5120 sin6_t *sin6; 5121 socklen_t len; 5122 int error; 5123 5124 if (connp->conn_family == AF_INET) { 5125 len = sizeof (struct sockaddr_in); 5126 sin = (sin_t *)&sin6addr; 5127 *sin = sin_null; 5128 sin->sin_family = AF_INET; 5129 sin->sin_addr.s_addr = INADDR_ANY; 5130 } else { 5131 ASSERT(connp->conn_family == AF_INET6); 5132 len = sizeof (sin6_t); 5133 sin6 = (sin6_t *)&sin6addr; 5134 *sin6 = sin6_null; 5135 sin6->sin6_family = AF_INET6; 5136 V6_SET_ZERO(sin6->sin6_addr); 5137 } 5138 5139 error = rawip_do_bind(connp, (struct sockaddr *)&sin6addr, len); 5140 5141 return ((error < 0) ? proto_tlitosyserr(-error) : error); 5142 } 5143 5144 static int 5145 rawip_unbind(conn_t *connp) 5146 { 5147 int error; 5148 5149 error = rawip_do_unbind(connp); 5150 if (error < 0) { 5151 error = proto_tlitosyserr(-error); 5152 } 5153 return (error); 5154 } 5155 5156 /* ARGSUSED */ 5157 int 5158 rawip_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr) 5159 { 5160 return (EOPNOTSUPP); 5161 } 5162 5163 int 5164 rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, 5165 socklen_t len, sock_connid_t *id, cred_t *cr) 5166 { 5167 conn_t *connp = (conn_t *)proto_handle; 5168 icmp_t *icmp = connp->conn_icmp; 5169 int error; 5170 boolean_t did_bind = B_FALSE; 5171 pid_t pid = curproc->p_pid; 5172 5173 /* All Solaris components should pass a cred for this operation. */ 5174 ASSERT(cr != NULL); 5175 5176 if (sa == NULL) { 5177 /* 5178 * Disconnect 5179 * Make sure we are connected 5180 */ 5181 if (icmp->icmp_state != TS_DATA_XFER) 5182 return (EINVAL); 5183 5184 error = icmp_disconnect(connp); 5185 return (error); 5186 } 5187 5188 error = proto_verify_ip_addr(connp->conn_family, sa, len); 5189 if (error != 0) 5190 return (error); 5191 5192 /* do an implicit bind if necessary */ 5193 if (icmp->icmp_state == TS_UNBND) { 5194 error = rawip_implicit_bind(connp); 5195 /* 5196 * We could be racing with an actual bind, in which case 5197 * we would see EPROTO. We cross our fingers and try 5198 * to connect. 5199 */ 5200 if (!(error == 0 || error == EPROTO)) 5201 return (error); 5202 did_bind = B_TRUE; 5203 } 5204 5205 /* 5206 * set SO_DGRAM_ERRIND 5207 */ 5208 connp->conn_dgram_errind = B_TRUE; 5209 5210 error = rawip_do_connect(connp, sa, len, cr, pid); 5211 if (error != 0 && did_bind) { 5212 int unbind_err; 5213 5214 unbind_err = rawip_unbind(connp); 5215 ASSERT(unbind_err == 0); 5216 } 5217 5218 if (error == 0) { 5219 *id = 0; 5220 (*connp->conn_upcalls->su_connected)(connp->conn_upper_handle, 5221 0, NULL, -1); 5222 } else if (error < 0) { 5223 error = proto_tlitosyserr(-error); 5224 } 5225 return (error); 5226 } 5227 5228 /* ARGSUSED2 */ 5229 int 5230 rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q, 5231 boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb, 5232 sock_quiesce_arg_t *arg) 5233 { 5234 conn_t *connp = (conn_t *)proto_handle; 5235 icmp_t *icmp; 5236 struct T_capability_ack tca; 5237 struct sockaddr_in6 laddr, faddr; 5238 socklen_t laddrlen, faddrlen; 5239 short opts; 5240 struct stroptions *stropt; 5241 mblk_t *mp, *stropt_mp; 5242 int error; 5243 5244 icmp = connp->conn_icmp; 5245 5246 stropt_mp = allocb_wait(sizeof (*stropt), BPRI_HI, STR_NOSIG, NULL); 5247 5248 /* 5249 * setup the fallback stream that was allocated 5250 */ 5251 connp->conn_dev = (dev_t)RD(q)->q_ptr; 5252 connp->conn_minor_arena = WR(q)->q_ptr; 5253 5254 RD(q)->q_ptr = WR(q)->q_ptr = connp; 5255 5256 WR(q)->q_qinfo = &icmpwinit; 5257 5258 connp->conn_rq = RD(q); 5259 connp->conn_wq = WR(q); 5260 5261 /* Notify stream head about options before sending up data */ 5262 stropt_mp->b_datap->db_type = M_SETOPTS; 5263 stropt_mp->b_wptr += sizeof (*stropt); 5264 stropt = (struct stroptions *)stropt_mp->b_rptr; 5265 stropt->so_flags = SO_WROFF | SO_HIWAT; 5266 stropt->so_wroff = connp->conn_wroff; 5267 stropt->so_hiwat = connp->conn_rcvbuf; 5268 putnext(RD(q), stropt_mp); 5269 5270 /* 5271 * free helper stream 5272 */ 5273 ip_free_helper_stream(connp); 5274 5275 /* 5276 * Collect the information needed to sync with the sonode 5277 */ 5278 icmp_do_capability_ack(icmp, &tca, TC1_INFO); 5279 5280 laddrlen = faddrlen = sizeof (sin6_t); 5281 (void) rawip_getsockname((sock_lower_handle_t)connp, 5282 (struct sockaddr *)&laddr, &laddrlen, CRED()); 5283 error = rawip_getpeername((sock_lower_handle_t)connp, 5284 (struct sockaddr *)&faddr, &faddrlen, CRED()); 5285 if (error != 0) 5286 faddrlen = 0; 5287 opts = 0; 5288 if (connp->conn_dgram_errind) 5289 opts |= SO_DGRAM_ERRIND; 5290 if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE) 5291 opts |= SO_DONTROUTE; 5292 5293 mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca, 5294 (struct sockaddr *)&laddr, laddrlen, 5295 (struct sockaddr *)&faddr, faddrlen, opts); 5296 5297 /* 5298 * Attempts to send data up during fallback will result in it being 5299 * queued in icmp_t. Now we push up any queued packets. 5300 */ 5301 mutex_enter(&icmp->icmp_recv_lock); 5302 if (mp != NULL) { 5303 mp->b_next = icmp->icmp_fallback_queue_head; 5304 icmp->icmp_fallback_queue_head = mp; 5305 } 5306 while (icmp->icmp_fallback_queue_head != NULL) { 5307 mp = icmp->icmp_fallback_queue_head; 5308 icmp->icmp_fallback_queue_head = mp->b_next; 5309 mp->b_next = NULL; 5310 mutex_exit(&icmp->icmp_recv_lock); 5311 putnext(RD(q), mp); 5312 mutex_enter(&icmp->icmp_recv_lock); 5313 } 5314 icmp->icmp_fallback_queue_tail = icmp->icmp_fallback_queue_head; 5315 5316 /* 5317 * No longer a streams less socket 5318 */ 5319 mutex_enter(&connp->conn_lock); 5320 connp->conn_flags &= ~IPCL_NONSTR; 5321 mutex_exit(&connp->conn_lock); 5322 5323 mutex_exit(&icmp->icmp_recv_lock); 5324 5325 ASSERT(icmp->icmp_fallback_queue_head == NULL && 5326 icmp->icmp_fallback_queue_tail == NULL); 5327 5328 ASSERT(connp->conn_ref >= 1); 5329 5330 return (0); 5331 } 5332 5333 /* ARGSUSED2 */ 5334 sock_lower_handle_t 5335 rawip_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, 5336 uint_t *smodep, int *errorp, int flags, cred_t *credp) 5337 { 5338 conn_t *connp; 5339 5340 if (type != SOCK_RAW || (family != AF_INET && family != AF_INET6)) { 5341 *errorp = EPROTONOSUPPORT; 5342 return (NULL); 5343 } 5344 5345 connp = rawip_do_open(family, credp, errorp, flags); 5346 if (connp != NULL) { 5347 connp->conn_flags |= IPCL_NONSTR; 5348 5349 mutex_enter(&connp->conn_lock); 5350 connp->conn_state_flags &= ~CONN_INCIPIENT; 5351 mutex_exit(&connp->conn_lock); 5352 *sock_downcalls = &sock_rawip_downcalls; 5353 *smodep = SM_ATOMIC; 5354 } else { 5355 ASSERT(*errorp != 0); 5356 } 5357 5358 return ((sock_lower_handle_t)connp); 5359 } 5360 5361 /* ARGSUSED3 */ 5362 void 5363 rawip_activate(sock_lower_handle_t proto_handle, 5364 sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, int flags, 5365 cred_t *cr) 5366 { 5367 conn_t *connp = (conn_t *)proto_handle; 5368 struct sock_proto_props sopp; 5369 5370 /* All Solaris components should pass a cred for this operation. */ 5371 ASSERT(cr != NULL); 5372 5373 connp->conn_upcalls = sock_upcalls; 5374 connp->conn_upper_handle = sock_handle; 5375 5376 sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT | 5377 SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ; 5378 sopp.sopp_wroff = connp->conn_wroff; 5379 sopp.sopp_rxhiwat = connp->conn_rcvbuf; 5380 sopp.sopp_rxlowat = connp->conn_rcvlowat; 5381 sopp.sopp_maxblk = INFPSZ; 5382 sopp.sopp_maxpsz = IP_MAXPACKET; 5383 sopp.sopp_minpsz = (icmp_mod_info.mi_minpsz == 1) ? 0 : 5384 icmp_mod_info.mi_minpsz; 5385 5386 (*connp->conn_upcalls->su_set_proto_props) 5387 (connp->conn_upper_handle, &sopp); 5388 5389 icmp_bind_proto(connp->conn_icmp); 5390 } 5391 5392 /* ARGSUSED3 */ 5393 int 5394 rawip_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa, 5395 socklen_t *salenp, cred_t *cr) 5396 { 5397 conn_t *connp = (conn_t *)proto_handle; 5398 icmp_t *icmp = connp->conn_icmp; 5399 int error; 5400 5401 /* All Solaris components should pass a cred for this operation. */ 5402 ASSERT(cr != NULL); 5403 5404 mutex_enter(&connp->conn_lock); 5405 if (icmp->icmp_state != TS_DATA_XFER) 5406 error = ENOTCONN; 5407 else 5408 error = conn_getpeername(connp, sa, salenp); 5409 mutex_exit(&connp->conn_lock); 5410 return (error); 5411 } 5412 5413 /* ARGSUSED3 */ 5414 int 5415 rawip_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *sa, 5416 socklen_t *salenp, cred_t *cr) 5417 { 5418 conn_t *connp = (conn_t *)proto_handle; 5419 int error; 5420 5421 /* All Solaris components should pass a cred for this operation. */ 5422 ASSERT(cr != NULL); 5423 5424 mutex_enter(&connp->conn_lock); 5425 error = conn_getsockname(connp, sa, salenp); 5426 mutex_exit(&connp->conn_lock); 5427 return (error); 5428 } 5429 5430 int 5431 rawip_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name, 5432 const void *optvalp, socklen_t optlen, cred_t *cr) 5433 { 5434 conn_t *connp = (conn_t *)proto_handle; 5435 int error; 5436 5437 /* All Solaris components should pass a cred for this operation. */ 5438 ASSERT(cr != NULL); 5439 5440 error = proto_opt_check(level, option_name, optlen, NULL, 5441 icmp_opt_obj.odb_opt_des_arr, 5442 icmp_opt_obj.odb_opt_arr_cnt, 5443 B_TRUE, B_FALSE, cr); 5444 5445 if (error != 0) { 5446 /* 5447 * option not recognized 5448 */ 5449 if (error < 0) { 5450 error = proto_tlitosyserr(-error); 5451 } 5452 return (error); 5453 } 5454 5455 error = icmp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, 5456 option_name, optlen, (uchar_t *)optvalp, (uint_t *)&optlen, 5457 (uchar_t *)optvalp, NULL, cr); 5458 5459 ASSERT(error >= 0); 5460 5461 return (error); 5462 } 5463 5464 int 5465 rawip_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, 5466 void *optvalp, socklen_t *optlen, cred_t *cr) 5467 { 5468 int error; 5469 conn_t *connp = (conn_t *)proto_handle; 5470 t_uscalar_t max_optbuf_len; 5471 void *optvalp_buf; 5472 int len; 5473 5474 /* All Solaris components should pass a cred for this operation. */ 5475 ASSERT(cr != NULL); 5476 5477 error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len, 5478 icmp_opt_obj.odb_opt_des_arr, 5479 icmp_opt_obj.odb_opt_arr_cnt, 5480 B_FALSE, B_TRUE, cr); 5481 5482 if (error != 0) { 5483 if (error < 0) { 5484 error = proto_tlitosyserr(-error); 5485 } 5486 return (error); 5487 } 5488 5489 optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP); 5490 len = icmp_opt_get(connp, level, option_name, optvalp_buf); 5491 if (len == -1) { 5492 kmem_free(optvalp_buf, max_optbuf_len); 5493 return (EINVAL); 5494 } 5495 5496 /* 5497 * update optlen and copy option value 5498 */ 5499 t_uscalar_t size = MIN(len, *optlen); 5500 5501 bcopy(optvalp_buf, optvalp, size); 5502 bcopy(&size, optlen, sizeof (size)); 5503 5504 kmem_free(optvalp_buf, max_optbuf_len); 5505 return (0); 5506 } 5507 5508 /* ARGSUSED1 */ 5509 int 5510 rawip_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr) 5511 { 5512 conn_t *connp = (conn_t *)proto_handle; 5513 5514 /* All Solaris components should pass a cred for this operation. */ 5515 ASSERT(cr != NULL); 5516 5517 (void) rawip_do_close(connp); 5518 return (0); 5519 } 5520 5521 /* ARGSUSED2 */ 5522 int 5523 rawip_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr) 5524 { 5525 conn_t *connp = (conn_t *)proto_handle; 5526 5527 /* All Solaris components should pass a cred for this operation. */ 5528 ASSERT(cr != NULL); 5529 5530 /* shut down the send side */ 5531 if (how != SHUT_RD) 5532 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 5533 SOCK_OPCTL_SHUT_SEND, 0); 5534 /* shut down the recv side */ 5535 if (how != SHUT_WR) 5536 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 5537 SOCK_OPCTL_SHUT_RECV, 0); 5538 return (0); 5539 } 5540 5541 void 5542 rawip_clr_flowctrl(sock_lower_handle_t proto_handle) 5543 { 5544 conn_t *connp = (conn_t *)proto_handle; 5545 icmp_t *icmp = connp->conn_icmp; 5546 5547 mutex_enter(&icmp->icmp_recv_lock); 5548 connp->conn_flow_cntrld = B_FALSE; 5549 mutex_exit(&icmp->icmp_recv_lock); 5550 } 5551 5552 int 5553 rawip_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, 5554 int mode, int32_t *rvalp, cred_t *cr) 5555 { 5556 conn_t *connp = (conn_t *)proto_handle; 5557 int error; 5558 5559 /* All Solaris components should pass a cred for this operation. */ 5560 ASSERT(cr != NULL); 5561 5562 /* 5563 * If we don't have a helper stream then create one. 5564 * ip_create_helper_stream takes care of locking the conn_t, 5565 * so this check for NULL is just a performance optimization. 5566 */ 5567 if (connp->conn_helper_info == NULL) { 5568 icmp_stack_t *is = connp->conn_icmp->icmp_is; 5569 5570 ASSERT(is->is_ldi_ident != NULL); 5571 5572 /* 5573 * Create a helper stream for non-STREAMS socket. 5574 */ 5575 error = ip_create_helper_stream(connp, is->is_ldi_ident); 5576 if (error != 0) { 5577 ip0dbg(("rawip_ioctl: create of IP helper stream " 5578 "failed %d\n", error)); 5579 return (error); 5580 } 5581 } 5582 5583 switch (cmd) { 5584 case _SIOCSOCKFALLBACK: 5585 case TI_GETPEERNAME: 5586 case TI_GETMYNAME: 5587 #ifdef DEBUG 5588 cmn_err(CE_CONT, "icmp_ioctl cmd 0x%x on non streams" 5589 " socket", cmd); 5590 #endif 5591 error = EINVAL; 5592 break; 5593 default: 5594 /* 5595 * Pass on to IP using helper stream 5596 */ 5597 error = ldi_ioctl(connp->conn_helper_info->iphs_handle, 5598 cmd, arg, mode, cr, rvalp); 5599 break; 5600 } 5601 return (error); 5602 } 5603 5604 int 5605 rawip_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, 5606 cred_t *cr) 5607 { 5608 sin6_t *sin6; 5609 sin_t *sin = NULL; 5610 uint_t srcid; 5611 conn_t *connp = (conn_t *)proto_handle; 5612 icmp_t *icmp = connp->conn_icmp; 5613 int error = 0; 5614 icmp_stack_t *is = icmp->icmp_is; 5615 pid_t pid = curproc->p_pid; 5616 ip_xmit_attr_t *ixa; 5617 5618 ASSERT(DB_TYPE(mp) == M_DATA); 5619 5620 /* All Solaris components should pass a cred for this operation. */ 5621 ASSERT(cr != NULL); 5622 5623 /* do an implicit bind if necessary */ 5624 if (icmp->icmp_state == TS_UNBND) { 5625 error = rawip_implicit_bind(connp); 5626 /* 5627 * We could be racing with an actual bind, in which case 5628 * we would see EPROTO. We cross our fingers and try 5629 * to connect. 5630 */ 5631 if (!(error == 0 || error == EPROTO)) { 5632 freemsg(mp); 5633 return (error); 5634 } 5635 } 5636 5637 /* Protocol 255 contains full IP headers */ 5638 /* Read without holding lock */ 5639 if (icmp->icmp_hdrincl) { 5640 ASSERT(connp->conn_ipversion == IPV4_VERSION); 5641 if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH) { 5642 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { 5643 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5644 freemsg(mp); 5645 return (EINVAL); 5646 } 5647 } 5648 error = icmp_output_hdrincl(connp, mp, cr, pid); 5649 if (is->is_sendto_ignerr) 5650 return (0); 5651 else 5652 return (error); 5653 } 5654 5655 /* Connected? */ 5656 if (msg->msg_name == NULL) { 5657 if (icmp->icmp_state != TS_DATA_XFER) { 5658 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5659 return (EDESTADDRREQ); 5660 } 5661 if (msg->msg_controllen != 0) { 5662 error = icmp_output_ancillary(connp, NULL, NULL, mp, 5663 NULL, msg, cr, pid); 5664 } else { 5665 error = icmp_output_connected(connp, mp, cr, pid); 5666 } 5667 if (is->is_sendto_ignerr) 5668 return (0); 5669 else 5670 return (error); 5671 } 5672 if (icmp->icmp_state == TS_DATA_XFER) { 5673 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5674 return (EISCONN); 5675 } 5676 error = proto_verify_ip_addr(connp->conn_family, 5677 (struct sockaddr *)msg->msg_name, msg->msg_namelen); 5678 if (error != 0) { 5679 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5680 return (error); 5681 } 5682 switch (connp->conn_family) { 5683 case AF_INET6: 5684 sin6 = (sin6_t *)msg->msg_name; 5685 5686 /* No support for mapped addresses on raw sockets */ 5687 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 5688 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5689 return (EADDRNOTAVAIL); 5690 } 5691 srcid = sin6->__sin6_src_id; 5692 5693 /* 5694 * If the local address is a mapped address return 5695 * an error. 5696 * It would be possible to send an IPv6 packet but the 5697 * response would never make it back to the application 5698 * since it is bound to a mapped address. 5699 */ 5700 if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) { 5701 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5702 return (EADDRNOTAVAIL); 5703 } 5704 5705 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) 5706 sin6->sin6_addr = ipv6_loopback; 5707 5708 /* 5709 * We have to allocate an ip_xmit_attr_t before we grab 5710 * conn_lock and we need to hold conn_lock once we've check 5711 * conn_same_as_last_v6 to handle concurrent send* calls on a 5712 * socket. 5713 */ 5714 if (msg->msg_controllen == 0) { 5715 ixa = conn_get_ixa(connp, B_FALSE); 5716 if (ixa == NULL) { 5717 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5718 return (ENOMEM); 5719 } 5720 } else { 5721 ixa = NULL; 5722 } 5723 mutex_enter(&connp->conn_lock); 5724 if (icmp->icmp_delayed_error != 0) { 5725 sin6_t *sin2 = (sin6_t *)&icmp->icmp_delayed_addr; 5726 5727 error = icmp->icmp_delayed_error; 5728 icmp->icmp_delayed_error = 0; 5729 5730 /* Compare IP address and family */ 5731 5732 if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr, 5733 &sin2->sin6_addr) && 5734 sin6->sin6_family == sin2->sin6_family) { 5735 mutex_exit(&connp->conn_lock); 5736 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5737 if (ixa != NULL) 5738 ixa_refrele(ixa); 5739 return (error); 5740 } 5741 } 5742 if (msg->msg_controllen != 0) { 5743 mutex_exit(&connp->conn_lock); 5744 ASSERT(ixa == NULL); 5745 error = icmp_output_ancillary(connp, NULL, sin6, mp, 5746 NULL, msg, cr, pid); 5747 } else if (conn_same_as_last_v6(connp, sin6) && 5748 connp->conn_lastsrcid == srcid && 5749 ipsec_outbound_policy_current(ixa)) { 5750 /* icmp_output_lastdst drops conn_lock */ 5751 error = icmp_output_lastdst(connp, mp, cr, pid, ixa); 5752 } else { 5753 /* icmp_output_newdst drops conn_lock */ 5754 error = icmp_output_newdst(connp, mp, NULL, sin6, cr, 5755 pid, ixa); 5756 } 5757 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 5758 if (is->is_sendto_ignerr) 5759 return (0); 5760 else 5761 return (error); 5762 case AF_INET: 5763 sin = (sin_t *)msg->msg_name; 5764 5765 if (sin->sin_addr.s_addr == INADDR_ANY) 5766 sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); 5767 5768 /* 5769 * We have to allocate an ip_xmit_attr_t before we grab 5770 * conn_lock and we need to hold conn_lock once we've check 5771 * conn_same_as_last_v6 to handle concurrent send* on a socket. 5772 */ 5773 if (msg->msg_controllen == 0) { 5774 ixa = conn_get_ixa(connp, B_FALSE); 5775 if (ixa == NULL) { 5776 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5777 return (ENOMEM); 5778 } 5779 } else { 5780 ixa = NULL; 5781 } 5782 mutex_enter(&connp->conn_lock); 5783 if (icmp->icmp_delayed_error != 0) { 5784 sin_t *sin2 = (sin_t *)&icmp->icmp_delayed_addr; 5785 5786 error = icmp->icmp_delayed_error; 5787 icmp->icmp_delayed_error = 0; 5788 5789 /* Compare IP address */ 5790 5791 if (sin->sin_addr.s_addr == sin2->sin_addr.s_addr) { 5792 mutex_exit(&connp->conn_lock); 5793 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5794 if (ixa != NULL) 5795 ixa_refrele(ixa); 5796 return (error); 5797 } 5798 } 5799 5800 if (msg->msg_controllen != 0) { 5801 mutex_exit(&connp->conn_lock); 5802 ASSERT(ixa == NULL); 5803 error = icmp_output_ancillary(connp, sin, NULL, mp, 5804 NULL, msg, cr, pid); 5805 } else if (conn_same_as_last_v4(connp, sin) && 5806 ipsec_outbound_policy_current(ixa)) { 5807 /* icmp_output_lastdst drops conn_lock */ 5808 error = icmp_output_lastdst(connp, mp, cr, pid, ixa); 5809 } else { 5810 /* icmp_output_newdst drops conn_lock */ 5811 error = icmp_output_newdst(connp, mp, sin, NULL, cr, 5812 pid, ixa); 5813 } 5814 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 5815 if (is->is_sendto_ignerr) 5816 return (0); 5817 else 5818 return (error); 5819 default: 5820 return (EINVAL); 5821 } 5822 } 5823 5824 sock_downcalls_t sock_rawip_downcalls = { 5825 rawip_activate, 5826 rawip_accept, 5827 rawip_bind, 5828 rawip_listen, 5829 rawip_connect, 5830 rawip_getpeername, 5831 rawip_getsockname, 5832 rawip_getsockopt, 5833 rawip_setsockopt, 5834 rawip_send, 5835 NULL, 5836 NULL, 5837 NULL, 5838 rawip_shutdown, 5839 rawip_clr_flowctrl, 5840 rawip_ioctl, 5841 rawip_close 5842 };