1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2013 by Delphix. All rights reserved. 24 * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved. 25 */ 26 /* Copyright (c) 1990 Mentat Inc. */ 27 28 #include <sys/types.h> 29 #include <sys/stream.h> 30 #include <sys/stropts.h> 31 #include <sys/strlog.h> 32 #include <sys/strsun.h> 33 #define _SUN_TPI_VERSION 2 34 #include <sys/tihdr.h> 35 #include <sys/timod.h> 36 #include <sys/ddi.h> 37 #include <sys/sunddi.h> 38 #include <sys/strsubr.h> 39 #include <sys/suntpi.h> 40 #include <sys/xti_inet.h> 41 #include <sys/cmn_err.h> 42 #include <sys/kmem.h> 43 #include <sys/cred.h> 44 #include <sys/policy.h> 45 #include <sys/priv.h> 46 #include <sys/ucred.h> 47 #include <sys/zone.h> 48 49 #include <sys/sockio.h> 50 #include <sys/socket.h> 51 #include <sys/socketvar.h> 52 #include <sys/vtrace.h> 53 #include <sys/sdt.h> 54 #include <sys/debug.h> 55 #include <sys/isa_defs.h> 56 #include <sys/random.h> 57 #include <netinet/in.h> 58 #include <netinet/ip6.h> 59 #include <netinet/icmp6.h> 60 #include <netinet/udp.h> 61 62 #include <inet/common.h> 63 #include <inet/ip.h> 64 #include <inet/ip_impl.h> 65 #include <inet/ipsec_impl.h> 66 #include <inet/ip6.h> 67 #include <inet/ip_ire.h> 68 #include <inet/ip_if.h> 69 #include <inet/ip_multi.h> 70 #include <inet/ip_ndp.h> 71 #include <inet/proto_set.h> 72 #include <inet/mib2.h> 73 #include <inet/nd.h> 74 #include <inet/optcom.h> 75 #include <inet/snmpcom.h> 76 #include <inet/kstatcom.h> 77 #include <inet/ipclassifier.h> 78 79 #include <sys/tsol/label.h> 80 #include <sys/tsol/tnet.h> 81 82 #include <inet/rawip_impl.h> 83 84 #include <sys/disp.h> 85 86 /* 87 * Synchronization notes: 88 * 89 * RAWIP is MT and uses the usual kernel synchronization primitives. We use 90 * conn_lock to protect the icmp_t. 91 * 92 * Plumbing notes: 93 * ICMP is always a device driver. For compatibility with mibopen() code 94 * it is possible to I_PUSH "icmp", but that results in pushing a passthrough 95 * dummy module. 96 */ 97 static void icmp_addr_req(queue_t *q, mblk_t *mp); 98 static void icmp_tpi_bind(queue_t *q, mblk_t *mp); 99 static void icmp_bind_proto(icmp_t *icmp); 100 static int icmp_build_hdr_template(conn_t *, const in6_addr_t *, 101 const in6_addr_t *, uint32_t); 102 static void icmp_capability_req(queue_t *q, mblk_t *mp); 103 static int icmp_close(queue_t *q, int flags); 104 static void icmp_close_free(conn_t *); 105 static void icmp_tpi_connect(queue_t *q, mblk_t *mp); 106 static void icmp_tpi_disconnect(queue_t *q, mblk_t *mp); 107 static void icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, 108 int sys_error); 109 static void icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive, 110 t_scalar_t tlierr, int sys_error); 111 static void icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2, 112 ip_recv_attr_t *); 113 static void icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, 114 ip_recv_attr_t *); 115 static void icmp_info_req(queue_t *q, mblk_t *mp); 116 static void icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *); 117 static conn_t *icmp_open(int family, cred_t *credp, int *err, int flags); 118 static int icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, 119 cred_t *credp); 120 static int icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, 121 cred_t *credp); 122 static boolean_t icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name); 123 int icmp_opt_set(conn_t *connp, uint_t optset_context, 124 int level, int name, uint_t inlen, 125 uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 126 void *thisdg_attrs, cred_t *cr); 127 int icmp_opt_get(conn_t *connp, int level, int name, 128 uchar_t *ptr); 129 static int icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, 130 sin6_t *sin6, cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa); 131 static mblk_t *icmp_prepend_hdr(conn_t *, ip_xmit_attr_t *, const ip_pkt_t *, 132 const in6_addr_t *, const in6_addr_t *, uint32_t, mblk_t *, int *); 133 static mblk_t *icmp_prepend_header_template(conn_t *, ip_xmit_attr_t *, 134 mblk_t *, const in6_addr_t *, uint32_t, int *); 135 static int icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name, 136 uchar_t *ptr, int len); 137 static void icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err); 138 static void icmp_tpi_unbind(queue_t *q, mblk_t *mp); 139 static void icmp_wput(queue_t *q, mblk_t *mp); 140 static void icmp_wput_fallback(queue_t *q, mblk_t *mp); 141 static void icmp_wput_other(queue_t *q, mblk_t *mp); 142 static void icmp_wput_iocdata(queue_t *q, mblk_t *mp); 143 static void icmp_wput_restricted(queue_t *q, mblk_t *mp); 144 static void icmp_ulp_recv(conn_t *, mblk_t *, uint_t); 145 146 static void *rawip_stack_init(netstackid_t stackid, netstack_t *ns); 147 static void rawip_stack_fini(netstackid_t stackid, void *arg); 148 149 static void *rawip_kstat_init(netstackid_t stackid); 150 static void rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp); 151 static int rawip_kstat_update(kstat_t *kp, int rw); 152 static void rawip_stack_shutdown(netstackid_t stackid, void *arg); 153 154 /* Common routines for TPI and socket module */ 155 static conn_t *rawip_do_open(int, cred_t *, int *, int); 156 static void rawip_do_close(conn_t *); 157 static int rawip_do_bind(conn_t *, struct sockaddr *, socklen_t); 158 static int rawip_do_unbind(conn_t *); 159 static int rawip_do_connect(conn_t *, const struct sockaddr *, socklen_t, 160 cred_t *, pid_t); 161 162 int rawip_getsockname(sock_lower_handle_t, struct sockaddr *, 163 socklen_t *, cred_t *); 164 int rawip_getpeername(sock_lower_handle_t, struct sockaddr *, 165 socklen_t *, cred_t *); 166 167 static struct module_info icmp_mod_info = { 168 5707, "icmp", 1, INFPSZ, 512, 128 169 }; 170 171 /* 172 * Entry points for ICMP as a device. 173 * We have separate open functions for the /dev/icmp and /dev/icmp6 devices. 174 */ 175 static struct qinit icmprinitv4 = { 176 NULL, NULL, icmp_openv4, icmp_close, NULL, &icmp_mod_info 177 }; 178 179 static struct qinit icmprinitv6 = { 180 NULL, NULL, icmp_openv6, icmp_close, NULL, &icmp_mod_info 181 }; 182 183 static struct qinit icmpwinit = { 184 (pfi_t)icmp_wput, (pfi_t)ip_wsrv, NULL, NULL, NULL, &icmp_mod_info 185 }; 186 187 /* ICMP entry point during fallback */ 188 static struct qinit icmp_fallback_sock_winit = { 189 (pfi_t)icmp_wput_fallback, NULL, NULL, NULL, NULL, &icmp_mod_info 190 }; 191 192 /* For AF_INET aka /dev/icmp */ 193 struct streamtab icmpinfov4 = { 194 &icmprinitv4, &icmpwinit 195 }; 196 197 /* For AF_INET6 aka /dev/icmp6 */ 198 struct streamtab icmpinfov6 = { 199 &icmprinitv6, &icmpwinit 200 }; 201 202 /* Default structure copied into T_INFO_ACK messages */ 203 static struct T_info_ack icmp_g_t_info_ack = { 204 T_INFO_ACK, 205 IP_MAXPACKET, /* TSDU_size. icmp allows maximum size messages. */ 206 T_INVALID, /* ETSDU_size. icmp does not support expedited data. */ 207 T_INVALID, /* CDATA_size. icmp does not support connect data. */ 208 T_INVALID, /* DDATA_size. icmp does not support disconnect data. */ 209 0, /* ADDR_size - filled in later. */ 210 0, /* OPT_size - not initialized here */ 211 IP_MAXPACKET, /* TIDU_size. icmp allows maximum size messages. */ 212 T_CLTS, /* SERV_type. icmp supports connection-less. */ 213 TS_UNBND, /* CURRENT_state. This is set from icmp_state. */ 214 (XPG4_1|SENDZERO) /* PROVIDER_flag */ 215 }; 216 217 static int 218 icmp_set_buf_prop(netstack_t *stack, cred_t *cr, mod_prop_info_t *pinfo, 219 const char *ifname, const void *pval, uint_t flags) 220 { 221 return (mod_set_buf_prop(stack->netstack_icmp->is_propinfo_tbl, 222 stack, cr, pinfo, ifname, pval, flags)); 223 } 224 225 static int 226 icmp_get_buf_prop(netstack_t *stack, mod_prop_info_t *pinfo, const char *ifname, 227 void *val, uint_t psize, uint_t flags) 228 { 229 return (mod_get_buf_prop(stack->netstack_icmp->is_propinfo_tbl, stack, 230 pinfo, ifname, val, psize, flags)); 231 } 232 233 /* 234 * All of these are alterable, within the min/max values given, at run time. 235 * 236 * Note: All those tunables which do not start with "icmp_" are Committed and 237 * therefore are public. See PSARC 2010/080. 238 */ 239 static mod_prop_info_t icmp_propinfo_tbl[] = { 240 /* tunable - 0 */ 241 { "_wroff_extra", MOD_PROTO_RAWIP, 242 mod_set_uint32, mod_get_uint32, 243 {0, 128, 32}, {32} }, 244 245 { "_ipv4_ttl", MOD_PROTO_RAWIP, 246 mod_set_uint32, mod_get_uint32, 247 {1, 255, 255}, {255} }, 248 249 { "_ipv6_hoplimit", MOD_PROTO_RAWIP, 250 mod_set_uint32, mod_get_uint32, 251 {0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS}, 252 {IPV6_DEFAULT_HOPS} }, 253 254 { "_bsd_compat", MOD_PROTO_RAWIP, 255 mod_set_boolean, mod_get_boolean, 256 {B_TRUE}, {B_TRUE} }, 257 258 { "send_buf", MOD_PROTO_RAWIP, 259 icmp_set_buf_prop, icmp_get_buf_prop, 260 {4096, 65536, 8192}, {8192} }, 261 262 { "_xmit_lowat", MOD_PROTO_RAWIP, 263 mod_set_uint32, mod_get_uint32, 264 {0, 65536, 1024}, {1024} }, 265 266 { "recv_buf", MOD_PROTO_RAWIP, 267 icmp_set_buf_prop, icmp_get_buf_prop, 268 {4096, 65536, 8192}, {8192} }, 269 270 { "max_buf", MOD_PROTO_RAWIP, 271 mod_set_uint32, mod_get_uint32, 272 {65536, ULP_MAX_BUF, 256*1024}, {256*1024} }, 273 274 { "_pmtu_discovery", MOD_PROTO_RAWIP, 275 mod_set_boolean, mod_get_boolean, 276 {B_FALSE}, {B_FALSE} }, 277 278 { "_sendto_ignerr", MOD_PROTO_RAWIP, 279 mod_set_boolean, mod_get_boolean, 280 {B_FALSE}, {B_FALSE} }, 281 282 { "?", MOD_PROTO_RAWIP, NULL, mod_get_allprop, {0}, {0} }, 283 284 { NULL, 0, NULL, NULL, {0}, {0} } 285 }; 286 287 #define is_wroff_extra is_propinfo_tbl[0].prop_cur_uval 288 #define is_ipv4_ttl is_propinfo_tbl[1].prop_cur_uval 289 #define is_ipv6_hoplimit is_propinfo_tbl[2].prop_cur_uval 290 #define is_bsd_compat is_propinfo_tbl[3].prop_cur_bval 291 #define is_xmit_hiwat is_propinfo_tbl[4].prop_cur_uval 292 #define is_xmit_lowat is_propinfo_tbl[5].prop_cur_uval 293 #define is_recv_hiwat is_propinfo_tbl[6].prop_cur_uval 294 #define is_max_buf is_propinfo_tbl[7].prop_cur_uval 295 #define is_pmtu_discovery is_propinfo_tbl[8].prop_cur_bval 296 #define is_sendto_ignerr is_propinfo_tbl[9].prop_cur_bval 297 298 typedef union T_primitives *t_primp_t; 299 300 /* 301 * This routine is called to handle each O_T_BIND_REQ/T_BIND_REQ message 302 * passed to icmp_wput. 303 * It calls IP to verify the local IP address, and calls IP to insert 304 * the conn_t in the fanout table. 305 * If everything is ok it then sends the T_BIND_ACK back up. 306 */ 307 static void 308 icmp_tpi_bind(queue_t *q, mblk_t *mp) 309 { 310 int error; 311 struct sockaddr *sa; 312 struct T_bind_req *tbr; 313 socklen_t len; 314 sin_t *sin; 315 sin6_t *sin6; 316 icmp_t *icmp; 317 conn_t *connp = Q_TO_CONN(q); 318 mblk_t *mp1; 319 cred_t *cr; 320 321 /* 322 * All Solaris components should pass a db_credp 323 * for this TPI message, hence we ASSERT. 324 * But in case there is some other M_PROTO that looks 325 * like a TPI message sent by some other kernel 326 * component, we check and return an error. 327 */ 328 cr = msg_getcred(mp, NULL); 329 ASSERT(cr != NULL); 330 if (cr == NULL) { 331 icmp_err_ack(q, mp, TSYSERR, EINVAL); 332 return; 333 } 334 335 icmp = connp->conn_icmp; 336 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) { 337 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 338 "icmp_bind: bad req, len %u", 339 (uint_t)(mp->b_wptr - mp->b_rptr)); 340 icmp_err_ack(q, mp, TPROTO, 0); 341 return; 342 } 343 344 if (icmp->icmp_state != TS_UNBND) { 345 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 346 "icmp_bind: bad state, %u", icmp->icmp_state); 347 icmp_err_ack(q, mp, TOUTSTATE, 0); 348 return; 349 } 350 351 /* 352 * Reallocate the message to make sure we have enough room for an 353 * address. 354 */ 355 mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1); 356 if (mp1 == NULL) { 357 icmp_err_ack(q, mp, TSYSERR, ENOMEM); 358 return; 359 } 360 mp = mp1; 361 362 /* Reset the message type in preparation for shipping it back. */ 363 DB_TYPE(mp) = M_PCPROTO; 364 tbr = (struct T_bind_req *)mp->b_rptr; 365 len = tbr->ADDR_length; 366 switch (len) { 367 case 0: /* request for a generic port */ 368 tbr->ADDR_offset = sizeof (struct T_bind_req); 369 if (connp->conn_family == AF_INET) { 370 tbr->ADDR_length = sizeof (sin_t); 371 sin = (sin_t *)&tbr[1]; 372 *sin = sin_null; 373 sin->sin_family = AF_INET; 374 mp->b_wptr = (uchar_t *)&sin[1]; 375 sa = (struct sockaddr *)sin; 376 len = sizeof (sin_t); 377 } else { 378 ASSERT(connp->conn_family == AF_INET6); 379 tbr->ADDR_length = sizeof (sin6_t); 380 sin6 = (sin6_t *)&tbr[1]; 381 *sin6 = sin6_null; 382 sin6->sin6_family = AF_INET6; 383 mp->b_wptr = (uchar_t *)&sin6[1]; 384 sa = (struct sockaddr *)sin6; 385 len = sizeof (sin6_t); 386 } 387 break; 388 389 case sizeof (sin_t): /* Complete IPv4 address */ 390 sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset, 391 sizeof (sin_t)); 392 break; 393 394 case sizeof (sin6_t): /* Complete IPv6 address */ 395 sa = (struct sockaddr *)mi_offset_param(mp, 396 tbr->ADDR_offset, sizeof (sin6_t)); 397 break; 398 399 default: 400 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 401 "icmp_bind: bad ADDR_length %u", tbr->ADDR_length); 402 icmp_err_ack(q, mp, TBADADDR, 0); 403 return; 404 } 405 406 error = rawip_do_bind(connp, sa, len); 407 if (error != 0) { 408 if (error > 0) { 409 icmp_err_ack(q, mp, TSYSERR, error); 410 } else { 411 icmp_err_ack(q, mp, -error, 0); 412 } 413 } else { 414 tbr->PRIM_type = T_BIND_ACK; 415 qreply(q, mp); 416 } 417 } 418 419 static int 420 rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len) 421 { 422 sin_t *sin; 423 sin6_t *sin6; 424 icmp_t *icmp = connp->conn_icmp; 425 int error = 0; 426 ip_laddr_t laddr_type = IPVL_UNICAST_UP; /* INADDR_ANY */ 427 in_port_t lport; /* Network byte order */ 428 ipaddr_t v4src; /* Set if AF_INET */ 429 in6_addr_t v6src; 430 uint_t scopeid = 0; 431 zoneid_t zoneid = IPCL_ZONEID(connp); 432 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 433 434 if (sa == NULL || !OK_32PTR((char *)sa)) { 435 return (EINVAL); 436 } 437 438 switch (len) { 439 case sizeof (sin_t): /* Complete IPv4 address */ 440 sin = (sin_t *)sa; 441 if (sin->sin_family != AF_INET || 442 connp->conn_family != AF_INET) { 443 /* TSYSERR, EAFNOSUPPORT */ 444 return (EAFNOSUPPORT); 445 } 446 v4src = sin->sin_addr.s_addr; 447 IN6_IPADDR_TO_V4MAPPED(v4src, &v6src); 448 if (v4src != INADDR_ANY) { 449 laddr_type = ip_laddr_verify_v4(v4src, zoneid, ipst, 450 B_TRUE); 451 } 452 lport = sin->sin_port; 453 break; 454 case sizeof (sin6_t): /* Complete IPv6 address */ 455 sin6 = (sin6_t *)sa; 456 if (sin6->sin6_family != AF_INET6 || 457 connp->conn_family != AF_INET6) { 458 /* TSYSERR, EAFNOSUPPORT */ 459 return (EAFNOSUPPORT); 460 } 461 /* No support for mapped addresses on raw sockets */ 462 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 463 /* TSYSERR, EADDRNOTAVAIL */ 464 return (EADDRNOTAVAIL); 465 } 466 v6src = sin6->sin6_addr; 467 if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 468 if (IN6_IS_ADDR_LINKSCOPE(&v6src)) 469 scopeid = sin6->sin6_scope_id; 470 laddr_type = ip_laddr_verify_v6(&v6src, zoneid, ipst, 471 B_TRUE, scopeid); 472 } 473 lport = sin6->sin6_port; 474 break; 475 476 default: 477 /* TBADADDR */ 478 return (EADDRNOTAVAIL); 479 } 480 481 /* Is the local address a valid unicast, multicast, or broadcast? */ 482 if (laddr_type == IPVL_BAD) 483 return (EADDRNOTAVAIL); 484 485 /* 486 * The state must be TS_UNBND. 487 */ 488 mutex_enter(&connp->conn_lock); 489 if (icmp->icmp_state != TS_UNBND) { 490 mutex_exit(&connp->conn_lock); 491 return (-TOUTSTATE); 492 } 493 494 /* 495 * Copy the source address into our icmp structure. This address 496 * may still be zero; if so, ip will fill in the correct address 497 * each time an outbound packet is passed to it. 498 * If we are binding to a broadcast or multicast address then 499 * we just set the conn_bound_addr since we don't want to use 500 * that as the source address when sending. 501 */ 502 connp->conn_bound_addr_v6 = v6src; 503 connp->conn_laddr_v6 = v6src; 504 if (scopeid != 0) { 505 connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET; 506 connp->conn_ixa->ixa_scopeid = scopeid; 507 connp->conn_incoming_ifindex = scopeid; 508 } else { 509 connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 510 connp->conn_incoming_ifindex = connp->conn_bound_if; 511 } 512 513 switch (laddr_type) { 514 case IPVL_UNICAST_UP: 515 case IPVL_UNICAST_DOWN: 516 connp->conn_saddr_v6 = v6src; 517 connp->conn_mcbc_bind = B_FALSE; 518 break; 519 case IPVL_MCAST: 520 case IPVL_BCAST: 521 /* ip_set_destination will pick a source address later */ 522 connp->conn_saddr_v6 = ipv6_all_zeros; 523 connp->conn_mcbc_bind = B_TRUE; 524 break; 525 } 526 527 /* Any errors after this point should use late_error */ 528 529 /* 530 * Use sin_port/sin6_port since applications like psh use SOCK_RAW 531 * with IPPROTO_TCP. 532 */ 533 connp->conn_lport = lport; 534 connp->conn_fport = 0; 535 536 if (connp->conn_family == AF_INET) { 537 ASSERT(connp->conn_ipversion == IPV4_VERSION); 538 } else { 539 ASSERT(connp->conn_ipversion == IPV6_VERSION); 540 } 541 542 icmp->icmp_state = TS_IDLE; 543 544 /* 545 * We create an initial header template here to make a subsequent 546 * sendto have a starting point. Since conn_last_dst is zero the 547 * first sendto will always follow the 'dst changed' code path. 548 * Note that we defer massaging options and the related checksum 549 * adjustment until we have a destination address. 550 */ 551 error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 552 &connp->conn_faddr_v6, connp->conn_flowinfo); 553 if (error != 0) { 554 mutex_exit(&connp->conn_lock); 555 goto late_error; 556 } 557 /* Just in case */ 558 connp->conn_faddr_v6 = ipv6_all_zeros; 559 connp->conn_v6lastdst = ipv6_all_zeros; 560 mutex_exit(&connp->conn_lock); 561 562 error = ip_laddr_fanout_insert(connp); 563 if (error != 0) 564 goto late_error; 565 566 /* Bind succeeded */ 567 return (0); 568 569 late_error: 570 mutex_enter(&connp->conn_lock); 571 connp->conn_saddr_v6 = ipv6_all_zeros; 572 connp->conn_bound_addr_v6 = ipv6_all_zeros; 573 connp->conn_laddr_v6 = ipv6_all_zeros; 574 if (scopeid != 0) { 575 connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 576 connp->conn_incoming_ifindex = connp->conn_bound_if; 577 } 578 icmp->icmp_state = TS_UNBND; 579 connp->conn_v6lastdst = ipv6_all_zeros; 580 connp->conn_lport = 0; 581 582 /* Restore the header that was built above - different source address */ 583 (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 584 &connp->conn_faddr_v6, connp->conn_flowinfo); 585 mutex_exit(&connp->conn_lock); 586 return (error); 587 } 588 589 /* 590 * Tell IP to just bind to the protocol. 591 */ 592 static void 593 icmp_bind_proto(icmp_t *icmp) 594 { 595 conn_t *connp = icmp->icmp_connp; 596 597 mutex_enter(&connp->conn_lock); 598 connp->conn_saddr_v6 = ipv6_all_zeros; 599 connp->conn_laddr_v6 = ipv6_all_zeros; 600 connp->conn_faddr_v6 = ipv6_all_zeros; 601 connp->conn_v6lastdst = ipv6_all_zeros; 602 mutex_exit(&connp->conn_lock); 603 604 (void) ip_laddr_fanout_insert(connp); 605 } 606 607 /* 608 * This routine handles each T_CONN_REQ message passed to icmp. It 609 * associates a default destination address with the stream. 610 * 611 * After various error checks are completed, icmp_connect() lays 612 * the target address and port into the composite header template. 613 * Then we ask IP for information, including a source address if we didn't 614 * already have one. Finally we send up the T_OK_ACK reply message. 615 */ 616 static void 617 icmp_tpi_connect(queue_t *q, mblk_t *mp) 618 { 619 conn_t *connp = Q_TO_CONN(q); 620 struct T_conn_req *tcr; 621 struct sockaddr *sa; 622 socklen_t len; 623 int error; 624 cred_t *cr; 625 pid_t pid; 626 /* 627 * All Solaris components should pass a db_credp 628 * for this TPI message, hence we ASSERT. 629 * But in case there is some other M_PROTO that looks 630 * like a TPI message sent by some other kernel 631 * component, we check and return an error. 632 */ 633 cr = msg_getcred(mp, &pid); 634 ASSERT(cr != NULL); 635 if (cr == NULL) { 636 icmp_err_ack(q, mp, TSYSERR, EINVAL); 637 return; 638 } 639 640 tcr = (struct T_conn_req *)mp->b_rptr; 641 /* Sanity checks */ 642 if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_req)) { 643 icmp_err_ack(q, mp, TPROTO, 0); 644 return; 645 } 646 647 if (tcr->OPT_length != 0) { 648 icmp_err_ack(q, mp, TBADOPT, 0); 649 return; 650 } 651 652 len = tcr->DEST_length; 653 654 switch (len) { 655 default: 656 icmp_err_ack(q, mp, TBADADDR, 0); 657 return; 658 case sizeof (sin_t): 659 sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset, 660 sizeof (sin_t)); 661 break; 662 case sizeof (sin6_t): 663 sa = (struct sockaddr *)mi_offset_param(mp, 664 tcr->DEST_offset, sizeof (sin6_t)); 665 break; 666 } 667 668 error = proto_verify_ip_addr(connp->conn_family, sa, len); 669 if (error != 0) { 670 icmp_err_ack(q, mp, TSYSERR, error); 671 return; 672 } 673 674 error = rawip_do_connect(connp, sa, len, cr, pid); 675 if (error != 0) { 676 if (error < 0) { 677 icmp_err_ack(q, mp, -error, 0); 678 } else { 679 icmp_err_ack(q, mp, 0, error); 680 } 681 } else { 682 mblk_t *mp1; 683 684 /* 685 * We have to send a connection confirmation to 686 * keep TLI happy. 687 */ 688 if (connp->conn_family == AF_INET) { 689 mp1 = mi_tpi_conn_con(NULL, (char *)sa, 690 sizeof (sin_t), NULL, 0); 691 } else { 692 ASSERT(connp->conn_family == AF_INET6); 693 mp1 = mi_tpi_conn_con(NULL, (char *)sa, 694 sizeof (sin6_t), NULL, 0); 695 } 696 if (mp1 == NULL) { 697 icmp_err_ack(q, mp, TSYSERR, ENOMEM); 698 return; 699 } 700 701 /* 702 * Send ok_ack for T_CONN_REQ 703 */ 704 mp = mi_tpi_ok_ack_alloc(mp); 705 if (mp == NULL) { 706 /* Unable to reuse the T_CONN_REQ for the ack. */ 707 icmp_err_ack_prim(q, mp1, T_CONN_REQ, TSYSERR, ENOMEM); 708 return; 709 } 710 putnext(connp->conn_rq, mp); 711 putnext(connp->conn_rq, mp1); 712 } 713 } 714 715 static int 716 rawip_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len, 717 cred_t *cr, pid_t pid) 718 { 719 icmp_t *icmp; 720 sin_t *sin; 721 sin6_t *sin6; 722 int error; 723 uint16_t dstport; 724 ipaddr_t v4dst; 725 in6_addr_t v6dst; 726 uint32_t flowinfo; 727 ip_xmit_attr_t *ixa; 728 ip_xmit_attr_t *oldixa; 729 uint_t scopeid = 0; 730 uint_t srcid = 0; 731 in6_addr_t v6src = connp->conn_saddr_v6; 732 733 icmp = connp->conn_icmp; 734 735 if (sa == NULL || !OK_32PTR((char *)sa)) { 736 return (EINVAL); 737 } 738 739 ASSERT(sa != NULL && len != 0); 740 741 /* 742 * Determine packet type based on type of address passed in 743 * the request should contain an IPv4 or IPv6 address. 744 * Make sure that address family matches the type of 745 * family of the address passed down. 746 */ 747 switch (len) { 748 case sizeof (sin_t): 749 sin = (sin_t *)sa; 750 751 v4dst = sin->sin_addr.s_addr; 752 dstport = sin->sin_port; 753 IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst); 754 ASSERT(connp->conn_ipversion == IPV4_VERSION); 755 break; 756 757 case sizeof (sin6_t): 758 sin6 = (sin6_t *)sa; 759 760 /* No support for mapped addresses on raw sockets */ 761 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 762 return (EADDRNOTAVAIL); 763 } 764 v6dst = sin6->sin6_addr; 765 dstport = sin6->sin6_port; 766 ASSERT(connp->conn_ipversion == IPV6_VERSION); 767 flowinfo = sin6->sin6_flowinfo; 768 if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) 769 scopeid = sin6->sin6_scope_id; 770 srcid = sin6->__sin6_src_id; 771 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 772 /* Due to check above, we know sin6_addr is v6-only. */ 773 if (!ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 774 B_FALSE, connp->conn_netstack)) { 775 /* Mismatch - v6src would be v4mapped. */ 776 return (EADDRNOTAVAIL); 777 } 778 } 779 break; 780 } 781 782 /* 783 * If there is a different thread using conn_ixa then we get a new 784 * copy and cut the old one loose from conn_ixa. Otherwise we use 785 * conn_ixa and prevent any other thread from using/changing it. 786 * Once connect() is done other threads can use conn_ixa since the 787 * refcnt will be back at one. 788 * We defer updating conn_ixa until later to handle any concurrent 789 * conn_ixa_cleanup thread. 790 */ 791 ixa = conn_get_ixa(connp, B_FALSE); 792 if (ixa == NULL) 793 return (ENOMEM); 794 795 mutex_enter(&connp->conn_lock); 796 /* 797 * This icmp_t must have bound already before doing a connect. 798 * Reject if a connect is in progress (we drop conn_lock during 799 * rawip_do_connect). 800 */ 801 if (icmp->icmp_state == TS_UNBND || icmp->icmp_state == TS_WCON_CREQ) { 802 mutex_exit(&connp->conn_lock); 803 ixa_refrele(ixa); 804 return (-TOUTSTATE); 805 } 806 807 if (icmp->icmp_state == TS_DATA_XFER) { 808 /* Already connected - clear out state */ 809 if (connp->conn_mcbc_bind) 810 connp->conn_saddr_v6 = ipv6_all_zeros; 811 else 812 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 813 connp->conn_laddr_v6 = connp->conn_bound_addr_v6; 814 connp->conn_faddr_v6 = ipv6_all_zeros; 815 icmp->icmp_state = TS_IDLE; 816 } 817 818 /* 819 * Use sin_port/sin6_port since applications like psh use SOCK_RAW 820 * with IPPROTO_TCP. 821 */ 822 connp->conn_fport = dstport; 823 if (connp->conn_ipversion == IPV4_VERSION) { 824 /* 825 * Interpret a zero destination to mean loopback. 826 * Update the T_CONN_REQ (sin/sin6) since it is used to 827 * generate the T_CONN_CON. 828 */ 829 if (v4dst == INADDR_ANY) { 830 v4dst = htonl(INADDR_LOOPBACK); 831 IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst); 832 ASSERT(connp->conn_family == AF_INET); 833 sin->sin_addr.s_addr = v4dst; 834 } 835 connp->conn_faddr_v6 = v6dst; 836 connp->conn_flowinfo = 0; 837 } else { 838 ASSERT(connp->conn_ipversion == IPV6_VERSION); 839 /* 840 * Interpret a zero destination to mean loopback. 841 * Update the T_CONN_REQ (sin/sin6) since it is used to 842 * generate the T_CONN_CON. 843 */ 844 if (IN6_IS_ADDR_UNSPECIFIED(&v6dst)) { 845 v6dst = ipv6_loopback; 846 sin6->sin6_addr = v6dst; 847 } 848 connp->conn_faddr_v6 = v6dst; 849 connp->conn_flowinfo = flowinfo; 850 } 851 852 /* 853 * We update our cred/cpid based on the caller of connect 854 */ 855 if (connp->conn_cred != cr) { 856 crhold(cr); 857 crfree(connp->conn_cred); 858 connp->conn_cred = cr; 859 } 860 connp->conn_cpid = pid; 861 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 862 ixa->ixa_cred = cr; 863 ixa->ixa_cpid = pid; 864 if (is_system_labeled()) { 865 /* We need to restart with a label based on the cred */ 866 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 867 } 868 869 if (scopeid != 0) { 870 ixa->ixa_flags |= IXAF_SCOPEID_SET; 871 ixa->ixa_scopeid = scopeid; 872 connp->conn_incoming_ifindex = scopeid; 873 } else { 874 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 875 connp->conn_incoming_ifindex = connp->conn_bound_if; 876 } 877 878 /* 879 * conn_connect will drop conn_lock and reacquire it. 880 * To prevent a send* from messing with this icmp_t while the lock 881 * is dropped we set icmp_state and clear conn_v6lastdst. 882 * That will make all send* fail with EISCONN. 883 */ 884 connp->conn_v6lastdst = ipv6_all_zeros; 885 icmp->icmp_state = TS_WCON_CREQ; 886 887 error = conn_connect(connp, NULL, IPDF_ALLOW_MCBC); 888 mutex_exit(&connp->conn_lock); 889 if (error != 0) 890 goto connect_failed; 891 892 /* 893 * The addresses have been verified. Time to insert in 894 * the correct fanout list. 895 */ 896 error = ipcl_conn_insert(connp); 897 if (error != 0) 898 goto connect_failed; 899 900 mutex_enter(&connp->conn_lock); 901 error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 902 &connp->conn_faddr_v6, connp->conn_flowinfo); 903 if (error != 0) { 904 mutex_exit(&connp->conn_lock); 905 goto connect_failed; 906 } 907 908 icmp->icmp_state = TS_DATA_XFER; 909 /* Record this as the "last" send even though we haven't sent any */ 910 connp->conn_v6lastdst = connp->conn_faddr_v6; 911 connp->conn_lastipversion = connp->conn_ipversion; 912 connp->conn_lastdstport = connp->conn_fport; 913 connp->conn_lastflowinfo = connp->conn_flowinfo; 914 connp->conn_lastscopeid = scopeid; 915 connp->conn_lastsrcid = srcid; 916 /* Also remember a source to use together with lastdst */ 917 connp->conn_v6lastsrc = v6src; 918 919 oldixa = conn_replace_ixa(connp, ixa); 920 mutex_exit(&connp->conn_lock); 921 ixa_refrele(oldixa); 922 923 ixa_refrele(ixa); 924 return (0); 925 926 connect_failed: 927 if (ixa != NULL) 928 ixa_refrele(ixa); 929 mutex_enter(&connp->conn_lock); 930 icmp->icmp_state = TS_IDLE; 931 /* In case the source address was set above */ 932 if (connp->conn_mcbc_bind) 933 connp->conn_saddr_v6 = ipv6_all_zeros; 934 else 935 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 936 connp->conn_laddr_v6 = connp->conn_bound_addr_v6; 937 connp->conn_faddr_v6 = ipv6_all_zeros; 938 connp->conn_v6lastdst = ipv6_all_zeros; 939 connp->conn_flowinfo = 0; 940 941 (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 942 &connp->conn_faddr_v6, connp->conn_flowinfo); 943 mutex_exit(&connp->conn_lock); 944 return (error); 945 } 946 947 static void 948 rawip_do_close(conn_t *connp) 949 { 950 ASSERT(connp != NULL && IPCL_IS_RAWIP(connp)); 951 952 ip_quiesce_conn(connp); 953 954 if (!IPCL_IS_NONSTR(connp)) { 955 qprocsoff(connp->conn_rq); 956 } 957 958 icmp_close_free(connp); 959 960 /* 961 * Now we are truly single threaded on this stream, and can 962 * delete the things hanging off the connp, and finally the connp. 963 * We removed this connp from the fanout list, it cannot be 964 * accessed thru the fanouts, and we already waited for the 965 * conn_ref to drop to 0. We are already in close, so 966 * there cannot be any other thread from the top. qprocsoff 967 * has completed, and service has completed or won't run in 968 * future. 969 */ 970 ASSERT(connp->conn_ref == 1); 971 972 if (!IPCL_IS_NONSTR(connp)) { 973 inet_minor_free(connp->conn_minor_arena, connp->conn_dev); 974 } else { 975 ip_free_helper_stream(connp); 976 } 977 978 connp->conn_ref--; 979 ipcl_conn_destroy(connp); 980 } 981 982 static int 983 icmp_close(queue_t *q, int flags) 984 { 985 conn_t *connp; 986 987 if (flags & SO_FALLBACK) { 988 /* 989 * stream is being closed while in fallback 990 * simply free the resources that were allocated 991 */ 992 inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr)); 993 qprocsoff(q); 994 goto done; 995 } 996 997 connp = Q_TO_CONN(q); 998 (void) rawip_do_close(connp); 999 done: 1000 q->q_ptr = WR(q)->q_ptr = NULL; 1001 return (0); 1002 } 1003 1004 static void 1005 icmp_close_free(conn_t *connp) 1006 { 1007 icmp_t *icmp = connp->conn_icmp; 1008 1009 if (icmp->icmp_filter != NULL) { 1010 kmem_free(icmp->icmp_filter, sizeof (icmp6_filter_t)); 1011 icmp->icmp_filter = NULL; 1012 } 1013 1014 /* 1015 * Clear any fields which the kmem_cache constructor clears. 1016 * Only icmp_connp needs to be preserved. 1017 * TBD: We should make this more efficient to avoid clearing 1018 * everything. 1019 */ 1020 ASSERT(icmp->icmp_connp == connp); 1021 bzero(icmp, sizeof (icmp_t)); 1022 icmp->icmp_connp = connp; 1023 } 1024 1025 /* 1026 * This routine handles each T_DISCON_REQ message passed to icmp 1027 * as an indicating that ICMP is no longer connected. This results 1028 * in telling IP to restore the binding to just the local address. 1029 */ 1030 static int 1031 icmp_do_disconnect(conn_t *connp) 1032 { 1033 icmp_t *icmp = connp->conn_icmp; 1034 int error; 1035 1036 mutex_enter(&connp->conn_lock); 1037 if (icmp->icmp_state != TS_DATA_XFER) { 1038 mutex_exit(&connp->conn_lock); 1039 return (-TOUTSTATE); 1040 } 1041 if (connp->conn_mcbc_bind) 1042 connp->conn_saddr_v6 = ipv6_all_zeros; 1043 else 1044 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 1045 connp->conn_laddr_v6 = connp->conn_bound_addr_v6; 1046 connp->conn_faddr_v6 = ipv6_all_zeros; 1047 icmp->icmp_state = TS_IDLE; 1048 1049 connp->conn_v6lastdst = ipv6_all_zeros; 1050 error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 1051 &connp->conn_faddr_v6, connp->conn_flowinfo); 1052 mutex_exit(&connp->conn_lock); 1053 if (error != 0) 1054 return (error); 1055 1056 /* 1057 * Tell IP to remove the full binding and revert 1058 * to the local address binding. 1059 */ 1060 return (ip_laddr_fanout_insert(connp)); 1061 } 1062 1063 static void 1064 icmp_tpi_disconnect(queue_t *q, mblk_t *mp) 1065 { 1066 conn_t *connp = Q_TO_CONN(q); 1067 int error; 1068 1069 /* 1070 * Allocate the largest primitive we need to send back 1071 * T_error_ack is > than T_ok_ack 1072 */ 1073 mp = reallocb(mp, sizeof (struct T_error_ack), 1); 1074 if (mp == NULL) { 1075 /* Unable to reuse the T_DISCON_REQ for the ack. */ 1076 icmp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, ENOMEM); 1077 return; 1078 } 1079 1080 error = icmp_do_disconnect(connp); 1081 1082 if (error != 0) { 1083 if (error > 0) { 1084 icmp_err_ack(q, mp, 0, error); 1085 } else { 1086 icmp_err_ack(q, mp, -error, 0); 1087 } 1088 } else { 1089 mp = mi_tpi_ok_ack_alloc(mp); 1090 ASSERT(mp != NULL); 1091 qreply(q, mp); 1092 } 1093 } 1094 1095 static int 1096 icmp_disconnect(conn_t *connp) 1097 { 1098 int error; 1099 1100 connp->conn_dgram_errind = B_FALSE; 1101 1102 error = icmp_do_disconnect(connp); 1103 1104 if (error < 0) 1105 error = proto_tlitosyserr(-error); 1106 return (error); 1107 } 1108 1109 /* This routine creates a T_ERROR_ACK message and passes it upstream. */ 1110 static void 1111 icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error) 1112 { 1113 if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL) 1114 qreply(q, mp); 1115 } 1116 1117 /* Shorthand to generate and send TPI error acks to our client */ 1118 static void 1119 icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive, 1120 t_scalar_t t_error, int sys_error) 1121 { 1122 struct T_error_ack *teackp; 1123 1124 if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack), 1125 M_PCPROTO, T_ERROR_ACK)) != NULL) { 1126 teackp = (struct T_error_ack *)mp->b_rptr; 1127 teackp->ERROR_prim = primitive; 1128 teackp->TLI_error = t_error; 1129 teackp->UNIX_error = sys_error; 1130 qreply(q, mp); 1131 } 1132 } 1133 1134 /* 1135 * icmp_icmp_input is called as conn_recvicmp to process ICMP messages. 1136 * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors. 1137 * Assumes that IP has pulled up everything up to and including the ICMP header. 1138 */ 1139 /* ARGSUSED2 */ 1140 static void 1141 icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) 1142 { 1143 conn_t *connp = (conn_t *)arg1; 1144 icmp_t *icmp = connp->conn_icmp; 1145 icmph_t *icmph; 1146 ipha_t *ipha; 1147 int iph_hdr_length; 1148 sin_t sin; 1149 mblk_t *mp1; 1150 int error = 0; 1151 1152 ipha = (ipha_t *)mp->b_rptr; 1153 1154 ASSERT(OK_32PTR(mp->b_rptr)); 1155 1156 if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) { 1157 ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION); 1158 icmp_icmp_error_ipv6(connp, mp, ira); 1159 return; 1160 } 1161 ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); 1162 1163 /* Skip past the outer IP and ICMP headers */ 1164 ASSERT(IPH_HDR_LENGTH(ipha) == ira->ira_ip_hdr_length); 1165 iph_hdr_length = ira->ira_ip_hdr_length; 1166 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1167 ipha = (ipha_t *)&icmph[1]; /* Inner IP header */ 1168 1169 iph_hdr_length = IPH_HDR_LENGTH(ipha); 1170 1171 switch (icmph->icmph_type) { 1172 case ICMP_DEST_UNREACHABLE: 1173 switch (icmph->icmph_code) { 1174 case ICMP_FRAGMENTATION_NEEDED: { 1175 ipha_t *ipha; 1176 ip_xmit_attr_t *ixa; 1177 /* 1178 * IP has already adjusted the path MTU. 1179 * But we need to adjust DF for IPv4. 1180 */ 1181 if (connp->conn_ipversion != IPV4_VERSION) 1182 break; 1183 1184 ixa = conn_get_ixa(connp, B_FALSE); 1185 if (ixa == NULL || ixa->ixa_ire == NULL) { 1186 /* 1187 * Some other thread holds conn_ixa. We will 1188 * redo this on the next ICMP too big. 1189 */ 1190 if (ixa != NULL) 1191 ixa_refrele(ixa); 1192 break; 1193 } 1194 (void) ip_get_pmtu(ixa); 1195 1196 mutex_enter(&connp->conn_lock); 1197 ipha = (ipha_t *)connp->conn_ht_iphc; 1198 if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) { 1199 ipha->ipha_fragment_offset_and_flags |= 1200 IPH_DF_HTONS; 1201 } else { 1202 ipha->ipha_fragment_offset_and_flags &= 1203 ~IPH_DF_HTONS; 1204 } 1205 mutex_exit(&connp->conn_lock); 1206 ixa_refrele(ixa); 1207 break; 1208 } 1209 case ICMP_PORT_UNREACHABLE: 1210 case ICMP_PROTOCOL_UNREACHABLE: 1211 error = ECONNREFUSED; 1212 break; 1213 default: 1214 /* Transient errors */ 1215 break; 1216 } 1217 break; 1218 default: 1219 /* Transient errors */ 1220 break; 1221 } 1222 if (error == 0) { 1223 freemsg(mp); 1224 return; 1225 } 1226 1227 /* 1228 * Deliver T_UDERROR_IND when the application has asked for it. 1229 * The socket layer enables this automatically when connected. 1230 */ 1231 if (!connp->conn_dgram_errind) { 1232 freemsg(mp); 1233 return; 1234 } 1235 1236 sin = sin_null; 1237 sin.sin_family = AF_INET; 1238 sin.sin_addr.s_addr = ipha->ipha_dst; 1239 1240 if (IPCL_IS_NONSTR(connp)) { 1241 mutex_enter(&connp->conn_lock); 1242 if (icmp->icmp_state == TS_DATA_XFER) { 1243 if (sin.sin_addr.s_addr == connp->conn_faddr_v4) { 1244 mutex_exit(&connp->conn_lock); 1245 (*connp->conn_upcalls->su_set_error) 1246 (connp->conn_upper_handle, error); 1247 goto done; 1248 } 1249 } else { 1250 icmp->icmp_delayed_error = error; 1251 *((sin_t *)&icmp->icmp_delayed_addr) = sin; 1252 } 1253 mutex_exit(&connp->conn_lock); 1254 } else { 1255 mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL, 0, 1256 error); 1257 if (mp1 != NULL) 1258 putnext(connp->conn_rq, mp1); 1259 } 1260 done: 1261 freemsg(mp); 1262 } 1263 1264 /* 1265 * icmp_icmp_error_ipv6 is called by icmp_icmp_error to process ICMP for IPv6. 1266 * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors. 1267 * Assumes that IP has pulled up all the extension headers as well as the 1268 * ICMPv6 header. 1269 */ 1270 static void 1271 icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira) 1272 { 1273 icmp6_t *icmp6; 1274 ip6_t *ip6h, *outer_ip6h; 1275 uint16_t iph_hdr_length; 1276 uint8_t *nexthdrp; 1277 sin6_t sin6; 1278 mblk_t *mp1; 1279 int error = 0; 1280 icmp_t *icmp = connp->conn_icmp; 1281 1282 outer_ip6h = (ip6_t *)mp->b_rptr; 1283 #ifdef DEBUG 1284 if (outer_ip6h->ip6_nxt != IPPROTO_ICMPV6) 1285 iph_hdr_length = ip_hdr_length_v6(mp, outer_ip6h); 1286 else 1287 iph_hdr_length = IPV6_HDR_LEN; 1288 ASSERT(iph_hdr_length == ira->ira_ip_hdr_length); 1289 #endif 1290 /* Skip past the outer IP and ICMP headers */ 1291 iph_hdr_length = ira->ira_ip_hdr_length; 1292 icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length]; 1293 1294 ip6h = (ip6_t *)&icmp6[1]; /* Inner IP header */ 1295 if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) { 1296 freemsg(mp); 1297 return; 1298 } 1299 1300 switch (icmp6->icmp6_type) { 1301 case ICMP6_DST_UNREACH: 1302 switch (icmp6->icmp6_code) { 1303 case ICMP6_DST_UNREACH_NOPORT: 1304 error = ECONNREFUSED; 1305 break; 1306 case ICMP6_DST_UNREACH_ADMIN: 1307 case ICMP6_DST_UNREACH_NOROUTE: 1308 case ICMP6_DST_UNREACH_BEYONDSCOPE: 1309 case ICMP6_DST_UNREACH_ADDR: 1310 /* Transient errors */ 1311 break; 1312 default: 1313 break; 1314 } 1315 break; 1316 case ICMP6_PACKET_TOO_BIG: { 1317 struct T_unitdata_ind *tudi; 1318 struct T_opthdr *toh; 1319 size_t udi_size; 1320 mblk_t *newmp; 1321 t_scalar_t opt_length = sizeof (struct T_opthdr) + 1322 sizeof (struct ip6_mtuinfo); 1323 sin6_t *sin6; 1324 struct ip6_mtuinfo *mtuinfo; 1325 1326 /* 1327 * If the application has requested to receive path mtu 1328 * information, send up an empty message containing an 1329 * IPV6_PATHMTU ancillary data item. 1330 */ 1331 if (!connp->conn_ipv6_recvpathmtu) 1332 break; 1333 1334 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t) + 1335 opt_length; 1336 if ((newmp = allocb(udi_size, BPRI_MED)) == NULL) { 1337 BUMP_MIB(&icmp->icmp_is->is_rawip_mib, rawipInErrors); 1338 break; 1339 } 1340 1341 /* 1342 * newmp->b_cont is left to NULL on purpose. This is an 1343 * empty message containing only ancillary data. 1344 */ 1345 newmp->b_datap->db_type = M_PROTO; 1346 tudi = (struct T_unitdata_ind *)newmp->b_rptr; 1347 newmp->b_wptr = (uchar_t *)tudi + udi_size; 1348 tudi->PRIM_type = T_UNITDATA_IND; 1349 tudi->SRC_length = sizeof (sin6_t); 1350 tudi->SRC_offset = sizeof (struct T_unitdata_ind); 1351 tudi->OPT_offset = tudi->SRC_offset + sizeof (sin6_t); 1352 tudi->OPT_length = opt_length; 1353 1354 sin6 = (sin6_t *)&tudi[1]; 1355 bzero(sin6, sizeof (sin6_t)); 1356 sin6->sin6_family = AF_INET6; 1357 sin6->sin6_addr = connp->conn_faddr_v6; 1358 1359 toh = (struct T_opthdr *)&sin6[1]; 1360 toh->level = IPPROTO_IPV6; 1361 toh->name = IPV6_PATHMTU; 1362 toh->len = opt_length; 1363 toh->status = 0; 1364 1365 mtuinfo = (struct ip6_mtuinfo *)&toh[1]; 1366 bzero(mtuinfo, sizeof (struct ip6_mtuinfo)); 1367 mtuinfo->ip6m_addr.sin6_family = AF_INET6; 1368 mtuinfo->ip6m_addr.sin6_addr = ip6h->ip6_dst; 1369 mtuinfo->ip6m_mtu = icmp6->icmp6_mtu; 1370 /* 1371 * We've consumed everything we need from the original 1372 * message. Free it, then send our empty message. 1373 */ 1374 freemsg(mp); 1375 icmp_ulp_recv(connp, newmp, msgdsize(newmp)); 1376 return; 1377 } 1378 case ICMP6_TIME_EXCEEDED: 1379 /* Transient errors */ 1380 break; 1381 case ICMP6_PARAM_PROB: 1382 /* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */ 1383 if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER && 1384 (uchar_t *)ip6h + icmp6->icmp6_pptr == 1385 (uchar_t *)nexthdrp) { 1386 error = ECONNREFUSED; 1387 break; 1388 } 1389 break; 1390 } 1391 if (error == 0) { 1392 freemsg(mp); 1393 return; 1394 } 1395 1396 /* 1397 * Deliver T_UDERROR_IND when the application has asked for it. 1398 * The socket layer enables this automatically when connected. 1399 */ 1400 if (!connp->conn_dgram_errind) { 1401 freemsg(mp); 1402 return; 1403 } 1404 1405 sin6 = sin6_null; 1406 sin6.sin6_family = AF_INET6; 1407 sin6.sin6_addr = ip6h->ip6_dst; 1408 sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; 1409 if (IPCL_IS_NONSTR(connp)) { 1410 mutex_enter(&connp->conn_lock); 1411 if (icmp->icmp_state == TS_DATA_XFER) { 1412 if (IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr, 1413 &connp->conn_faddr_v6)) { 1414 mutex_exit(&connp->conn_lock); 1415 (*connp->conn_upcalls->su_set_error) 1416 (connp->conn_upper_handle, error); 1417 goto done; 1418 } 1419 } else { 1420 icmp->icmp_delayed_error = error; 1421 *((sin6_t *)&icmp->icmp_delayed_addr) = sin6; 1422 } 1423 mutex_exit(&connp->conn_lock); 1424 } else { 1425 mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t), 1426 NULL, 0, error); 1427 if (mp1 != NULL) 1428 putnext(connp->conn_rq, mp1); 1429 } 1430 done: 1431 freemsg(mp); 1432 } 1433 1434 /* 1435 * This routine responds to T_ADDR_REQ messages. It is called by icmp_wput. 1436 * The local address is filled in if endpoint is bound. The remote address 1437 * is filled in if remote address has been precified ("connected endpoint") 1438 * (The concept of connected CLTS sockets is alien to published TPI 1439 * but we support it anyway). 1440 */ 1441 static void 1442 icmp_addr_req(queue_t *q, mblk_t *mp) 1443 { 1444 struct sockaddr *sa; 1445 mblk_t *ackmp; 1446 struct T_addr_ack *taa; 1447 icmp_t *icmp = Q_TO_ICMP(q); 1448 conn_t *connp = icmp->icmp_connp; 1449 uint_t addrlen; 1450 1451 /* Make it large enough for worst case */ 1452 ackmp = reallocb(mp, sizeof (struct T_addr_ack) + 1453 2 * sizeof (sin6_t), 1); 1454 if (ackmp == NULL) { 1455 icmp_err_ack(q, mp, TSYSERR, ENOMEM); 1456 return; 1457 } 1458 taa = (struct T_addr_ack *)ackmp->b_rptr; 1459 1460 bzero(taa, sizeof (struct T_addr_ack)); 1461 ackmp->b_wptr = (uchar_t *)&taa[1]; 1462 1463 taa->PRIM_type = T_ADDR_ACK; 1464 ackmp->b_datap->db_type = M_PCPROTO; 1465 1466 if (connp->conn_family == AF_INET) 1467 addrlen = sizeof (sin_t); 1468 else 1469 addrlen = sizeof (sin6_t); 1470 1471 mutex_enter(&connp->conn_lock); 1472 /* 1473 * Note: Following code assumes 32 bit alignment of basic 1474 * data structures like sin_t and struct T_addr_ack. 1475 */ 1476 if (icmp->icmp_state != TS_UNBND) { 1477 /* 1478 * Fill in local address first 1479 */ 1480 taa->LOCADDR_offset = sizeof (*taa); 1481 taa->LOCADDR_length = addrlen; 1482 sa = (struct sockaddr *)&taa[1]; 1483 (void) conn_getsockname(connp, sa, &addrlen); 1484 ackmp->b_wptr += addrlen; 1485 } 1486 if (icmp->icmp_state == TS_DATA_XFER) { 1487 /* 1488 * connected, fill remote address too 1489 */ 1490 taa->REMADDR_length = addrlen; 1491 /* assumed 32-bit alignment */ 1492 taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length; 1493 sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset); 1494 (void) conn_getpeername(connp, sa, &addrlen); 1495 ackmp->b_wptr += addrlen; 1496 } 1497 mutex_exit(&connp->conn_lock); 1498 ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim); 1499 qreply(q, ackmp); 1500 } 1501 1502 static void 1503 icmp_copy_info(struct T_info_ack *tap, icmp_t *icmp) 1504 { 1505 conn_t *connp = icmp->icmp_connp; 1506 1507 *tap = icmp_g_t_info_ack; 1508 1509 if (connp->conn_family == AF_INET6) 1510 tap->ADDR_size = sizeof (sin6_t); 1511 else 1512 tap->ADDR_size = sizeof (sin_t); 1513 tap->CURRENT_state = icmp->icmp_state; 1514 tap->OPT_size = icmp_max_optsize; 1515 } 1516 1517 static void 1518 icmp_do_capability_ack(icmp_t *icmp, struct T_capability_ack *tcap, 1519 t_uscalar_t cap_bits1) 1520 { 1521 tcap->CAP_bits1 = 0; 1522 1523 if (cap_bits1 & TC1_INFO) { 1524 icmp_copy_info(&tcap->INFO_ack, icmp); 1525 tcap->CAP_bits1 |= TC1_INFO; 1526 } 1527 } 1528 1529 /* 1530 * This routine responds to T_CAPABILITY_REQ messages. It is called by 1531 * icmp_wput. Much of the T_CAPABILITY_ACK information is copied from 1532 * icmp_g_t_info_ack. The current state of the stream is copied from 1533 * icmp_state. 1534 */ 1535 static void 1536 icmp_capability_req(queue_t *q, mblk_t *mp) 1537 { 1538 icmp_t *icmp = Q_TO_ICMP(q); 1539 t_uscalar_t cap_bits1; 1540 struct T_capability_ack *tcap; 1541 1542 cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1; 1543 1544 mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack), 1545 mp->b_datap->db_type, T_CAPABILITY_ACK); 1546 if (!mp) 1547 return; 1548 1549 tcap = (struct T_capability_ack *)mp->b_rptr; 1550 1551 icmp_do_capability_ack(icmp, tcap, cap_bits1); 1552 1553 qreply(q, mp); 1554 } 1555 1556 /* 1557 * This routine responds to T_INFO_REQ messages. It is called by icmp_wput. 1558 * Most of the T_INFO_ACK information is copied from icmp_g_t_info_ack. 1559 * The current state of the stream is copied from icmp_state. 1560 */ 1561 static void 1562 icmp_info_req(queue_t *q, mblk_t *mp) 1563 { 1564 icmp_t *icmp = Q_TO_ICMP(q); 1565 1566 /* Create a T_INFO_ACK message. */ 1567 mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO, 1568 T_INFO_ACK); 1569 if (!mp) 1570 return; 1571 icmp_copy_info((struct T_info_ack *)mp->b_rptr, icmp); 1572 qreply(q, mp); 1573 } 1574 1575 static int 1576 icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, 1577 int family) 1578 { 1579 conn_t *connp; 1580 dev_t conn_dev; 1581 int error; 1582 1583 /* If the stream is already open, return immediately. */ 1584 if (q->q_ptr != NULL) 1585 return (0); 1586 1587 if (sflag == MODOPEN) 1588 return (EINVAL); 1589 1590 /* 1591 * Since ICMP is not used so heavily, allocating from the small 1592 * arena should be sufficient. 1593 */ 1594 if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) { 1595 return (EBUSY); 1596 } 1597 1598 if (flag & SO_FALLBACK) { 1599 /* 1600 * Non streams socket needs a stream to fallback to 1601 */ 1602 RD(q)->q_ptr = (void *)conn_dev; 1603 WR(q)->q_qinfo = &icmp_fallback_sock_winit; 1604 WR(q)->q_ptr = (void *)ip_minor_arena_sa; 1605 qprocson(q); 1606 return (0); 1607 } 1608 1609 connp = rawip_do_open(family, credp, &error, KM_SLEEP); 1610 if (connp == NULL) { 1611 ASSERT(error != 0); 1612 inet_minor_free(ip_minor_arena_sa, conn_dev); 1613 return (error); 1614 } 1615 1616 *devp = makedevice(getemajor(*devp), (minor_t)conn_dev); 1617 connp->conn_dev = conn_dev; 1618 connp->conn_minor_arena = ip_minor_arena_sa; 1619 1620 /* 1621 * Initialize the icmp_t structure for this stream. 1622 */ 1623 q->q_ptr = connp; 1624 WR(q)->q_ptr = connp; 1625 connp->conn_rq = q; 1626 connp->conn_wq = WR(q); 1627 1628 WR(q)->q_hiwat = connp->conn_sndbuf; 1629 WR(q)->q_lowat = connp->conn_sndlowat; 1630 1631 qprocson(q); 1632 1633 /* Set the Stream head write offset. */ 1634 (void) proto_set_tx_wroff(q, connp, connp->conn_wroff); 1635 (void) proto_set_rx_hiwat(connp->conn_rq, connp, connp->conn_rcvbuf); 1636 1637 mutex_enter(&connp->conn_lock); 1638 connp->conn_state_flags &= ~CONN_INCIPIENT; 1639 mutex_exit(&connp->conn_lock); 1640 1641 icmp_bind_proto(connp->conn_icmp); 1642 1643 return (0); 1644 } 1645 1646 /* For /dev/icmp aka AF_INET open */ 1647 static int 1648 icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 1649 { 1650 return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET)); 1651 } 1652 1653 /* For /dev/icmp6 aka AF_INET6 open */ 1654 static int 1655 icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 1656 { 1657 return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET6)); 1658 } 1659 1660 /* 1661 * This is the open routine for icmp. It allocates a icmp_t structure for 1662 * the stream and, on the first open of the module, creates an ND table. 1663 */ 1664 static conn_t * 1665 rawip_do_open(int family, cred_t *credp, int *err, int flags) 1666 { 1667 icmp_t *icmp; 1668 conn_t *connp; 1669 zoneid_t zoneid; 1670 netstack_t *ns; 1671 icmp_stack_t *is; 1672 int len; 1673 boolean_t isv6 = B_FALSE; 1674 1675 *err = secpolicy_net_icmpaccess(credp); 1676 if (*err != 0) 1677 return (NULL); 1678 1679 if (family == AF_INET6) 1680 isv6 = B_TRUE; 1681 1682 ns = netstack_find_by_cred(credp); 1683 ASSERT(ns != NULL); 1684 is = ns->netstack_icmp; 1685 ASSERT(is != NULL); 1686 1687 /* 1688 * For exclusive stacks we set the zoneid to zero 1689 * to make ICMP operate as if in the global zone. 1690 */ 1691 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 1692 zoneid = GLOBAL_ZONEID; 1693 else 1694 zoneid = crgetzoneid(credp); 1695 1696 ASSERT(flags == KM_SLEEP || flags == KM_NOSLEEP); 1697 1698 connp = ipcl_conn_create(IPCL_RAWIPCONN, flags, ns); 1699 icmp = connp->conn_icmp; 1700 1701 /* 1702 * ipcl_conn_create did a netstack_hold. Undo the hold that was 1703 * done by netstack_find_by_cred() 1704 */ 1705 netstack_rele(ns); 1706 1707 /* 1708 * Since this conn_t/icmp_t is not yet visible to anybody else we don't 1709 * need to lock anything. 1710 */ 1711 ASSERT(connp->conn_proto == IPPROTO_ICMP); 1712 ASSERT(connp->conn_icmp == icmp); 1713 ASSERT(icmp->icmp_connp == connp); 1714 1715 /* Set the initial state of the stream and the privilege status. */ 1716 icmp->icmp_state = TS_UNBND; 1717 connp->conn_ixa->ixa_flags |= IXAF_VERIFY_SOURCE; 1718 if (isv6) { 1719 connp->conn_family = AF_INET6; 1720 connp->conn_ipversion = IPV6_VERSION; 1721 connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4; 1722 connp->conn_proto = IPPROTO_ICMPV6; 1723 /* May be changed by a SO_PROTOTYPE socket option. */ 1724 connp->conn_proto = IPPROTO_ICMPV6; 1725 connp->conn_ixa->ixa_protocol = connp->conn_proto; 1726 connp->conn_ixa->ixa_raw_cksum_offset = 2; 1727 connp->conn_default_ttl = is->is_ipv6_hoplimit; 1728 len = sizeof (ip6_t); 1729 } else { 1730 connp->conn_family = AF_INET; 1731 connp->conn_ipversion = IPV4_VERSION; 1732 connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4; 1733 /* May be changed by a SO_PROTOTYPE socket option. */ 1734 connp->conn_proto = IPPROTO_ICMP; 1735 connp->conn_ixa->ixa_protocol = connp->conn_proto; 1736 connp->conn_default_ttl = is->is_ipv4_ttl; 1737 len = sizeof (ipha_t); 1738 } 1739 connp->conn_xmit_ipp.ipp_unicast_hops = connp->conn_default_ttl; 1740 1741 connp->conn_ixa->ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; 1742 1743 /* 1744 * For the socket of protocol IPPROTO_RAW or when IP_HDRINCL is set, 1745 * the checksum is provided in the pre-built packet. We clear 1746 * IXAF_SET_ULP_CKSUM to tell IP that the application has sent a 1747 * complete IP header and not to compute the transport checksum. 1748 */ 1749 connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM; 1750 /* conn_allzones can not be set this early, hence no IPCL_ZONEID */ 1751 connp->conn_ixa->ixa_zoneid = zoneid; 1752 1753 connp->conn_zoneid = zoneid; 1754 1755 /* 1756 * If the caller has the process-wide flag set, then default to MAC 1757 * exempt mode. This allows read-down to unlabeled hosts. 1758 */ 1759 if (getpflags(NET_MAC_AWARE, credp) != 0) 1760 connp->conn_mac_mode = CONN_MAC_AWARE; 1761 1762 connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID); 1763 1764 icmp->icmp_is = is; 1765 1766 connp->conn_rcvbuf = is->is_recv_hiwat; 1767 connp->conn_sndbuf = is->is_xmit_hiwat; 1768 connp->conn_sndlowat = is->is_xmit_lowat; 1769 connp->conn_rcvlowat = icmp_mod_info.mi_lowat; 1770 1771 connp->conn_wroff = len + is->is_wroff_extra; 1772 connp->conn_so_type = SOCK_RAW; 1773 1774 connp->conn_recv = icmp_input; 1775 connp->conn_recvicmp = icmp_icmp_input; 1776 crhold(credp); 1777 connp->conn_cred = credp; 1778 connp->conn_cpid = curproc->p_pid; 1779 connp->conn_open_time = ddi_get_lbolt64(); 1780 /* Cache things in ixa without an extra refhold */ 1781 ASSERT(!(connp->conn_ixa->ixa_free_flags & IXA_FREE_CRED)); 1782 connp->conn_ixa->ixa_cred = connp->conn_cred; 1783 connp->conn_ixa->ixa_cpid = connp->conn_cpid; 1784 if (is_system_labeled()) 1785 connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred); 1786 1787 connp->conn_flow_cntrld = B_FALSE; 1788 1789 if (is->is_pmtu_discovery) 1790 connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY; 1791 1792 return (connp); 1793 } 1794 1795 /* 1796 * Which ICMP options OK to set through T_UNITDATA_REQ... 1797 */ 1798 /* ARGSUSED */ 1799 static boolean_t 1800 icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name) 1801 { 1802 return (B_TRUE); 1803 } 1804 1805 /* 1806 * This routine gets default values of certain options whose default 1807 * values are maintained by protcol specific code 1808 */ 1809 int 1810 icmp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) 1811 { 1812 icmp_t *icmp = Q_TO_ICMP(q); 1813 icmp_stack_t *is = icmp->icmp_is; 1814 int *i1 = (int *)ptr; 1815 1816 switch (level) { 1817 case IPPROTO_IP: 1818 switch (name) { 1819 case IP_MULTICAST_TTL: 1820 *ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL; 1821 return (sizeof (uchar_t)); 1822 case IP_MULTICAST_LOOP: 1823 *ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP; 1824 return (sizeof (uchar_t)); 1825 } 1826 break; 1827 case IPPROTO_IPV6: 1828 switch (name) { 1829 case IPV6_MULTICAST_HOPS: 1830 *i1 = IP_DEFAULT_MULTICAST_TTL; 1831 return (sizeof (int)); 1832 case IPV6_MULTICAST_LOOP: 1833 *i1 = IP_DEFAULT_MULTICAST_LOOP; 1834 return (sizeof (int)); 1835 case IPV6_UNICAST_HOPS: 1836 *i1 = is->is_ipv6_hoplimit; 1837 return (sizeof (int)); 1838 } 1839 break; 1840 case IPPROTO_ICMPV6: 1841 switch (name) { 1842 case ICMP6_FILTER: 1843 /* Make it look like "pass all" */ 1844 ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr); 1845 return (sizeof (icmp6_filter_t)); 1846 } 1847 break; 1848 } 1849 return (-1); 1850 } 1851 1852 /* 1853 * This routine retrieves the current status of socket options. 1854 * It returns the size of the option retrieved, or -1. 1855 */ 1856 int 1857 icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) 1858 { 1859 icmp_t *icmp = connp->conn_icmp; 1860 int *i1 = (int *)ptr; 1861 conn_opt_arg_t coas; 1862 int retval; 1863 1864 coas.coa_connp = connp; 1865 coas.coa_ixa = connp->conn_ixa; 1866 coas.coa_ipp = &connp->conn_xmit_ipp; 1867 coas.coa_ancillary = B_FALSE; 1868 coas.coa_changed = 0; 1869 1870 /* 1871 * We assume that the optcom framework has checked for the set 1872 * of levels and names that are supported, hence we don't worry 1873 * about rejecting based on that. 1874 * First check for ICMP specific handling, then pass to common routine. 1875 */ 1876 switch (level) { 1877 case IPPROTO_IP: 1878 /* 1879 * Only allow IPv4 option processing on IPv4 sockets. 1880 */ 1881 if (connp->conn_family != AF_INET) 1882 return (-1); 1883 1884 switch (name) { 1885 case IP_OPTIONS: 1886 case T_IP_OPTIONS: 1887 /* Options are passed up with each packet */ 1888 return (0); 1889 case IP_HDRINCL: 1890 mutex_enter(&connp->conn_lock); 1891 *i1 = (int)icmp->icmp_hdrincl; 1892 mutex_exit(&connp->conn_lock); 1893 return (sizeof (int)); 1894 } 1895 break; 1896 1897 case IPPROTO_IPV6: 1898 /* 1899 * Only allow IPv6 option processing on native IPv6 sockets. 1900 */ 1901 if (connp->conn_family != AF_INET6) 1902 return (-1); 1903 1904 switch (name) { 1905 case IPV6_CHECKSUM: 1906 /* 1907 * Return offset or -1 if no checksum offset. 1908 * Does not apply to IPPROTO_ICMPV6 1909 */ 1910 if (connp->conn_proto == IPPROTO_ICMPV6) 1911 return (-1); 1912 1913 mutex_enter(&connp->conn_lock); 1914 if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM) 1915 *i1 = connp->conn_ixa->ixa_raw_cksum_offset; 1916 else 1917 *i1 = -1; 1918 mutex_exit(&connp->conn_lock); 1919 return (sizeof (int)); 1920 } 1921 break; 1922 1923 case IPPROTO_ICMPV6: 1924 /* 1925 * Only allow IPv6 option processing on native IPv6 sockets. 1926 */ 1927 if (connp->conn_family != AF_INET6) 1928 return (-1); 1929 1930 if (connp->conn_proto != IPPROTO_ICMPV6) 1931 return (-1); 1932 1933 switch (name) { 1934 case ICMP6_FILTER: 1935 mutex_enter(&connp->conn_lock); 1936 if (icmp->icmp_filter == NULL) { 1937 /* Make it look like "pass all" */ 1938 ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr); 1939 } else { 1940 (void) bcopy(icmp->icmp_filter, ptr, 1941 sizeof (icmp6_filter_t)); 1942 } 1943 mutex_exit(&connp->conn_lock); 1944 return (sizeof (icmp6_filter_t)); 1945 } 1946 } 1947 mutex_enter(&connp->conn_lock); 1948 retval = conn_opt_get(&coas, level, name, ptr); 1949 mutex_exit(&connp->conn_lock); 1950 return (retval); 1951 } 1952 1953 /* 1954 * This routine retrieves the current status of socket options. 1955 * It returns the size of the option retrieved, or -1. 1956 */ 1957 int 1958 icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr) 1959 { 1960 conn_t *connp = Q_TO_CONN(q); 1961 int err; 1962 1963 err = icmp_opt_get(connp, level, name, ptr); 1964 return (err); 1965 } 1966 1967 /* 1968 * This routine sets socket options. 1969 */ 1970 int 1971 icmp_do_opt_set(conn_opt_arg_t *coa, int level, int name, 1972 uint_t inlen, uchar_t *invalp, cred_t *cr, boolean_t checkonly) 1973 { 1974 conn_t *connp = coa->coa_connp; 1975 ip_xmit_attr_t *ixa = coa->coa_ixa; 1976 icmp_t *icmp = connp->conn_icmp; 1977 icmp_stack_t *is = icmp->icmp_is; 1978 int *i1 = (int *)invalp; 1979 boolean_t onoff = (*i1 == 0) ? 0 : 1; 1980 int error; 1981 1982 ASSERT(MUTEX_NOT_HELD(&coa->coa_connp->conn_lock)); 1983 1984 /* 1985 * For fixed length options, no sanity check 1986 * of passed in length is done. It is assumed *_optcom_req() 1987 * routines do the right thing. 1988 */ 1989 1990 switch (level) { 1991 case SOL_SOCKET: 1992 switch (name) { 1993 case SO_PROTOTYPE: 1994 if ((*i1 & 0xFF) != IPPROTO_ICMP && 1995 (*i1 & 0xFF) != IPPROTO_ICMPV6 && 1996 secpolicy_net_rawaccess(cr) != 0) { 1997 return (EACCES); 1998 } 1999 if (checkonly) 2000 break; 2001 2002 mutex_enter(&connp->conn_lock); 2003 connp->conn_proto = *i1 & 0xFF; 2004 ixa->ixa_protocol = connp->conn_proto; 2005 if ((connp->conn_proto == IPPROTO_RAW || 2006 connp->conn_proto == IPPROTO_IGMP) && 2007 connp->conn_family == AF_INET) { 2008 icmp->icmp_hdrincl = 1; 2009 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 2010 } else if (connp->conn_proto == IPPROTO_UDP || 2011 connp->conn_proto == IPPROTO_TCP || 2012 connp->conn_proto == IPPROTO_SCTP) { 2013 /* Used by test applications like psh */ 2014 icmp->icmp_hdrincl = 0; 2015 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 2016 } else { 2017 icmp->icmp_hdrincl = 0; 2018 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM; 2019 } 2020 2021 if (connp->conn_family == AF_INET6 && 2022 connp->conn_proto == IPPROTO_ICMPV6) { 2023 /* Set offset for icmp6_cksum */ 2024 ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM; 2025 ixa->ixa_raw_cksum_offset = 2; 2026 } 2027 if (icmp->icmp_filter != NULL && 2028 connp->conn_proto != IPPROTO_ICMPV6) { 2029 kmem_free(icmp->icmp_filter, 2030 sizeof (icmp6_filter_t)); 2031 icmp->icmp_filter = NULL; 2032 } 2033 mutex_exit(&connp->conn_lock); 2034 2035 coa->coa_changed |= COA_HEADER_CHANGED; 2036 /* 2037 * For SCTP, we don't use icmp_bind_proto() for 2038 * raw socket binding. 2039 */ 2040 if (connp->conn_proto == IPPROTO_SCTP) 2041 return (0); 2042 2043 coa->coa_changed |= COA_ICMP_BIND_NEEDED; 2044 return (0); 2045 2046 case SO_SNDBUF: 2047 if (*i1 > is->is_max_buf) { 2048 return (ENOBUFS); 2049 } 2050 break; 2051 case SO_RCVBUF: 2052 if (*i1 > is->is_max_buf) { 2053 return (ENOBUFS); 2054 } 2055 break; 2056 } 2057 break; 2058 2059 case IPPROTO_IP: 2060 /* 2061 * Only allow IPv4 option processing on IPv4 sockets. 2062 */ 2063 if (connp->conn_family != AF_INET) 2064 return (EINVAL); 2065 2066 switch (name) { 2067 case IP_HDRINCL: 2068 if (!checkonly) { 2069 mutex_enter(&connp->conn_lock); 2070 icmp->icmp_hdrincl = onoff; 2071 if (onoff) 2072 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 2073 else 2074 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM; 2075 mutex_exit(&connp->conn_lock); 2076 } 2077 break; 2078 } 2079 break; 2080 2081 case IPPROTO_IPV6: 2082 if (connp->conn_family != AF_INET6) 2083 return (EINVAL); 2084 2085 switch (name) { 2086 case IPV6_CHECKSUM: 2087 /* 2088 * Integer offset into the user data of where the 2089 * checksum is located. 2090 * Offset of -1 disables option. 2091 * Does not apply to IPPROTO_ICMPV6. 2092 */ 2093 if (connp->conn_proto == IPPROTO_ICMPV6 || 2094 coa->coa_ancillary) { 2095 return (EINVAL); 2096 } 2097 if ((*i1 != -1) && ((*i1 < 0) || (*i1 & 0x1) != 0)) { 2098 /* Negative or not 16 bit aligned offset */ 2099 return (EINVAL); 2100 } 2101 if (checkonly) 2102 break; 2103 2104 mutex_enter(&connp->conn_lock); 2105 if (*i1 == -1) { 2106 ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM; 2107 ixa->ixa_raw_cksum_offset = 0; 2108 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 2109 } else { 2110 ixa->ixa_flags |= IXAF_SET_RAW_CKSUM; 2111 ixa->ixa_raw_cksum_offset = *i1; 2112 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM; 2113 } 2114 mutex_exit(&connp->conn_lock); 2115 break; 2116 } 2117 break; 2118 2119 case IPPROTO_ICMPV6: 2120 /* 2121 * Only allow IPv6 option processing on IPv6 sockets. 2122 */ 2123 if (connp->conn_family != AF_INET6) 2124 return (EINVAL); 2125 if (connp->conn_proto != IPPROTO_ICMPV6) 2126 return (EINVAL); 2127 2128 switch (name) { 2129 case ICMP6_FILTER: 2130 if (checkonly) 2131 break; 2132 2133 if ((inlen != 0) && 2134 (inlen != sizeof (icmp6_filter_t))) 2135 return (EINVAL); 2136 2137 mutex_enter(&connp->conn_lock); 2138 if (inlen == 0) { 2139 if (icmp->icmp_filter != NULL) { 2140 kmem_free(icmp->icmp_filter, 2141 sizeof (icmp6_filter_t)); 2142 icmp->icmp_filter = NULL; 2143 } 2144 } else { 2145 if (icmp->icmp_filter == NULL) { 2146 icmp->icmp_filter = kmem_alloc( 2147 sizeof (icmp6_filter_t), 2148 KM_NOSLEEP); 2149 if (icmp->icmp_filter == NULL) { 2150 mutex_exit(&connp->conn_lock); 2151 return (ENOBUFS); 2152 } 2153 } 2154 (void) bcopy(invalp, icmp->icmp_filter, inlen); 2155 } 2156 mutex_exit(&connp->conn_lock); 2157 break; 2158 } 2159 break; 2160 } 2161 error = conn_opt_set(coa, level, name, inlen, invalp, 2162 checkonly, cr); 2163 return (error); 2164 } 2165 2166 /* 2167 * This routine sets socket options. 2168 */ 2169 int 2170 icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, 2171 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 2172 void *thisdg_attrs, cred_t *cr) 2173 { 2174 icmp_t *icmp = connp->conn_icmp; 2175 int err; 2176 conn_opt_arg_t coas, *coa; 2177 boolean_t checkonly; 2178 icmp_stack_t *is = icmp->icmp_is; 2179 2180 switch (optset_context) { 2181 case SETFN_OPTCOM_CHECKONLY: 2182 checkonly = B_TRUE; 2183 /* 2184 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ 2185 * inlen != 0 implies value supplied and 2186 * we have to "pretend" to set it. 2187 * inlen == 0 implies that there is no 2188 * value part in T_CHECK request and just validation 2189 * done elsewhere should be enough, we just return here. 2190 */ 2191 if (inlen == 0) { 2192 *outlenp = 0; 2193 return (0); 2194 } 2195 break; 2196 case SETFN_OPTCOM_NEGOTIATE: 2197 checkonly = B_FALSE; 2198 break; 2199 case SETFN_UD_NEGOTIATE: 2200 case SETFN_CONN_NEGOTIATE: 2201 checkonly = B_FALSE; 2202 /* 2203 * Negotiating local and "association-related" options 2204 * through T_UNITDATA_REQ. 2205 * 2206 * Following routine can filter out ones we do not 2207 * want to be "set" this way. 2208 */ 2209 if (!icmp_opt_allow_udr_set(level, name)) { 2210 *outlenp = 0; 2211 return (EINVAL); 2212 } 2213 break; 2214 default: 2215 /* 2216 * We should never get here 2217 */ 2218 *outlenp = 0; 2219 return (EINVAL); 2220 } 2221 2222 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || 2223 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); 2224 2225 if (thisdg_attrs != NULL) { 2226 /* Options from T_UNITDATA_REQ */ 2227 coa = (conn_opt_arg_t *)thisdg_attrs; 2228 ASSERT(coa->coa_connp == connp); 2229 ASSERT(coa->coa_ixa != NULL); 2230 ASSERT(coa->coa_ipp != NULL); 2231 ASSERT(coa->coa_ancillary); 2232 } else { 2233 coa = &coas; 2234 coas.coa_connp = connp; 2235 /* Get a reference on conn_ixa to prevent concurrent mods */ 2236 coas.coa_ixa = conn_get_ixa(connp, B_TRUE); 2237 if (coas.coa_ixa == NULL) { 2238 *outlenp = 0; 2239 return (ENOMEM); 2240 } 2241 coas.coa_ipp = &connp->conn_xmit_ipp; 2242 coas.coa_ancillary = B_FALSE; 2243 coas.coa_changed = 0; 2244 } 2245 2246 err = icmp_do_opt_set(coa, level, name, inlen, invalp, 2247 cr, checkonly); 2248 if (err != 0) { 2249 errout: 2250 if (!coa->coa_ancillary) 2251 ixa_refrele(coa->coa_ixa); 2252 *outlenp = 0; 2253 return (err); 2254 } 2255 2256 /* 2257 * Common case of OK return with outval same as inval. 2258 */ 2259 if (invalp != outvalp) { 2260 /* don't trust bcopy for identical src/dst */ 2261 (void) bcopy(invalp, outvalp, inlen); 2262 } 2263 *outlenp = inlen; 2264 2265 /* 2266 * If this was not ancillary data, then we rebuild the headers, 2267 * update the IRE/NCE, and IPsec as needed. 2268 * Since the label depends on the destination we go through 2269 * ip_set_destination first. 2270 */ 2271 if (coa->coa_ancillary) { 2272 return (0); 2273 } 2274 2275 if (coa->coa_changed & COA_ROUTE_CHANGED) { 2276 in6_addr_t saddr, faddr, nexthop; 2277 in_port_t fport; 2278 2279 /* 2280 * We clear lastdst to make sure we pick up the change 2281 * next time sending. 2282 * If we are connected we re-cache the information. 2283 * We ignore errors to preserve BSD behavior. 2284 * Note that we don't redo IPsec policy lookup here 2285 * since the final destination (or source) didn't change. 2286 */ 2287 mutex_enter(&connp->conn_lock); 2288 connp->conn_v6lastdst = ipv6_all_zeros; 2289 2290 ip_attr_nexthop(coa->coa_ipp, coa->coa_ixa, 2291 &connp->conn_faddr_v6, &nexthop); 2292 saddr = connp->conn_saddr_v6; 2293 faddr = connp->conn_faddr_v6; 2294 fport = connp->conn_fport; 2295 mutex_exit(&connp->conn_lock); 2296 2297 if (!IN6_IS_ADDR_UNSPECIFIED(&faddr) && 2298 !IN6_IS_ADDR_V4MAPPED_ANY(&faddr)) { 2299 (void) ip_attr_connect(connp, coa->coa_ixa, 2300 &saddr, &faddr, &nexthop, fport, NULL, NULL, 2301 IPDF_ALLOW_MCBC | IPDF_VERIFY_DST); 2302 } 2303 } 2304 2305 ixa_refrele(coa->coa_ixa); 2306 2307 if (coa->coa_changed & COA_HEADER_CHANGED) { 2308 /* 2309 * Rebuild the header template if we are connected. 2310 * Otherwise clear conn_v6lastdst so we rebuild the header 2311 * in the data path. 2312 */ 2313 mutex_enter(&connp->conn_lock); 2314 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && 2315 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 2316 err = icmp_build_hdr_template(connp, 2317 &connp->conn_saddr_v6, &connp->conn_faddr_v6, 2318 connp->conn_flowinfo); 2319 if (err != 0) { 2320 mutex_exit(&connp->conn_lock); 2321 return (err); 2322 } 2323 } else { 2324 connp->conn_v6lastdst = ipv6_all_zeros; 2325 } 2326 mutex_exit(&connp->conn_lock); 2327 } 2328 if (coa->coa_changed & COA_RCVBUF_CHANGED) { 2329 (void) proto_set_rx_hiwat(connp->conn_rq, connp, 2330 connp->conn_rcvbuf); 2331 } 2332 if ((coa->coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) { 2333 connp->conn_wq->q_hiwat = connp->conn_sndbuf; 2334 } 2335 if (coa->coa_changed & COA_WROFF_CHANGED) { 2336 /* Increase wroff if needed */ 2337 uint_t wroff; 2338 2339 mutex_enter(&connp->conn_lock); 2340 wroff = connp->conn_ht_iphc_allocated + is->is_wroff_extra; 2341 if (wroff > connp->conn_wroff) { 2342 connp->conn_wroff = wroff; 2343 mutex_exit(&connp->conn_lock); 2344 (void) proto_set_tx_wroff(connp->conn_rq, connp, wroff); 2345 } else { 2346 mutex_exit(&connp->conn_lock); 2347 } 2348 } 2349 if (coa->coa_changed & COA_ICMP_BIND_NEEDED) { 2350 icmp_bind_proto(icmp); 2351 } 2352 return (err); 2353 } 2354 2355 /* This routine sets socket options. */ 2356 int 2357 icmp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name, 2358 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 2359 void *thisdg_attrs, cred_t *cr) 2360 { 2361 conn_t *connp = Q_TO_CONN(q); 2362 int error; 2363 2364 error = icmp_opt_set(connp, optset_context, level, name, inlen, invalp, 2365 outlenp, outvalp, thisdg_attrs, cr); 2366 return (error); 2367 } 2368 2369 /* 2370 * Setup IP headers. 2371 * 2372 * Note that IP_HDRINCL has ipha_protocol that is different than conn_proto, 2373 * but icmp_output_hdrincl restores ipha_protocol once we return. 2374 */ 2375 mblk_t * 2376 icmp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp, 2377 const in6_addr_t *v6src, const in6_addr_t *v6dst, uint32_t flowinfo, 2378 mblk_t *data_mp, int *errorp) 2379 { 2380 mblk_t *mp; 2381 icmp_stack_t *is = connp->conn_netstack->netstack_icmp; 2382 uint_t data_len; 2383 uint32_t cksum; 2384 2385 data_len = msgdsize(data_mp); 2386 mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, connp->conn_proto, 2387 flowinfo, 0, data_mp, data_len, is->is_wroff_extra, &cksum, errorp); 2388 if (mp == NULL) { 2389 ASSERT(*errorp != 0); 2390 return (NULL); 2391 } 2392 2393 ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length; 2394 2395 /* 2396 * If there was a routing option/header then conn_prepend_hdr 2397 * has massaged it and placed the pseudo-header checksum difference 2398 * in the cksum argument. 2399 * 2400 * Prepare for ICMPv6 checksum done in IP. 2401 * 2402 * We make it easy for IP to include our pseudo header 2403 * by putting our length (and any routing header adjustment) 2404 * in the ICMPv6 checksum field. 2405 * The IP source, destination, and length have already been set by 2406 * conn_prepend_hdr. 2407 */ 2408 cksum += data_len; 2409 cksum = (cksum >> 16) + (cksum & 0xFFFF); 2410 ASSERT(cksum < 0x10000); 2411 2412 if (ixa->ixa_flags & IXAF_IS_IPV4) { 2413 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2414 2415 ASSERT(ntohs(ipha->ipha_length) == ixa->ixa_pktlen); 2416 } else { 2417 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2418 uint_t cksum_offset = 0; 2419 2420 ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == ixa->ixa_pktlen); 2421 2422 if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) { 2423 if (connp->conn_proto == IPPROTO_ICMPV6) { 2424 cksum_offset = ixa->ixa_ip_hdr_length + 2425 offsetof(icmp6_t, icmp6_cksum); 2426 } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { 2427 cksum_offset = ixa->ixa_ip_hdr_length + 2428 ixa->ixa_raw_cksum_offset; 2429 } 2430 } 2431 if (cksum_offset != 0) { 2432 uint16_t *ptr; 2433 2434 /* Make sure the checksum fits in the first mblk */ 2435 if (cksum_offset + sizeof (short) > MBLKL(mp)) { 2436 mblk_t *mp1; 2437 2438 mp1 = msgpullup(mp, 2439 cksum_offset + sizeof (short)); 2440 freemsg(mp); 2441 if (mp1 == NULL) { 2442 *errorp = ENOMEM; 2443 return (NULL); 2444 } 2445 mp = mp1; 2446 ip6h = (ip6_t *)mp->b_rptr; 2447 } 2448 ptr = (uint16_t *)(mp->b_rptr + cksum_offset); 2449 *ptr = htons(cksum); 2450 } 2451 } 2452 2453 /* Note that we don't try to update wroff due to ancillary data */ 2454 return (mp); 2455 } 2456 2457 static int 2458 icmp_build_hdr_template(conn_t *connp, const in6_addr_t *v6src, 2459 const in6_addr_t *v6dst, uint32_t flowinfo) 2460 { 2461 int error; 2462 2463 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2464 /* 2465 * We clear lastdst to make sure we don't use the lastdst path 2466 * next time sending since we might not have set v6dst yet. 2467 */ 2468 connp->conn_v6lastdst = ipv6_all_zeros; 2469 2470 error = conn_build_hdr_template(connp, 0, 0, v6src, v6dst, flowinfo); 2471 if (error != 0) 2472 return (error); 2473 2474 /* 2475 * Any routing header/option has been massaged. The checksum difference 2476 * is stored in conn_sum. 2477 */ 2478 return (0); 2479 } 2480 2481 static mblk_t * 2482 icmp_queue_fallback(icmp_t *icmp, mblk_t *mp) 2483 { 2484 ASSERT(MUTEX_HELD(&icmp->icmp_recv_lock)); 2485 if (IPCL_IS_NONSTR(icmp->icmp_connp)) { 2486 /* 2487 * fallback has started but messages have not been moved yet 2488 */ 2489 if (icmp->icmp_fallback_queue_head == NULL) { 2490 ASSERT(icmp->icmp_fallback_queue_tail == NULL); 2491 icmp->icmp_fallback_queue_head = mp; 2492 icmp->icmp_fallback_queue_tail = mp; 2493 } else { 2494 ASSERT(icmp->icmp_fallback_queue_tail != NULL); 2495 icmp->icmp_fallback_queue_tail->b_next = mp; 2496 icmp->icmp_fallback_queue_tail = mp; 2497 } 2498 return (NULL); 2499 } else { 2500 /* 2501 * Fallback completed, let the caller putnext() the mblk. 2502 */ 2503 return (mp); 2504 } 2505 } 2506 2507 /* 2508 * Deliver data to ULP. In case we have a socket, and it's falling back to 2509 * TPI, then we'll queue the mp for later processing. 2510 */ 2511 static void 2512 icmp_ulp_recv(conn_t *connp, mblk_t *mp, uint_t len) 2513 { 2514 if (IPCL_IS_NONSTR(connp)) { 2515 icmp_t *icmp = connp->conn_icmp; 2516 int error; 2517 2518 ASSERT(len == msgdsize(mp)); 2519 if ((*connp->conn_upcalls->su_recv) 2520 (connp->conn_upper_handle, mp, len, 0, &error, NULL) < 0) { 2521 mutex_enter(&icmp->icmp_recv_lock); 2522 if (error == ENOSPC) { 2523 /* 2524 * let's confirm while holding the lock 2525 */ 2526 if ((*connp->conn_upcalls->su_recv) 2527 (connp->conn_upper_handle, NULL, 0, 0, 2528 &error, NULL) < 0) { 2529 ASSERT(error == ENOSPC); 2530 if (error == ENOSPC) { 2531 connp->conn_flow_cntrld = 2532 B_TRUE; 2533 } 2534 } 2535 mutex_exit(&icmp->icmp_recv_lock); 2536 } else { 2537 ASSERT(error == EOPNOTSUPP); 2538 mp = icmp_queue_fallback(icmp, mp); 2539 mutex_exit(&icmp->icmp_recv_lock); 2540 if (mp != NULL) 2541 putnext(connp->conn_rq, mp); 2542 } 2543 } 2544 ASSERT(MUTEX_NOT_HELD(&icmp->icmp_recv_lock)); 2545 } else { 2546 putnext(connp->conn_rq, mp); 2547 } 2548 } 2549 2550 /* 2551 * This is the inbound data path. 2552 * IP has already pulled up the IP headers and verified alignment 2553 * etc. 2554 */ 2555 /* ARGSUSED2 */ 2556 static void 2557 icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) 2558 { 2559 conn_t *connp = (conn_t *)arg1; 2560 struct T_unitdata_ind *tudi; 2561 uchar_t *rptr; /* Pointer to IP header */ 2562 int ip_hdr_length; 2563 int udi_size; /* Size of T_unitdata_ind */ 2564 int pkt_len; 2565 icmp_t *icmp; 2566 ip_pkt_t ipps; 2567 ip6_t *ip6h; 2568 mblk_t *mp1; 2569 crb_t recv_ancillary; 2570 icmp_stack_t *is; 2571 sin_t *sin; 2572 sin6_t *sin6; 2573 ipha_t *ipha; 2574 2575 ASSERT(connp->conn_flags & IPCL_RAWIPCONN); 2576 2577 icmp = connp->conn_icmp; 2578 is = icmp->icmp_is; 2579 rptr = mp->b_rptr; 2580 2581 ASSERT(DB_TYPE(mp) == M_DATA); 2582 ASSERT(OK_32PTR(rptr)); 2583 ASSERT(ira->ira_pktlen == msgdsize(mp)); 2584 pkt_len = ira->ira_pktlen; 2585 2586 /* 2587 * Get a snapshot of these and allow other threads to change 2588 * them after that. We need the same recv_ancillary when determining 2589 * the size as when adding the ancillary data items. 2590 */ 2591 mutex_enter(&connp->conn_lock); 2592 recv_ancillary = connp->conn_recv_ancillary; 2593 mutex_exit(&connp->conn_lock); 2594 2595 ip_hdr_length = ira->ira_ip_hdr_length; 2596 ASSERT(MBLKL(mp) >= ip_hdr_length); /* IP did a pullup */ 2597 2598 /* Initialize regardless of IP version */ 2599 ipps.ipp_fields = 0; 2600 2601 if (ira->ira_flags & IRAF_IS_IPV4) { 2602 ASSERT(IPH_HDR_VERSION(rptr) == IPV4_VERSION); 2603 ASSERT(MBLKL(mp) >= sizeof (ipha_t)); 2604 ASSERT(ira->ira_ip_hdr_length == IPH_HDR_LENGTH(rptr)); 2605 2606 ipha = (ipha_t *)mp->b_rptr; 2607 if (recv_ancillary.crb_all != 0) 2608 (void) ip_find_hdr_v4(ipha, &ipps, B_FALSE); 2609 2610 /* 2611 * BSD for some reason adjusts ipha_length to exclude the 2612 * IP header length. We do the same. 2613 */ 2614 if (is->is_bsd_compat) { 2615 ushort_t len; 2616 2617 len = ntohs(ipha->ipha_length); 2618 if (mp->b_datap->db_ref > 1) { 2619 /* 2620 * Allocate a new IP header so that we can 2621 * modify ipha_length. 2622 */ 2623 mblk_t *mp1; 2624 2625 mp1 = allocb(ip_hdr_length, BPRI_MED); 2626 if (mp1 == NULL) { 2627 freemsg(mp); 2628 BUMP_MIB(&is->is_rawip_mib, 2629 rawipInErrors); 2630 return; 2631 } 2632 bcopy(rptr, mp1->b_rptr, ip_hdr_length); 2633 mp->b_rptr = rptr + ip_hdr_length; 2634 rptr = mp1->b_rptr; 2635 ipha = (ipha_t *)rptr; 2636 mp1->b_cont = mp; 2637 mp1->b_wptr = rptr + ip_hdr_length; 2638 mp = mp1; 2639 } 2640 len -= ip_hdr_length; 2641 ipha->ipha_length = htons(len); 2642 } 2643 2644 /* 2645 * For RAW sockets we not pass ICMP/IPv4 packets to AF_INET6 2646 * sockets. This is ensured by icmp_bind and the IP fanout code. 2647 */ 2648 ASSERT(connp->conn_family == AF_INET); 2649 2650 /* 2651 * This is the inbound data path. Packets are passed upstream 2652 * as T_UNITDATA_IND messages with full IPv4 headers still 2653 * attached. 2654 */ 2655 2656 /* 2657 * Normally only send up the source address. 2658 * If any ancillary data items are wanted we add those. 2659 */ 2660 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t); 2661 if (recv_ancillary.crb_all != 0) { 2662 udi_size += conn_recvancillary_size(connp, 2663 recv_ancillary, ira, mp, &ipps); 2664 } 2665 2666 /* Allocate a message block for the T_UNITDATA_IND structure. */ 2667 mp1 = allocb(udi_size, BPRI_MED); 2668 if (mp1 == NULL) { 2669 freemsg(mp); 2670 BUMP_MIB(&is->is_rawip_mib, rawipInErrors); 2671 return; 2672 } 2673 mp1->b_cont = mp; 2674 tudi = (struct T_unitdata_ind *)mp1->b_rptr; 2675 mp1->b_datap->db_type = M_PROTO; 2676 mp1->b_wptr = (uchar_t *)tudi + udi_size; 2677 tudi->PRIM_type = T_UNITDATA_IND; 2678 tudi->SRC_length = sizeof (sin_t); 2679 tudi->SRC_offset = sizeof (struct T_unitdata_ind); 2680 sin = (sin_t *)&tudi[1]; 2681 *sin = sin_null; 2682 sin->sin_family = AF_INET; 2683 sin->sin_addr.s_addr = ipha->ipha_src; 2684 *(uint32_t *)&sin->sin_zero[0] = 0; 2685 *(uint32_t *)&sin->sin_zero[4] = 0; 2686 tudi->OPT_offset = sizeof (struct T_unitdata_ind) + 2687 sizeof (sin_t); 2688 udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t)); 2689 tudi->OPT_length = udi_size; 2690 2691 /* 2692 * Add options if IP_RECVIF etc is set 2693 */ 2694 if (udi_size != 0) { 2695 conn_recvancillary_add(connp, recv_ancillary, ira, 2696 &ipps, (uchar_t *)&sin[1], udi_size); 2697 } 2698 goto deliver; 2699 } 2700 2701 ASSERT(IPH_HDR_VERSION(rptr) == IPV6_VERSION); 2702 /* 2703 * IPv6 packets can only be received by applications 2704 * that are prepared to receive IPv6 addresses. 2705 * The IP fanout must ensure this. 2706 */ 2707 ASSERT(connp->conn_family == AF_INET6); 2708 2709 /* 2710 * Handle IPv6 packets. We don't pass up the IP headers with the 2711 * payload for IPv6. 2712 */ 2713 2714 ip6h = (ip6_t *)rptr; 2715 if (recv_ancillary.crb_all != 0) { 2716 /* 2717 * Call on ip_find_hdr_v6 which gets individual lenghts of 2718 * extension headers (and pointers to them). 2719 */ 2720 uint8_t nexthdr; 2721 2722 /* We don't care about the length or nextheader. */ 2723 (void) ip_find_hdr_v6(mp, ip6h, B_TRUE, &ipps, &nexthdr); 2724 2725 /* 2726 * We do not pass up hop-by-hop options or any other 2727 * extension header as part of the packet. Applications 2728 * that want to see them have to specify IPV6_RECV* socket 2729 * options. And conn_recvancillary_size/add explicitly 2730 * drops the TX option from IPV6_HOPOPTS as it does for UDP. 2731 * 2732 * If we had multilevel ICMP sockets, then we'd want to 2733 * modify conn_recvancillary_size/add to 2734 * allow the user to see the label. 2735 */ 2736 } 2737 2738 /* 2739 * Check a filter for ICMPv6 types if needed. 2740 * Verify raw checksums if needed. 2741 */ 2742 mutex_enter(&connp->conn_lock); 2743 if (icmp->icmp_filter != NULL) { 2744 int type; 2745 2746 /* Assumes that IP has done the pullupmsg */ 2747 type = mp->b_rptr[ip_hdr_length]; 2748 2749 ASSERT(mp->b_rptr + ip_hdr_length <= mp->b_wptr); 2750 if (ICMP6_FILTER_WILLBLOCK(type, icmp->icmp_filter)) { 2751 mutex_exit(&connp->conn_lock); 2752 freemsg(mp); 2753 return; 2754 } 2755 } 2756 if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { 2757 /* Checksum */ 2758 uint16_t *up; 2759 uint32_t sum; 2760 int remlen; 2761 2762 up = (uint16_t *)&ip6h->ip6_src; 2763 2764 remlen = msgdsize(mp) - ip_hdr_length; 2765 sum = htons(connp->conn_proto + remlen) 2766 + up[0] + up[1] + up[2] + up[3] 2767 + up[4] + up[5] + up[6] + up[7] 2768 + up[8] + up[9] + up[10] + up[11] 2769 + up[12] + up[13] + up[14] + up[15]; 2770 sum = (sum & 0xffff) + (sum >> 16); 2771 sum = IP_CSUM(mp, ip_hdr_length, sum); 2772 if (sum != 0) { 2773 /* IPv6 RAW checksum failed */ 2774 ip0dbg(("icmp_rput: RAW checksum failed %x\n", sum)); 2775 mutex_exit(&connp->conn_lock); 2776 freemsg(mp); 2777 BUMP_MIB(&is->is_rawip_mib, rawipInCksumErrs); 2778 return; 2779 } 2780 } 2781 mutex_exit(&connp->conn_lock); 2782 2783 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t); 2784 2785 if (recv_ancillary.crb_all != 0) { 2786 udi_size += conn_recvancillary_size(connp, 2787 recv_ancillary, ira, mp, &ipps); 2788 } 2789 2790 mp1 = allocb(udi_size, BPRI_MED); 2791 if (mp1 == NULL) { 2792 freemsg(mp); 2793 BUMP_MIB(&is->is_rawip_mib, rawipInErrors); 2794 return; 2795 } 2796 mp1->b_cont = mp; 2797 mp1->b_datap->db_type = M_PROTO; 2798 tudi = (struct T_unitdata_ind *)mp1->b_rptr; 2799 mp1->b_wptr = (uchar_t *)tudi + udi_size; 2800 tudi->PRIM_type = T_UNITDATA_IND; 2801 tudi->SRC_length = sizeof (sin6_t); 2802 tudi->SRC_offset = sizeof (struct T_unitdata_ind); 2803 tudi->OPT_offset = sizeof (struct T_unitdata_ind) + sizeof (sin6_t); 2804 udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin6_t)); 2805 tudi->OPT_length = udi_size; 2806 sin6 = (sin6_t *)&tudi[1]; 2807 *sin6 = sin6_null; 2808 sin6->sin6_port = 0; 2809 sin6->sin6_family = AF_INET6; 2810 2811 sin6->sin6_addr = ip6h->ip6_src; 2812 /* No sin6_flowinfo per API */ 2813 sin6->sin6_flowinfo = 0; 2814 /* For link-scope pass up scope id */ 2815 if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) 2816 sin6->sin6_scope_id = ira->ira_ruifindex; 2817 else 2818 sin6->sin6_scope_id = 0; 2819 sin6->__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst, 2820 IPCL_ZONEID(connp), is->is_netstack); 2821 2822 if (udi_size != 0) { 2823 conn_recvancillary_add(connp, recv_ancillary, ira, 2824 &ipps, (uchar_t *)&sin6[1], udi_size); 2825 } 2826 2827 /* Skip all the IPv6 headers per API */ 2828 mp->b_rptr += ip_hdr_length; 2829 pkt_len -= ip_hdr_length; 2830 2831 deliver: 2832 BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams); 2833 icmp_ulp_recv(connp, mp1, pkt_len); 2834 } 2835 2836 /* 2837 * return SNMP stuff in buffer in mpdata. We don't hold any lock and report 2838 * information that can be changing beneath us. 2839 */ 2840 mblk_t * 2841 icmp_snmp_get(queue_t *q, mblk_t *mpctl) 2842 { 2843 mblk_t *mpdata; 2844 struct opthdr *optp; 2845 conn_t *connp = Q_TO_CONN(q); 2846 icmp_stack_t *is = connp->conn_netstack->netstack_icmp; 2847 mblk_t *mp2ctl; 2848 2849 /* 2850 * make a copy of the original message 2851 */ 2852 mp2ctl = copymsg(mpctl); 2853 2854 if (mpctl == NULL || 2855 (mpdata = mpctl->b_cont) == NULL) { 2856 freemsg(mpctl); 2857 freemsg(mp2ctl); 2858 return (0); 2859 } 2860 2861 /* fixed length structure for IPv4 and IPv6 counters */ 2862 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 2863 optp->level = EXPER_RAWIP; 2864 optp->name = 0; 2865 (void) snmp_append_data(mpdata, (char *)&is->is_rawip_mib, 2866 sizeof (is->is_rawip_mib)); 2867 optp->len = msgdsize(mpdata); 2868 qreply(q, mpctl); 2869 2870 return (mp2ctl); 2871 } 2872 2873 /* 2874 * Return 0 if invalid set request, 1 otherwise, including non-rawip requests. 2875 * TODO: If this ever actually tries to set anything, it needs to be 2876 * to do the appropriate locking. 2877 */ 2878 /* ARGSUSED */ 2879 int 2880 icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name, 2881 uchar_t *ptr, int len) 2882 { 2883 switch (level) { 2884 case EXPER_RAWIP: 2885 return (0); 2886 default: 2887 return (1); 2888 } 2889 } 2890 2891 /* 2892 * This routine creates a T_UDERROR_IND message and passes it upstream. 2893 * The address and options are copied from the T_UNITDATA_REQ message 2894 * passed in mp. This message is freed. 2895 */ 2896 static void 2897 icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err) 2898 { 2899 struct T_unitdata_req *tudr; 2900 mblk_t *mp1; 2901 uchar_t *destaddr; 2902 t_scalar_t destlen; 2903 uchar_t *optaddr; 2904 t_scalar_t optlen; 2905 2906 if ((mp->b_wptr < mp->b_rptr) || 2907 (MBLKL(mp)) < sizeof (struct T_unitdata_req)) { 2908 goto done; 2909 } 2910 tudr = (struct T_unitdata_req *)mp->b_rptr; 2911 destaddr = mp->b_rptr + tudr->DEST_offset; 2912 if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr || 2913 destaddr + tudr->DEST_length < mp->b_rptr || 2914 destaddr + tudr->DEST_length > mp->b_wptr) { 2915 goto done; 2916 } 2917 optaddr = mp->b_rptr + tudr->OPT_offset; 2918 if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr || 2919 optaddr + tudr->OPT_length < mp->b_rptr || 2920 optaddr + tudr->OPT_length > mp->b_wptr) { 2921 goto done; 2922 } 2923 destlen = tudr->DEST_length; 2924 optlen = tudr->OPT_length; 2925 2926 mp1 = mi_tpi_uderror_ind((char *)destaddr, destlen, 2927 (char *)optaddr, optlen, err); 2928 if (mp1 != NULL) 2929 qreply(q, mp1); 2930 2931 done: 2932 freemsg(mp); 2933 } 2934 2935 static int 2936 rawip_do_unbind(conn_t *connp) 2937 { 2938 icmp_t *icmp = connp->conn_icmp; 2939 2940 mutex_enter(&connp->conn_lock); 2941 /* If a bind has not been done, we can't unbind. */ 2942 if (icmp->icmp_state == TS_UNBND) { 2943 mutex_exit(&connp->conn_lock); 2944 return (-TOUTSTATE); 2945 } 2946 connp->conn_saddr_v6 = ipv6_all_zeros; 2947 connp->conn_bound_addr_v6 = ipv6_all_zeros; 2948 connp->conn_laddr_v6 = ipv6_all_zeros; 2949 connp->conn_mcbc_bind = B_FALSE; 2950 connp->conn_lport = 0; 2951 connp->conn_fport = 0; 2952 /* In case we were also connected */ 2953 connp->conn_faddr_v6 = ipv6_all_zeros; 2954 connp->conn_v6lastdst = ipv6_all_zeros; 2955 2956 icmp->icmp_state = TS_UNBND; 2957 2958 (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 2959 &connp->conn_faddr_v6, connp->conn_flowinfo); 2960 mutex_exit(&connp->conn_lock); 2961 2962 ip_unbind(connp); 2963 return (0); 2964 } 2965 2966 /* 2967 * This routine is called by icmp_wput to handle T_UNBIND_REQ messages. 2968 * After some error checking, the message is passed downstream to ip. 2969 */ 2970 static void 2971 icmp_tpi_unbind(queue_t *q, mblk_t *mp) 2972 { 2973 conn_t *connp = Q_TO_CONN(q); 2974 int error; 2975 2976 ASSERT(mp->b_cont == NULL); 2977 error = rawip_do_unbind(connp); 2978 if (error) { 2979 if (error < 0) { 2980 icmp_err_ack(q, mp, -error, 0); 2981 } else { 2982 icmp_err_ack(q, mp, 0, error); 2983 } 2984 return; 2985 } 2986 2987 /* 2988 * Convert mp into a T_OK_ACK 2989 */ 2990 2991 mp = mi_tpi_ok_ack_alloc(mp); 2992 2993 /* 2994 * should not happen in practice... T_OK_ACK is smaller than the 2995 * original message. 2996 */ 2997 ASSERT(mp != NULL); 2998 ASSERT(((struct T_ok_ack *)mp->b_rptr)->PRIM_type == T_OK_ACK); 2999 qreply(q, mp); 3000 } 3001 3002 /* 3003 * Process IPv4 packets that already include an IP header. 3004 * Used when IP_HDRINCL has been set (implicit for IPPROTO_RAW and 3005 * IPPROTO_IGMP). 3006 * In this case we ignore the address and any options in the T_UNITDATA_REQ. 3007 * 3008 * The packet is assumed to have a base (20 byte) IP header followed 3009 * by the upper-layer protocol. We include any IP_OPTIONS including a 3010 * CIPSO label but otherwise preserve the base IP header. 3011 */ 3012 static int 3013 icmp_output_hdrincl(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid) 3014 { 3015 icmp_t *icmp = connp->conn_icmp; 3016 icmp_stack_t *is = icmp->icmp_is; 3017 ipha_t iphas; 3018 ipha_t *ipha; 3019 int ip_hdr_length; 3020 int tp_hdr_len; 3021 ip_xmit_attr_t *ixa; 3022 ip_pkt_t *ipp; 3023 in6_addr_t v6src; 3024 in6_addr_t v6dst; 3025 in6_addr_t v6nexthop; 3026 int error; 3027 boolean_t do_ipsec; 3028 3029 /* 3030 * We need an exclusive copy of conn_ixa since the included IP 3031 * header could have any destination. 3032 * That copy has no pointers hence we 3033 * need to set them up once we've parsed the ancillary data. 3034 */ 3035 ixa = conn_get_ixa_exclusive(connp); 3036 if (ixa == NULL) { 3037 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3038 freemsg(mp); 3039 return (ENOMEM); 3040 } 3041 ASSERT(cr != NULL); 3042 /* 3043 * Caller has a reference on cr; from db_credp or because we 3044 * are running in process context. 3045 */ 3046 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3047 ixa->ixa_cred = cr; 3048 ixa->ixa_cpid = pid; 3049 if (is_system_labeled()) { 3050 /* We need to restart with a label based on the cred */ 3051 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 3052 } 3053 3054 /* In case previous destination was multicast or multirt */ 3055 ip_attr_newdst(ixa); 3056 3057 /* Get a copy of conn_xmit_ipp since the TX label might change it */ 3058 ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP); 3059 if (ipp == NULL) { 3060 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3061 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3062 ixa->ixa_cpid = connp->conn_cpid; 3063 ixa_refrele(ixa); 3064 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3065 freemsg(mp); 3066 return (ENOMEM); 3067 } 3068 mutex_enter(&connp->conn_lock); 3069 error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP); 3070 mutex_exit(&connp->conn_lock); 3071 if (error != 0) { 3072 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3073 freemsg(mp); 3074 goto done; 3075 } 3076 3077 /* Sanity check length of packet */ 3078 ipha = (ipha_t *)mp->b_rptr; 3079 3080 ip_hdr_length = IP_SIMPLE_HDR_LENGTH; 3081 if ((mp->b_wptr - mp->b_rptr) < IP_SIMPLE_HDR_LENGTH) { 3082 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { 3083 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3084 freemsg(mp); 3085 goto done; 3086 } 3087 ipha = (ipha_t *)mp->b_rptr; 3088 } 3089 ipha->ipha_version_and_hdr_length = 3090 (IP_VERSION<<4) | (ip_hdr_length>>2); 3091 3092 /* 3093 * We set IXAF_DONTFRAG if the application set DF which makes 3094 * IP not fragment. 3095 */ 3096 ipha->ipha_fragment_offset_and_flags &= htons(IPH_DF); 3097 if (ipha->ipha_fragment_offset_and_flags & htons(IPH_DF)) 3098 ixa->ixa_flags |= (IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF); 3099 else 3100 ixa->ixa_flags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF); 3101 3102 /* Even for multicast and broadcast we honor the apps ttl */ 3103 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE; 3104 3105 /* 3106 * No source verification for non-local addresses 3107 */ 3108 if (ipha->ipha_src != INADDR_ANY && 3109 ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid, 3110 is->is_netstack->netstack_ip, B_FALSE) 3111 != IPVL_UNICAST_UP) { 3112 ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE; 3113 } 3114 3115 if (ipha->ipha_dst == INADDR_ANY) 3116 ipha->ipha_dst = htonl(INADDR_LOOPBACK); 3117 3118 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src); 3119 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst); 3120 3121 /* Defer IPsec if it might need to look at ICMP type/code */ 3122 do_ipsec = ipha->ipha_protocol != IPPROTO_ICMP; 3123 ixa->ixa_flags |= IXAF_IS_IPV4; 3124 3125 ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop); 3126 error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, 3127 connp->conn_fport, &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | 3128 (do_ipsec ? IPDF_IPSEC : 0)); 3129 switch (error) { 3130 case 0: 3131 break; 3132 case EADDRNOTAVAIL: 3133 /* 3134 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3135 * Don't have the application see that errno 3136 */ 3137 error = ENETUNREACH; 3138 goto failed; 3139 case ENETDOWN: 3140 /* 3141 * Have !ipif_addr_ready address; drop packet silently 3142 * until we can get applications to not send until we 3143 * are ready. 3144 */ 3145 error = 0; 3146 goto failed; 3147 case EHOSTUNREACH: 3148 case ENETUNREACH: 3149 if (ixa->ixa_ire != NULL) { 3150 /* 3151 * Let conn_ip_output/ire_send_noroute return 3152 * the error and send any local ICMP error. 3153 */ 3154 error = 0; 3155 break; 3156 } 3157 /* FALLTHRU */ 3158 default: 3159 failed: 3160 freemsg(mp); 3161 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3162 goto done; 3163 } 3164 if (ipha->ipha_src == INADDR_ANY) 3165 IN6_V4MAPPED_TO_IPADDR(&v6src, ipha->ipha_src); 3166 3167 /* 3168 * We might be going to a different destination than last time, 3169 * thus check that TX allows the communication and compute any 3170 * needed label. 3171 * 3172 * TSOL Note: We have an exclusive ipp and ixa for this thread so we 3173 * don't have to worry about concurrent threads. 3174 */ 3175 if (is_system_labeled()) { 3176 /* 3177 * Check whether Trusted Solaris policy allows communication 3178 * with this host, and pretend that the destination is 3179 * unreachable if not. 3180 * Compute any needed label and place it in ipp_label_v4/v6. 3181 * 3182 * Later conn_build_hdr_template/conn_prepend_hdr takes 3183 * ipp_label_v4/v6 to form the packet. 3184 * 3185 * Tsol note: We have ipp structure local to this thread so 3186 * no locking is needed. 3187 */ 3188 error = conn_update_label(connp, ixa, &v6dst, ipp); 3189 if (error != 0) { 3190 freemsg(mp); 3191 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3192 goto done; 3193 } 3194 } 3195 3196 /* 3197 * Save away a copy of the IPv4 header the application passed down 3198 * and then prepend an IPv4 header complete with any IP options 3199 * including label. 3200 * We need a struct copy since icmp_prepend_hdr will reuse the available 3201 * space in the mblk. 3202 */ 3203 iphas = *ipha; 3204 mp->b_rptr += IP_SIMPLE_HDR_LENGTH; 3205 3206 mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, 0, mp, &error); 3207 if (mp == NULL) { 3208 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3209 ASSERT(error != 0); 3210 goto done; 3211 } 3212 if (ixa->ixa_pktlen > IP_MAXPACKET) { 3213 error = EMSGSIZE; 3214 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3215 freemsg(mp); 3216 goto done; 3217 } 3218 /* Restore key parts of the header that the application passed down */ 3219 ipha = (ipha_t *)mp->b_rptr; 3220 ipha->ipha_type_of_service = iphas.ipha_type_of_service; 3221 ipha->ipha_ident = iphas.ipha_ident; 3222 ipha->ipha_fragment_offset_and_flags = 3223 iphas.ipha_fragment_offset_and_flags; 3224 ipha->ipha_ttl = iphas.ipha_ttl; 3225 ipha->ipha_protocol = iphas.ipha_protocol; 3226 ipha->ipha_src = iphas.ipha_src; 3227 ipha->ipha_dst = iphas.ipha_dst; 3228 3229 ixa->ixa_protocol = ipha->ipha_protocol; 3230 3231 /* 3232 * Make sure that the IP header plus any transport header that is 3233 * checksumed by ip_output is in the first mblk. (ip_output assumes 3234 * that at least the checksum field is in the first mblk.) 3235 */ 3236 switch (ipha->ipha_protocol) { 3237 case IPPROTO_UDP: 3238 tp_hdr_len = 8; 3239 break; 3240 case IPPROTO_TCP: 3241 tp_hdr_len = 20; 3242 break; 3243 default: 3244 tp_hdr_len = 0; 3245 break; 3246 } 3247 ip_hdr_length = IPH_HDR_LENGTH(ipha); 3248 if (mp->b_wptr - mp->b_rptr < ip_hdr_length + tp_hdr_len) { 3249 if (!pullupmsg(mp, ip_hdr_length + tp_hdr_len)) { 3250 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3251 if (mp->b_cont == NULL) 3252 error = EINVAL; 3253 else 3254 error = ENOMEM; 3255 freemsg(mp); 3256 goto done; 3257 } 3258 } 3259 3260 if (!do_ipsec) { 3261 /* Policy might differ for different ICMP type/code */ 3262 if (ixa->ixa_ipsec_policy != NULL) { 3263 IPPOL_REFRELE(ixa->ixa_ipsec_policy); 3264 ixa->ixa_ipsec_policy = NULL; 3265 ixa->ixa_flags &= ~IXAF_IPSEC_SECURE; 3266 } 3267 mp = ip_output_attach_policy(mp, ipha, NULL, connp, ixa); 3268 if (mp == NULL) { 3269 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3270 error = EHOSTUNREACH; /* IPsec policy failure */ 3271 goto done; 3272 } 3273 } 3274 3275 /* We're done. Pass the packet to ip. */ 3276 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3277 3278 error = conn_ip_output(mp, ixa); 3279 /* No rawipOutErrors if an error since IP increases its error counter */ 3280 switch (error) { 3281 case 0: 3282 break; 3283 case EWOULDBLOCK: 3284 (void) ixa_check_drain_insert(connp, ixa); 3285 error = 0; 3286 break; 3287 case EADDRNOTAVAIL: 3288 /* 3289 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3290 * Don't have the application see that errno 3291 */ 3292 error = ENETUNREACH; 3293 break; 3294 } 3295 done: 3296 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3297 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3298 ixa->ixa_cpid = connp->conn_cpid; 3299 ixa_refrele(ixa); 3300 ip_pkt_free(ipp); 3301 kmem_free(ipp, sizeof (*ipp)); 3302 return (error); 3303 } 3304 3305 static mblk_t * 3306 icmp_output_attach_policy(mblk_t *mp, conn_t *connp, ip_xmit_attr_t *ixa) 3307 { 3308 ipha_t *ipha = NULL; 3309 ip6_t *ip6h = NULL; 3310 3311 if (ixa->ixa_flags & IXAF_IS_IPV4) 3312 ipha = (ipha_t *)mp->b_rptr; 3313 else 3314 ip6h = (ip6_t *)mp->b_rptr; 3315 3316 if (ixa->ixa_ipsec_policy != NULL) { 3317 IPPOL_REFRELE(ixa->ixa_ipsec_policy); 3318 ixa->ixa_ipsec_policy = NULL; 3319 ixa->ixa_flags &= ~IXAF_IPSEC_SECURE; 3320 } 3321 return (ip_output_attach_policy(mp, ipha, ip6h, connp, ixa)); 3322 } 3323 3324 /* 3325 * Handle T_UNITDATA_REQ with options. Both IPv4 and IPv6 3326 * Either tudr_mp or msg is set. If tudr_mp we take ancillary data from 3327 * the TPI options, otherwise we take them from msg_control. 3328 * If both sin and sin6 is set it is a connected socket and we use conn_faddr. 3329 * Always consumes mp; never consumes tudr_mp. 3330 */ 3331 static int 3332 icmp_output_ancillary(conn_t *connp, sin_t *sin, sin6_t *sin6, mblk_t *mp, 3333 mblk_t *tudr_mp, struct nmsghdr *msg, cred_t *cr, pid_t pid) 3334 { 3335 icmp_t *icmp = connp->conn_icmp; 3336 icmp_stack_t *is = icmp->icmp_is; 3337 int error; 3338 ip_xmit_attr_t *ixa; 3339 ip_pkt_t *ipp; 3340 in6_addr_t v6src; 3341 in6_addr_t v6dst; 3342 in6_addr_t v6nexthop; 3343 in_port_t dstport; 3344 uint32_t flowinfo; 3345 int is_absreq_failure = 0; 3346 conn_opt_arg_t coas, *coa; 3347 3348 ASSERT(tudr_mp != NULL || msg != NULL); 3349 3350 /* 3351 * Get ixa before checking state to handle a disconnect race. 3352 * 3353 * We need an exclusive copy of conn_ixa since the ancillary data 3354 * options might modify it. That copy has no pointers hence we 3355 * need to set them up once we've parsed the ancillary data. 3356 */ 3357 ixa = conn_get_ixa_exclusive(connp); 3358 if (ixa == NULL) { 3359 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3360 freemsg(mp); 3361 return (ENOMEM); 3362 } 3363 ASSERT(cr != NULL); 3364 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3365 ixa->ixa_cred = cr; 3366 ixa->ixa_cpid = pid; 3367 if (is_system_labeled()) { 3368 /* We need to restart with a label based on the cred */ 3369 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 3370 } 3371 3372 /* In case previous destination was multicast or multirt */ 3373 ip_attr_newdst(ixa); 3374 3375 /* Get a copy of conn_xmit_ipp since the options might change it */ 3376 ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP); 3377 if (ipp == NULL) { 3378 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3379 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3380 ixa->ixa_cpid = connp->conn_cpid; 3381 ixa_refrele(ixa); 3382 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3383 freemsg(mp); 3384 return (ENOMEM); 3385 } 3386 mutex_enter(&connp->conn_lock); 3387 error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP); 3388 mutex_exit(&connp->conn_lock); 3389 if (error != 0) { 3390 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3391 freemsg(mp); 3392 goto done; 3393 } 3394 3395 /* 3396 * Parse the options and update ixa and ipp as a result. 3397 */ 3398 3399 coa = &coas; 3400 coa->coa_connp = connp; 3401 coa->coa_ixa = ixa; 3402 coa->coa_ipp = ipp; 3403 coa->coa_ancillary = B_TRUE; 3404 coa->coa_changed = 0; 3405 3406 if (msg != NULL) { 3407 error = process_auxiliary_options(connp, msg->msg_control, 3408 msg->msg_controllen, coa, &icmp_opt_obj, icmp_opt_set, cr); 3409 } else { 3410 struct T_unitdata_req *tudr; 3411 3412 tudr = (struct T_unitdata_req *)tudr_mp->b_rptr; 3413 ASSERT(tudr->PRIM_type == T_UNITDATA_REQ); 3414 error = tpi_optcom_buf(connp->conn_wq, tudr_mp, 3415 &tudr->OPT_length, tudr->OPT_offset, cr, &icmp_opt_obj, 3416 coa, &is_absreq_failure); 3417 } 3418 if (error != 0) { 3419 /* 3420 * Note: No special action needed in this 3421 * module for "is_absreq_failure" 3422 */ 3423 freemsg(mp); 3424 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3425 goto done; 3426 } 3427 ASSERT(is_absreq_failure == 0); 3428 3429 mutex_enter(&connp->conn_lock); 3430 /* 3431 * If laddr is unspecified then we look at sin6_src_id. 3432 * We will give precedence to a source address set with IPV6_PKTINFO 3433 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't 3434 * want ip_attr_connect to select a source (since it can fail) when 3435 * IPV6_PKTINFO is specified. 3436 * If this doesn't result in a source address then we get a source 3437 * from ip_attr_connect() below. 3438 */ 3439 v6src = connp->conn_saddr_v6; 3440 if (sin != NULL) { 3441 IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst); 3442 dstport = sin->sin_port; 3443 flowinfo = 0; 3444 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 3445 ixa->ixa_flags |= IXAF_IS_IPV4; 3446 } else if (sin6 != NULL) { 3447 boolean_t v4mapped; 3448 uint_t srcid; 3449 3450 v6dst = sin6->sin6_addr; 3451 dstport = sin6->sin6_port; 3452 flowinfo = sin6->sin6_flowinfo; 3453 srcid = sin6->__sin6_src_id; 3454 if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) { 3455 ixa->ixa_scopeid = sin6->sin6_scope_id; 3456 ixa->ixa_flags |= IXAF_SCOPEID_SET; 3457 } else { 3458 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 3459 } 3460 v4mapped = IN6_IS_ADDR_V4MAPPED(&v6dst); 3461 if (v4mapped) 3462 ixa->ixa_flags |= IXAF_IS_IPV4; 3463 else 3464 ixa->ixa_flags &= ~IXAF_IS_IPV4; 3465 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 3466 if (!ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 3467 v4mapped, connp->conn_netstack)) { 3468 /* Mismatched v4mapped/v6 specified by srcid. */ 3469 mutex_exit(&connp->conn_lock); 3470 error = EADDRNOTAVAIL; 3471 goto failed; /* Does freemsg() and mib. */ 3472 } 3473 } 3474 } else { 3475 /* Connected case */ 3476 v6dst = connp->conn_faddr_v6; 3477 flowinfo = connp->conn_flowinfo; 3478 } 3479 mutex_exit(&connp->conn_lock); 3480 /* Handle IP_PKTINFO/IPV6_PKTINFO setting source address. */ 3481 if (ipp->ipp_fields & IPPF_ADDR) { 3482 if (ixa->ixa_flags & IXAF_IS_IPV4) { 3483 if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 3484 v6src = ipp->ipp_addr; 3485 } else { 3486 if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 3487 v6src = ipp->ipp_addr; 3488 } 3489 } 3490 /* 3491 * Allow source not assigned to the system 3492 * only if it is not a local addresses 3493 */ 3494 if (!V6_OR_V4_INADDR_ANY(v6src)) { 3495 ip_laddr_t laddr_type; 3496 3497 if (ixa->ixa_flags & IXAF_IS_IPV4) { 3498 ipaddr_t v4src; 3499 3500 IN6_V4MAPPED_TO_IPADDR(&v6src, v4src); 3501 laddr_type = ip_laddr_verify_v4(v4src, ixa->ixa_zoneid, 3502 is->is_netstack->netstack_ip, B_FALSE); 3503 } else { 3504 laddr_type = ip_laddr_verify_v6(&v6src, ixa->ixa_zoneid, 3505 is->is_netstack->netstack_ip, B_FALSE, B_FALSE); 3506 } 3507 if (laddr_type != IPVL_UNICAST_UP) 3508 ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE; 3509 } 3510 3511 ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop); 3512 error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport, 3513 &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST); 3514 3515 switch (error) { 3516 case 0: 3517 break; 3518 case EADDRNOTAVAIL: 3519 /* 3520 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3521 * Don't have the application see that errno 3522 */ 3523 error = ENETUNREACH; 3524 goto failed; 3525 case ENETDOWN: 3526 /* 3527 * Have !ipif_addr_ready address; drop packet silently 3528 * until we can get applications to not send until we 3529 * are ready. 3530 */ 3531 error = 0; 3532 goto failed; 3533 case EHOSTUNREACH: 3534 case ENETUNREACH: 3535 if (ixa->ixa_ire != NULL) { 3536 /* 3537 * Let conn_ip_output/ire_send_noroute return 3538 * the error and send any local ICMP error. 3539 */ 3540 error = 0; 3541 break; 3542 } 3543 /* FALLTHRU */ 3544 default: 3545 failed: 3546 freemsg(mp); 3547 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3548 goto done; 3549 } 3550 3551 /* 3552 * We might be going to a different destination than last time, 3553 * thus check that TX allows the communication and compute any 3554 * needed label. 3555 * 3556 * TSOL Note: We have an exclusive ipp and ixa for this thread so we 3557 * don't have to worry about concurrent threads. 3558 */ 3559 if (is_system_labeled()) { 3560 /* 3561 * Check whether Trusted Solaris policy allows communication 3562 * with this host, and pretend that the destination is 3563 * unreachable if not. 3564 * Compute any needed label and place it in ipp_label_v4/v6. 3565 * 3566 * Later conn_build_hdr_template/conn_prepend_hdr takes 3567 * ipp_label_v4/v6 to form the packet. 3568 * 3569 * Tsol note: We have ipp structure local to this thread so 3570 * no locking is needed. 3571 */ 3572 error = conn_update_label(connp, ixa, &v6dst, ipp); 3573 if (error != 0) { 3574 freemsg(mp); 3575 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3576 goto done; 3577 } 3578 } 3579 mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, flowinfo, mp, 3580 &error); 3581 if (mp == NULL) { 3582 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3583 ASSERT(error != 0); 3584 goto done; 3585 } 3586 if (ixa->ixa_pktlen > IP_MAXPACKET) { 3587 error = EMSGSIZE; 3588 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3589 freemsg(mp); 3590 goto done; 3591 } 3592 3593 /* Policy might differ for different ICMP type/code */ 3594 mp = icmp_output_attach_policy(mp, connp, ixa); 3595 if (mp == NULL) { 3596 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3597 error = EHOSTUNREACH; /* IPsec policy failure */ 3598 goto done; 3599 } 3600 3601 /* We're done. Pass the packet to ip. */ 3602 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3603 3604 error = conn_ip_output(mp, ixa); 3605 if (!connp->conn_unspec_src) 3606 ixa->ixa_flags |= IXAF_VERIFY_SOURCE; 3607 /* No rawipOutErrors if an error since IP increases its error counter */ 3608 switch (error) { 3609 case 0: 3610 break; 3611 case EWOULDBLOCK: 3612 (void) ixa_check_drain_insert(connp, ixa); 3613 error = 0; 3614 break; 3615 case EADDRNOTAVAIL: 3616 /* 3617 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3618 * Don't have the application see that errno 3619 */ 3620 error = ENETUNREACH; 3621 /* FALLTHRU */ 3622 default: 3623 mutex_enter(&connp->conn_lock); 3624 /* 3625 * Clear the source and v6lastdst so we call ip_attr_connect 3626 * for the next packet and try to pick a better source. 3627 */ 3628 if (connp->conn_mcbc_bind) 3629 connp->conn_saddr_v6 = ipv6_all_zeros; 3630 else 3631 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 3632 connp->conn_v6lastdst = ipv6_all_zeros; 3633 mutex_exit(&connp->conn_lock); 3634 break; 3635 } 3636 done: 3637 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3638 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3639 ixa->ixa_cpid = connp->conn_cpid; 3640 ixa_refrele(ixa); 3641 ip_pkt_free(ipp); 3642 kmem_free(ipp, sizeof (*ipp)); 3643 return (error); 3644 } 3645 3646 /* 3647 * Handle sending an M_DATA for a connected socket. 3648 * Handles both IPv4 and IPv6. 3649 */ 3650 int 3651 icmp_output_connected(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid) 3652 { 3653 icmp_t *icmp = connp->conn_icmp; 3654 icmp_stack_t *is = icmp->icmp_is; 3655 int error; 3656 ip_xmit_attr_t *ixa; 3657 boolean_t do_ipsec; 3658 3659 /* 3660 * If no other thread is using conn_ixa this just gets a reference to 3661 * conn_ixa. Otherwise we get a safe copy of conn_ixa. 3662 */ 3663 ixa = conn_get_ixa(connp, B_FALSE); 3664 if (ixa == NULL) { 3665 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3666 freemsg(mp); 3667 return (ENOMEM); 3668 } 3669 3670 ASSERT(cr != NULL); 3671 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3672 ixa->ixa_cred = cr; 3673 ixa->ixa_cpid = pid; 3674 3675 /* Defer IPsec if it might need to look at ICMP type/code */ 3676 switch (ixa->ixa_protocol) { 3677 case IPPROTO_ICMP: 3678 case IPPROTO_ICMPV6: 3679 do_ipsec = B_FALSE; 3680 break; 3681 default: 3682 do_ipsec = B_TRUE; 3683 } 3684 3685 mutex_enter(&connp->conn_lock); 3686 mp = icmp_prepend_header_template(connp, ixa, mp, 3687 &connp->conn_saddr_v6, connp->conn_flowinfo, &error); 3688 3689 if (mp == NULL) { 3690 ASSERT(error != 0); 3691 mutex_exit(&connp->conn_lock); 3692 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3693 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3694 ixa->ixa_cpid = connp->conn_cpid; 3695 ixa_refrele(ixa); 3696 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3697 freemsg(mp); 3698 return (error); 3699 } 3700 3701 if (!do_ipsec) { 3702 /* Policy might differ for different ICMP type/code */ 3703 mp = icmp_output_attach_policy(mp, connp, ixa); 3704 if (mp == NULL) { 3705 mutex_exit(&connp->conn_lock); 3706 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3707 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3708 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3709 ixa->ixa_cpid = connp->conn_cpid; 3710 ixa_refrele(ixa); 3711 return (EHOSTUNREACH); /* IPsec policy failure */ 3712 } 3713 } 3714 3715 /* 3716 * In case we got a safe copy of conn_ixa, or if opt_set made us a new 3717 * safe copy, then we need to fill in any pointers in it. 3718 */ 3719 if (ixa->ixa_ire == NULL) { 3720 in6_addr_t faddr, saddr; 3721 in6_addr_t nexthop; 3722 in_port_t fport; 3723 3724 saddr = connp->conn_saddr_v6; 3725 faddr = connp->conn_faddr_v6; 3726 fport = connp->conn_fport; 3727 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &faddr, &nexthop); 3728 mutex_exit(&connp->conn_lock); 3729 3730 error = ip_attr_connect(connp, ixa, &saddr, &faddr, &nexthop, 3731 fport, NULL, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | 3732 (do_ipsec ? IPDF_IPSEC : 0)); 3733 switch (error) { 3734 case 0: 3735 break; 3736 case EADDRNOTAVAIL: 3737 /* 3738 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3739 * Don't have the application see that errno 3740 */ 3741 error = ENETUNREACH; 3742 goto failed; 3743 case ENETDOWN: 3744 /* 3745 * Have !ipif_addr_ready address; drop packet silently 3746 * until we can get applications to not send until we 3747 * are ready. 3748 */ 3749 error = 0; 3750 goto failed; 3751 case EHOSTUNREACH: 3752 case ENETUNREACH: 3753 if (ixa->ixa_ire != NULL) { 3754 /* 3755 * Let conn_ip_output/ire_send_noroute return 3756 * the error and send any local ICMP error. 3757 */ 3758 error = 0; 3759 break; 3760 } 3761 /* FALLTHRU */ 3762 default: 3763 failed: 3764 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3765 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3766 ixa->ixa_cpid = connp->conn_cpid; 3767 ixa_refrele(ixa); 3768 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3769 freemsg(mp); 3770 return (error); 3771 } 3772 } else { 3773 /* Done with conn_t */ 3774 mutex_exit(&connp->conn_lock); 3775 } 3776 3777 /* We're done. Pass the packet to ip. */ 3778 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3779 3780 error = conn_ip_output(mp, ixa); 3781 /* No rawipOutErrors if an error since IP increases its error counter */ 3782 switch (error) { 3783 case 0: 3784 break; 3785 case EWOULDBLOCK: 3786 (void) ixa_check_drain_insert(connp, ixa); 3787 error = 0; 3788 break; 3789 case EADDRNOTAVAIL: 3790 /* 3791 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3792 * Don't have the application see that errno 3793 */ 3794 error = ENETUNREACH; 3795 break; 3796 } 3797 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3798 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3799 ixa->ixa_cpid = connp->conn_cpid; 3800 ixa_refrele(ixa); 3801 return (error); 3802 } 3803 3804 /* 3805 * Handle sending an M_DATA to the last destination. 3806 * Handles both IPv4 and IPv6. 3807 * 3808 * NOTE: The caller must hold conn_lock and we drop it here. 3809 */ 3810 int 3811 icmp_output_lastdst(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid, 3812 ip_xmit_attr_t *ixa) 3813 { 3814 icmp_t *icmp = connp->conn_icmp; 3815 icmp_stack_t *is = icmp->icmp_is; 3816 int error; 3817 boolean_t do_ipsec; 3818 3819 ASSERT(MUTEX_HELD(&connp->conn_lock)); 3820 ASSERT(ixa != NULL); 3821 3822 ASSERT(cr != NULL); 3823 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3824 ixa->ixa_cred = cr; 3825 ixa->ixa_cpid = pid; 3826 3827 /* Defer IPsec if it might need to look at ICMP type/code */ 3828 switch (ixa->ixa_protocol) { 3829 case IPPROTO_ICMP: 3830 case IPPROTO_ICMPV6: 3831 do_ipsec = B_FALSE; 3832 break; 3833 default: 3834 do_ipsec = B_TRUE; 3835 } 3836 3837 3838 mp = icmp_prepend_header_template(connp, ixa, mp, 3839 &connp->conn_v6lastsrc, connp->conn_lastflowinfo, &error); 3840 3841 if (mp == NULL) { 3842 ASSERT(error != 0); 3843 mutex_exit(&connp->conn_lock); 3844 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3845 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3846 ixa->ixa_cpid = connp->conn_cpid; 3847 ixa_refrele(ixa); 3848 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3849 freemsg(mp); 3850 return (error); 3851 } 3852 3853 if (!do_ipsec) { 3854 /* Policy might differ for different ICMP type/code */ 3855 mp = icmp_output_attach_policy(mp, connp, ixa); 3856 if (mp == NULL) { 3857 mutex_exit(&connp->conn_lock); 3858 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3859 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3860 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3861 ixa->ixa_cpid = connp->conn_cpid; 3862 ixa_refrele(ixa); 3863 return (EHOSTUNREACH); /* IPsec policy failure */ 3864 } 3865 } 3866 3867 /* 3868 * In case we got a safe copy of conn_ixa, or if opt_set made us a new 3869 * safe copy, then we need to fill in any pointers in it. 3870 */ 3871 if (ixa->ixa_ire == NULL) { 3872 in6_addr_t lastdst, lastsrc; 3873 in6_addr_t nexthop; 3874 in_port_t lastport; 3875 3876 lastsrc = connp->conn_v6lastsrc; 3877 lastdst = connp->conn_v6lastdst; 3878 lastport = connp->conn_lastdstport; 3879 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &lastdst, &nexthop); 3880 mutex_exit(&connp->conn_lock); 3881 3882 error = ip_attr_connect(connp, ixa, &lastsrc, &lastdst, 3883 &nexthop, lastport, NULL, NULL, IPDF_ALLOW_MCBC | 3884 IPDF_VERIFY_DST | (do_ipsec ? IPDF_IPSEC : 0)); 3885 switch (error) { 3886 case 0: 3887 break; 3888 case EADDRNOTAVAIL: 3889 /* 3890 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3891 * Don't have the application see that errno 3892 */ 3893 error = ENETUNREACH; 3894 goto failed; 3895 case ENETDOWN: 3896 /* 3897 * Have !ipif_addr_ready address; drop packet silently 3898 * until we can get applications to not send until we 3899 * are ready. 3900 */ 3901 error = 0; 3902 goto failed; 3903 case EHOSTUNREACH: 3904 case ENETUNREACH: 3905 if (ixa->ixa_ire != NULL) { 3906 /* 3907 * Let conn_ip_output/ire_send_noroute return 3908 * the error and send any local ICMP error. 3909 */ 3910 error = 0; 3911 break; 3912 } 3913 /* FALLTHRU */ 3914 default: 3915 failed: 3916 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3917 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3918 ixa->ixa_cpid = connp->conn_cpid; 3919 ixa_refrele(ixa); 3920 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3921 freemsg(mp); 3922 return (error); 3923 } 3924 } else { 3925 /* Done with conn_t */ 3926 mutex_exit(&connp->conn_lock); 3927 } 3928 3929 /* We're done. Pass the packet to ip. */ 3930 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3931 error = conn_ip_output(mp, ixa); 3932 /* No rawipOutErrors if an error since IP increases its error counter */ 3933 switch (error) { 3934 case 0: 3935 break; 3936 case EWOULDBLOCK: 3937 (void) ixa_check_drain_insert(connp, ixa); 3938 error = 0; 3939 break; 3940 case EADDRNOTAVAIL: 3941 /* 3942 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3943 * Don't have the application see that errno 3944 */ 3945 error = ENETUNREACH; 3946 /* FALLTHRU */ 3947 default: 3948 mutex_enter(&connp->conn_lock); 3949 /* 3950 * Clear the source and v6lastdst so we call ip_attr_connect 3951 * for the next packet and try to pick a better source. 3952 */ 3953 if (connp->conn_mcbc_bind) 3954 connp->conn_saddr_v6 = ipv6_all_zeros; 3955 else 3956 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 3957 connp->conn_v6lastdst = ipv6_all_zeros; 3958 mutex_exit(&connp->conn_lock); 3959 break; 3960 } 3961 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3962 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3963 ixa->ixa_cpid = connp->conn_cpid; 3964 ixa_refrele(ixa); 3965 return (error); 3966 } 3967 3968 3969 /* 3970 * Prepend the header template and then fill in the source and 3971 * flowinfo. The caller needs to handle the destination address since 3972 * it's setting is different if rthdr or source route. 3973 * 3974 * Returns NULL is allocation failed or if the packet would exceed IP_MAXPACKET. 3975 * When it returns NULL it sets errorp. 3976 */ 3977 static mblk_t * 3978 icmp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp, 3979 const in6_addr_t *v6src, uint32_t flowinfo, int *errorp) 3980 { 3981 icmp_t *icmp = connp->conn_icmp; 3982 icmp_stack_t *is = icmp->icmp_is; 3983 uint_t pktlen; 3984 uint_t copylen; 3985 uint8_t *iph; 3986 uint_t ip_hdr_length; 3987 uint32_t cksum; 3988 ip_pkt_t *ipp; 3989 3990 ASSERT(MUTEX_HELD(&connp->conn_lock)); 3991 3992 /* 3993 * Copy the header template. 3994 */ 3995 copylen = connp->conn_ht_iphc_len; 3996 pktlen = copylen + msgdsize(mp); 3997 if (pktlen > IP_MAXPACKET) { 3998 freemsg(mp); 3999 *errorp = EMSGSIZE; 4000 return (NULL); 4001 } 4002 ixa->ixa_pktlen = pktlen; 4003 4004 /* check/fix buffer config, setup pointers into it */ 4005 iph = mp->b_rptr - copylen; 4006 if (DB_REF(mp) != 1 || iph < DB_BASE(mp) || !OK_32PTR(iph)) { 4007 mblk_t *mp1; 4008 4009 mp1 = allocb(copylen + is->is_wroff_extra, BPRI_MED); 4010 if (mp1 == NULL) { 4011 freemsg(mp); 4012 *errorp = ENOMEM; 4013 return (NULL); 4014 } 4015 mp1->b_wptr = DB_LIM(mp1); 4016 mp1->b_cont = mp; 4017 mp = mp1; 4018 iph = (mp->b_wptr - copylen); 4019 } 4020 mp->b_rptr = iph; 4021 bcopy(connp->conn_ht_iphc, iph, copylen); 4022 ip_hdr_length = (uint_t)(connp->conn_ht_ulp - connp->conn_ht_iphc); 4023 4024 ixa->ixa_ip_hdr_length = ip_hdr_length; 4025 4026 /* 4027 * Prepare for ICMPv6 checksum done in IP. 4028 * 4029 * icmp_build_hdr_template has already massaged any routing header 4030 * and placed the result in conn_sum. 4031 * 4032 * We make it easy for IP to include our pseudo header 4033 * by putting our length (and any routing header adjustment) 4034 * in the ICMPv6 checksum field. 4035 */ 4036 cksum = pktlen - ip_hdr_length; 4037 4038 cksum += connp->conn_sum; 4039 cksum = (cksum >> 16) + (cksum & 0xFFFF); 4040 ASSERT(cksum < 0x10000); 4041 4042 ipp = &connp->conn_xmit_ipp; 4043 if (ixa->ixa_flags & IXAF_IS_IPV4) { 4044 ipha_t *ipha = (ipha_t *)iph; 4045 4046 ipha->ipha_length = htons((uint16_t)pktlen); 4047 4048 /* if IP_PKTINFO specified an addres it wins over bind() */ 4049 if ((ipp->ipp_fields & IPPF_ADDR) && 4050 IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) { 4051 ASSERT(ipp->ipp_addr_v4 != INADDR_ANY); 4052 ipha->ipha_src = ipp->ipp_addr_v4; 4053 } else { 4054 IN6_V4MAPPED_TO_IPADDR(v6src, ipha->ipha_src); 4055 } 4056 } else { 4057 ip6_t *ip6h = (ip6_t *)iph; 4058 uint_t cksum_offset = 0; 4059 4060 ip6h->ip6_plen = htons((uint16_t)(pktlen - IPV6_HDR_LEN)); 4061 4062 /* if IP_PKTINFO specified an addres it wins over bind() */ 4063 if ((ipp->ipp_fields & IPPF_ADDR) && 4064 !IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) { 4065 ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr)); 4066 ip6h->ip6_src = ipp->ipp_addr; 4067 } else { 4068 ip6h->ip6_src = *v6src; 4069 } 4070 ip6h->ip6_vcf = 4071 (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) | 4072 (flowinfo & ~IPV6_VERS_AND_FLOW_MASK); 4073 if (ipp->ipp_fields & IPPF_TCLASS) { 4074 /* Overrides the class part of flowinfo */ 4075 ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf, 4076 ipp->ipp_tclass); 4077 } 4078 4079 if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) { 4080 if (connp->conn_proto == IPPROTO_ICMPV6) { 4081 cksum_offset = ixa->ixa_ip_hdr_length + 4082 offsetof(icmp6_t, icmp6_cksum); 4083 } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { 4084 cksum_offset = ixa->ixa_ip_hdr_length + 4085 ixa->ixa_raw_cksum_offset; 4086 } 4087 } 4088 if (cksum_offset != 0) { 4089 uint16_t *ptr; 4090 4091 /* Make sure the checksum fits in the first mblk */ 4092 if (cksum_offset + sizeof (short) > MBLKL(mp)) { 4093 mblk_t *mp1; 4094 4095 mp1 = msgpullup(mp, 4096 cksum_offset + sizeof (short)); 4097 freemsg(mp); 4098 if (mp1 == NULL) { 4099 *errorp = ENOMEM; 4100 return (NULL); 4101 } 4102 mp = mp1; 4103 iph = mp->b_rptr; 4104 ip6h = (ip6_t *)iph; 4105 } 4106 ptr = (uint16_t *)(mp->b_rptr + cksum_offset); 4107 *ptr = htons(cksum); 4108 } 4109 } 4110 4111 return (mp); 4112 } 4113 4114 /* 4115 * This routine handles all messages passed downstream. It either 4116 * consumes the message or passes it downstream; it never queues a 4117 * a message. 4118 */ 4119 void 4120 icmp_wput(queue_t *q, mblk_t *mp) 4121 { 4122 sin6_t *sin6; 4123 sin_t *sin = NULL; 4124 uint_t srcid; 4125 conn_t *connp = Q_TO_CONN(q); 4126 icmp_t *icmp = connp->conn_icmp; 4127 int error = 0; 4128 struct sockaddr *addr = NULL; 4129 socklen_t addrlen; 4130 icmp_stack_t *is = icmp->icmp_is; 4131 struct T_unitdata_req *tudr; 4132 mblk_t *data_mp; 4133 cred_t *cr; 4134 pid_t pid; 4135 4136 /* 4137 * We directly handle several cases here: T_UNITDATA_REQ message 4138 * coming down as M_PROTO/M_PCPROTO and M_DATA messages for connected 4139 * socket. 4140 */ 4141 switch (DB_TYPE(mp)) { 4142 case M_DATA: 4143 /* sockfs never sends down M_DATA */ 4144 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4145 freemsg(mp); 4146 return; 4147 4148 case M_PROTO: 4149 case M_PCPROTO: 4150 tudr = (struct T_unitdata_req *)mp->b_rptr; 4151 if (MBLKL(mp) < sizeof (*tudr) || 4152 ((t_primp_t)mp->b_rptr)->type != T_UNITDATA_REQ) { 4153 icmp_wput_other(q, mp); 4154 return; 4155 } 4156 break; 4157 4158 default: 4159 icmp_wput_other(q, mp); 4160 return; 4161 } 4162 4163 /* Handle valid T_UNITDATA_REQ here */ 4164 data_mp = mp->b_cont; 4165 if (data_mp == NULL) { 4166 error = EPROTO; 4167 goto ud_error2; 4168 } 4169 mp->b_cont = NULL; 4170 4171 if (!MBLKIN(mp, 0, tudr->DEST_offset + tudr->DEST_length)) { 4172 error = EADDRNOTAVAIL; 4173 goto ud_error2; 4174 } 4175 4176 /* 4177 * All Solaris components should pass a db_credp 4178 * for this message, hence we ASSERT. 4179 * On production kernels we return an error to be robust against 4180 * random streams modules sitting on top of us. 4181 */ 4182 cr = msg_getcred(mp, &pid); 4183 ASSERT(cr != NULL); 4184 if (cr == NULL) { 4185 error = EINVAL; 4186 goto ud_error2; 4187 } 4188 4189 /* 4190 * If a port has not been bound to the stream, fail. 4191 * This is not a problem when sockfs is directly 4192 * above us, because it will ensure that the socket 4193 * is first bound before allowing data to be sent. 4194 */ 4195 if (icmp->icmp_state == TS_UNBND) { 4196 error = EPROTO; 4197 goto ud_error2; 4198 } 4199 addr = (struct sockaddr *)&mp->b_rptr[tudr->DEST_offset]; 4200 addrlen = tudr->DEST_length; 4201 4202 switch (connp->conn_family) { 4203 case AF_INET6: 4204 sin6 = (sin6_t *)addr; 4205 if (!OK_32PTR((char *)sin6) || (addrlen != sizeof (sin6_t)) || 4206 (sin6->sin6_family != AF_INET6)) { 4207 error = EADDRNOTAVAIL; 4208 goto ud_error2; 4209 } 4210 4211 /* No support for mapped addresses on raw sockets */ 4212 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 4213 error = EADDRNOTAVAIL; 4214 goto ud_error2; 4215 } 4216 srcid = sin6->__sin6_src_id; 4217 4218 /* 4219 * If the local address is a mapped address return 4220 * an error. 4221 * It would be possible to send an IPv6 packet but the 4222 * response would never make it back to the application 4223 * since it is bound to a mapped address. 4224 */ 4225 if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) { 4226 error = EADDRNOTAVAIL; 4227 goto ud_error2; 4228 } 4229 4230 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) 4231 sin6->sin6_addr = ipv6_loopback; 4232 4233 if (tudr->OPT_length != 0) { 4234 /* 4235 * If we are connected then the destination needs to be 4236 * the same as the connected one. 4237 */ 4238 if (icmp->icmp_state == TS_DATA_XFER && 4239 !conn_same_as_last_v6(connp, sin6)) { 4240 error = EISCONN; 4241 goto ud_error2; 4242 } 4243 error = icmp_output_ancillary(connp, NULL, sin6, 4244 data_mp, mp, NULL, cr, pid); 4245 } else { 4246 ip_xmit_attr_t *ixa; 4247 4248 /* 4249 * We have to allocate an ip_xmit_attr_t before we grab 4250 * conn_lock and we need to hold conn_lock once we've 4251 * checked conn_same_as_last_v6 to handle concurrent 4252 * send* calls on a socket. 4253 */ 4254 ixa = conn_get_ixa(connp, B_FALSE); 4255 if (ixa == NULL) { 4256 error = ENOMEM; 4257 goto ud_error2; 4258 } 4259 mutex_enter(&connp->conn_lock); 4260 4261 if (conn_same_as_last_v6(connp, sin6) && 4262 connp->conn_lastsrcid == srcid && 4263 ipsec_outbound_policy_current(ixa)) { 4264 /* icmp_output_lastdst drops conn_lock */ 4265 error = icmp_output_lastdst(connp, data_mp, cr, 4266 pid, ixa); 4267 } else { 4268 /* icmp_output_newdst drops conn_lock */ 4269 error = icmp_output_newdst(connp, data_mp, NULL, 4270 sin6, cr, pid, ixa); 4271 } 4272 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 4273 } 4274 if (error == 0) { 4275 freeb(mp); 4276 return; 4277 } 4278 break; 4279 4280 case AF_INET: 4281 sin = (sin_t *)addr; 4282 if ((!OK_32PTR((char *)sin) || addrlen != sizeof (sin_t)) || 4283 (sin->sin_family != AF_INET)) { 4284 error = EADDRNOTAVAIL; 4285 goto ud_error2; 4286 } 4287 if (sin->sin_addr.s_addr == INADDR_ANY) 4288 sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); 4289 4290 /* Protocol 255 contains full IP headers */ 4291 /* Read without holding lock */ 4292 if (icmp->icmp_hdrincl) { 4293 if (MBLKL(data_mp) < IP_SIMPLE_HDR_LENGTH) { 4294 if (!pullupmsg(data_mp, IP_SIMPLE_HDR_LENGTH)) { 4295 error = EINVAL; 4296 goto ud_error2; 4297 } 4298 } 4299 error = icmp_output_hdrincl(connp, data_mp, cr, pid); 4300 if (error == 0) { 4301 freeb(mp); 4302 return; 4303 } 4304 /* data_mp consumed above */ 4305 data_mp = NULL; 4306 goto ud_error2; 4307 } 4308 4309 if (tudr->OPT_length != 0) { 4310 /* 4311 * If we are connected then the destination needs to be 4312 * the same as the connected one. 4313 */ 4314 if (icmp->icmp_state == TS_DATA_XFER && 4315 !conn_same_as_last_v4(connp, sin)) { 4316 error = EISCONN; 4317 goto ud_error2; 4318 } 4319 error = icmp_output_ancillary(connp, sin, NULL, 4320 data_mp, mp, NULL, cr, pid); 4321 } else { 4322 ip_xmit_attr_t *ixa; 4323 4324 /* 4325 * We have to allocate an ip_xmit_attr_t before we grab 4326 * conn_lock and we need to hold conn_lock once we've 4327 * checked conn_same_as_last_v4 to handle concurrent 4328 * send* calls on a socket. 4329 */ 4330 ixa = conn_get_ixa(connp, B_FALSE); 4331 if (ixa == NULL) { 4332 error = ENOMEM; 4333 goto ud_error2; 4334 } 4335 mutex_enter(&connp->conn_lock); 4336 4337 if (conn_same_as_last_v4(connp, sin) && 4338 ipsec_outbound_policy_current(ixa)) { 4339 /* icmp_output_lastdst drops conn_lock */ 4340 error = icmp_output_lastdst(connp, data_mp, cr, 4341 pid, ixa); 4342 } else { 4343 /* icmp_output_newdst drops conn_lock */ 4344 error = icmp_output_newdst(connp, data_mp, sin, 4345 NULL, cr, pid, ixa); 4346 } 4347 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 4348 } 4349 if (error == 0) { 4350 freeb(mp); 4351 return; 4352 } 4353 break; 4354 } 4355 ASSERT(mp != NULL); 4356 /* mp is freed by the following routine */ 4357 icmp_ud_err(q, mp, (t_scalar_t)error); 4358 return; 4359 4360 ud_error2: 4361 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4362 freemsg(data_mp); 4363 ASSERT(mp != NULL); 4364 /* mp is freed by the following routine */ 4365 icmp_ud_err(q, mp, (t_scalar_t)error); 4366 } 4367 4368 /* 4369 * Handle the case of the IP address or flow label being different 4370 * for both IPv4 and IPv6. 4371 * 4372 * NOTE: The caller must hold conn_lock and we drop it here. 4373 */ 4374 static int 4375 icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, sin6_t *sin6, 4376 cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa) 4377 { 4378 icmp_t *icmp = connp->conn_icmp; 4379 icmp_stack_t *is = icmp->icmp_is; 4380 int error; 4381 ip_xmit_attr_t *oldixa; 4382 boolean_t do_ipsec; 4383 uint_t srcid; 4384 uint32_t flowinfo; 4385 in6_addr_t v6src; 4386 in6_addr_t v6dst; 4387 in6_addr_t v6nexthop; 4388 in_port_t dstport; 4389 4390 ASSERT(MUTEX_HELD(&connp->conn_lock)); 4391 ASSERT(ixa != NULL); 4392 4393 /* 4394 * We hold conn_lock across all the use and modifications of 4395 * the conn_lastdst, conn_ixa, and conn_xmit_ipp to ensure that they 4396 * stay consistent. 4397 */ 4398 4399 ASSERT(cr != NULL); 4400 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 4401 ixa->ixa_cred = cr; 4402 ixa->ixa_cpid = pid; 4403 if (is_system_labeled()) { 4404 /* We need to restart with a label based on the cred */ 4405 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 4406 } 4407 /* 4408 * If we are connected then the destination needs to be the 4409 * same as the connected one, which is not the case here since we 4410 * checked for that above. 4411 */ 4412 if (icmp->icmp_state == TS_DATA_XFER) { 4413 mutex_exit(&connp->conn_lock); 4414 error = EISCONN; 4415 goto ud_error; 4416 } 4417 4418 /* In case previous destination was multicast or multirt */ 4419 ip_attr_newdst(ixa); 4420 4421 /* 4422 * If laddr is unspecified then we look at sin6_src_id. 4423 * We will give precedence to a source address set with IPV6_PKTINFO 4424 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't 4425 * want ip_attr_connect to select a source (since it can fail) when 4426 * IPV6_PKTINFO is specified. 4427 * If this doesn't result in a source address then we get a source 4428 * from ip_attr_connect() below. 4429 */ 4430 v6src = connp->conn_saddr_v6; 4431 if (sin != NULL) { 4432 IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst); 4433 dstport = sin->sin_port; 4434 flowinfo = 0; 4435 /* Don't bother with ip_srcid_find_id(), but indicate anyway. */ 4436 srcid = 0; 4437 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 4438 ixa->ixa_flags |= IXAF_IS_IPV4; 4439 } else { 4440 boolean_t v4mapped; 4441 4442 v6dst = sin6->sin6_addr; 4443 dstport = sin6->sin6_port; 4444 flowinfo = sin6->sin6_flowinfo; 4445 srcid = sin6->__sin6_src_id; 4446 if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) { 4447 ixa->ixa_scopeid = sin6->sin6_scope_id; 4448 ixa->ixa_flags |= IXAF_SCOPEID_SET; 4449 } else { 4450 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 4451 } 4452 v4mapped = IN6_IS_ADDR_V4MAPPED(&v6dst); 4453 if (v4mapped) 4454 ixa->ixa_flags |= IXAF_IS_IPV4; 4455 else 4456 ixa->ixa_flags &= ~IXAF_IS_IPV4; 4457 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 4458 if (!ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 4459 v4mapped, connp->conn_netstack)) { 4460 /* Mismatched v4mapped/v6 specified by srcid. */ 4461 mutex_exit(&connp->conn_lock); 4462 error = EADDRNOTAVAIL; 4463 goto ud_error; 4464 } 4465 } 4466 } 4467 /* Handle IP_PKTINFO/IPV6_PKTINFO setting source address. */ 4468 if (connp->conn_xmit_ipp.ipp_fields & IPPF_ADDR) { 4469 ip_pkt_t *ipp = &connp->conn_xmit_ipp; 4470 4471 if (ixa->ixa_flags & IXAF_IS_IPV4) { 4472 if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 4473 v6src = ipp->ipp_addr; 4474 } else { 4475 if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 4476 v6src = ipp->ipp_addr; 4477 } 4478 } 4479 4480 /* Defer IPsec if it might need to look at ICMP type/code */ 4481 switch (ixa->ixa_protocol) { 4482 case IPPROTO_ICMP: 4483 case IPPROTO_ICMPV6: 4484 do_ipsec = B_FALSE; 4485 break; 4486 default: 4487 do_ipsec = B_TRUE; 4488 } 4489 4490 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &v6dst, &v6nexthop); 4491 mutex_exit(&connp->conn_lock); 4492 4493 error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport, 4494 &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | 4495 (do_ipsec ? IPDF_IPSEC : 0)); 4496 switch (error) { 4497 case 0: 4498 break; 4499 case EADDRNOTAVAIL: 4500 /* 4501 * IXAF_VERIFY_SOURCE tells us to pick a better source. 4502 * Don't have the application see that errno 4503 */ 4504 error = ENETUNREACH; 4505 goto failed; 4506 case ENETDOWN: 4507 /* 4508 * Have !ipif_addr_ready address; drop packet silently 4509 * until we can get applications to not send until we 4510 * are ready. 4511 */ 4512 error = 0; 4513 goto failed; 4514 case EHOSTUNREACH: 4515 case ENETUNREACH: 4516 if (ixa->ixa_ire != NULL) { 4517 /* 4518 * Let conn_ip_output/ire_send_noroute return 4519 * the error and send any local ICMP error. 4520 */ 4521 error = 0; 4522 break; 4523 } 4524 /* FALLTHRU */ 4525 default: 4526 failed: 4527 goto ud_error; 4528 } 4529 4530 mutex_enter(&connp->conn_lock); 4531 /* 4532 * While we dropped the lock some other thread might have connected 4533 * this socket. If so we bail out with EISCONN to ensure that the 4534 * connecting thread is the one that updates conn_ixa, conn_ht_* 4535 * and conn_*last*. 4536 */ 4537 if (icmp->icmp_state == TS_DATA_XFER) { 4538 mutex_exit(&connp->conn_lock); 4539 error = EISCONN; 4540 goto ud_error; 4541 } 4542 4543 /* 4544 * We need to rebuild the headers if 4545 * - we are labeling packets (could be different for different 4546 * destinations) 4547 * - we have a source route (or routing header) since we need to 4548 * massage that to get the pseudo-header checksum 4549 * - a socket option with COA_HEADER_CHANGED has been set which 4550 * set conn_v6lastdst to zero. 4551 * 4552 * Otherwise the prepend function will just update the src, dst, 4553 * and flow label. 4554 */ 4555 if (is_system_labeled()) { 4556 /* TX MLP requires SCM_UCRED and don't have that here */ 4557 if (connp->conn_mlp_type != mlptSingle) { 4558 mutex_exit(&connp->conn_lock); 4559 error = ECONNREFUSED; 4560 goto ud_error; 4561 } 4562 /* 4563 * Check whether Trusted Solaris policy allows communication 4564 * with this host, and pretend that the destination is 4565 * unreachable if not. 4566 * Compute any needed label and place it in ipp_label_v4/v6. 4567 * 4568 * Later conn_build_hdr_template/conn_prepend_hdr takes 4569 * ipp_label_v4/v6 to form the packet. 4570 * 4571 * Tsol note: Since we hold conn_lock we know no other 4572 * thread manipulates conn_xmit_ipp. 4573 */ 4574 error = conn_update_label(connp, ixa, &v6dst, 4575 &connp->conn_xmit_ipp); 4576 if (error != 0) { 4577 mutex_exit(&connp->conn_lock); 4578 goto ud_error; 4579 } 4580 /* Rebuild the header template */ 4581 error = icmp_build_hdr_template(connp, &v6src, &v6dst, 4582 flowinfo); 4583 if (error != 0) { 4584 mutex_exit(&connp->conn_lock); 4585 goto ud_error; 4586 } 4587 } else if (connp->conn_xmit_ipp.ipp_fields & 4588 (IPPF_IPV4_OPTIONS|IPPF_RTHDR) || 4589 IN6_IS_ADDR_UNSPECIFIED(&connp->conn_v6lastdst)) { 4590 /* Rebuild the header template */ 4591 error = icmp_build_hdr_template(connp, &v6src, &v6dst, 4592 flowinfo); 4593 if (error != 0) { 4594 mutex_exit(&connp->conn_lock); 4595 goto ud_error; 4596 } 4597 } else { 4598 /* Simply update the destination address if no source route */ 4599 if (ixa->ixa_flags & IXAF_IS_IPV4) { 4600 ipha_t *ipha = (ipha_t *)connp->conn_ht_iphc; 4601 4602 IN6_V4MAPPED_TO_IPADDR(&v6dst, ipha->ipha_dst); 4603 if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) { 4604 ipha->ipha_fragment_offset_and_flags |= 4605 IPH_DF_HTONS; 4606 } else { 4607 ipha->ipha_fragment_offset_and_flags &= 4608 ~IPH_DF_HTONS; 4609 } 4610 } else { 4611 ip6_t *ip6h = (ip6_t *)connp->conn_ht_iphc; 4612 ip6h->ip6_dst = v6dst; 4613 } 4614 } 4615 4616 /* 4617 * Remember the dst etc which corresponds to the built header 4618 * template and conn_ixa. 4619 */ 4620 oldixa = conn_replace_ixa(connp, ixa); 4621 connp->conn_v6lastdst = v6dst; 4622 connp->conn_lastflowinfo = flowinfo; 4623 connp->conn_lastscopeid = ixa->ixa_scopeid; 4624 connp->conn_lastsrcid = srcid; 4625 /* Also remember a source to use together with lastdst */ 4626 connp->conn_v6lastsrc = v6src; 4627 4628 data_mp = icmp_prepend_header_template(connp, ixa, data_mp, &v6src, 4629 flowinfo, &error); 4630 4631 /* Done with conn_t */ 4632 mutex_exit(&connp->conn_lock); 4633 ixa_refrele(oldixa); 4634 4635 if (data_mp == NULL) { 4636 ASSERT(error != 0); 4637 goto ud_error; 4638 } 4639 4640 if (!do_ipsec) { 4641 /* Policy might differ for different ICMP type/code */ 4642 data_mp = icmp_output_attach_policy(data_mp, connp, ixa); 4643 if (data_mp == NULL) { 4644 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4645 error = EHOSTUNREACH; /* IPsec policy failure */ 4646 goto done; 4647 } 4648 } 4649 4650 /* We're done. Pass the packet to ip. */ 4651 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 4652 4653 error = conn_ip_output(data_mp, ixa); 4654 /* No rawipOutErrors if an error since IP increases its error counter */ 4655 switch (error) { 4656 case 0: 4657 break; 4658 case EWOULDBLOCK: 4659 (void) ixa_check_drain_insert(connp, ixa); 4660 error = 0; 4661 break; 4662 case EADDRNOTAVAIL: 4663 /* 4664 * IXAF_VERIFY_SOURCE tells us to pick a better source. 4665 * Don't have the application see that errno 4666 */ 4667 error = ENETUNREACH; 4668 /* FALLTHRU */ 4669 default: 4670 mutex_enter(&connp->conn_lock); 4671 /* 4672 * Clear the source and v6lastdst so we call ip_attr_connect 4673 * for the next packet and try to pick a better source. 4674 */ 4675 if (connp->conn_mcbc_bind) 4676 connp->conn_saddr_v6 = ipv6_all_zeros; 4677 else 4678 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 4679 connp->conn_v6lastdst = ipv6_all_zeros; 4680 mutex_exit(&connp->conn_lock); 4681 break; 4682 } 4683 done: 4684 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 4685 ixa->ixa_cred = connp->conn_cred; /* Restore */ 4686 ixa->ixa_cpid = connp->conn_cpid; 4687 ixa_refrele(ixa); 4688 return (error); 4689 4690 ud_error: 4691 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 4692 ixa->ixa_cred = connp->conn_cred; /* Restore */ 4693 ixa->ixa_cpid = connp->conn_cpid; 4694 ixa_refrele(ixa); 4695 4696 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4697 freemsg(data_mp); 4698 return (error); 4699 } 4700 4701 /* ARGSUSED */ 4702 static void 4703 icmp_wput_fallback(queue_t *q, mblk_t *mp) 4704 { 4705 #ifdef DEBUG 4706 cmn_err(CE_CONT, "icmp_wput_fallback: Message during fallback \n"); 4707 #endif 4708 freemsg(mp); 4709 } 4710 4711 static void 4712 icmp_wput_other(queue_t *q, mblk_t *mp) 4713 { 4714 uchar_t *rptr = mp->b_rptr; 4715 struct iocblk *iocp; 4716 conn_t *connp = Q_TO_CONN(q); 4717 icmp_t *icmp = connp->conn_icmp; 4718 cred_t *cr; 4719 4720 switch (mp->b_datap->db_type) { 4721 case M_PROTO: 4722 case M_PCPROTO: 4723 if (mp->b_wptr - rptr < sizeof (t_scalar_t)) { 4724 /* 4725 * If the message does not contain a PRIM_type, 4726 * throw it away. 4727 */ 4728 freemsg(mp); 4729 return; 4730 } 4731 switch (((t_primp_t)rptr)->type) { 4732 case T_ADDR_REQ: 4733 icmp_addr_req(q, mp); 4734 return; 4735 case O_T_BIND_REQ: 4736 case T_BIND_REQ: 4737 icmp_tpi_bind(q, mp); 4738 return; 4739 case T_CONN_REQ: 4740 icmp_tpi_connect(q, mp); 4741 return; 4742 case T_CAPABILITY_REQ: 4743 icmp_capability_req(q, mp); 4744 return; 4745 case T_INFO_REQ: 4746 icmp_info_req(q, mp); 4747 return; 4748 case T_UNITDATA_REQ: 4749 /* 4750 * If a T_UNITDATA_REQ gets here, the address must 4751 * be bad. Valid T_UNITDATA_REQs are handled 4752 * in icmp_wput. 4753 */ 4754 icmp_ud_err(q, mp, EADDRNOTAVAIL); 4755 return; 4756 case T_UNBIND_REQ: 4757 icmp_tpi_unbind(q, mp); 4758 return; 4759 case T_SVR4_OPTMGMT_REQ: 4760 /* 4761 * All Solaris components should pass a db_credp 4762 * for this TPI message, hence we ASSERT. 4763 * But in case there is some other M_PROTO that looks 4764 * like a TPI message sent by some other kernel 4765 * component, we check and return an error. 4766 */ 4767 cr = msg_getcred(mp, NULL); 4768 ASSERT(cr != NULL); 4769 if (cr == NULL) { 4770 icmp_err_ack(q, mp, TSYSERR, EINVAL); 4771 return; 4772 } 4773 4774 if (!snmpcom_req(q, mp, icmp_snmp_set, ip_snmp_get, 4775 cr)) { 4776 svr4_optcom_req(q, mp, cr, &icmp_opt_obj); 4777 } 4778 return; 4779 4780 case T_OPTMGMT_REQ: 4781 /* 4782 * All Solaris components should pass a db_credp 4783 * for this TPI message, hence we ASSERT. 4784 * But in case there is some other M_PROTO that looks 4785 * like a TPI message sent by some other kernel 4786 * component, we check and return an error. 4787 */ 4788 cr = msg_getcred(mp, NULL); 4789 ASSERT(cr != NULL); 4790 if (cr == NULL) { 4791 icmp_err_ack(q, mp, TSYSERR, EINVAL); 4792 return; 4793 } 4794 tpi_optcom_req(q, mp, cr, &icmp_opt_obj); 4795 return; 4796 4797 case T_DISCON_REQ: 4798 icmp_tpi_disconnect(q, mp); 4799 return; 4800 4801 /* The following TPI message is not supported by icmp. */ 4802 case O_T_CONN_RES: 4803 case T_CONN_RES: 4804 icmp_err_ack(q, mp, TNOTSUPPORT, 0); 4805 return; 4806 4807 /* The following 3 TPI requests are illegal for icmp. */ 4808 case T_DATA_REQ: 4809 case T_EXDATA_REQ: 4810 case T_ORDREL_REQ: 4811 icmp_err_ack(q, mp, TNOTSUPPORT, 0); 4812 return; 4813 default: 4814 break; 4815 } 4816 break; 4817 case M_FLUSH: 4818 if (*rptr & FLUSHW) 4819 flushq(q, FLUSHDATA); 4820 break; 4821 case M_IOCTL: 4822 iocp = (struct iocblk *)mp->b_rptr; 4823 switch (iocp->ioc_cmd) { 4824 case TI_GETPEERNAME: 4825 if (icmp->icmp_state != TS_DATA_XFER) { 4826 /* 4827 * If a default destination address has not 4828 * been associated with the stream, then we 4829 * don't know the peer's name. 4830 */ 4831 iocp->ioc_error = ENOTCONN; 4832 iocp->ioc_count = 0; 4833 mp->b_datap->db_type = M_IOCACK; 4834 qreply(q, mp); 4835 return; 4836 } 4837 /* FALLTHRU */ 4838 case TI_GETMYNAME: 4839 /* 4840 * For TI_GETPEERNAME and TI_GETMYNAME, we first 4841 * need to copyin the user's strbuf structure. 4842 * Processing will continue in the M_IOCDATA case 4843 * below. 4844 */ 4845 mi_copyin(q, mp, NULL, 4846 SIZEOF_STRUCT(strbuf, iocp->ioc_flag)); 4847 return; 4848 default: 4849 break; 4850 } 4851 break; 4852 case M_IOCDATA: 4853 icmp_wput_iocdata(q, mp); 4854 return; 4855 default: 4856 /* Unrecognized messages are passed through without change. */ 4857 break; 4858 } 4859 ip_wput_nondata(q, mp); 4860 } 4861 4862 /* 4863 * icmp_wput_iocdata is called by icmp_wput_other to handle all M_IOCDATA 4864 * messages. 4865 */ 4866 static void 4867 icmp_wput_iocdata(queue_t *q, mblk_t *mp) 4868 { 4869 mblk_t *mp1; 4870 STRUCT_HANDLE(strbuf, sb); 4871 uint_t addrlen; 4872 conn_t *connp = Q_TO_CONN(q); 4873 icmp_t *icmp = connp->conn_icmp; 4874 4875 /* Make sure it is one of ours. */ 4876 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 4877 case TI_GETMYNAME: 4878 case TI_GETPEERNAME: 4879 break; 4880 default: 4881 ip_wput_nondata(q, mp); 4882 return; 4883 } 4884 4885 switch (mi_copy_state(q, mp, &mp1)) { 4886 case -1: 4887 return; 4888 case MI_COPY_CASE(MI_COPY_IN, 1): 4889 break; 4890 case MI_COPY_CASE(MI_COPY_OUT, 1): 4891 /* 4892 * The address has been copied out, so now 4893 * copyout the strbuf. 4894 */ 4895 mi_copyout(q, mp); 4896 return; 4897 case MI_COPY_CASE(MI_COPY_OUT, 2): 4898 /* 4899 * The address and strbuf have been copied out. 4900 * We're done, so just acknowledge the original 4901 * M_IOCTL. 4902 */ 4903 mi_copy_done(q, mp, 0); 4904 return; 4905 default: 4906 /* 4907 * Something strange has happened, so acknowledge 4908 * the original M_IOCTL with an EPROTO error. 4909 */ 4910 mi_copy_done(q, mp, EPROTO); 4911 return; 4912 } 4913 4914 /* 4915 * Now we have the strbuf structure for TI_GETMYNAME 4916 * and TI_GETPEERNAME. Next we copyout the requested 4917 * address and then we'll copyout the strbuf. 4918 */ 4919 STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag, 4920 (void *)mp1->b_rptr); 4921 4922 if (connp->conn_family == AF_INET) 4923 addrlen = sizeof (sin_t); 4924 else 4925 addrlen = sizeof (sin6_t); 4926 4927 if (STRUCT_FGET(sb, maxlen) < addrlen) { 4928 mi_copy_done(q, mp, EINVAL); 4929 return; 4930 } 4931 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 4932 case TI_GETMYNAME: 4933 break; 4934 case TI_GETPEERNAME: 4935 if (icmp->icmp_state != TS_DATA_XFER) { 4936 mi_copy_done(q, mp, ENOTCONN); 4937 return; 4938 } 4939 break; 4940 default: 4941 mi_copy_done(q, mp, EPROTO); 4942 return; 4943 } 4944 mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE); 4945 if (!mp1) 4946 return; 4947 4948 STRUCT_FSET(sb, len, addrlen); 4949 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 4950 case TI_GETMYNAME: 4951 (void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr, 4952 &addrlen); 4953 break; 4954 case TI_GETPEERNAME: 4955 (void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr, 4956 &addrlen); 4957 break; 4958 } 4959 mp1->b_wptr += addrlen; 4960 /* Copy out the address */ 4961 mi_copyout(q, mp); 4962 } 4963 4964 void 4965 icmp_ddi_g_init(void) 4966 { 4967 icmp_max_optsize = optcom_max_optsize(icmp_opt_obj.odb_opt_des_arr, 4968 icmp_opt_obj.odb_opt_arr_cnt); 4969 4970 /* 4971 * We want to be informed each time a stack is created or 4972 * destroyed in the kernel, so we can maintain the 4973 * set of icmp_stack_t's. 4974 */ 4975 netstack_register(NS_ICMP, rawip_stack_init, NULL, rawip_stack_fini); 4976 } 4977 4978 void 4979 icmp_ddi_g_destroy(void) 4980 { 4981 netstack_unregister(NS_ICMP); 4982 } 4983 4984 #define INET_NAME "ip" 4985 4986 /* 4987 * Initialize the ICMP stack instance. 4988 */ 4989 static void * 4990 rawip_stack_init(netstackid_t stackid, netstack_t *ns) 4991 { 4992 icmp_stack_t *is; 4993 int error = 0; 4994 size_t arrsz; 4995 major_t major; 4996 4997 is = (icmp_stack_t *)kmem_zalloc(sizeof (*is), KM_SLEEP); 4998 is->is_netstack = ns; 4999 5000 arrsz = sizeof (icmp_propinfo_tbl); 5001 is->is_propinfo_tbl = (mod_prop_info_t *)kmem_alloc(arrsz, KM_SLEEP); 5002 bcopy(icmp_propinfo_tbl, is->is_propinfo_tbl, arrsz); 5003 5004 is->is_ksp = rawip_kstat_init(stackid); 5005 5006 major = mod_name_to_major(INET_NAME); 5007 error = ldi_ident_from_major(major, &is->is_ldi_ident); 5008 ASSERT(error == 0); 5009 return (is); 5010 } 5011 5012 /* 5013 * Free the ICMP stack instance. 5014 */ 5015 static void 5016 rawip_stack_fini(netstackid_t stackid, void *arg) 5017 { 5018 icmp_stack_t *is = (icmp_stack_t *)arg; 5019 5020 kmem_free(is->is_propinfo_tbl, sizeof (icmp_propinfo_tbl)); 5021 is->is_propinfo_tbl = NULL; 5022 5023 rawip_kstat_fini(stackid, is->is_ksp); 5024 is->is_ksp = NULL; 5025 ldi_ident_release(is->is_ldi_ident); 5026 kmem_free(is, sizeof (*is)); 5027 } 5028 5029 static void * 5030 rawip_kstat_init(netstackid_t stackid) { 5031 kstat_t *ksp; 5032 5033 rawip_named_kstat_t template = { 5034 { "inDatagrams", KSTAT_DATA_UINT32, 0 }, 5035 { "inCksumErrs", KSTAT_DATA_UINT32, 0 }, 5036 { "inErrors", KSTAT_DATA_UINT32, 0 }, 5037 { "outDatagrams", KSTAT_DATA_UINT32, 0 }, 5038 { "outErrors", KSTAT_DATA_UINT32, 0 }, 5039 }; 5040 5041 ksp = kstat_create_netstack("icmp", 0, "rawip", "mib2", 5042 KSTAT_TYPE_NAMED, 5043 NUM_OF_FIELDS(rawip_named_kstat_t), 5044 0, stackid); 5045 if (ksp == NULL || ksp->ks_data == NULL) 5046 return (NULL); 5047 5048 bcopy(&template, ksp->ks_data, sizeof (template)); 5049 ksp->ks_update = rawip_kstat_update; 5050 ksp->ks_private = (void *)(uintptr_t)stackid; 5051 5052 kstat_install(ksp); 5053 return (ksp); 5054 } 5055 5056 static void 5057 rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp) 5058 { 5059 if (ksp != NULL) { 5060 ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private); 5061 kstat_delete_netstack(ksp, stackid); 5062 } 5063 } 5064 5065 static int 5066 rawip_kstat_update(kstat_t *ksp, int rw) 5067 { 5068 rawip_named_kstat_t *rawipkp; 5069 netstackid_t stackid = (netstackid_t)(uintptr_t)ksp->ks_private; 5070 netstack_t *ns; 5071 icmp_stack_t *is; 5072 5073 if ((ksp == NULL) || (ksp->ks_data == NULL)) 5074 return (EIO); 5075 5076 if (rw == KSTAT_WRITE) 5077 return (EACCES); 5078 5079 rawipkp = (rawip_named_kstat_t *)ksp->ks_data; 5080 5081 ns = netstack_find_by_stackid(stackid); 5082 if (ns == NULL) 5083 return (-1); 5084 is = ns->netstack_icmp; 5085 if (is == NULL) { 5086 netstack_rele(ns); 5087 return (-1); 5088 } 5089 rawipkp->inDatagrams.value.ui32 = is->is_rawip_mib.rawipInDatagrams; 5090 rawipkp->inCksumErrs.value.ui32 = is->is_rawip_mib.rawipInCksumErrs; 5091 rawipkp->inErrors.value.ui32 = is->is_rawip_mib.rawipInErrors; 5092 rawipkp->outDatagrams.value.ui32 = is->is_rawip_mib.rawipOutDatagrams; 5093 rawipkp->outErrors.value.ui32 = is->is_rawip_mib.rawipOutErrors; 5094 netstack_rele(ns); 5095 return (0); 5096 } 5097 5098 /* ARGSUSED */ 5099 int 5100 rawip_accept(sock_lower_handle_t lproto_handle, 5101 sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle, 5102 cred_t *cr) 5103 { 5104 return (EOPNOTSUPP); 5105 } 5106 5107 /* ARGSUSED */ 5108 int 5109 rawip_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa, 5110 socklen_t len, cred_t *cr) 5111 { 5112 conn_t *connp = (conn_t *)proto_handle; 5113 int error; 5114 5115 /* All Solaris components should pass a cred for this operation. */ 5116 ASSERT(cr != NULL); 5117 5118 /* Binding to a NULL address really means unbind */ 5119 if (sa == NULL) 5120 error = rawip_do_unbind(connp); 5121 else 5122 error = rawip_do_bind(connp, sa, len); 5123 5124 if (error < 0) { 5125 if (error == -TOUTSTATE) 5126 error = EINVAL; 5127 else 5128 error = proto_tlitosyserr(-error); 5129 } 5130 return (error); 5131 } 5132 5133 static int 5134 rawip_implicit_bind(conn_t *connp) 5135 { 5136 sin6_t sin6addr; 5137 sin_t *sin; 5138 sin6_t *sin6; 5139 socklen_t len; 5140 int error; 5141 5142 if (connp->conn_family == AF_INET) { 5143 len = sizeof (struct sockaddr_in); 5144 sin = (sin_t *)&sin6addr; 5145 *sin = sin_null; 5146 sin->sin_family = AF_INET; 5147 sin->sin_addr.s_addr = INADDR_ANY; 5148 } else { 5149 ASSERT(connp->conn_family == AF_INET6); 5150 len = sizeof (sin6_t); 5151 sin6 = (sin6_t *)&sin6addr; 5152 *sin6 = sin6_null; 5153 sin6->sin6_family = AF_INET6; 5154 V6_SET_ZERO(sin6->sin6_addr); 5155 } 5156 5157 error = rawip_do_bind(connp, (struct sockaddr *)&sin6addr, len); 5158 5159 return ((error < 0) ? proto_tlitosyserr(-error) : error); 5160 } 5161 5162 static int 5163 rawip_unbind(conn_t *connp) 5164 { 5165 int error; 5166 5167 error = rawip_do_unbind(connp); 5168 if (error < 0) { 5169 error = proto_tlitosyserr(-error); 5170 } 5171 return (error); 5172 } 5173 5174 /* ARGSUSED */ 5175 int 5176 rawip_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr) 5177 { 5178 return (EOPNOTSUPP); 5179 } 5180 5181 int 5182 rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, 5183 socklen_t len, sock_connid_t *id, cred_t *cr) 5184 { 5185 conn_t *connp = (conn_t *)proto_handle; 5186 icmp_t *icmp = connp->conn_icmp; 5187 int error; 5188 boolean_t did_bind = B_FALSE; 5189 pid_t pid = curproc->p_pid; 5190 5191 /* All Solaris components should pass a cred for this operation. */ 5192 ASSERT(cr != NULL); 5193 5194 if (sa == NULL) { 5195 /* 5196 * Disconnect 5197 * Make sure we are connected 5198 */ 5199 if (icmp->icmp_state != TS_DATA_XFER) 5200 return (EINVAL); 5201 5202 error = icmp_disconnect(connp); 5203 return (error); 5204 } 5205 5206 error = proto_verify_ip_addr(connp->conn_family, sa, len); 5207 if (error != 0) 5208 return (error); 5209 5210 /* do an implicit bind if necessary */ 5211 if (icmp->icmp_state == TS_UNBND) { 5212 error = rawip_implicit_bind(connp); 5213 /* 5214 * We could be racing with an actual bind, in which case 5215 * we would see EPROTO. We cross our fingers and try 5216 * to connect. 5217 */ 5218 if (!(error == 0 || error == EPROTO)) 5219 return (error); 5220 did_bind = B_TRUE; 5221 } 5222 5223 /* 5224 * set SO_DGRAM_ERRIND 5225 */ 5226 connp->conn_dgram_errind = B_TRUE; 5227 5228 error = rawip_do_connect(connp, sa, len, cr, pid); 5229 if (error != 0 && did_bind) { 5230 int unbind_err; 5231 5232 unbind_err = rawip_unbind(connp); 5233 ASSERT(unbind_err == 0); 5234 } 5235 5236 if (error == 0) { 5237 *id = 0; 5238 (*connp->conn_upcalls->su_connected)(connp->conn_upper_handle, 5239 0, NULL, -1); 5240 } else if (error < 0) { 5241 error = proto_tlitosyserr(-error); 5242 } 5243 return (error); 5244 } 5245 5246 /* ARGSUSED2 */ 5247 int 5248 rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q, 5249 boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb, 5250 sock_quiesce_arg_t *arg) 5251 { 5252 conn_t *connp = (conn_t *)proto_handle; 5253 icmp_t *icmp; 5254 struct T_capability_ack tca; 5255 struct sockaddr_in6 laddr, faddr; 5256 socklen_t laddrlen, faddrlen; 5257 short opts; 5258 struct stroptions *stropt; 5259 mblk_t *mp, *stropt_mp; 5260 int error; 5261 5262 icmp = connp->conn_icmp; 5263 5264 stropt_mp = allocb_wait(sizeof (*stropt), BPRI_HI, STR_NOSIG, NULL); 5265 5266 /* 5267 * setup the fallback stream that was allocated 5268 */ 5269 connp->conn_dev = (dev_t)RD(q)->q_ptr; 5270 connp->conn_minor_arena = WR(q)->q_ptr; 5271 5272 RD(q)->q_ptr = WR(q)->q_ptr = connp; 5273 5274 WR(q)->q_qinfo = &icmpwinit; 5275 5276 connp->conn_rq = RD(q); 5277 connp->conn_wq = WR(q); 5278 5279 /* Notify stream head about options before sending up data */ 5280 stropt_mp->b_datap->db_type = M_SETOPTS; 5281 stropt_mp->b_wptr += sizeof (*stropt); 5282 stropt = (struct stroptions *)stropt_mp->b_rptr; 5283 stropt->so_flags = SO_WROFF | SO_HIWAT; 5284 stropt->so_wroff = connp->conn_wroff; 5285 stropt->so_hiwat = connp->conn_rcvbuf; 5286 putnext(RD(q), stropt_mp); 5287 5288 /* 5289 * free helper stream 5290 */ 5291 ip_free_helper_stream(connp); 5292 5293 /* 5294 * Collect the information needed to sync with the sonode 5295 */ 5296 icmp_do_capability_ack(icmp, &tca, TC1_INFO); 5297 5298 laddrlen = faddrlen = sizeof (sin6_t); 5299 (void) rawip_getsockname((sock_lower_handle_t)connp, 5300 (struct sockaddr *)&laddr, &laddrlen, CRED()); 5301 error = rawip_getpeername((sock_lower_handle_t)connp, 5302 (struct sockaddr *)&faddr, &faddrlen, CRED()); 5303 if (error != 0) 5304 faddrlen = 0; 5305 opts = 0; 5306 if (connp->conn_dgram_errind) 5307 opts |= SO_DGRAM_ERRIND; 5308 if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE) 5309 opts |= SO_DONTROUTE; 5310 5311 mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca, 5312 (struct sockaddr *)&laddr, laddrlen, 5313 (struct sockaddr *)&faddr, faddrlen, opts); 5314 5315 /* 5316 * Attempts to send data up during fallback will result in it being 5317 * queued in icmp_t. Now we push up any queued packets. 5318 */ 5319 mutex_enter(&icmp->icmp_recv_lock); 5320 if (mp != NULL) { 5321 mp->b_next = icmp->icmp_fallback_queue_head; 5322 icmp->icmp_fallback_queue_head = mp; 5323 } 5324 while (icmp->icmp_fallback_queue_head != NULL) { 5325 mp = icmp->icmp_fallback_queue_head; 5326 icmp->icmp_fallback_queue_head = mp->b_next; 5327 mp->b_next = NULL; 5328 mutex_exit(&icmp->icmp_recv_lock); 5329 putnext(RD(q), mp); 5330 mutex_enter(&icmp->icmp_recv_lock); 5331 } 5332 icmp->icmp_fallback_queue_tail = icmp->icmp_fallback_queue_head; 5333 5334 /* 5335 * No longer a streams less socket 5336 */ 5337 mutex_enter(&connp->conn_lock); 5338 connp->conn_flags &= ~IPCL_NONSTR; 5339 mutex_exit(&connp->conn_lock); 5340 5341 mutex_exit(&icmp->icmp_recv_lock); 5342 5343 ASSERT(icmp->icmp_fallback_queue_head == NULL && 5344 icmp->icmp_fallback_queue_tail == NULL); 5345 5346 ASSERT(connp->conn_ref >= 1); 5347 5348 return (0); 5349 } 5350 5351 /* ARGSUSED2 */ 5352 sock_lower_handle_t 5353 rawip_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, 5354 uint_t *smodep, int *errorp, int flags, cred_t *credp) 5355 { 5356 conn_t *connp; 5357 5358 if (type != SOCK_RAW || (family != AF_INET && family != AF_INET6)) { 5359 *errorp = EPROTONOSUPPORT; 5360 return (NULL); 5361 } 5362 5363 connp = rawip_do_open(family, credp, errorp, flags); 5364 if (connp != NULL) { 5365 connp->conn_flags |= IPCL_NONSTR; 5366 5367 mutex_enter(&connp->conn_lock); 5368 connp->conn_state_flags &= ~CONN_INCIPIENT; 5369 mutex_exit(&connp->conn_lock); 5370 *sock_downcalls = &sock_rawip_downcalls; 5371 *smodep = SM_ATOMIC; 5372 } else { 5373 ASSERT(*errorp != 0); 5374 } 5375 5376 return ((sock_lower_handle_t)connp); 5377 } 5378 5379 /* ARGSUSED3 */ 5380 void 5381 rawip_activate(sock_lower_handle_t proto_handle, 5382 sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, int flags, 5383 cred_t *cr) 5384 { 5385 conn_t *connp = (conn_t *)proto_handle; 5386 struct sock_proto_props sopp; 5387 5388 /* All Solaris components should pass a cred for this operation. */ 5389 ASSERT(cr != NULL); 5390 5391 connp->conn_upcalls = sock_upcalls; 5392 connp->conn_upper_handle = sock_handle; 5393 5394 sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT | 5395 SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ; 5396 sopp.sopp_wroff = connp->conn_wroff; 5397 sopp.sopp_rxhiwat = connp->conn_rcvbuf; 5398 sopp.sopp_rxlowat = connp->conn_rcvlowat; 5399 sopp.sopp_maxblk = INFPSZ; 5400 sopp.sopp_maxpsz = IP_MAXPACKET; 5401 sopp.sopp_minpsz = (icmp_mod_info.mi_minpsz == 1) ? 0 : 5402 icmp_mod_info.mi_minpsz; 5403 5404 (*connp->conn_upcalls->su_set_proto_props) 5405 (connp->conn_upper_handle, &sopp); 5406 5407 icmp_bind_proto(connp->conn_icmp); 5408 } 5409 5410 /* ARGSUSED3 */ 5411 int 5412 rawip_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa, 5413 socklen_t *salenp, cred_t *cr) 5414 { 5415 conn_t *connp = (conn_t *)proto_handle; 5416 icmp_t *icmp = connp->conn_icmp; 5417 int error; 5418 5419 /* All Solaris components should pass a cred for this operation. */ 5420 ASSERT(cr != NULL); 5421 5422 mutex_enter(&connp->conn_lock); 5423 if (icmp->icmp_state != TS_DATA_XFER) 5424 error = ENOTCONN; 5425 else 5426 error = conn_getpeername(connp, sa, salenp); 5427 mutex_exit(&connp->conn_lock); 5428 return (error); 5429 } 5430 5431 /* ARGSUSED3 */ 5432 int 5433 rawip_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *sa, 5434 socklen_t *salenp, cred_t *cr) 5435 { 5436 conn_t *connp = (conn_t *)proto_handle; 5437 int error; 5438 5439 /* All Solaris components should pass a cred for this operation. */ 5440 ASSERT(cr != NULL); 5441 5442 mutex_enter(&connp->conn_lock); 5443 error = conn_getsockname(connp, sa, salenp); 5444 mutex_exit(&connp->conn_lock); 5445 return (error); 5446 } 5447 5448 int 5449 rawip_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name, 5450 const void *optvalp, socklen_t optlen, cred_t *cr) 5451 { 5452 conn_t *connp = (conn_t *)proto_handle; 5453 int error; 5454 5455 /* All Solaris components should pass a cred for this operation. */ 5456 ASSERT(cr != NULL); 5457 5458 error = proto_opt_check(level, option_name, optlen, NULL, 5459 icmp_opt_obj.odb_opt_des_arr, 5460 icmp_opt_obj.odb_opt_arr_cnt, 5461 B_TRUE, B_FALSE, cr); 5462 5463 if (error != 0) { 5464 /* 5465 * option not recognized 5466 */ 5467 if (error < 0) { 5468 error = proto_tlitosyserr(-error); 5469 } 5470 return (error); 5471 } 5472 5473 error = icmp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, 5474 option_name, optlen, (uchar_t *)optvalp, (uint_t *)&optlen, 5475 (uchar_t *)optvalp, NULL, cr); 5476 5477 ASSERT(error >= 0); 5478 5479 return (error); 5480 } 5481 5482 int 5483 rawip_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, 5484 void *optvalp, socklen_t *optlen, cred_t *cr) 5485 { 5486 int error; 5487 conn_t *connp = (conn_t *)proto_handle; 5488 t_uscalar_t max_optbuf_len; 5489 void *optvalp_buf; 5490 int len; 5491 5492 /* All Solaris components should pass a cred for this operation. */ 5493 ASSERT(cr != NULL); 5494 5495 error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len, 5496 icmp_opt_obj.odb_opt_des_arr, 5497 icmp_opt_obj.odb_opt_arr_cnt, 5498 B_FALSE, B_TRUE, cr); 5499 5500 if (error != 0) { 5501 if (error < 0) { 5502 error = proto_tlitosyserr(-error); 5503 } 5504 return (error); 5505 } 5506 5507 optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP); 5508 len = icmp_opt_get(connp, level, option_name, optvalp_buf); 5509 if (len == -1) { 5510 kmem_free(optvalp_buf, max_optbuf_len); 5511 return (EINVAL); 5512 } 5513 5514 /* 5515 * update optlen and copy option value 5516 */ 5517 t_uscalar_t size = MIN(len, *optlen); 5518 5519 bcopy(optvalp_buf, optvalp, size); 5520 bcopy(&size, optlen, sizeof (size)); 5521 5522 kmem_free(optvalp_buf, max_optbuf_len); 5523 return (0); 5524 } 5525 5526 /* ARGSUSED1 */ 5527 int 5528 rawip_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr) 5529 { 5530 conn_t *connp = (conn_t *)proto_handle; 5531 5532 /* All Solaris components should pass a cred for this operation. */ 5533 ASSERT(cr != NULL); 5534 5535 (void) rawip_do_close(connp); 5536 return (0); 5537 } 5538 5539 /* ARGSUSED2 */ 5540 int 5541 rawip_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr) 5542 { 5543 conn_t *connp = (conn_t *)proto_handle; 5544 5545 /* All Solaris components should pass a cred for this operation. */ 5546 ASSERT(cr != NULL); 5547 5548 /* shut down the send side */ 5549 if (how != SHUT_RD) 5550 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 5551 SOCK_OPCTL_SHUT_SEND, 0); 5552 /* shut down the recv side */ 5553 if (how != SHUT_WR) 5554 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 5555 SOCK_OPCTL_SHUT_RECV, 0); 5556 return (0); 5557 } 5558 5559 void 5560 rawip_clr_flowctrl(sock_lower_handle_t proto_handle) 5561 { 5562 conn_t *connp = (conn_t *)proto_handle; 5563 icmp_t *icmp = connp->conn_icmp; 5564 5565 mutex_enter(&icmp->icmp_recv_lock); 5566 connp->conn_flow_cntrld = B_FALSE; 5567 mutex_exit(&icmp->icmp_recv_lock); 5568 } 5569 5570 int 5571 rawip_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, 5572 int mode, int32_t *rvalp, cred_t *cr) 5573 { 5574 conn_t *connp = (conn_t *)proto_handle; 5575 int error; 5576 5577 /* All Solaris components should pass a cred for this operation. */ 5578 ASSERT(cr != NULL); 5579 5580 /* 5581 * If we don't have a helper stream then create one. 5582 * ip_create_helper_stream takes care of locking the conn_t, 5583 * so this check for NULL is just a performance optimization. 5584 */ 5585 if (connp->conn_helper_info == NULL) { 5586 icmp_stack_t *is = connp->conn_icmp->icmp_is; 5587 5588 ASSERT(is->is_ldi_ident != NULL); 5589 5590 /* 5591 * Create a helper stream for non-STREAMS socket. 5592 */ 5593 error = ip_create_helper_stream(connp, is->is_ldi_ident); 5594 if (error != 0) { 5595 ip0dbg(("rawip_ioctl: create of IP helper stream " 5596 "failed %d\n", error)); 5597 return (error); 5598 } 5599 } 5600 5601 switch (cmd) { 5602 case _SIOCSOCKFALLBACK: 5603 case TI_GETPEERNAME: 5604 case TI_GETMYNAME: 5605 #ifdef DEBUG 5606 cmn_err(CE_CONT, "icmp_ioctl cmd 0x%x on non streams" 5607 " socket", cmd); 5608 #endif 5609 error = EINVAL; 5610 break; 5611 default: 5612 /* 5613 * Pass on to IP using helper stream 5614 */ 5615 error = ldi_ioctl(connp->conn_helper_info->iphs_handle, 5616 cmd, arg, mode, cr, rvalp); 5617 break; 5618 } 5619 return (error); 5620 } 5621 5622 int 5623 rawip_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, 5624 cred_t *cr) 5625 { 5626 sin6_t *sin6; 5627 sin_t *sin = NULL; 5628 uint_t srcid; 5629 conn_t *connp = (conn_t *)proto_handle; 5630 icmp_t *icmp = connp->conn_icmp; 5631 int error = 0; 5632 icmp_stack_t *is = icmp->icmp_is; 5633 pid_t pid = curproc->p_pid; 5634 ip_xmit_attr_t *ixa; 5635 5636 ASSERT(DB_TYPE(mp) == M_DATA); 5637 5638 /* All Solaris components should pass a cred for this operation. */ 5639 ASSERT(cr != NULL); 5640 5641 /* do an implicit bind if necessary */ 5642 if (icmp->icmp_state == TS_UNBND) { 5643 error = rawip_implicit_bind(connp); 5644 /* 5645 * We could be racing with an actual bind, in which case 5646 * we would see EPROTO. We cross our fingers and try 5647 * to connect. 5648 */ 5649 if (!(error == 0 || error == EPROTO)) { 5650 freemsg(mp); 5651 return (error); 5652 } 5653 } 5654 5655 /* Protocol 255 contains full IP headers */ 5656 /* Read without holding lock */ 5657 if (icmp->icmp_hdrincl) { 5658 ASSERT(connp->conn_ipversion == IPV4_VERSION); 5659 if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH) { 5660 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { 5661 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5662 freemsg(mp); 5663 return (EINVAL); 5664 } 5665 } 5666 error = icmp_output_hdrincl(connp, mp, cr, pid); 5667 if (is->is_sendto_ignerr) 5668 return (0); 5669 else 5670 return (error); 5671 } 5672 5673 /* Connected? */ 5674 if (msg->msg_name == NULL) { 5675 if (icmp->icmp_state != TS_DATA_XFER) { 5676 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5677 return (EDESTADDRREQ); 5678 } 5679 if (msg->msg_controllen != 0) { 5680 error = icmp_output_ancillary(connp, NULL, NULL, mp, 5681 NULL, msg, cr, pid); 5682 } else { 5683 error = icmp_output_connected(connp, mp, cr, pid); 5684 } 5685 if (is->is_sendto_ignerr) 5686 return (0); 5687 else 5688 return (error); 5689 } 5690 if (icmp->icmp_state == TS_DATA_XFER) { 5691 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5692 return (EISCONN); 5693 } 5694 error = proto_verify_ip_addr(connp->conn_family, 5695 (struct sockaddr *)msg->msg_name, msg->msg_namelen); 5696 if (error != 0) { 5697 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5698 return (error); 5699 } 5700 switch (connp->conn_family) { 5701 case AF_INET6: 5702 sin6 = (sin6_t *)msg->msg_name; 5703 5704 /* No support for mapped addresses on raw sockets */ 5705 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 5706 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5707 return (EADDRNOTAVAIL); 5708 } 5709 srcid = sin6->__sin6_src_id; 5710 5711 /* 5712 * If the local address is a mapped address return 5713 * an error. 5714 * It would be possible to send an IPv6 packet but the 5715 * response would never make it back to the application 5716 * since it is bound to a mapped address. 5717 */ 5718 if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) { 5719 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5720 return (EADDRNOTAVAIL); 5721 } 5722 5723 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) 5724 sin6->sin6_addr = ipv6_loopback; 5725 5726 /* 5727 * We have to allocate an ip_xmit_attr_t before we grab 5728 * conn_lock and we need to hold conn_lock once we've check 5729 * conn_same_as_last_v6 to handle concurrent send* calls on a 5730 * socket. 5731 */ 5732 if (msg->msg_controllen == 0) { 5733 ixa = conn_get_ixa(connp, B_FALSE); 5734 if (ixa == NULL) { 5735 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5736 return (ENOMEM); 5737 } 5738 } else { 5739 ixa = NULL; 5740 } 5741 mutex_enter(&connp->conn_lock); 5742 if (icmp->icmp_delayed_error != 0) { 5743 sin6_t *sin2 = (sin6_t *)&icmp->icmp_delayed_addr; 5744 5745 error = icmp->icmp_delayed_error; 5746 icmp->icmp_delayed_error = 0; 5747 5748 /* Compare IP address and family */ 5749 5750 if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr, 5751 &sin2->sin6_addr) && 5752 sin6->sin6_family == sin2->sin6_family) { 5753 mutex_exit(&connp->conn_lock); 5754 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5755 if (ixa != NULL) 5756 ixa_refrele(ixa); 5757 return (error); 5758 } 5759 } 5760 if (msg->msg_controllen != 0) { 5761 mutex_exit(&connp->conn_lock); 5762 ASSERT(ixa == NULL); 5763 error = icmp_output_ancillary(connp, NULL, sin6, mp, 5764 NULL, msg, cr, pid); 5765 } else if (conn_same_as_last_v6(connp, sin6) && 5766 connp->conn_lastsrcid == srcid && 5767 ipsec_outbound_policy_current(ixa)) { 5768 /* icmp_output_lastdst drops conn_lock */ 5769 error = icmp_output_lastdst(connp, mp, cr, pid, ixa); 5770 } else { 5771 /* icmp_output_newdst drops conn_lock */ 5772 error = icmp_output_newdst(connp, mp, NULL, sin6, cr, 5773 pid, ixa); 5774 } 5775 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 5776 if (is->is_sendto_ignerr) 5777 return (0); 5778 else 5779 return (error); 5780 case AF_INET: 5781 sin = (sin_t *)msg->msg_name; 5782 5783 if (sin->sin_addr.s_addr == INADDR_ANY) 5784 sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); 5785 5786 /* 5787 * We have to allocate an ip_xmit_attr_t before we grab 5788 * conn_lock and we need to hold conn_lock once we've check 5789 * conn_same_as_last_v6 to handle concurrent send* on a socket. 5790 */ 5791 if (msg->msg_controllen == 0) { 5792 ixa = conn_get_ixa(connp, B_FALSE); 5793 if (ixa == NULL) { 5794 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5795 return (ENOMEM); 5796 } 5797 } else { 5798 ixa = NULL; 5799 } 5800 mutex_enter(&connp->conn_lock); 5801 if (icmp->icmp_delayed_error != 0) { 5802 sin_t *sin2 = (sin_t *)&icmp->icmp_delayed_addr; 5803 5804 error = icmp->icmp_delayed_error; 5805 icmp->icmp_delayed_error = 0; 5806 5807 /* Compare IP address */ 5808 5809 if (sin->sin_addr.s_addr == sin2->sin_addr.s_addr) { 5810 mutex_exit(&connp->conn_lock); 5811 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5812 if (ixa != NULL) 5813 ixa_refrele(ixa); 5814 return (error); 5815 } 5816 } 5817 5818 if (msg->msg_controllen != 0) { 5819 mutex_exit(&connp->conn_lock); 5820 ASSERT(ixa == NULL); 5821 error = icmp_output_ancillary(connp, sin, NULL, mp, 5822 NULL, msg, cr, pid); 5823 } else if (conn_same_as_last_v4(connp, sin) && 5824 ipsec_outbound_policy_current(ixa)) { 5825 /* icmp_output_lastdst drops conn_lock */ 5826 error = icmp_output_lastdst(connp, mp, cr, pid, ixa); 5827 } else { 5828 /* icmp_output_newdst drops conn_lock */ 5829 error = icmp_output_newdst(connp, mp, sin, NULL, cr, 5830 pid, ixa); 5831 } 5832 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 5833 if (is->is_sendto_ignerr) 5834 return (0); 5835 else 5836 return (error); 5837 default: 5838 return (EINVAL); 5839 } 5840 } 5841 5842 sock_downcalls_t sock_rawip_downcalls = { 5843 rawip_activate, 5844 rawip_accept, 5845 rawip_bind, 5846 rawip_listen, 5847 rawip_connect, 5848 rawip_getpeername, 5849 rawip_getsockname, 5850 rawip_getsockopt, 5851 rawip_setsockopt, 5852 rawip_send, 5853 NULL, 5854 NULL, 5855 NULL, 5856 rawip_shutdown, 5857 rawip_clr_flowctrl, 5858 rawip_ioctl, 5859 rawip_close 5860 };