1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. 24 * 25 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 26 */ 27 /* Copyright (c) 1990 Mentat Inc. */ 28 29 #include <sys/types.h> 30 #include <sys/stream.h> 31 #include <sys/dlpi.h> 32 #include <sys/stropts.h> 33 #include <sys/sysmacros.h> 34 #include <sys/strsubr.h> 35 #include <sys/strlog.h> 36 #include <sys/strsun.h> 37 #include <sys/zone.h> 38 #define _SUN_TPI_VERSION 2 39 #include <sys/tihdr.h> 40 #include <sys/xti_inet.h> 41 #include <sys/ddi.h> 42 #include <sys/sunddi.h> 43 #include <sys/cmn_err.h> 44 #include <sys/debug.h> 45 #include <sys/kobj.h> 46 #include <sys/modctl.h> 47 #include <sys/atomic.h> 48 #include <sys/policy.h> 49 #include <sys/priv.h> 50 51 #include <sys/systm.h> 52 #include <sys/param.h> 53 #include <sys/kmem.h> 54 #include <sys/sdt.h> 55 #include <sys/socket.h> 56 #include <sys/vtrace.h> 57 #include <sys/isa_defs.h> 58 #include <sys/mac.h> 59 #include <net/if.h> 60 #include <net/if_arp.h> 61 #include <net/route.h> 62 #include <sys/sockio.h> 63 #include <netinet/in.h> 64 #include <net/if_dl.h> 65 66 #include <inet/common.h> 67 #include <inet/mi.h> 68 #include <inet/mib2.h> 69 #include <inet/nd.h> 70 #include <inet/arp.h> 71 #include <inet/snmpcom.h> 72 #include <inet/kstatcom.h> 73 74 #include <netinet/igmp_var.h> 75 #include <netinet/ip6.h> 76 #include <netinet/icmp6.h> 77 #include <netinet/sctp.h> 78 79 #include <inet/ip.h> 80 #include <inet/ip_impl.h> 81 #include <inet/ip6.h> 82 #include <inet/ip6_asp.h> 83 #include <inet/optcom.h> 84 #include <inet/tcp.h> 85 #include <inet/tcp_impl.h> 86 #include <inet/ip_multi.h> 87 #include <inet/ip_if.h> 88 #include <inet/ip_ire.h> 89 #include <inet/ip_ftable.h> 90 #include <inet/ip_rts.h> 91 #include <inet/ip_ndp.h> 92 #include <inet/ip_listutils.h> 93 #include <netinet/igmp.h> 94 #include <netinet/ip_mroute.h> 95 #include <inet/ipp_common.h> 96 97 #include <net/pfkeyv2.h> 98 #include <inet/sadb.h> 99 #include <inet/ipsec_impl.h> 100 #include <inet/ipdrop.h> 101 #include <inet/ip_netinfo.h> 102 #include <inet/ilb_ip.h> 103 #include <sys/squeue_impl.h> 104 #include <sys/squeue.h> 105 106 #include <sys/ethernet.h> 107 #include <net/if_types.h> 108 #include <sys/cpuvar.h> 109 110 #include <ipp/ipp.h> 111 #include <ipp/ipp_impl.h> 112 #include <ipp/ipgpc/ipgpc.h> 113 114 #include <sys/pattr.h> 115 #include <inet/ipclassifier.h> 116 #include <inet/sctp_ip.h> 117 #include <inet/sctp/sctp_impl.h> 118 #include <inet/udp_impl.h> 119 #include <inet/dccp_impl.h> 120 #include <sys/sunddi.h> 121 122 #include <sys/tsol/label.h> 123 #include <sys/tsol/tnet.h> 124 125 #include <sys/clock_impl.h> /* For LBOLT_FASTPATH{,64} */ 126 127 #ifdef DEBUG 128 extern boolean_t skip_sctp_cksum; 129 #endif 130 131 static void ip_input_local_v4(ire_t *, mblk_t *, ipha_t *, 132 ip_recv_attr_t *); 133 134 static void ip_input_broadcast_v4(ire_t *, mblk_t *, ipha_t *, 135 ip_recv_attr_t *); 136 static void ip_input_multicast_v4(ire_t *, mblk_t *, ipha_t *, 137 ip_recv_attr_t *); 138 139 #pragma inline(ip_input_common_v4, ip_input_local_v4, ip_forward_xmit_v4) 140 141 /* 142 * Direct read side procedure capable of dealing with chains. GLDv3 based 143 * drivers call this function directly with mblk chains while STREAMS 144 * read side procedure ip_rput() calls this for single packet with ip_ring 145 * set to NULL to process one packet at a time. 146 * 147 * The ill will always be valid if this function is called directly from 148 * the driver. 149 * 150 * If ip_input() is called from GLDv3: 151 * 152 * - This must be a non-VLAN IP stream. 153 * - 'mp' is either an untagged or a special priority-tagged packet. 154 * - Any VLAN tag that was in the MAC header has been stripped. 155 * 156 * If the IP header in packet is not 32-bit aligned, every message in the 157 * chain will be aligned before further operations. This is required on SPARC 158 * platform. 159 */ 160 void 161 ip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain, 162 struct mac_header_info_s *mhip) 163 { 164 (void) ip_input_common_v4(ill, ip_ring, mp_chain, mhip, NULL, NULL, 165 NULL); 166 } 167 168 /* 169 * ip_accept_tcp() - This function is called by the squeue when it retrieves 170 * a chain of packets in the poll mode. The packets have gone through the 171 * data link processing but not IP processing. For performance and latency 172 * reasons, the squeue wants to process the chain in line instead of feeding 173 * it back via ip_input path. 174 * 175 * We set up the ip_recv_attr_t with IRAF_TARGET_SQP to that ip_fanout_v4 176 * will pass back any TCP packets matching the target sqp to 177 * ip_input_common_v4 using ira_target_sqp_mp. Other packets are handled by 178 * ip_input_v4 and ip_fanout_v4 as normal. 179 * The TCP packets that match the target squeue are returned to the caller 180 * as a b_next chain after each packet has been prepend with an mblk 181 * from ip_recv_attr_to_mblk. 182 */ 183 mblk_t * 184 ip_accept_tcp(ill_t *ill, ill_rx_ring_t *ip_ring, squeue_t *target_sqp, 185 mblk_t *mp_chain, mblk_t **last, uint_t *cnt) 186 { 187 return (ip_input_common_v4(ill, ip_ring, mp_chain, NULL, target_sqp, 188 last, cnt)); 189 } 190 191 /* 192 * Used by ip_input and ip_accept_tcp 193 * The last three arguments are only used by ip_accept_tcp, and mhip is 194 * only used by ip_input. 195 */ 196 mblk_t * 197 ip_input_common_v4(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain, 198 struct mac_header_info_s *mhip, squeue_t *target_sqp, 199 mblk_t **last, uint_t *cnt) 200 { 201 mblk_t *mp; 202 ipha_t *ipha; 203 ip_recv_attr_t iras; /* Receive attributes */ 204 rtc_t rtc; 205 iaflags_t chain_flags = 0; /* Fixed for chain */ 206 mblk_t *ahead = NULL; /* Accepted head */ 207 mblk_t *atail = NULL; /* Accepted tail */ 208 uint_t acnt = 0; /* Accepted count */ 209 210 ASSERT(mp_chain != NULL); 211 ASSERT(ill != NULL); 212 213 /* These ones do not change as we loop over packets */ 214 iras.ira_ill = iras.ira_rill = ill; 215 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 216 iras.ira_rifindex = iras.ira_ruifindex; 217 iras.ira_sqp = NULL; 218 iras.ira_ring = ip_ring; 219 /* For ECMP and outbound transmit ring selection */ 220 iras.ira_xmit_hint = ILL_RING_TO_XMIT_HINT(ip_ring); 221 222 iras.ira_target_sqp = target_sqp; 223 iras.ira_target_sqp_mp = NULL; 224 if (target_sqp != NULL) 225 chain_flags |= IRAF_TARGET_SQP; 226 227 /* 228 * We try to have a mhip pointer when possible, but 229 * it might be NULL in some cases. In those cases we 230 * have to assume unicast. 231 */ 232 iras.ira_mhip = mhip; 233 iras.ira_flags = 0; 234 if (mhip != NULL) { 235 switch (mhip->mhi_dsttype) { 236 case MAC_ADDRTYPE_MULTICAST : 237 chain_flags |= IRAF_L2DST_MULTICAST; 238 break; 239 case MAC_ADDRTYPE_BROADCAST : 240 chain_flags |= IRAF_L2DST_BROADCAST; 241 break; 242 } 243 } 244 245 /* 246 * Initialize the one-element route cache. 247 * 248 * We do ire caching from one iteration to 249 * another. In the event the packet chain contains 250 * all packets from the same dst, this caching saves 251 * an ire_route_recursive for each of the succeeding 252 * packets in a packet chain. 253 */ 254 rtc.rtc_ire = NULL; 255 rtc.rtc_ipaddr = INADDR_ANY; 256 257 /* Loop over b_next */ 258 for (mp = mp_chain; mp != NULL; mp = mp_chain) { 259 mp_chain = mp->b_next; 260 mp->b_next = NULL; 261 262 ASSERT(DB_TYPE(mp) == M_DATA); 263 264 265 /* 266 * if db_ref > 1 then copymsg and free original. Packet 267 * may be changed and we do not want the other entity 268 * who has a reference to this message to trip over the 269 * changes. This is a blind change because trying to 270 * catch all places that might change the packet is too 271 * difficult. 272 * 273 * This corresponds to the fast path case, where we have 274 * a chain of M_DATA mblks. We check the db_ref count 275 * of only the 1st data block in the mblk chain. There 276 * doesn't seem to be a reason why a device driver would 277 * send up data with varying db_ref counts in the mblk 278 * chain. In any case the Fast path is a private 279 * interface, and our drivers don't do such a thing. 280 * Given the above assumption, there is no need to walk 281 * down the entire mblk chain (which could have a 282 * potential performance problem) 283 * 284 * The "(DB_REF(mp) > 1)" check was moved from ip_rput() 285 * to here because of exclusive ip stacks and vnics. 286 * Packets transmitted from exclusive stack over vnic 287 * can have db_ref > 1 and when it gets looped back to 288 * another vnic in a different zone, you have ip_input() 289 * getting dblks with db_ref > 1. So if someone 290 * complains of TCP performance under this scenario, 291 * take a serious look here on the impact of copymsg(). 292 */ 293 if (DB_REF(mp) > 1) { 294 if ((mp = ip_fix_dbref(mp, &iras)) == NULL) { 295 /* mhip might point into 1st packet in chain */ 296 iras.ira_mhip = NULL; 297 continue; 298 } 299 } 300 301 /* 302 * IP header ptr not aligned? 303 * OR IP header not complete in first mblk 304 */ 305 ipha = (ipha_t *)mp->b_rptr; 306 if (!OK_32PTR(ipha) || MBLKL(mp) < IP_SIMPLE_HDR_LENGTH) { 307 mp = ip_check_and_align_header(mp, IP_SIMPLE_HDR_LENGTH, 308 &iras); 309 if (mp == NULL) { 310 /* mhip might point into 1st packet in chain */ 311 iras.ira_mhip = NULL; 312 continue; 313 } 314 ipha = (ipha_t *)mp->b_rptr; 315 } 316 317 /* Protect against a mix of Ethertypes and IP versions */ 318 if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) { 319 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); 320 ip_drop_input("ipIfStatsInHdrErrors", mp, ill); 321 freemsg(mp); 322 /* mhip might point into 1st packet in the chain. */ 323 iras.ira_mhip = NULL; 324 continue; 325 } 326 327 /* 328 * Check for Martian addrs; we have to explicitly 329 * test for for zero dst since this is also used as 330 * an indication that the rtc is not used. 331 */ 332 if (ipha->ipha_dst == INADDR_ANY) { 333 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors); 334 ip_drop_input("ipIfStatsInAddrErrors", mp, ill); 335 freemsg(mp); 336 /* mhip might point into 1st packet in the chain. */ 337 iras.ira_mhip = NULL; 338 continue; 339 } 340 341 /* 342 * Keep L2SRC from a previous packet in chain since mhip 343 * might point into an earlier packet in the chain. 344 * Keep IRAF_VERIFIED_SRC to avoid redoing broadcast 345 * source check in forwarding path. 346 */ 347 chain_flags |= (iras.ira_flags & 348 (IRAF_L2SRC_SET|IRAF_VERIFIED_SRC)); 349 350 iras.ira_flags = IRAF_IS_IPV4 | IRAF_VERIFY_IP_CKSUM | 351 IRAF_VERIFY_ULP_CKSUM | chain_flags; 352 iras.ira_free_flags = 0; 353 iras.ira_cred = NULL; 354 iras.ira_cpid = NOPID; 355 iras.ira_tsl = NULL; 356 iras.ira_zoneid = ALL_ZONES; /* Default for forwarding */ 357 358 /* 359 * We must count all incoming packets, even if they end 360 * up being dropped later on. Defer counting bytes until 361 * we have the whole IP header in first mblk. 362 */ 363 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives); 364 365 iras.ira_pktlen = ntohs(ipha->ipha_length); 366 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, 367 iras.ira_pktlen); 368 369 /* 370 * Call one of: 371 * ill_input_full_v4 372 * ill_input_short_v4 373 * The former is used in unusual cases. See ill_set_inputfn(). 374 */ 375 (*ill->ill_inputfn)(mp, ipha, &ipha->ipha_dst, &iras, &rtc); 376 377 /* Any references to clean up? No hold on ira_ill */ 378 if (iras.ira_flags & (IRAF_IPSEC_SECURE|IRAF_SYSTEM_LABELED)) 379 ira_cleanup(&iras, B_FALSE); 380 381 if (iras.ira_target_sqp_mp != NULL) { 382 /* Better be called from ip_accept_tcp */ 383 ASSERT(target_sqp != NULL); 384 385 /* Found one packet to accept */ 386 mp = iras.ira_target_sqp_mp; 387 iras.ira_target_sqp_mp = NULL; 388 ASSERT(ip_recv_attr_is_mblk(mp)); 389 390 if (atail != NULL) 391 atail->b_next = mp; 392 else 393 ahead = mp; 394 atail = mp; 395 acnt++; 396 mp = NULL; 397 } 398 /* mhip might point into 1st packet in the chain. */ 399 iras.ira_mhip = NULL; 400 } 401 /* Any remaining references to the route cache? */ 402 if (rtc.rtc_ire != NULL) { 403 ASSERT(rtc.rtc_ipaddr != INADDR_ANY); 404 ire_refrele(rtc.rtc_ire); 405 } 406 407 if (ahead != NULL) { 408 /* Better be called from ip_accept_tcp */ 409 ASSERT(target_sqp != NULL); 410 *last = atail; 411 *cnt = acnt; 412 return (ahead); 413 } 414 415 return (NULL); 416 } 417 418 /* 419 * This input function is used when 420 * - is_system_labeled() 421 * - CGTP filtering 422 * - DHCP unicast before we have an IP address configured 423 * - there is an listener for IPPROTO_RSVP 424 */ 425 void 426 ill_input_full_v4(mblk_t *mp, void *iph_arg, void *nexthop_arg, 427 ip_recv_attr_t *ira, rtc_t *rtc) 428 { 429 ipha_t *ipha = (ipha_t *)iph_arg; 430 ipaddr_t nexthop = *(ipaddr_t *)nexthop_arg; 431 ill_t *ill = ira->ira_ill; 432 ip_stack_t *ipst = ill->ill_ipst; 433 int cgtp_flt_pkt; 434 435 ASSERT(ira->ira_tsl == NULL); 436 437 /* 438 * Attach any necessary label information to 439 * this packet 440 */ 441 if (is_system_labeled()) { 442 ira->ira_flags |= IRAF_SYSTEM_LABELED; 443 444 /* 445 * This updates ira_cred, ira_tsl and ira_free_flags based 446 * on the label. 447 */ 448 if (!tsol_get_pkt_label(mp, IPV4_VERSION, ira)) { 449 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 450 ip_drop_input("ipIfStatsInDiscards", mp, ill); 451 freemsg(mp); 452 return; 453 } 454 /* Note that ira_tsl can be NULL here. */ 455 456 /* tsol_get_pkt_label sometimes does pullupmsg */ 457 ipha = (ipha_t *)mp->b_rptr; 458 } 459 460 /* 461 * Invoke the CGTP (multirouting) filtering module to process 462 * the incoming packet. Packets identified as duplicates 463 * must be discarded. Filtering is active only if the 464 * the ip_cgtp_filter ndd variable is non-zero. 465 */ 466 cgtp_flt_pkt = CGTP_IP_PKT_NOT_CGTP; 467 if (ipst->ips_ip_cgtp_filter && 468 ipst->ips_ip_cgtp_filter_ops != NULL) { 469 netstackid_t stackid; 470 471 stackid = ipst->ips_netstack->netstack_stackid; 472 /* 473 * CGTP and IPMP are mutually exclusive so 474 * phyint_ifindex is fine here. 475 */ 476 cgtp_flt_pkt = 477 ipst->ips_ip_cgtp_filter_ops->cfo_filter(stackid, 478 ill->ill_phyint->phyint_ifindex, mp); 479 if (cgtp_flt_pkt == CGTP_IP_PKT_DUPLICATE) { 480 ip_drop_input("CGTP_IP_PKT_DUPLICATE", mp, ill); 481 freemsg(mp); 482 return; 483 } 484 } 485 486 /* 487 * Brutal hack for DHCPv4 unicast: RFC2131 allows a DHCP 488 * server to unicast DHCP packets to a DHCP client using the 489 * IP address it is offering to the client. This can be 490 * disabled through the "broadcast bit", but not all DHCP 491 * servers honor that bit. Therefore, to interoperate with as 492 * many DHCP servers as possible, the DHCP client allows the 493 * server to unicast, but we treat those packets as broadcast 494 * here. Note that we don't rewrite the packet itself since 495 * (a) that would mess up the checksums and (b) the DHCP 496 * client conn is bound to INADDR_ANY so ip_fanout_udp() will 497 * hand it the packet regardless. 498 */ 499 if (ill->ill_dhcpinit != 0 && 500 ipha->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION && 501 ipha->ipha_protocol == IPPROTO_UDP) { 502 udpha_t *udpha; 503 504 ipha = ip_pullup(mp, sizeof (ipha_t) + sizeof (udpha_t), ira); 505 if (ipha == NULL) { 506 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 507 ip_drop_input("ipIfStatsInDiscards - dhcp", mp, ill); 508 freemsg(mp); 509 return; 510 } 511 /* Reload since pullupmsg() can change b_rptr. */ 512 udpha = (udpha_t *)&ipha[1]; 513 514 if (ntohs(udpha->uha_dst_port) == IPPORT_BOOTPC) { 515 DTRACE_PROBE2(ip4__dhcpinit__pkt, ill_t *, ill, 516 mblk_t *, mp); 517 /* 518 * This assumes that we deliver to all conns for 519 * multicast and broadcast packets. 520 */ 521 nexthop = INADDR_BROADCAST; 522 ira->ira_flags |= IRAF_DHCP_UNICAST; 523 } 524 } 525 526 /* 527 * If rsvpd is running, let RSVP daemon handle its processing 528 * and forwarding of RSVP multicast/unicast packets. 529 * If rsvpd is not running but mrouted is running, RSVP 530 * multicast packets are forwarded as multicast traffic 531 * and RSVP unicast packets are forwarded by unicast router. 532 * If neither rsvpd nor mrouted is running, RSVP multicast 533 * packets are not forwarded, but the unicast packets are 534 * forwarded like unicast traffic. 535 */ 536 if (ipha->ipha_protocol == IPPROTO_RSVP && 537 ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head != NULL) { 538 /* RSVP packet and rsvpd running. Treat as ours */ 539 ip2dbg(("ip_input: RSVP for us: 0x%x\n", ntohl(nexthop))); 540 /* 541 * We use a multicast address to get the packet to 542 * ire_recv_multicast_v4. There will not be a membership 543 * check since we set IRAF_RSVP 544 */ 545 nexthop = htonl(INADDR_UNSPEC_GROUP); 546 ira->ira_flags |= IRAF_RSVP; 547 } 548 549 ill_input_short_v4(mp, ipha, &nexthop, ira, rtc); 550 } 551 552 /* 553 * This is the tail-end of the full receive side packet handling. 554 * It can be used directly when the configuration is simple. 555 */ 556 void 557 ill_input_short_v4(mblk_t *mp, void *iph_arg, void *nexthop_arg, 558 ip_recv_attr_t *ira, rtc_t *rtc) 559 { 560 ire_t *ire; 561 uint_t opt_len; 562 ill_t *ill = ira->ira_ill; 563 ip_stack_t *ipst = ill->ill_ipst; 564 uint_t pkt_len; 565 ssize_t len; 566 ipha_t *ipha = (ipha_t *)iph_arg; 567 ipaddr_t nexthop = *(ipaddr_t *)nexthop_arg; 568 ilb_stack_t *ilbs = ipst->ips_netstack->netstack_ilb; 569 uint_t irr_flags; 570 #define rptr ((uchar_t *)ipha) 571 572 ASSERT(DB_TYPE(mp) == M_DATA); 573 574 /* 575 * The following test for loopback is faster than 576 * IP_LOOPBACK_ADDR(), because it avoids any bitwise 577 * operations. 578 * Note that these addresses are always in network byte order 579 */ 580 if (((*(uchar_t *)&ipha->ipha_dst) == IN_LOOPBACKNET) || 581 ((*(uchar_t *)&ipha->ipha_src) == IN_LOOPBACKNET)) { 582 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors); 583 ip_drop_input("ipIfStatsInAddrErrors", mp, ill); 584 freemsg(mp); 585 return; 586 } 587 588 len = mp->b_wptr - rptr; 589 pkt_len = ira->ira_pktlen; 590 591 /* multiple mblk or too short */ 592 len -= pkt_len; 593 if (len != 0) { 594 mp = ip_check_length(mp, rptr, len, pkt_len, 595 IP_SIMPLE_HDR_LENGTH, ira); 596 if (mp == NULL) 597 return; 598 ipha = (ipha_t *)mp->b_rptr; 599 } 600 601 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 602 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL, 603 int, 0); 604 605 /* 606 * The event for packets being received from a 'physical' 607 * interface is placed after validation of the source and/or 608 * destination address as being local so that packets can be 609 * redirected to loopback addresses using ipnat. 610 */ 611 DTRACE_PROBE4(ip4__physical__in__start, 612 ill_t *, ill, ill_t *, NULL, 613 ipha_t *, ipha, mblk_t *, mp); 614 615 if (HOOKS4_INTERESTED_PHYSICAL_IN(ipst)) { 616 int ll_multicast = 0; 617 int error; 618 ipaddr_t orig_dst = ipha->ipha_dst; 619 620 if (ira->ira_flags & IRAF_L2DST_MULTICAST) 621 ll_multicast = HPE_MULTICAST; 622 else if (ira->ira_flags & IRAF_L2DST_BROADCAST) 623 ll_multicast = HPE_BROADCAST; 624 625 FW_HOOKS(ipst->ips_ip4_physical_in_event, 626 ipst->ips_ipv4firewall_physical_in, 627 ill, NULL, ipha, mp, mp, ll_multicast, ipst, error); 628 629 DTRACE_PROBE1(ip4__physical__in__end, mblk_t *, mp); 630 631 if (mp == NULL) 632 return; 633 /* The length could have changed */ 634 ipha = (ipha_t *)mp->b_rptr; 635 ira->ira_pktlen = ntohs(ipha->ipha_length); 636 pkt_len = ira->ira_pktlen; 637 638 /* 639 * In case the destination changed we override any previous 640 * change to nexthop. 641 */ 642 if (orig_dst != ipha->ipha_dst) 643 nexthop = ipha->ipha_dst; 644 if (nexthop == INADDR_ANY) { 645 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors); 646 ip_drop_input("ipIfStatsInAddrErrors", mp, ill); 647 freemsg(mp); 648 return; 649 } 650 } 651 652 if (ipst->ips_ip4_observe.he_interested) { 653 zoneid_t dzone; 654 655 /* 656 * On the inbound path the src zone will be unknown as 657 * this packet has come from the wire. 658 */ 659 dzone = ip_get_zoneid_v4(nexthop, mp, ira, ALL_ZONES); 660 ipobs_hook(mp, IPOBS_HOOK_INBOUND, ALL_ZONES, dzone, ill, ipst); 661 } 662 663 /* 664 * If there is a good HW IP header checksum we clear the need 665 * look at the IP header checksum. 666 */ 667 if ((DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) && 668 ILL_HCKSUM_CAPABLE(ill) && dohwcksum) { 669 /* Header checksum was ok. Clear the flag */ 670 DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM; 671 ira->ira_flags &= ~IRAF_VERIFY_IP_CKSUM; 672 } 673 674 /* 675 * Here we check to see if we machine is setup as 676 * L3 loadbalancer and if the incoming packet is for a VIP 677 * 678 * Check the following: 679 * - there is at least a rule 680 * - protocol of the packet is supported 681 */ 682 if (ilb_has_rules(ilbs) && ILB_SUPP_L4(ipha->ipha_protocol)) { 683 ipaddr_t lb_dst; 684 int lb_ret; 685 686 /* For convenience, we pull up the mblk. */ 687 if (mp->b_cont != NULL) { 688 if (pullupmsg(mp, -1) == 0) { 689 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 690 ip_drop_input("ipIfStatsInDiscards - pullupmsg", 691 mp, ill); 692 freemsg(mp); 693 return; 694 } 695 ipha = (ipha_t *)mp->b_rptr; 696 } 697 698 /* 699 * We just drop all fragments going to any VIP, at 700 * least for now.... 701 */ 702 if (ntohs(ipha->ipha_fragment_offset_and_flags) & 703 (IPH_MF | IPH_OFFSET)) { 704 if (!ilb_rule_match_vip_v4(ilbs, nexthop, NULL)) { 705 goto after_ilb; 706 } 707 708 ILB_KSTAT_UPDATE(ilbs, ip_frag_in, 1); 709 ILB_KSTAT_UPDATE(ilbs, ip_frag_dropped, 1); 710 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 711 ip_drop_input("ILB fragment", mp, ill); 712 freemsg(mp); 713 return; 714 } 715 lb_ret = ilb_check_v4(ilbs, ill, mp, ipha, ipha->ipha_protocol, 716 (uint8_t *)ipha + IPH_HDR_LENGTH(ipha), &lb_dst); 717 718 if (lb_ret == ILB_DROPPED) { 719 /* Is this the right counter to increase? */ 720 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 721 ip_drop_input("ILB_DROPPED", mp, ill); 722 freemsg(mp); 723 return; 724 } 725 if (lb_ret == ILB_BALANCED) { 726 /* Set the dst to that of the chosen server */ 727 nexthop = lb_dst; 728 DB_CKSUMFLAGS(mp) = 0; 729 } 730 } 731 732 after_ilb: 733 opt_len = ipha->ipha_version_and_hdr_length - IP_SIMPLE_HDR_VERSION; 734 ira->ira_ip_hdr_length = IP_SIMPLE_HDR_LENGTH; 735 if (opt_len != 0) { 736 int error = 0; 737 738 ira->ira_ip_hdr_length += (opt_len << 2); 739 ira->ira_flags |= IRAF_IPV4_OPTIONS; 740 741 /* IP Options present! Validate the length. */ 742 mp = ip_check_optlen(mp, ipha, opt_len, pkt_len, ira); 743 if (mp == NULL) 744 return; 745 746 /* Might have changed */ 747 ipha = (ipha_t *)mp->b_rptr; 748 749 /* Verify IP header checksum before parsing the options */ 750 if ((ira->ira_flags & IRAF_VERIFY_IP_CKSUM) && 751 ip_csum_hdr(ipha)) { 752 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); 753 ip_drop_input("ipIfStatsInCksumErrs", mp, ill); 754 freemsg(mp); 755 return; 756 } 757 ira->ira_flags &= ~IRAF_VERIFY_IP_CKSUM; 758 759 /* 760 * Go off to ip_input_options which returns the next hop 761 * destination address, which may have been affected 762 * by source routing. 763 */ 764 IP_STAT(ipst, ip_opt); 765 766 nexthop = ip_input_options(ipha, nexthop, mp, ira, &error); 767 if (error != 0) { 768 /* 769 * An ICMP error has been sent and the packet has 770 * been dropped. 771 */ 772 return; 773 } 774 } 775 776 if (ill->ill_flags & ILLF_ROUTER) 777 irr_flags = IRR_ALLOCATE; 778 else 779 irr_flags = IRR_NONE; 780 781 /* Can not use route cache with TX since the labels can differ */ 782 if (ira->ira_flags & IRAF_SYSTEM_LABELED) { 783 if (CLASSD(nexthop)) { 784 ire = ire_multicast(ill); 785 } else { 786 /* Match destination and label */ 787 ire = ire_route_recursive_v4(nexthop, 0, NULL, 788 ALL_ZONES, ira->ira_tsl, MATCH_IRE_SECATTR, 789 irr_flags, ira->ira_xmit_hint, ipst, NULL, NULL, 790 NULL); 791 } 792 /* Update the route cache so we do the ire_refrele */ 793 ASSERT(ire != NULL); 794 if (rtc->rtc_ire != NULL) 795 ire_refrele(rtc->rtc_ire); 796 rtc->rtc_ire = ire; 797 rtc->rtc_ipaddr = nexthop; 798 } else if (nexthop == rtc->rtc_ipaddr && rtc->rtc_ire != NULL) { 799 /* Use the route cache */ 800 ire = rtc->rtc_ire; 801 } else { 802 /* Update the route cache */ 803 if (CLASSD(nexthop)) { 804 ire = ire_multicast(ill); 805 } else { 806 /* Just match the destination */ 807 ire = ire_route_recursive_dstonly_v4(nexthop, irr_flags, 808 ira->ira_xmit_hint, ipst); 809 } 810 ASSERT(ire != NULL); 811 if (rtc->rtc_ire != NULL) 812 ire_refrele(rtc->rtc_ire); 813 rtc->rtc_ire = ire; 814 rtc->rtc_ipaddr = nexthop; 815 } 816 817 ire->ire_ib_pkt_count++; 818 819 /* 820 * Based on ire_type and ire_flags call one of: 821 * ire_recv_local_v4 - for IRE_LOCAL 822 * ire_recv_loopback_v4 - for IRE_LOOPBACK 823 * ire_recv_multirt_v4 - if RTF_MULTIRT 824 * ire_recv_noroute_v4 - if RTF_REJECT or RTF_BLACHOLE 825 * ire_recv_multicast_v4 - for IRE_MULTICAST 826 * ire_recv_broadcast_v4 - for IRE_BROADCAST 827 * ire_recv_noaccept_v4 - for ire_noaccept ones 828 * ire_recv_forward_v4 - for the rest. 829 */ 830 (*ire->ire_recvfn)(ire, mp, ipha, ira); 831 } 832 #undef rptr 833 834 /* 835 * ire_recvfn for IREs that need forwarding 836 */ 837 void 838 ire_recv_forward_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira) 839 { 840 ipha_t *ipha = (ipha_t *)iph_arg; 841 ill_t *ill = ira->ira_ill; 842 ip_stack_t *ipst = ill->ill_ipst; 843 ill_t *dst_ill; 844 nce_t *nce; 845 ipaddr_t src = ipha->ipha_src; 846 uint32_t added_tx_len; 847 uint32_t mtu, iremtu; 848 849 if (ira->ira_flags & (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST)) { 850 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); 851 ip_drop_input("l2 multicast not forwarded", mp, ill); 852 freemsg(mp); 853 return; 854 } 855 856 if (!(ill->ill_flags & ILLF_ROUTER) && !ip_source_routed(ipha, ipst)) { 857 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); 858 ip_drop_input("ipIfStatsForwProhibits", mp, ill); 859 freemsg(mp); 860 return; 861 } 862 863 /* 864 * Either ire_nce_capable or ire_dep_parent would be set for the IRE 865 * when it is found by ire_route_recursive, but that some other thread 866 * could have changed the routes with the effect of clearing 867 * ire_dep_parent. In that case we'd end up dropping the packet, or 868 * finding a new nce below. 869 * Get, allocate, or update the nce. 870 * We get a refhold on ire_nce_cache as a result of this to avoid races 871 * where ire_nce_cache is deleted. 872 * 873 * This ensures that we don't forward if the interface is down since 874 * ipif_down removes all the nces. 875 */ 876 mutex_enter(&ire->ire_lock); 877 nce = ire->ire_nce_cache; 878 if (nce == NULL) { 879 /* Not yet set up - try to set one up */ 880 mutex_exit(&ire->ire_lock); 881 (void) ire_revalidate_nce(ire); 882 mutex_enter(&ire->ire_lock); 883 nce = ire->ire_nce_cache; 884 if (nce == NULL) { 885 mutex_exit(&ire->ire_lock); 886 /* The ire_dep_parent chain went bad, or no memory */ 887 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 888 ip_drop_input("No ire_dep_parent", mp, ill); 889 freemsg(mp); 890 return; 891 } 892 } 893 nce_refhold(nce); 894 mutex_exit(&ire->ire_lock); 895 896 if (nce->nce_is_condemned) { 897 nce_t *nce1; 898 899 nce1 = ire_handle_condemned_nce(nce, ire, ipha, NULL, B_FALSE); 900 nce_refrele(nce); 901 if (nce1 == NULL) { 902 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 903 ip_drop_input("No nce", mp, ill); 904 freemsg(mp); 905 return; 906 } 907 nce = nce1; 908 } 909 dst_ill = nce->nce_ill; 910 911 /* 912 * Unless we are forwarding, drop the packet. 913 * We have to let source routed packets through if they go out 914 * the same interface i.e., they are 'ping -l' packets. 915 */ 916 if (!(dst_ill->ill_flags & ILLF_ROUTER) && 917 !(ip_source_routed(ipha, ipst) && dst_ill == ill)) { 918 if (ip_source_routed(ipha, ipst)) { 919 ip_drop_input("ICMP_SOURCE_ROUTE_FAILED", mp, ill); 920 icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, ira); 921 nce_refrele(nce); 922 return; 923 } 924 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); 925 ip_drop_input("ipIfStatsForwProhibits", mp, ill); 926 freemsg(mp); 927 nce_refrele(nce); 928 return; 929 } 930 931 if (ire->ire_zoneid != GLOBAL_ZONEID && ire->ire_zoneid != ALL_ZONES) { 932 ipaddr_t dst = ipha->ipha_dst; 933 934 ire->ire_ib_pkt_count--; 935 /* 936 * Should only use IREs that are visible from the 937 * global zone for forwarding. 938 * Take a source route into account the same way as ip_input 939 * did. 940 */ 941 if (ira->ira_flags & IRAF_IPV4_OPTIONS) { 942 int error = 0; 943 944 dst = ip_input_options(ipha, dst, mp, ira, &error); 945 ASSERT(error == 0); /* ip_input checked */ 946 } 947 ire = ire_route_recursive_v4(dst, 0, NULL, GLOBAL_ZONEID, 948 ira->ira_tsl, MATCH_IRE_SECATTR, 949 (ill->ill_flags & ILLF_ROUTER) ? IRR_ALLOCATE : IRR_NONE, 950 ira->ira_xmit_hint, ipst, NULL, NULL, NULL); 951 ire->ire_ib_pkt_count++; 952 (*ire->ire_recvfn)(ire, mp, ipha, ira); 953 ire_refrele(ire); 954 nce_refrele(nce); 955 return; 956 } 957 958 /* 959 * ipIfStatsHCInForwDatagrams should only be increment if there 960 * will be an attempt to forward the packet, which is why we 961 * increment after the above condition has been checked. 962 */ 963 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams); 964 965 /* Initiate Read side IPPF processing */ 966 if (IPP_ENABLED(IPP_FWD_IN, ipst)) { 967 /* ip_process translates an IS_UNDER_IPMP */ 968 mp = ip_process(IPP_FWD_IN, mp, ill, ill); 969 if (mp == NULL) { 970 /* ip_drop_packet and MIB done */ 971 ip2dbg(("ire_recv_forward_v4: pkt dropped/deferred " 972 "during IPPF processing\n")); 973 nce_refrele(nce); 974 return; 975 } 976 } 977 978 DTRACE_PROBE4(ip4__forwarding__start, 979 ill_t *, ill, ill_t *, dst_ill, ipha_t *, ipha, mblk_t *, mp); 980 981 if (HOOKS4_INTERESTED_FORWARDING(ipst)) { 982 int error; 983 984 FW_HOOKS(ipst->ips_ip4_forwarding_event, 985 ipst->ips_ipv4firewall_forwarding, 986 ill, dst_ill, ipha, mp, mp, 0, ipst, error); 987 988 DTRACE_PROBE1(ip4__forwarding__end, mblk_t *, mp); 989 990 if (mp == NULL) { 991 nce_refrele(nce); 992 return; 993 } 994 /* 995 * Even if the destination was changed by the filter we use the 996 * forwarding decision that was made based on the address 997 * in ip_input. 998 */ 999 1000 /* Might have changed */ 1001 ipha = (ipha_t *)mp->b_rptr; 1002 ira->ira_pktlen = ntohs(ipha->ipha_length); 1003 } 1004 1005 /* Packet is being forwarded. Turning off hwcksum flag. */ 1006 DB_CKSUMFLAGS(mp) = 0; 1007 1008 /* 1009 * Martian Address Filtering [RFC 1812, Section 5.3.7] 1010 * The loopback address check for both src and dst has already 1011 * been checked in ip_input 1012 * In the future one can envision adding RPF checks using number 3. 1013 * If we already checked the same source address we can skip this. 1014 */ 1015 if (!(ira->ira_flags & IRAF_VERIFIED_SRC) || 1016 src != ira->ira_verified_src) { 1017 switch (ipst->ips_src_check) { 1018 case 0: 1019 break; 1020 case 2: 1021 if (ip_type_v4(src, ipst) == IRE_BROADCAST) { 1022 BUMP_MIB(ill->ill_ip_mib, 1023 ipIfStatsForwProhibits); 1024 BUMP_MIB(ill->ill_ip_mib, 1025 ipIfStatsInAddrErrors); 1026 ip_drop_input("ipIfStatsInAddrErrors", mp, ill); 1027 freemsg(mp); 1028 nce_refrele(nce); 1029 return; 1030 } 1031 /* FALLTHRU */ 1032 1033 case 1: 1034 if (CLASSD(src)) { 1035 BUMP_MIB(ill->ill_ip_mib, 1036 ipIfStatsForwProhibits); 1037 BUMP_MIB(ill->ill_ip_mib, 1038 ipIfStatsInAddrErrors); 1039 ip_drop_input("ipIfStatsInAddrErrors", mp, ill); 1040 freemsg(mp); 1041 nce_refrele(nce); 1042 return; 1043 } 1044 break; 1045 } 1046 /* Remember for next packet */ 1047 ira->ira_flags |= IRAF_VERIFIED_SRC; 1048 ira->ira_verified_src = src; 1049 } 1050 1051 /* 1052 * Check if packet is going out the same link on which it arrived. 1053 * Means we might need to send a redirect. 1054 */ 1055 if (IS_ON_SAME_LAN(dst_ill, ill) && ipst->ips_ip_g_send_redirects) { 1056 ip_send_potential_redirect_v4(mp, ipha, ire, ira); 1057 } 1058 1059 added_tx_len = 0; 1060 if (ira->ira_flags & IRAF_SYSTEM_LABELED) { 1061 mblk_t *mp1; 1062 uint32_t old_pkt_len = ira->ira_pktlen; 1063 1064 /* Verify IP header checksum before adding/removing options */ 1065 if ((ira->ira_flags & IRAF_VERIFY_IP_CKSUM) && 1066 ip_csum_hdr(ipha)) { 1067 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); 1068 ip_drop_input("ipIfStatsInCksumErrs", mp, ill); 1069 freemsg(mp); 1070 nce_refrele(nce); 1071 return; 1072 } 1073 ira->ira_flags &= ~IRAF_VERIFY_IP_CKSUM; 1074 1075 /* 1076 * Check if it can be forwarded and add/remove 1077 * CIPSO options as needed. 1078 */ 1079 if ((mp1 = tsol_ip_forward(ire, mp, ira)) == NULL) { 1080 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); 1081 ip_drop_input("tsol_ip_forward", mp, ill); 1082 freemsg(mp); 1083 nce_refrele(nce); 1084 return; 1085 } 1086 /* 1087 * Size may have changed. Remember amount added in case 1088 * IP needs to send an ICMP too big. 1089 */ 1090 mp = mp1; 1091 ipha = (ipha_t *)mp->b_rptr; 1092 ira->ira_pktlen = ntohs(ipha->ipha_length); 1093 ira->ira_ip_hdr_length = IPH_HDR_LENGTH(ipha); 1094 if (ira->ira_pktlen > old_pkt_len) 1095 added_tx_len = ira->ira_pktlen - old_pkt_len; 1096 1097 /* Options can have been added or removed */ 1098 if (ira->ira_ip_hdr_length != IP_SIMPLE_HDR_LENGTH) 1099 ira->ira_flags |= IRAF_IPV4_OPTIONS; 1100 else 1101 ira->ira_flags &= ~IRAF_IPV4_OPTIONS; 1102 } 1103 1104 mtu = dst_ill->ill_mtu; 1105 if ((iremtu = ire->ire_metrics.iulp_mtu) != 0 && iremtu < mtu) 1106 mtu = iremtu; 1107 ip_forward_xmit_v4(nce, ill, mp, ipha, ira, mtu, added_tx_len); 1108 nce_refrele(nce); 1109 } 1110 1111 /* 1112 * Used for sending out unicast and multicast packets that are 1113 * forwarded. 1114 */ 1115 void 1116 ip_forward_xmit_v4(nce_t *nce, ill_t *ill, mblk_t *mp, ipha_t *ipha, 1117 ip_recv_attr_t *ira, uint32_t mtu, uint32_t added_tx_len) 1118 { 1119 ill_t *dst_ill = nce->nce_ill; 1120 uint32_t pkt_len; 1121 uint32_t sum; 1122 iaflags_t iraflags = ira->ira_flags; 1123 ip_stack_t *ipst = ill->ill_ipst; 1124 iaflags_t ixaflags; 1125 1126 if (ipha->ipha_ttl <= 1) { 1127 /* Perhaps the checksum was bad */ 1128 if ((iraflags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) { 1129 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); 1130 ip_drop_input("ipIfStatsInCksumErrs", mp, ill); 1131 freemsg(mp); 1132 return; 1133 } 1134 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1135 ip_drop_input("ICMP_TTL_EXCEEDED", mp, ill); 1136 icmp_time_exceeded(mp, ICMP_TTL_EXCEEDED, ira); 1137 return; 1138 } 1139 ipha->ipha_ttl--; 1140 /* Adjust the checksum to reflect the ttl decrement. */ 1141 sum = (int)ipha->ipha_hdr_checksum + IP_HDR_CSUM_TTL_ADJUST; 1142 ipha->ipha_hdr_checksum = (uint16_t)(sum + (sum >> 16)); 1143 1144 /* Check if there are options to update */ 1145 if (iraflags & IRAF_IPV4_OPTIONS) { 1146 ASSERT(ipha->ipha_version_and_hdr_length != 1147 IP_SIMPLE_HDR_VERSION); 1148 ASSERT(!(iraflags & IRAF_VERIFY_IP_CKSUM)); 1149 1150 if (!ip_forward_options(mp, ipha, dst_ill, ira)) { 1151 /* ipIfStatsForwProhibits and ip_drop_input done */ 1152 return; 1153 } 1154 1155 ipha->ipha_hdr_checksum = 0; 1156 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1157 } 1158 1159 /* Initiate Write side IPPF processing before any fragmentation */ 1160 if (IPP_ENABLED(IPP_FWD_OUT, ipst)) { 1161 /* ip_process translates an IS_UNDER_IPMP */ 1162 mp = ip_process(IPP_FWD_OUT, mp, dst_ill, dst_ill); 1163 if (mp == NULL) { 1164 /* ip_drop_packet and MIB done */ 1165 ip2dbg(("ire_recv_forward_v4: pkt dropped/deferred" \ 1166 " during IPPF processing\n")); 1167 return; 1168 } 1169 } 1170 1171 pkt_len = ira->ira_pktlen; 1172 1173 BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsHCOutForwDatagrams); 1174 1175 ixaflags = IXAF_IS_IPV4 | IXAF_NO_DEV_FLOW_CTL; 1176 1177 if (pkt_len > mtu) { 1178 /* 1179 * It needs fragging on its way out. If we haven't 1180 * verified the header checksum yet we do it now since 1181 * are going to put a surely good checksum in the 1182 * outgoing header, we have to make sure that it 1183 * was good coming in. 1184 */ 1185 if ((iraflags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) { 1186 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); 1187 ip_drop_input("ipIfStatsInCksumErrs", mp, ill); 1188 freemsg(mp); 1189 return; 1190 } 1191 if (ipha->ipha_fragment_offset_and_flags & IPH_DF_HTONS) { 1192 BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsOutFragFails); 1193 ip_drop_output("ipIfStatsOutFragFails", mp, dst_ill); 1194 if (iraflags & IRAF_SYSTEM_LABELED) { 1195 /* 1196 * Remove any CIPSO option added by 1197 * tsol_ip_forward, and make sure we report 1198 * a path MTU so that there 1199 * is room to add such a CIPSO option for future 1200 * packets. 1201 */ 1202 mtu = tsol_pmtu_adjust(mp, mtu, added_tx_len, 1203 AF_INET); 1204 } 1205 1206 icmp_frag_needed(mp, mtu, ira); 1207 return; 1208 } 1209 1210 (void) ip_fragment_v4(mp, nce, ixaflags, pkt_len, mtu, 1211 ira->ira_xmit_hint, GLOBAL_ZONEID, 0, ip_xmit, NULL); 1212 return; 1213 } 1214 1215 ASSERT(pkt_len == ntohs(((ipha_t *)mp->b_rptr)->ipha_length)); 1216 if (iraflags & IRAF_LOOPBACK_COPY) { 1217 /* 1218 * IXAF_NO_LOOP_ZONEID is not set hence 7th arg 1219 * is don't care 1220 */ 1221 (void) ip_postfrag_loopcheck(mp, nce, 1222 ixaflags | IXAF_LOOPBACK_COPY, 1223 pkt_len, ira->ira_xmit_hint, GLOBAL_ZONEID, 0, NULL); 1224 } else { 1225 (void) ip_xmit(mp, nce, ixaflags, pkt_len, ira->ira_xmit_hint, 1226 GLOBAL_ZONEID, 0, NULL); 1227 } 1228 } 1229 1230 /* 1231 * ire_recvfn for RTF_REJECT and RTF_BLACKHOLE routes, including IRE_NOROUTE, 1232 * which is what ire_route_recursive returns when there is no matching ire. 1233 * Send ICMP unreachable unless blackhole. 1234 */ 1235 void 1236 ire_recv_noroute_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira) 1237 { 1238 ipha_t *ipha = (ipha_t *)iph_arg; 1239 ill_t *ill = ira->ira_ill; 1240 ip_stack_t *ipst = ill->ill_ipst; 1241 1242 /* Would we have forwarded this packet if we had a route? */ 1243 if (ira->ira_flags & (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST)) { 1244 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); 1245 ip_drop_input("l2 multicast not forwarded", mp, ill); 1246 freemsg(mp); 1247 return; 1248 } 1249 1250 if (!(ill->ill_flags & ILLF_ROUTER)) { 1251 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); 1252 ip_drop_input("ipIfStatsForwProhibits", mp, ill); 1253 freemsg(mp); 1254 return; 1255 } 1256 /* 1257 * If we had a route this could have been forwarded. Count as such. 1258 * 1259 * ipIfStatsHCInForwDatagrams should only be increment if there 1260 * will be an attempt to forward the packet, which is why we 1261 * increment after the above condition has been checked. 1262 */ 1263 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams); 1264 1265 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInNoRoutes); 1266 1267 ip_rts_change(RTM_MISS, ipha->ipha_dst, 0, 0, 0, 0, 0, 0, RTA_DST, 1268 ipst); 1269 1270 if (ire->ire_flags & RTF_BLACKHOLE) { 1271 ip_drop_input("ipIfStatsInNoRoutes RTF_BLACKHOLE", mp, ill); 1272 freemsg(mp); 1273 } else { 1274 ip_drop_input("ipIfStatsInNoRoutes RTF_REJECT", mp, ill); 1275 1276 if (ip_source_routed(ipha, ipst)) { 1277 icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, ira); 1278 } else { 1279 icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, ira); 1280 } 1281 } 1282 } 1283 1284 /* 1285 * ire_recvfn for IRE_LOCALs marked with ire_noaccept. Such IREs are used for 1286 * VRRP when in noaccept mode. 1287 * We silently drop the packet. ARP handles packets even if noaccept is set. 1288 */ 1289 /* ARGSUSED */ 1290 void 1291 ire_recv_noaccept_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1292 ip_recv_attr_t *ira) 1293 { 1294 ill_t *ill = ira->ira_ill; 1295 1296 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1297 ip_drop_input("ipIfStatsInDiscards - noaccept", mp, ill); 1298 freemsg(mp); 1299 } 1300 1301 /* 1302 * ire_recvfn for IRE_BROADCAST. 1303 */ 1304 void 1305 ire_recv_broadcast_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1306 ip_recv_attr_t *ira) 1307 { 1308 ipha_t *ipha = (ipha_t *)iph_arg; 1309 ill_t *ill = ira->ira_ill; 1310 ill_t *dst_ill = ire->ire_ill; 1311 ip_stack_t *ipst = ill->ill_ipst; 1312 ire_t *alt_ire; 1313 nce_t *nce; 1314 ipaddr_t ipha_dst; 1315 1316 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInBcastPkts); 1317 1318 /* Tag for higher-level protocols */ 1319 ira->ira_flags |= IRAF_BROADCAST; 1320 1321 /* 1322 * Whether local or directed broadcast forwarding: don't allow 1323 * for TCP. 1324 */ 1325 if (ipha->ipha_protocol == IPPROTO_TCP) { 1326 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1327 ip_drop_input("ipIfStatsInDiscards", mp, ill); 1328 freemsg(mp); 1329 return; 1330 } 1331 1332 /* 1333 * So that we don't end up with dups, only one ill an IPMP group is 1334 * nominated to receive broadcast traffic. 1335 * If we have no cast_ill we are liberal and accept everything. 1336 */ 1337 if (IS_UNDER_IPMP(ill)) { 1338 /* For an under ill_grp can change under lock */ 1339 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 1340 if (!ill->ill_nom_cast && ill->ill_grp != NULL && 1341 ill->ill_grp->ig_cast_ill != NULL) { 1342 rw_exit(&ipst->ips_ill_g_lock); 1343 /* No MIB since this is normal operation */ 1344 ip_drop_input("not nom_cast", mp, ill); 1345 freemsg(mp); 1346 return; 1347 } 1348 rw_exit(&ipst->ips_ill_g_lock); 1349 1350 ira->ira_ruifindex = ill_get_upper_ifindex(ill); 1351 } 1352 1353 /* 1354 * After reassembly and IPsec we will need to duplicate the 1355 * broadcast packet for all matching zones on the ill. 1356 */ 1357 ira->ira_zoneid = ALL_ZONES; 1358 1359 /* 1360 * Check for directed broadcast i.e. ire->ire_ill is different than 1361 * the incoming ill. 1362 * The same broadcast address can be assigned to multiple interfaces 1363 * so have to check explicitly for that case by looking up the alt_ire 1364 */ 1365 if (dst_ill == ill && !(ire->ire_flags & RTF_MULTIRT)) { 1366 /* Reassemble on the ill on which the packet arrived */ 1367 ip_input_local_v4(ire, mp, ipha, ira); 1368 /* Restore */ 1369 ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex; 1370 return; 1371 } 1372 1373 /* Is there an IRE_BROADCAST on the incoming ill? */ 1374 ipha_dst = ((ira->ira_flags & IRAF_DHCP_UNICAST) ? INADDR_BROADCAST : 1375 ipha->ipha_dst); 1376 alt_ire = ire_ftable_lookup_v4(ipha_dst, 0, 0, IRE_BROADCAST, ill, 1377 ALL_ZONES, ira->ira_tsl, 1378 MATCH_IRE_TYPE|MATCH_IRE_ILL|MATCH_IRE_SECATTR, 0, ipst, NULL); 1379 if (alt_ire != NULL) { 1380 /* Not a directed broadcast */ 1381 /* 1382 * In the special case of multirouted broadcast 1383 * packets, we unconditionally need to "gateway" 1384 * them to the appropriate interface here so that reassembly 1385 * works. We know that the IRE_BROADCAST on cgtp0 doesn't 1386 * have RTF_MULTIRT set so we look for such an IRE in the 1387 * bucket. 1388 */ 1389 if (alt_ire->ire_flags & RTF_MULTIRT) { 1390 irb_t *irb; 1391 ire_t *ire1; 1392 1393 irb = ire->ire_bucket; 1394 irb_refhold(irb); 1395 for (ire1 = irb->irb_ire; ire1 != NULL; 1396 ire1 = ire1->ire_next) { 1397 if (IRE_IS_CONDEMNED(ire1)) 1398 continue; 1399 if (!(ire1->ire_type & IRE_BROADCAST) || 1400 (ire1->ire_flags & RTF_MULTIRT)) 1401 continue; 1402 ill = ire1->ire_ill; 1403 ill_refhold(ill); 1404 break; 1405 } 1406 irb_refrele(irb); 1407 if (ire1 != NULL) { 1408 ill_t *orig_ill = ira->ira_ill; 1409 1410 ire_refrele(alt_ire); 1411 /* Reassemble on the new ill */ 1412 ira->ira_ill = ill; 1413 ip_input_local_v4(ire, mp, ipha, ira); 1414 ill_refrele(ill); 1415 /* Restore */ 1416 ira->ira_ill = orig_ill; 1417 ira->ira_ruifindex = 1418 orig_ill->ill_phyint->phyint_ifindex; 1419 return; 1420 } 1421 } 1422 ire_refrele(alt_ire); 1423 /* Reassemble on the ill on which the packet arrived */ 1424 ip_input_local_v4(ire, mp, ipha, ira); 1425 goto done; 1426 } 1427 1428 /* 1429 * This is a directed broadcast 1430 * 1431 * If directed broadcast is allowed, then forward the packet out 1432 * the destination interface with IXAF_LOOPBACK_COPY set. That will 1433 * result in ip_input() receiving a copy of the packet on the 1434 * appropriate ill. (We could optimize this to avoid the extra trip 1435 * via ip_input(), but since directed broadcasts are normally disabled 1436 * it doesn't make sense to optimize it.) 1437 */ 1438 if (!ipst->ips_ip_g_forward_directed_bcast || 1439 (ira->ira_flags & (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST))) { 1440 ip_drop_input("directed broadcast not allowed", mp, ill); 1441 freemsg(mp); 1442 goto done; 1443 } 1444 if ((ira->ira_flags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) { 1445 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); 1446 ip_drop_input("ipIfStatsInCksumErrs", mp, ill); 1447 freemsg(mp); 1448 goto done; 1449 } 1450 1451 /* 1452 * Clear the indication that this may have hardware 1453 * checksum as we are not using it for forwarding. 1454 */ 1455 DB_CKSUMFLAGS(mp) = 0; 1456 1457 /* 1458 * Adjust ttl to 2 (1+1 - the forward engine will decrement it by one. 1459 */ 1460 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl + 1; 1461 ipha->ipha_hdr_checksum = 0; 1462 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1463 1464 /* 1465 * We use ip_forward_xmit to do any fragmentation. 1466 * and loopback copy on the outbound interface. 1467 * 1468 * Make it so that IXAF_LOOPBACK_COPY to be set on transmit side. 1469 */ 1470 ira->ira_flags |= IRAF_LOOPBACK_COPY; 1471 1472 nce = arp_nce_init(dst_ill, ipha->ipha_dst, IRE_BROADCAST); 1473 if (nce == NULL) { 1474 BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsOutDiscards); 1475 ip_drop_output("No nce", mp, dst_ill); 1476 freemsg(mp); 1477 goto done; 1478 } 1479 1480 ip_forward_xmit_v4(nce, ill, mp, ipha, ira, dst_ill->ill_mc_mtu, 0); 1481 nce_refrele(nce); 1482 done: 1483 /* Restore */ 1484 ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex; 1485 } 1486 1487 /* 1488 * ire_recvfn for IRE_MULTICAST. 1489 */ 1490 void 1491 ire_recv_multicast_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1492 ip_recv_attr_t *ira) 1493 { 1494 ipha_t *ipha = (ipha_t *)iph_arg; 1495 ill_t *ill = ira->ira_ill; 1496 ip_stack_t *ipst = ill->ill_ipst; 1497 1498 ASSERT(ire->ire_ill == ira->ira_ill); 1499 1500 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastPkts); 1501 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastOctets, ira->ira_pktlen); 1502 1503 /* RSVP hook */ 1504 if (ira->ira_flags & IRAF_RSVP) 1505 goto forus; 1506 1507 /* Tag for higher-level protocols */ 1508 ira->ira_flags |= IRAF_MULTICAST; 1509 1510 /* 1511 * So that we don't end up with dups, only one ill an IPMP group is 1512 * nominated to receive multicast traffic. 1513 * If we have no cast_ill we are liberal and accept everything. 1514 */ 1515 if (IS_UNDER_IPMP(ill)) { 1516 ip_stack_t *ipst = ill->ill_ipst; 1517 1518 /* For an under ill_grp can change under lock */ 1519 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 1520 if (!ill->ill_nom_cast && ill->ill_grp != NULL && 1521 ill->ill_grp->ig_cast_ill != NULL) { 1522 rw_exit(&ipst->ips_ill_g_lock); 1523 ip_drop_input("not on cast ill", mp, ill); 1524 freemsg(mp); 1525 return; 1526 } 1527 rw_exit(&ipst->ips_ill_g_lock); 1528 /* 1529 * We switch to the upper ill so that mrouter and hasmembers 1530 * can operate on upper here and in ip_input_multicast. 1531 */ 1532 ill = ipmp_ill_hold_ipmp_ill(ill); 1533 if (ill != NULL) { 1534 ASSERT(ill != ira->ira_ill); 1535 ASSERT(ire->ire_ill == ira->ira_ill); 1536 ira->ira_ill = ill; 1537 ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex; 1538 } else { 1539 ill = ira->ira_ill; 1540 } 1541 } 1542 1543 /* 1544 * Check if we are a multicast router - send ip_mforward a copy of 1545 * the packet. 1546 * Due to mroute_decap tunnels we consider forwarding packets even if 1547 * mrouted has not joined the allmulti group on this interface. 1548 */ 1549 if (ipst->ips_ip_g_mrouter) { 1550 int retval; 1551 1552 /* 1553 * Clear the indication that this may have hardware 1554 * checksum as we are not using it for forwarding. 1555 */ 1556 DB_CKSUMFLAGS(mp) = 0; 1557 1558 /* 1559 * ip_mforward helps us make these distinctions: If received 1560 * on tunnel and not IGMP, then drop. 1561 * If IGMP packet, then don't check membership 1562 * If received on a phyint and IGMP or PIM, then 1563 * don't check membership 1564 */ 1565 retval = ip_mforward(mp, ira); 1566 /* ip_mforward updates mib variables if needed */ 1567 1568 switch (retval) { 1569 case 0: 1570 /* 1571 * pkt is okay and arrived on phyint. 1572 * 1573 * If we are running as a multicast router 1574 * we need to see all IGMP and/or PIM packets. 1575 */ 1576 if ((ipha->ipha_protocol == IPPROTO_IGMP) || 1577 (ipha->ipha_protocol == IPPROTO_PIM)) { 1578 goto forus; 1579 } 1580 break; 1581 case -1: 1582 /* pkt is mal-formed, toss it */ 1583 freemsg(mp); 1584 goto done; 1585 case 1: 1586 /* 1587 * pkt is okay and arrived on a tunnel 1588 * 1589 * If we are running a multicast router 1590 * we need to see all igmp packets. 1591 */ 1592 if (ipha->ipha_protocol == IPPROTO_IGMP) { 1593 goto forus; 1594 } 1595 ip_drop_input("Multicast on tunnel ignored", mp, ill); 1596 freemsg(mp); 1597 goto done; 1598 } 1599 } 1600 1601 /* 1602 * Check if we have members on this ill. This is not necessary for 1603 * correctness because even if the NIC/GLD had a leaky filter, we 1604 * filter before passing to each conn_t. 1605 */ 1606 if (!ill_hasmembers_v4(ill, ipha->ipha_dst)) { 1607 /* 1608 * Nobody interested 1609 * 1610 * This might just be caused by the fact that 1611 * multiple IP Multicast addresses map to the same 1612 * link layer multicast - no need to increment counter! 1613 */ 1614 ip_drop_input("Multicast with no members", mp, ill); 1615 freemsg(mp); 1616 goto done; 1617 } 1618 forus: 1619 ip2dbg(("ire_recv_multicast_v4: multicast for us: 0x%x\n", 1620 ntohl(ipha->ipha_dst))); 1621 1622 /* 1623 * After reassembly and IPsec we will need to duplicate the 1624 * multicast packet for all matching zones on the ill. 1625 */ 1626 ira->ira_zoneid = ALL_ZONES; 1627 1628 /* Reassemble on the ill on which the packet arrived */ 1629 ip_input_local_v4(ire, mp, ipha, ira); 1630 done: 1631 if (ill != ire->ire_ill) { 1632 ill_refrele(ill); 1633 ira->ira_ill = ire->ire_ill; 1634 ira->ira_ruifindex = ira->ira_ill->ill_phyint->phyint_ifindex; 1635 } 1636 } 1637 1638 /* 1639 * ire_recvfn for IRE_OFFLINK with RTF_MULTIRT. 1640 * Drop packets since we don't forward out multirt routes. 1641 */ 1642 /* ARGSUSED */ 1643 void 1644 ire_recv_multirt_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira) 1645 { 1646 ill_t *ill = ira->ira_ill; 1647 1648 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInNoRoutes); 1649 ip_drop_input("Not forwarding out MULTIRT", mp, ill); 1650 freemsg(mp); 1651 } 1652 1653 /* 1654 * ire_recvfn for IRE_LOOPBACK. This is only used when a FW_HOOK 1655 * has rewritten the packet to have a loopback destination address (We 1656 * filter out packet with a loopback destination from arriving over the wire). 1657 * We don't know what zone to use, thus we always use the GLOBAL_ZONEID. 1658 */ 1659 void 1660 ire_recv_loopback_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira) 1661 { 1662 ipha_t *ipha = (ipha_t *)iph_arg; 1663 ill_t *ill = ira->ira_ill; 1664 ill_t *ire_ill = ire->ire_ill; 1665 1666 ira->ira_zoneid = GLOBAL_ZONEID; 1667 1668 /* Switch to the lo0 ill for further processing */ 1669 if (ire_ill != ill) { 1670 /* 1671 * Update ira_ill to be the ILL on which the IP address 1672 * is hosted. 1673 * No need to hold the ill since we have a hold on the ire 1674 */ 1675 ASSERT(ira->ira_ill == ira->ira_rill); 1676 ira->ira_ill = ire_ill; 1677 1678 ip_input_local_v4(ire, mp, ipha, ira); 1679 1680 /* Restore */ 1681 ASSERT(ira->ira_ill == ire_ill); 1682 ira->ira_ill = ill; 1683 return; 1684 1685 } 1686 ip_input_local_v4(ire, mp, ipha, ira); 1687 } 1688 1689 /* 1690 * ire_recvfn for IRE_LOCAL. 1691 */ 1692 void 1693 ire_recv_local_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira) 1694 { 1695 ipha_t *ipha = (ipha_t *)iph_arg; 1696 ill_t *ill = ira->ira_ill; 1697 ill_t *ire_ill = ire->ire_ill; 1698 1699 /* Make a note for DAD that this address is in use */ 1700 ire->ire_last_used_time = LBOLT_FASTPATH; 1701 1702 /* Only target the IRE_LOCAL with the right zoneid. */ 1703 ira->ira_zoneid = ire->ire_zoneid; 1704 1705 /* 1706 * If the packet arrived on the wrong ill, we check that 1707 * this is ok. 1708 * If it is, then we ensure that we do the reassembly on 1709 * the ill on which the address is hosted. We keep ira_rill as 1710 * the one on which the packet arrived, so that IP_PKTINFO and 1711 * friends can report this. 1712 */ 1713 if (ire_ill != ill) { 1714 ire_t *new_ire; 1715 1716 new_ire = ip_check_multihome(&ipha->ipha_dst, ire, ill); 1717 if (new_ire == NULL) { 1718 /* Drop packet */ 1719 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); 1720 ip_drop_input("ipIfStatsInForwProhibits", mp, ill); 1721 freemsg(mp); 1722 return; 1723 } 1724 /* 1725 * Update ira_ill to be the ILL on which the IP address 1726 * is hosted. No need to hold the ill since we have a 1727 * hold on the ire. Note that we do the switch even if 1728 * new_ire == ire (for IPMP, ire would be the one corresponding 1729 * to the IPMP ill). 1730 */ 1731 ASSERT(ira->ira_ill == ira->ira_rill); 1732 ira->ira_ill = new_ire->ire_ill; 1733 1734 /* ira_ruifindex tracks the upper for ira_rill */ 1735 if (IS_UNDER_IPMP(ill)) 1736 ira->ira_ruifindex = ill_get_upper_ifindex(ill); 1737 1738 ip_input_local_v4(new_ire, mp, ipha, ira); 1739 1740 /* Restore */ 1741 ASSERT(ira->ira_ill == new_ire->ire_ill); 1742 ira->ira_ill = ill; 1743 ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex; 1744 1745 if (new_ire != ire) 1746 ire_refrele(new_ire); 1747 return; 1748 } 1749 1750 ip_input_local_v4(ire, mp, ipha, ira); 1751 } 1752 1753 /* 1754 * Common function for packets arriving for the host. Handles 1755 * checksum verification, reassembly checks, etc. 1756 */ 1757 static void 1758 ip_input_local_v4(ire_t *ire, mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira) 1759 { 1760 ill_t *ill = ira->ira_ill; 1761 iaflags_t iraflags = ira->ira_flags; 1762 1763 /* 1764 * Verify IP header checksum. If the packet was AH or ESP then 1765 * this flag has already been cleared. Likewise if the packet 1766 * had a hardware checksum. 1767 */ 1768 if ((iraflags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) { 1769 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); 1770 ip_drop_input("ipIfStatsInCksumErrs", mp, ill); 1771 freemsg(mp); 1772 return; 1773 } 1774 1775 if (iraflags & IRAF_IPV4_OPTIONS) { 1776 if (!ip_input_local_options(mp, ipha, ira)) { 1777 /* Error has been sent and mp consumed */ 1778 return; 1779 } 1780 /* 1781 * Some old hardware does partial checksum by including the 1782 * whole IP header, so the partial checksum value might have 1783 * become invalid if any option in the packet have been 1784 * updated. Always clear partial checksum flag here. 1785 */ 1786 DB_CKSUMFLAGS(mp) &= ~HCK_PARTIALCKSUM; 1787 } 1788 1789 /* 1790 * Is packet part of fragmented IP packet? 1791 * We compare against defined values in network byte order 1792 */ 1793 if (ipha->ipha_fragment_offset_and_flags & 1794 (IPH_MF_HTONS | IPH_OFFSET_HTONS)) { 1795 /* 1796 * Make sure we have ira_l2src before we loose the original 1797 * mblk 1798 */ 1799 if (!(ira->ira_flags & IRAF_L2SRC_SET)) 1800 ip_setl2src(mp, ira, ira->ira_rill); 1801 1802 mp = ip_input_fragment(mp, ipha, ira); 1803 if (mp == NULL) 1804 return; 1805 /* Completed reassembly */ 1806 ipha = (ipha_t *)mp->b_rptr; 1807 } 1808 1809 /* 1810 * For broadcast and multicast we need some extra work before 1811 * we call ip_fanout_v4(), since in the case of shared-IP zones 1812 * we need to pretend that a packet arrived for each zoneid. 1813 */ 1814 if (iraflags & IRAF_MULTIBROADCAST) { 1815 if (iraflags & IRAF_BROADCAST) 1816 ip_input_broadcast_v4(ire, mp, ipha, ira); 1817 else 1818 ip_input_multicast_v4(ire, mp, ipha, ira); 1819 return; 1820 } 1821 ip_fanout_v4(mp, ipha, ira); 1822 } 1823 1824 1825 /* 1826 * Handle multiple zones which match the same broadcast address 1827 * and ill by delivering a packet to each of them. 1828 * Walk the bucket and look for different ire_zoneid but otherwise 1829 * the same IRE (same ill/addr/mask/type). 1830 * Note that ire_add() tracks IREs that are identical in all 1831 * fields (addr/mask/type/gw/ill/zoneid) within a single IRE by 1832 * increasing ire_identical_cnt. Thus we don't need to be concerned 1833 * about those. 1834 */ 1835 static void 1836 ip_input_broadcast_v4(ire_t *ire, mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira) 1837 { 1838 ill_t *ill = ira->ira_ill; 1839 ip_stack_t *ipst = ill->ill_ipst; 1840 netstack_t *ns = ipst->ips_netstack; 1841 irb_t *irb; 1842 ire_t *ire1; 1843 mblk_t *mp1; 1844 ipha_t *ipha1; 1845 uint_t ira_pktlen = ira->ira_pktlen; 1846 uint16_t ira_ip_hdr_length = ira->ira_ip_hdr_length; 1847 1848 irb = ire->ire_bucket; 1849 1850 /* 1851 * If we don't have more than one shared-IP zone, or if 1852 * there can't be more than one IRE_BROADCAST for this 1853 * IP address, then just set the zoneid and proceed. 1854 */ 1855 if (ns->netstack_numzones == 1 || irb->irb_ire_cnt == 1) { 1856 ira->ira_zoneid = ire->ire_zoneid; 1857 1858 ip_fanout_v4(mp, ipha, ira); 1859 return; 1860 } 1861 irb_refhold(irb); 1862 for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 1863 /* We do the main IRE after the end of the loop */ 1864 if (ire1 == ire) 1865 continue; 1866 1867 /* 1868 * Only IREs for the same IP address should be in the same 1869 * bucket. 1870 * But could have IRE_HOSTs in the case of CGTP. 1871 */ 1872 ASSERT(ire1->ire_addr == ire->ire_addr); 1873 if (!(ire1->ire_type & IRE_BROADCAST)) 1874 continue; 1875 1876 if (IRE_IS_CONDEMNED(ire1)) 1877 continue; 1878 1879 mp1 = copymsg(mp); 1880 if (mp1 == NULL) { 1881 /* Failed to deliver to one zone */ 1882 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1883 ip_drop_input("ipIfStatsInDiscards", mp, ill); 1884 continue; 1885 } 1886 ira->ira_zoneid = ire1->ire_zoneid; 1887 ipha1 = (ipha_t *)mp1->b_rptr; 1888 ip_fanout_v4(mp1, ipha1, ira); 1889 /* 1890 * IPsec might have modified ira_pktlen and ira_ip_hdr_length 1891 * so we restore them for a potential next iteration 1892 */ 1893 ira->ira_pktlen = ira_pktlen; 1894 ira->ira_ip_hdr_length = ira_ip_hdr_length; 1895 } 1896 irb_refrele(irb); 1897 /* Do the main ire */ 1898 ira->ira_zoneid = ire->ire_zoneid; 1899 ip_fanout_v4(mp, ipha, ira); 1900 } 1901 1902 /* 1903 * Handle multiple zones which want to receive the same multicast packets 1904 * on this ill by delivering a packet to each of them. 1905 * 1906 * Note that for packets delivered to transports we could instead do this 1907 * as part of the fanout code, but since we need to handle icmp_inbound 1908 * it is simpler to have multicast work the same as broadcast. 1909 * 1910 * The ip_fanout matching for multicast matches based on ilm independent of 1911 * zoneid since the zoneid restriction is applied when joining a multicast 1912 * group. 1913 */ 1914 /* ARGSUSED */ 1915 static void 1916 ip_input_multicast_v4(ire_t *ire, mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira) 1917 { 1918 ill_t *ill = ira->ira_ill; 1919 iaflags_t iraflags = ira->ira_flags; 1920 ip_stack_t *ipst = ill->ill_ipst; 1921 netstack_t *ns = ipst->ips_netstack; 1922 zoneid_t zoneid; 1923 mblk_t *mp1; 1924 ipha_t *ipha1; 1925 uint_t ira_pktlen = ira->ira_pktlen; 1926 uint16_t ira_ip_hdr_length = ira->ira_ip_hdr_length; 1927 1928 /* ire_recv_multicast has switched to the upper ill for IPMP */ 1929 ASSERT(!IS_UNDER_IPMP(ill)); 1930 1931 /* 1932 * If we don't have more than one shared-IP zone, or if 1933 * there are no members in anything but the global zone, 1934 * then just set the zoneid and proceed. 1935 */ 1936 if (ns->netstack_numzones == 1 || 1937 !ill_hasmembers_otherzones_v4(ill, ipha->ipha_dst, 1938 GLOBAL_ZONEID)) { 1939 ira->ira_zoneid = GLOBAL_ZONEID; 1940 1941 /* If sender didn't want this zone to receive it, drop */ 1942 if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) && 1943 ira->ira_no_loop_zoneid == ira->ira_zoneid) { 1944 ip_drop_input("Multicast but wrong zoneid", mp, ill); 1945 freemsg(mp); 1946 return; 1947 } 1948 ip_fanout_v4(mp, ipha, ira); 1949 return; 1950 } 1951 1952 /* 1953 * Here we loop over all zoneids that have members in the group 1954 * and deliver a packet to ip_fanout for each zoneid. 1955 * 1956 * First find any members in the lowest numeric zoneid by looking for 1957 * first zoneid larger than -1 (ALL_ZONES). 1958 * We terminate the loop when we receive -1 (ALL_ZONES). 1959 */ 1960 zoneid = ill_hasmembers_nextzone_v4(ill, ipha->ipha_dst, ALL_ZONES); 1961 for (; zoneid != ALL_ZONES; 1962 zoneid = ill_hasmembers_nextzone_v4(ill, ipha->ipha_dst, zoneid)) { 1963 /* 1964 * Avoid an extra copymsg/freemsg by skipping global zone here 1965 * and doing that at the end. 1966 */ 1967 if (zoneid == GLOBAL_ZONEID) 1968 continue; 1969 1970 ira->ira_zoneid = zoneid; 1971 1972 /* If sender didn't want this zone to receive it, skip */ 1973 if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) && 1974 ira->ira_no_loop_zoneid == ira->ira_zoneid) 1975 continue; 1976 1977 mp1 = copymsg(mp); 1978 if (mp1 == NULL) { 1979 /* Failed to deliver to one zone */ 1980 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1981 ip_drop_input("ipIfStatsInDiscards", mp, ill); 1982 continue; 1983 } 1984 ipha1 = (ipha_t *)mp1->b_rptr; 1985 ip_fanout_v4(mp1, ipha1, ira); 1986 /* 1987 * IPsec might have modified ira_pktlen and ira_ip_hdr_length 1988 * so we restore them for a potential next iteration 1989 */ 1990 ira->ira_pktlen = ira_pktlen; 1991 ira->ira_ip_hdr_length = ira_ip_hdr_length; 1992 } 1993 1994 /* Do the main ire */ 1995 ira->ira_zoneid = GLOBAL_ZONEID; 1996 /* If sender didn't want this zone to receive it, drop */ 1997 if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) && 1998 ira->ira_no_loop_zoneid == ira->ira_zoneid) { 1999 ip_drop_input("Multicast but wrong zoneid", mp, ill); 2000 freemsg(mp); 2001 } else { 2002 ip_fanout_v4(mp, ipha, ira); 2003 } 2004 } 2005 2006 2007 /* 2008 * Determine the zoneid and IRAF_TX_* flags if trusted extensions 2009 * is in use. Updates ira_zoneid and ira_flags as a result. 2010 */ 2011 static void 2012 ip_fanout_tx_v4(mblk_t *mp, ipha_t *ipha, uint8_t protocol, 2013 uint_t ip_hdr_length, ip_recv_attr_t *ira) 2014 { 2015 uint16_t *up; 2016 uint16_t lport; 2017 zoneid_t zoneid; 2018 2019 ASSERT(ira->ira_flags & IRAF_SYSTEM_LABELED); 2020 2021 /* 2022 * If the packet is unlabeled we might allow read-down 2023 * for MAC_EXEMPT. Below we clear this if it is a multi-level 2024 * port (MLP). 2025 * Note that ira_tsl can be NULL here. 2026 */ 2027 if (ira->ira_tsl != NULL && ira->ira_tsl->tsl_flags & TSLF_UNLABELED) 2028 ira->ira_flags |= IRAF_TX_MAC_EXEMPTABLE; 2029 2030 if (ira->ira_zoneid != ALL_ZONES) 2031 return; 2032 2033 ira->ira_flags |= IRAF_TX_SHARED_ADDR; 2034 2035 up = (uint16_t *)((uchar_t *)ipha + ip_hdr_length); 2036 switch (protocol) { 2037 case IPPROTO_TCP: 2038 case IPPROTO_SCTP: 2039 case IPPROTO_UDP: 2040 /* Caller ensures this */ 2041 ASSERT(((uchar_t *)ipha) + ip_hdr_length +4 <= mp->b_wptr); 2042 2043 /* 2044 * Only these transports support MLP. 2045 * We know their destination port numbers is in 2046 * the same place in the header. 2047 */ 2048 lport = up[1]; 2049 2050 /* 2051 * No need to handle exclusive-stack zones 2052 * since ALL_ZONES only applies to the shared IP instance. 2053 */ 2054 zoneid = tsol_mlp_findzone(protocol, lport); 2055 /* 2056 * If no shared MLP is found, tsol_mlp_findzone returns 2057 * ALL_ZONES. In that case, we assume it's SLP, and 2058 * search for the zone based on the packet label. 2059 * 2060 * If there is such a zone, we prefer to find a 2061 * connection in it. Otherwise, we look for a 2062 * MAC-exempt connection in any zone whose label 2063 * dominates the default label on the packet. 2064 */ 2065 if (zoneid == ALL_ZONES) 2066 zoneid = tsol_attr_to_zoneid(ira); 2067 else 2068 ira->ira_flags &= ~IRAF_TX_MAC_EXEMPTABLE; 2069 break; 2070 default: 2071 /* Handle shared address for other protocols */ 2072 zoneid = tsol_attr_to_zoneid(ira); 2073 break; 2074 } 2075 ira->ira_zoneid = zoneid; 2076 } 2077 2078 /* 2079 * Increment checksum failure statistics 2080 */ 2081 static void 2082 ip_input_cksum_err_v4(uint8_t protocol, uint16_t hck_flags, ill_t *ill) 2083 { 2084 ip_stack_t *ipst = ill->ill_ipst; 2085 2086 switch (protocol) { 2087 case IPPROTO_TCP: 2088 BUMP_MIB(ill->ill_ip_mib, tcpIfStatsInErrs); 2089 2090 if (hck_flags & HCK_FULLCKSUM) 2091 IP_STAT(ipst, ip_tcp_in_full_hw_cksum_err); 2092 else if (hck_flags & HCK_PARTIALCKSUM) 2093 IP_STAT(ipst, ip_tcp_in_part_hw_cksum_err); 2094 else 2095 IP_STAT(ipst, ip_tcp_in_sw_cksum_err); 2096 break; 2097 case IPPROTO_UDP: 2098 BUMP_MIB(ill->ill_ip_mib, udpIfStatsInCksumErrs); 2099 if (hck_flags & HCK_FULLCKSUM) 2100 IP_STAT(ipst, ip_udp_in_full_hw_cksum_err); 2101 else if (hck_flags & HCK_PARTIALCKSUM) 2102 IP_STAT(ipst, ip_udp_in_part_hw_cksum_err); 2103 else 2104 IP_STAT(ipst, ip_udp_in_sw_cksum_err); 2105 break; 2106 case IPPROTO_ICMP: 2107 BUMP_MIB(&ipst->ips_icmp_mib, icmpInCksumErrs); 2108 break; 2109 default: 2110 ASSERT(0); 2111 break; 2112 } 2113 } 2114 2115 /* Calculate the IPv4 pseudo-header checksum */ 2116 uint32_t 2117 ip_input_cksum_pseudo_v4(ipha_t *ipha, ip_recv_attr_t *ira) 2118 { 2119 uint_t ulp_len; 2120 uint32_t cksum; 2121 uint8_t protocol = ira->ira_protocol; 2122 uint16_t ip_hdr_length = ira->ira_ip_hdr_length; 2123 2124 #define iphs ((uint16_t *)ipha) 2125 2126 switch (protocol) { 2127 case IPPROTO_TCP: 2128 ulp_len = ira->ira_pktlen - ip_hdr_length; 2129 2130 /* Protocol and length */ 2131 cksum = htons(ulp_len) + IP_TCP_CSUM_COMP; 2132 /* IP addresses */ 2133 cksum += iphs[6] + iphs[7] + iphs[8] + iphs[9]; 2134 break; 2135 2136 case IPPROTO_UDP: { 2137 udpha_t *udpha; 2138 2139 udpha = (udpha_t *)((uchar_t *)ipha + ip_hdr_length); 2140 2141 /* Protocol and length */ 2142 cksum = udpha->uha_length + IP_UDP_CSUM_COMP; 2143 /* IP addresses */ 2144 cksum += iphs[6] + iphs[7] + iphs[8] + iphs[9]; 2145 break; 2146 } 2147 2148 default: 2149 cksum = 0; 2150 break; 2151 } 2152 #undef iphs 2153 return (cksum); 2154 } 2155 2156 2157 /* 2158 * Software verification of the ULP checksums. 2159 * Returns B_TRUE if ok. 2160 * Increments statistics of failed. 2161 */ 2162 static boolean_t 2163 ip_input_sw_cksum_v4(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira) 2164 { 2165 ip_stack_t *ipst = ira->ira_ill->ill_ipst; 2166 uint32_t cksum; 2167 uint8_t protocol = ira->ira_protocol; 2168 uint16_t ip_hdr_length = ira->ira_ip_hdr_length; 2169 2170 IP_STAT(ipst, ip_in_sw_cksum); 2171 2172 ASSERT(protocol == IPPROTO_TCP || protocol == IPPROTO_UDP); 2173 2174 cksum = ip_input_cksum_pseudo_v4(ipha, ira); 2175 cksum = IP_CSUM(mp, ip_hdr_length, cksum); 2176 if (cksum == 0) 2177 return (B_TRUE); 2178 2179 ip_input_cksum_err_v4(protocol, 0, ira->ira_ill); 2180 return (B_FALSE); 2181 } 2182 2183 /* 2184 * Verify the ULP checksums. 2185 * Returns B_TRUE if ok, or if the ULP doesn't have a well-defined checksum 2186 * algorithm. 2187 * Increments statistics if failed. 2188 */ 2189 static boolean_t 2190 ip_input_cksum_v4(iaflags_t iraflags, mblk_t *mp, ipha_t *ipha, 2191 ip_recv_attr_t *ira) 2192 { 2193 ill_t *ill = ira->ira_rill; 2194 uint16_t hck_flags; 2195 uint32_t cksum; 2196 mblk_t *mp1; 2197 int32_t len; 2198 uint8_t protocol = ira->ira_protocol; 2199 uint16_t ip_hdr_length = ira->ira_ip_hdr_length; 2200 2201 2202 switch (protocol) { 2203 case IPPROTO_TCP: 2204 break; 2205 2206 case IPPROTO_UDP: { 2207 udpha_t *udpha; 2208 2209 udpha = (udpha_t *)((uchar_t *)ipha + ip_hdr_length); 2210 if (udpha->uha_checksum == 0) { 2211 /* Packet doesn't have a UDP checksum */ 2212 return (B_TRUE); 2213 } 2214 break; 2215 } 2216 case IPPROTO_SCTP: { 2217 sctp_hdr_t *sctph; 2218 uint32_t pktsum; 2219 2220 sctph = (sctp_hdr_t *)((uchar_t *)ipha + ip_hdr_length); 2221 #ifdef DEBUG 2222 if (skip_sctp_cksum) 2223 return (B_TRUE); 2224 #endif 2225 pktsum = sctph->sh_chksum; 2226 sctph->sh_chksum = 0; 2227 cksum = sctp_cksum(mp, ip_hdr_length); 2228 sctph->sh_chksum = pktsum; 2229 if (cksum == pktsum) 2230 return (B_TRUE); 2231 2232 /* 2233 * Defer until later whether a bad checksum is ok 2234 * in order to allow RAW sockets to use Adler checksum 2235 * with SCTP. 2236 */ 2237 ira->ira_flags |= IRAF_SCTP_CSUM_ERR; 2238 return (B_TRUE); 2239 } 2240 2241 default: 2242 /* No ULP checksum to verify. */ 2243 return (B_TRUE); 2244 } 2245 /* 2246 * Revert to software checksum calculation if the interface 2247 * isn't capable of checksum offload. 2248 * We clear DB_CKSUMFLAGS when going through IPsec in ip_fanout. 2249 * Note: IRAF_NO_HW_CKSUM is not currently used. 2250 */ 2251 ASSERT(!IS_IPMP(ill)); 2252 if ((iraflags & IRAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) || 2253 !dohwcksum) { 2254 return (ip_input_sw_cksum_v4(mp, ipha, ira)); 2255 } 2256 2257 /* 2258 * We apply this for all ULP protocols. Does the HW know to 2259 * not set the flags for SCTP and other protocols. 2260 */ 2261 2262 hck_flags = DB_CKSUMFLAGS(mp); 2263 2264 if (hck_flags & HCK_FULLCKSUM_OK) { 2265 /* 2266 * Hardware has already verified the checksum. 2267 */ 2268 return (B_TRUE); 2269 } 2270 2271 if (hck_flags & HCK_FULLCKSUM) { 2272 /* 2273 * Full checksum has been computed by the hardware 2274 * and has been attached. If the driver wants us to 2275 * verify the correctness of the attached value, in 2276 * order to protect against faulty hardware, compare 2277 * it against -0 (0xFFFF) to see if it's valid. 2278 */ 2279 cksum = DB_CKSUM16(mp); 2280 if (cksum == 0xFFFF) 2281 return (B_TRUE); 2282 ip_input_cksum_err_v4(protocol, hck_flags, ira->ira_ill); 2283 return (B_FALSE); 2284 } 2285 2286 mp1 = mp->b_cont; 2287 if ((hck_flags & HCK_PARTIALCKSUM) && 2288 (mp1 == NULL || mp1->b_cont == NULL) && 2289 ip_hdr_length >= DB_CKSUMSTART(mp) && 2290 ((len = ip_hdr_length - DB_CKSUMSTART(mp)) & 1) == 0) { 2291 uint32_t adj; 2292 uchar_t *cksum_start; 2293 2294 cksum = ip_input_cksum_pseudo_v4(ipha, ira); 2295 2296 cksum_start = ((uchar_t *)ipha + DB_CKSUMSTART(mp)); 2297 2298 /* 2299 * Partial checksum has been calculated by hardware 2300 * and attached to the packet; in addition, any 2301 * prepended extraneous data is even byte aligned, 2302 * and there are at most two mblks associated with 2303 * the packet. If any such data exists, we adjust 2304 * the checksum; also take care any postpended data. 2305 */ 2306 IP_ADJCKSUM_PARTIAL(cksum_start, mp, mp1, len, adj); 2307 /* 2308 * One's complement subtract extraneous checksum 2309 */ 2310 cksum += DB_CKSUM16(mp); 2311 if (adj >= cksum) 2312 cksum = ~(adj - cksum) & 0xFFFF; 2313 else 2314 cksum -= adj; 2315 cksum = (cksum & 0xFFFF) + ((int)cksum >> 16); 2316 cksum = (cksum & 0xFFFF) + ((int)cksum >> 16); 2317 if (!(~cksum & 0xFFFF)) 2318 return (B_TRUE); 2319 2320 ip_input_cksum_err_v4(protocol, hck_flags, ira->ira_ill); 2321 return (B_FALSE); 2322 } 2323 return (ip_input_sw_cksum_v4(mp, ipha, ira)); 2324 } 2325 2326 2327 /* 2328 * Handle fanout of received packets. 2329 * Unicast packets that are looped back (from ire_send_local_v4) and packets 2330 * from the wire are differentiated by checking IRAF_VERIFY_ULP_CKSUM. 2331 * 2332 * IPQoS Notes 2333 * Before sending it to the client, invoke IPPF processing. Policy processing 2334 * takes place only if the callout_position, IPP_LOCAL_IN, is enabled. 2335 */ 2336 void 2337 ip_fanout_v4(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira) 2338 { 2339 ill_t *ill = ira->ira_ill; 2340 iaflags_t iraflags = ira->ira_flags; 2341 ip_stack_t *ipst = ill->ill_ipst; 2342 uint8_t protocol = ipha->ipha_protocol; 2343 conn_t *connp; 2344 #define rptr ((uchar_t *)ipha) 2345 uint_t ip_hdr_length; 2346 uint_t min_ulp_header_length; 2347 int offset; 2348 ssize_t len; 2349 netstack_t *ns = ipst->ips_netstack; 2350 ipsec_stack_t *ipss = ns->netstack_ipsec; 2351 ill_t *rill = ira->ira_rill; 2352 2353 ASSERT(ira->ira_pktlen == ntohs(ipha->ipha_length)); 2354 2355 ip_hdr_length = ira->ira_ip_hdr_length; 2356 ira->ira_protocol = protocol; 2357 2358 /* 2359 * Time for IPP once we've done reassembly and IPsec. 2360 * We skip this for loopback packets since we don't do IPQoS 2361 * on loopback. 2362 */ 2363 if (IPP_ENABLED(IPP_LOCAL_IN, ipst) && 2364 !(iraflags & IRAF_LOOPBACK) && 2365 (protocol != IPPROTO_ESP || protocol != IPPROTO_AH)) { 2366 /* 2367 * Use the interface on which the packet arrived - not where 2368 * the IP address is hosted. 2369 */ 2370 /* ip_process translates an IS_UNDER_IPMP */ 2371 mp = ip_process(IPP_LOCAL_IN, mp, rill, ill); 2372 if (mp == NULL) { 2373 /* ip_drop_packet and MIB done */ 2374 return; 2375 } 2376 } 2377 2378 /* Determine the minimum required size of the upper-layer header */ 2379 /* Need to do this for at least the set of ULPs that TX handles. */ 2380 switch (protocol) { 2381 case IPPROTO_TCP: 2382 min_ulp_header_length = TCP_MIN_HEADER_LENGTH; 2383 break; 2384 case IPPROTO_SCTP: 2385 min_ulp_header_length = SCTP_COMMON_HDR_LENGTH; 2386 break; 2387 case IPPROTO_UDP: 2388 min_ulp_header_length = UDPH_SIZE; 2389 break; 2390 case IPPROTO_ICMP: 2391 min_ulp_header_length = ICMPH_SIZE; 2392 break; 2393 case IPPROTO_DCCP: 2394 min_ulp_header_length = DCCP_MIN_HEADER_LENGTH; 2395 break; 2396 default: 2397 min_ulp_header_length = 0; 2398 break; 2399 } 2400 /* Make sure we have the min ULP header length */ 2401 len = mp->b_wptr - rptr; 2402 if (len < ip_hdr_length + min_ulp_header_length) { 2403 if (ira->ira_pktlen < ip_hdr_length + min_ulp_header_length) { 2404 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts); 2405 ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill); 2406 freemsg(mp); 2407 return; 2408 } 2409 IP_STAT(ipst, ip_recv_pullup); 2410 ipha = ip_pullup(mp, ip_hdr_length + min_ulp_header_length, 2411 ira); 2412 if (ipha == NULL) 2413 goto discard; 2414 len = mp->b_wptr - rptr; 2415 } 2416 2417 /* 2418 * If trusted extensions then determine the zoneid and TX specific 2419 * ira_flags. 2420 */ 2421 if (iraflags & IRAF_SYSTEM_LABELED) { 2422 /* This can update ira->ira_flags and ira->ira_zoneid */ 2423 ip_fanout_tx_v4(mp, ipha, protocol, ip_hdr_length, ira); 2424 iraflags = ira->ira_flags; 2425 } 2426 2427 2428 /* Verify ULP checksum. Handles TCP, UDP, and SCTP */ 2429 if (iraflags & IRAF_VERIFY_ULP_CKSUM) { 2430 if (!ip_input_cksum_v4(iraflags, mp, ipha, ira)) { 2431 /* Bad checksum. Stats are already incremented */ 2432 ip_drop_input("Bad ULP checksum", mp, ill); 2433 freemsg(mp); 2434 return; 2435 } 2436 /* IRAF_SCTP_CSUM_ERR could have been set */ 2437 iraflags = ira->ira_flags; 2438 } 2439 switch (protocol) { 2440 case IPPROTO_TCP: 2441 /* For TCP, discard broadcast and multicast packets. */ 2442 if (iraflags & IRAF_MULTIBROADCAST) 2443 goto discard; 2444 2445 /* First mblk contains IP+TCP headers per above check */ 2446 ASSERT(len >= ip_hdr_length + TCP_MIN_HEADER_LENGTH); 2447 2448 /* TCP options present? */ 2449 offset = ((uchar_t *)ipha)[ip_hdr_length + 12] >> 4; 2450 if (offset != 5) { 2451 if (offset < 5) 2452 goto discard; 2453 2454 /* 2455 * There must be TCP options. 2456 * Make sure we can grab them. 2457 */ 2458 offset <<= 2; 2459 offset += ip_hdr_length; 2460 if (len < offset) { 2461 if (ira->ira_pktlen < offset) { 2462 BUMP_MIB(ill->ill_ip_mib, 2463 ipIfStatsInTruncatedPkts); 2464 ip_drop_input( 2465 "ipIfStatsInTruncatedPkts", 2466 mp, ill); 2467 freemsg(mp); 2468 return; 2469 } 2470 IP_STAT(ipst, ip_recv_pullup); 2471 ipha = ip_pullup(mp, offset, ira); 2472 if (ipha == NULL) 2473 goto discard; 2474 len = mp->b_wptr - rptr; 2475 } 2476 } 2477 2478 /* 2479 * Pass up a squeue hint to tcp. 2480 * If ira_sqp is already set (this is loopback) we leave it 2481 * alone. 2482 */ 2483 if (ira->ira_sqp == NULL) { 2484 ira->ira_sqp = ip_squeue_get(ira->ira_ring); 2485 } 2486 2487 /* Look for AF_INET or AF_INET6 that matches */ 2488 connp = ipcl_classify_v4(mp, IPPROTO_TCP, ip_hdr_length, 2489 ira, ipst); 2490 if (connp == NULL) { 2491 /* Send the TH_RST */ 2492 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 2493 tcp_xmit_listeners_reset(mp, ira, ipst, NULL); 2494 return; 2495 } 2496 if (connp->conn_incoming_ifindex != 0 && 2497 connp->conn_incoming_ifindex != ira->ira_ruifindex) { 2498 CONN_DEC_REF(connp); 2499 2500 /* Send the TH_RST */ 2501 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 2502 tcp_xmit_listeners_reset(mp, ira, ipst, NULL); 2503 return; 2504 } 2505 if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || 2506 (iraflags & IRAF_IPSEC_SECURE)) { 2507 mp = ipsec_check_inbound_policy(mp, connp, 2508 ipha, NULL, ira); 2509 if (mp == NULL) { 2510 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2511 /* Note that mp is NULL */ 2512 ip_drop_input("ipIfStatsInDiscards", mp, ill); 2513 CONN_DEC_REF(connp); 2514 return; 2515 } 2516 } 2517 /* Found a client; up it goes */ 2518 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 2519 ira->ira_ill = ira->ira_rill = NULL; 2520 if (!IPCL_IS_TCP(connp)) { 2521 /* Not TCP; must be SOCK_RAW, IPPROTO_TCP */ 2522 (connp->conn_recv)(connp, mp, NULL, ira); 2523 CONN_DEC_REF(connp); 2524 ira->ira_ill = ill; 2525 ira->ira_rill = rill; 2526 return; 2527 } 2528 2529 /* 2530 * We do different processing whether called from 2531 * ip_accept_tcp and we match the target, don't match 2532 * the target, and when we are called by ip_input. 2533 */ 2534 if (iraflags & IRAF_TARGET_SQP) { 2535 if (ira->ira_target_sqp == connp->conn_sqp) { 2536 mblk_t *attrmp; 2537 2538 attrmp = ip_recv_attr_to_mblk(ira); 2539 if (attrmp == NULL) { 2540 BUMP_MIB(ill->ill_ip_mib, 2541 ipIfStatsInDiscards); 2542 ip_drop_input("ipIfStatsInDiscards", 2543 mp, ill); 2544 freemsg(mp); 2545 CONN_DEC_REF(connp); 2546 } else { 2547 SET_SQUEUE(attrmp, connp->conn_recv, 2548 connp); 2549 attrmp->b_cont = mp; 2550 ASSERT(ira->ira_target_sqp_mp == NULL); 2551 ira->ira_target_sqp_mp = attrmp; 2552 /* 2553 * Conn ref release when drained from 2554 * the squeue. 2555 */ 2556 } 2557 } else { 2558 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, 2559 connp->conn_recv, connp, ira, SQ_FILL, 2560 SQTAG_IP_TCP_INPUT); 2561 } 2562 } else { 2563 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv, 2564 connp, ira, ip_squeue_flag, SQTAG_IP_TCP_INPUT); 2565 } 2566 ira->ira_ill = ill; 2567 ira->ira_rill = rill; 2568 return; 2569 2570 case IPPROTO_SCTP: { 2571 sctp_hdr_t *sctph; 2572 in6_addr_t map_src, map_dst; 2573 uint32_t ports; /* Source and destination ports */ 2574 sctp_stack_t *sctps = ipst->ips_netstack->netstack_sctp; 2575 2576 /* For SCTP, discard broadcast and multicast packets. */ 2577 if (iraflags & IRAF_MULTIBROADCAST) 2578 goto discard; 2579 2580 /* 2581 * Since there is no SCTP h/w cksum support yet, just 2582 * clear the flag. 2583 */ 2584 DB_CKSUMFLAGS(mp) = 0; 2585 2586 /* Length ensured above */ 2587 ASSERT(MBLKL(mp) >= ip_hdr_length + SCTP_COMMON_HDR_LENGTH); 2588 sctph = (sctp_hdr_t *)(rptr + ip_hdr_length); 2589 2590 /* get the ports */ 2591 ports = *(uint32_t *)&sctph->sh_sport; 2592 2593 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &map_dst); 2594 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &map_src); 2595 if (iraflags & IRAF_SCTP_CSUM_ERR) { 2596 /* 2597 * No potential sctp checksum errors go to the Sun 2598 * sctp stack however they might be Adler-32 summed 2599 * packets a userland stack bound to a raw IP socket 2600 * could reasonably use. Note though that Adler-32 is 2601 * a long deprecated algorithm and customer sctp 2602 * networks should eventually migrate to CRC-32 at 2603 * which time this facility should be removed. 2604 */ 2605 ip_fanout_sctp_raw(mp, ipha, NULL, ports, ira); 2606 return; 2607 } 2608 connp = sctp_fanout(&map_src, &map_dst, ports, ira, mp, 2609 sctps, sctph); 2610 if (connp == NULL) { 2611 /* Check for raw socket or OOTB handling */ 2612 ip_fanout_sctp_raw(mp, ipha, NULL, ports, ira); 2613 return; 2614 } 2615 if (connp->conn_incoming_ifindex != 0 && 2616 connp->conn_incoming_ifindex != ira->ira_ruifindex) { 2617 CONN_DEC_REF(connp); 2618 /* Check for raw socket or OOTB handling */ 2619 ip_fanout_sctp_raw(mp, ipha, NULL, ports, ira); 2620 return; 2621 } 2622 2623 /* Found a client; up it goes */ 2624 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 2625 sctp_input(connp, ipha, NULL, mp, ira); 2626 /* sctp_input does a rele of the sctp_t */ 2627 return; 2628 } 2629 2630 case IPPROTO_UDP: 2631 /* First mblk contains IP+UDP headers as checked above */ 2632 ASSERT(MBLKL(mp) >= ip_hdr_length + UDPH_SIZE); 2633 2634 if (iraflags & IRAF_MULTIBROADCAST) { 2635 uint16_t *up; /* Pointer to ports in ULP header */ 2636 2637 up = (uint16_t *)((uchar_t *)ipha + ip_hdr_length); 2638 ip_fanout_udp_multi_v4(mp, ipha, up[1], up[0], ira); 2639 return; 2640 } 2641 2642 /* Look for AF_INET or AF_INET6 that matches */ 2643 connp = ipcl_classify_v4(mp, IPPROTO_UDP, ip_hdr_length, 2644 ira, ipst); 2645 if (connp == NULL) { 2646 no_udp_match: 2647 if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_UDP]. 2648 connf_head != NULL) { 2649 ASSERT(ira->ira_protocol == IPPROTO_UDP); 2650 ip_fanout_proto_v4(mp, ipha, ira); 2651 } else { 2652 ip_fanout_send_icmp_v4(mp, 2653 ICMP_DEST_UNREACHABLE, 2654 ICMP_PORT_UNREACHABLE, ira); 2655 } 2656 return; 2657 2658 } 2659 if (connp->conn_incoming_ifindex != 0 && 2660 connp->conn_incoming_ifindex != ira->ira_ruifindex) { 2661 CONN_DEC_REF(connp); 2662 goto no_udp_match; 2663 } 2664 if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld : 2665 !canputnext(connp->conn_rq)) { 2666 CONN_DEC_REF(connp); 2667 BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows); 2668 ip_drop_input("udpIfStatsInOverflows", mp, ill); 2669 freemsg(mp); 2670 return; 2671 } 2672 if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || 2673 (iraflags & IRAF_IPSEC_SECURE)) { 2674 mp = ipsec_check_inbound_policy(mp, connp, 2675 ipha, NULL, ira); 2676 if (mp == NULL) { 2677 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2678 /* Note that mp is NULL */ 2679 ip_drop_input("ipIfStatsInDiscards", mp, ill); 2680 CONN_DEC_REF(connp); 2681 return; 2682 } 2683 } 2684 /* 2685 * Remove 0-spi if it's 0, or move everything behind 2686 * the UDP header over it and forward to ESP via 2687 * ip_fanout_v4(). 2688 */ 2689 if (connp->conn_udp->udp_nat_t_endpoint) { 2690 if (iraflags & IRAF_IPSEC_SECURE) { 2691 ip_drop_packet(mp, B_TRUE, ira->ira_ill, 2692 DROPPER(ipss, ipds_esp_nat_t_ipsec), 2693 &ipss->ipsec_dropper); 2694 CONN_DEC_REF(connp); 2695 return; 2696 } 2697 2698 mp = zero_spi_check(mp, ira); 2699 if (mp == NULL) { 2700 /* 2701 * Packet was consumed - probably sent to 2702 * ip_fanout_v4. 2703 */ 2704 CONN_DEC_REF(connp); 2705 return; 2706 } 2707 /* Else continue like a normal UDP packet. */ 2708 ipha = (ipha_t *)mp->b_rptr; 2709 protocol = ipha->ipha_protocol; 2710 ira->ira_protocol = protocol; 2711 } 2712 /* Found a client; up it goes */ 2713 IP_STAT(ipst, ip_udp_fannorm); 2714 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 2715 ira->ira_ill = ira->ira_rill = NULL; 2716 (connp->conn_recv)(connp, mp, NULL, ira); 2717 CONN_DEC_REF(connp); 2718 ira->ira_ill = ill; 2719 ira->ira_rill = rill; 2720 return; 2721 case IPPROTO_DCCP: 2722 /* For DCCP, discard broadcast and multicast packets */ 2723 if (iraflags & IRAF_MULTIBROADCAST) { 2724 goto discard; 2725 } 2726 2727 /* First mblk contains IP+DCCP headers per above check */ 2728 ASSERT(len >= ip_hdr_length + DCCP_MIN_HEADER_LENGTH); 2729 2730 /* Squeue hint */ 2731 if (ira->ira_sqp == NULL) { 2732 ira->ira_sqp = ip_squeue_get(ira->ira_ring); 2733 } 2734 2735 connp = ipcl_classify_v4(mp, IPPROTO_DCCP, ip_hdr_length, 2736 ira, ipst); 2737 if (connp == NULL) { 2738 cmn_err(CE_NOTE, "ip_input.c: ip_fanout_v4 connp not found"); 2739 /* Send the reset packet */ 2740 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 2741 dccp_xmit_listeners_reset(mp, ira, ipst, NULL); 2742 return; 2743 } 2744 2745 if (connp->conn_incoming_ifindex != 0 && 2746 connp->conn_incoming_ifindex != ira->ira_ruifindex) { 2747 cmn_err(CE_NOTE, "ip_input.c: ip_fanout_v4 ifindex problem"); 2748 /* Send the reset packet */ 2749 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 2750 dccp_xmit_listeners_reset(mp, ira, ipst, NULL); 2751 return; 2752 } 2753 2754 if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || 2755 (iraflags & IRAF_IPSEC_SECURE)) { 2756 mp = ipsec_check_inbound_policy(mp, connp, 2757 ipha, NULL, ira); 2758 if (mp == NULL) { 2759 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2760 /* Note that mp is NULL */ 2761 ip_drop_input("ipIfStatsInDiscards", mp, ill); 2762 CONN_DEC_REF(connp); 2763 return; 2764 } 2765 } 2766 2767 /* Found a client; up it goes */ 2768 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 2769 ira->ira_ill = ira->ira_rill = NULL; 2770 2771 /* XXX SOCK_RAW for DCCP? */ 2772 2773 if (iraflags & IRAF_TARGET_SQP) { 2774 cmn_err(CE_NOTE, "IRAF_TARGET_SQP"); 2775 } else { 2776 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv, 2777 connp, ira, ip_squeue_flag, SQTAG_IP_DCCP_INPUT); 2778 } 2779 2780 ira->ira_ill = ill; 2781 ira->ira_rill = rill; 2782 return; 2783 2784 default: 2785 break; 2786 } 2787 2788 /* 2789 * Clear hardware checksumming flag as it is currently only 2790 * used by TCP and UDP. 2791 */ 2792 DB_CKSUMFLAGS(mp) = 0; 2793 2794 switch (protocol) { 2795 case IPPROTO_ICMP: 2796 /* 2797 * We need to accomodate icmp messages coming in clear 2798 * until we get everything secure from the wire. If 2799 * icmp_accept_clear_messages is zero we check with 2800 * the global policy and act accordingly. If it is 2801 * non-zero, we accept the message without any checks. 2802 * But *this does not mean* that this will be delivered 2803 * to RAW socket clients. By accepting we might send 2804 * replies back, change our MTU value etc., 2805 * but delivery to the ULP/clients depends on their 2806 * policy dispositions. 2807 */ 2808 if (ipst->ips_icmp_accept_clear_messages == 0) { 2809 mp = ipsec_check_global_policy(mp, NULL, 2810 ipha, NULL, ira, ns); 2811 if (mp == NULL) 2812 return; 2813 } 2814 2815 /* 2816 * On a labeled system, we have to check whether the zone 2817 * itself is permitted to receive raw traffic. 2818 */ 2819 if (ira->ira_flags & IRAF_SYSTEM_LABELED) { 2820 if (!tsol_can_accept_raw(mp, ira, B_FALSE)) { 2821 BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors); 2822 ip_drop_input("tsol_can_accept_raw", mp, ill); 2823 freemsg(mp); 2824 return; 2825 } 2826 } 2827 2828 /* 2829 * ICMP header checksum, including checksum field, 2830 * should be zero. 2831 */ 2832 if (IP_CSUM(mp, ip_hdr_length, 0)) { 2833 BUMP_MIB(&ipst->ips_icmp_mib, icmpInCksumErrs); 2834 ip_drop_input("icmpInCksumErrs", mp, ill); 2835 freemsg(mp); 2836 return; 2837 } 2838 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 2839 mp = icmp_inbound_v4(mp, ira); 2840 if (mp == NULL) { 2841 /* No need to pass to RAW sockets */ 2842 return; 2843 } 2844 break; 2845 2846 case IPPROTO_IGMP: 2847 /* 2848 * If we are not willing to accept IGMP packets in clear, 2849 * then check with global policy. 2850 */ 2851 if (ipst->ips_igmp_accept_clear_messages == 0) { 2852 mp = ipsec_check_global_policy(mp, NULL, 2853 ipha, NULL, ira, ns); 2854 if (mp == NULL) 2855 return; 2856 } 2857 if ((ira->ira_flags & IRAF_SYSTEM_LABELED) && 2858 !tsol_can_accept_raw(mp, ira, B_TRUE)) { 2859 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2860 ip_drop_input("ipIfStatsInDiscards", mp, ill); 2861 freemsg(mp); 2862 return; 2863 } 2864 /* 2865 * Validate checksum 2866 */ 2867 if (IP_CSUM(mp, ip_hdr_length, 0)) { 2868 ++ipst->ips_igmpstat.igps_rcv_badsum; 2869 ip_drop_input("igps_rcv_badsum", mp, ill); 2870 freemsg(mp); 2871 return; 2872 } 2873 2874 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 2875 mp = igmp_input(mp, ira); 2876 if (mp == NULL) { 2877 /* Bad packet - discarded by igmp_input */ 2878 return; 2879 } 2880 break; 2881 case IPPROTO_PIM: 2882 /* 2883 * If we are not willing to accept PIM packets in clear, 2884 * then check with global policy. 2885 */ 2886 if (ipst->ips_pim_accept_clear_messages == 0) { 2887 mp = ipsec_check_global_policy(mp, NULL, 2888 ipha, NULL, ira, ns); 2889 if (mp == NULL) 2890 return; 2891 } 2892 if ((ira->ira_flags & IRAF_SYSTEM_LABELED) && 2893 !tsol_can_accept_raw(mp, ira, B_TRUE)) { 2894 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2895 ip_drop_input("ipIfStatsInDiscards", mp, ill); 2896 freemsg(mp); 2897 return; 2898 } 2899 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 2900 2901 /* Checksum is verified in pim_input */ 2902 mp = pim_input(mp, ira); 2903 if (mp == NULL) { 2904 /* Bad packet - discarded by pim_input */ 2905 return; 2906 } 2907 break; 2908 case IPPROTO_AH: 2909 case IPPROTO_ESP: { 2910 /* 2911 * Fast path for AH/ESP. 2912 */ 2913 netstack_t *ns = ipst->ips_netstack; 2914 ipsec_stack_t *ipss = ns->netstack_ipsec; 2915 2916 IP_STAT(ipst, ipsec_proto_ahesp); 2917 2918 if (!ipsec_loaded(ipss)) { 2919 ip_proto_not_sup(mp, ira); 2920 return; 2921 } 2922 2923 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 2924 /* select inbound SA and have IPsec process the pkt */ 2925 if (protocol == IPPROTO_ESP) { 2926 esph_t *esph; 2927 boolean_t esp_in_udp_sa; 2928 boolean_t esp_in_udp_packet; 2929 2930 mp = ipsec_inbound_esp_sa(mp, ira, &esph); 2931 if (mp == NULL) 2932 return; 2933 2934 ASSERT(esph != NULL); 2935 ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE); 2936 ASSERT(ira->ira_ipsec_esp_sa != NULL); 2937 ASSERT(ira->ira_ipsec_esp_sa->ipsa_input_func != NULL); 2938 2939 esp_in_udp_sa = ((ira->ira_ipsec_esp_sa->ipsa_flags & 2940 IPSA_F_NATT) != 0); 2941 esp_in_udp_packet = 2942 (ira->ira_flags & IRAF_ESP_UDP_PORTS) != 0; 2943 2944 /* 2945 * The following is a fancy, but quick, way of saying: 2946 * ESP-in-UDP SA and Raw ESP packet --> drop 2947 * OR 2948 * ESP SA and ESP-in-UDP packet --> drop 2949 */ 2950 if (esp_in_udp_sa != esp_in_udp_packet) { 2951 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2952 ip_drop_packet(mp, B_TRUE, ira->ira_ill, 2953 DROPPER(ipss, ipds_esp_no_sa), 2954 &ipss->ipsec_dropper); 2955 return; 2956 } 2957 mp = ira->ira_ipsec_esp_sa->ipsa_input_func(mp, esph, 2958 ira); 2959 } else { 2960 ah_t *ah; 2961 2962 mp = ipsec_inbound_ah_sa(mp, ira, &ah); 2963 if (mp == NULL) 2964 return; 2965 2966 ASSERT(ah != NULL); 2967 ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE); 2968 ASSERT(ira->ira_ipsec_ah_sa != NULL); 2969 ASSERT(ira->ira_ipsec_ah_sa->ipsa_input_func != NULL); 2970 mp = ira->ira_ipsec_ah_sa->ipsa_input_func(mp, ah, 2971 ira); 2972 } 2973 2974 if (mp == NULL) { 2975 /* 2976 * Either it failed or is pending. In the former case 2977 * ipIfStatsInDiscards was increased. 2978 */ 2979 return; 2980 } 2981 /* we're done with IPsec processing, send it up */ 2982 ip_input_post_ipsec(mp, ira); 2983 return; 2984 } 2985 case IPPROTO_ENCAP: { 2986 ipha_t *inner_ipha; 2987 2988 /* 2989 * Handle self-encapsulated packets (IP-in-IP where 2990 * the inner addresses == the outer addresses). 2991 */ 2992 if ((uchar_t *)ipha + ip_hdr_length + sizeof (ipha_t) > 2993 mp->b_wptr) { 2994 if (ira->ira_pktlen < 2995 ip_hdr_length + sizeof (ipha_t)) { 2996 BUMP_MIB(ill->ill_ip_mib, 2997 ipIfStatsInTruncatedPkts); 2998 ip_drop_input("ipIfStatsInTruncatedPkts", 2999 mp, ill); 3000 freemsg(mp); 3001 return; 3002 } 3003 ipha = ip_pullup(mp, (uchar_t *)ipha + ip_hdr_length + 3004 sizeof (ipha_t) - mp->b_rptr, ira); 3005 if (ipha == NULL) { 3006 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 3007 ip_drop_input("ipIfStatsInDiscards", mp, ill); 3008 freemsg(mp); 3009 return; 3010 } 3011 } 3012 inner_ipha = (ipha_t *)((uchar_t *)ipha + ip_hdr_length); 3013 /* 3014 * Check the sanity of the inner IP header. 3015 */ 3016 if ((IPH_HDR_VERSION(inner_ipha) != IPV4_VERSION)) { 3017 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 3018 ip_drop_input("ipIfStatsInDiscards", mp, ill); 3019 freemsg(mp); 3020 return; 3021 } 3022 if (IPH_HDR_LENGTH(inner_ipha) < sizeof (ipha_t)) { 3023 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 3024 ip_drop_input("ipIfStatsInDiscards", mp, ill); 3025 freemsg(mp); 3026 return; 3027 } 3028 if (inner_ipha->ipha_src != ipha->ipha_src || 3029 inner_ipha->ipha_dst != ipha->ipha_dst) { 3030 /* We fallthru to iptun fanout below */ 3031 goto iptun; 3032 } 3033 3034 /* 3035 * Self-encapsulated tunnel packet. Remove 3036 * the outer IP header and fanout again. 3037 * We also need to make sure that the inner 3038 * header is pulled up until options. 3039 */ 3040 mp->b_rptr = (uchar_t *)inner_ipha; 3041 ipha = inner_ipha; 3042 ip_hdr_length = IPH_HDR_LENGTH(ipha); 3043 if ((uchar_t *)ipha + ip_hdr_length > mp->b_wptr) { 3044 if (ira->ira_pktlen < 3045 (uchar_t *)ipha + ip_hdr_length - mp->b_rptr) { 3046 BUMP_MIB(ill->ill_ip_mib, 3047 ipIfStatsInTruncatedPkts); 3048 ip_drop_input("ipIfStatsInTruncatedPkts", 3049 mp, ill); 3050 freemsg(mp); 3051 return; 3052 } 3053 ipha = ip_pullup(mp, 3054 (uchar_t *)ipha + ip_hdr_length - mp->b_rptr, ira); 3055 if (ipha == NULL) { 3056 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 3057 ip_drop_input("ipIfStatsInDiscards", mp, ill); 3058 freemsg(mp); 3059 return; 3060 } 3061 } 3062 if (ip_hdr_length > sizeof (ipha_t)) { 3063 /* We got options on the inner packet. */ 3064 ipaddr_t dst = ipha->ipha_dst; 3065 int error = 0; 3066 3067 dst = ip_input_options(ipha, dst, mp, ira, &error); 3068 if (error != 0) { 3069 /* 3070 * An ICMP error has been sent and the packet 3071 * has been dropped. 3072 */ 3073 return; 3074 } 3075 if (dst != ipha->ipha_dst) { 3076 /* 3077 * Someone put a source-route in 3078 * the inside header of a self- 3079 * encapsulated packet. Drop it 3080 * with extreme prejudice and let 3081 * the sender know. 3082 */ 3083 ip_drop_input("ICMP_SOURCE_ROUTE_FAILED", 3084 mp, ill); 3085 icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, 3086 ira); 3087 return; 3088 } 3089 } 3090 if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) { 3091 /* 3092 * This means that somebody is sending 3093 * Self-encapsualted packets without AH/ESP. 3094 * 3095 * Send this packet to find a tunnel endpoint. 3096 * if I can't find one, an ICMP 3097 * PROTOCOL_UNREACHABLE will get sent. 3098 */ 3099 protocol = ipha->ipha_protocol; 3100 ira->ira_protocol = protocol; 3101 goto iptun; 3102 } 3103 3104 /* Update based on removed IP header */ 3105 ira->ira_ip_hdr_length = ip_hdr_length; 3106 ira->ira_pktlen = ntohs(ipha->ipha_length); 3107 3108 if (ira->ira_flags & IRAF_IPSEC_DECAPS) { 3109 /* 3110 * This packet is self-encapsulated multiple 3111 * times. We don't want to recurse infinitely. 3112 * To keep it simple, drop the packet. 3113 */ 3114 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 3115 ip_drop_input("ipIfStatsInDiscards", mp, ill); 3116 freemsg(mp); 3117 return; 3118 } 3119 ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE); 3120 ira->ira_flags |= IRAF_IPSEC_DECAPS; 3121 3122 ip_input_post_ipsec(mp, ira); 3123 return; 3124 } 3125 3126 iptun: /* IPPROTO_ENCAPS that is not self-encapsulated */ 3127 case IPPROTO_IPV6: 3128 /* iptun will verify trusted label */ 3129 connp = ipcl_classify_v4(mp, protocol, ip_hdr_length, 3130 ira, ipst); 3131 if (connp != NULL) { 3132 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 3133 ira->ira_ill = ira->ira_rill = NULL; 3134 (connp->conn_recv)(connp, mp, NULL, ira); 3135 CONN_DEC_REF(connp); 3136 ira->ira_ill = ill; 3137 ira->ira_rill = rill; 3138 return; 3139 } 3140 /* FALLTHRU */ 3141 default: 3142 /* 3143 * On a labeled system, we have to check whether the zone 3144 * itself is permitted to receive raw traffic. 3145 */ 3146 if (ira->ira_flags & IRAF_SYSTEM_LABELED) { 3147 if (!tsol_can_accept_raw(mp, ira, B_FALSE)) { 3148 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 3149 ip_drop_input("ipIfStatsInDiscards", mp, ill); 3150 freemsg(mp); 3151 return; 3152 } 3153 } 3154 break; 3155 } 3156 3157 /* 3158 * The above input functions may have returned the pulled up message. 3159 * So ipha need to be reinitialized. 3160 */ 3161 ipha = (ipha_t *)mp->b_rptr; 3162 ira->ira_protocol = protocol = ipha->ipha_protocol; 3163 if (ipst->ips_ipcl_proto_fanout_v4[protocol].connf_head == NULL) { 3164 /* 3165 * No user-level listener for these packets packets. 3166 * Check for IPPROTO_ENCAP... 3167 */ 3168 if (protocol == IPPROTO_ENCAP && ipst->ips_ip_g_mrouter) { 3169 /* 3170 * Check policy here, 3171 * THEN ship off to ip_mroute_decap(). 3172 * 3173 * BTW, If I match a configured IP-in-IP 3174 * tunnel above, this path will not be reached, and 3175 * ip_mroute_decap will never be called. 3176 */ 3177 mp = ipsec_check_global_policy(mp, connp, 3178 ipha, NULL, ira, ns); 3179 if (mp != NULL) { 3180 ip_mroute_decap(mp, ira); 3181 } /* Else we already freed everything! */ 3182 } else { 3183 ip_proto_not_sup(mp, ira); 3184 } 3185 return; 3186 } 3187 3188 /* 3189 * Handle fanout to raw sockets. There 3190 * can be more than one stream bound to a particular 3191 * protocol. When this is the case, each one gets a copy 3192 * of any incoming packets. 3193 */ 3194 ASSERT(ira->ira_protocol == ipha->ipha_protocol); 3195 ip_fanout_proto_v4(mp, ipha, ira); 3196 return; 3197 3198 discard: 3199 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 3200 ip_drop_input("ipIfStatsInDiscards", mp, ill); 3201 freemsg(mp); 3202 #undef rptr 3203 }