1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  24  *
  25  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  26  */
  27 /* Copyright (c) 1990 Mentat Inc. */
  28 
  29 #include <sys/types.h>
  30 #include <sys/stream.h>
  31 #include <sys/dlpi.h>
  32 #include <sys/stropts.h>
  33 #include <sys/sysmacros.h>
  34 #include <sys/strsubr.h>
  35 #include <sys/strlog.h>
  36 #include <sys/strsun.h>
  37 #include <sys/zone.h>
  38 #define _SUN_TPI_VERSION 2
  39 #include <sys/tihdr.h>
  40 #include <sys/xti_inet.h>
  41 #include <sys/ddi.h>
  42 #include <sys/sunddi.h>
  43 #include <sys/cmn_err.h>
  44 #include <sys/debug.h>
  45 #include <sys/kobj.h>
  46 #include <sys/modctl.h>
  47 #include <sys/atomic.h>
  48 #include <sys/policy.h>
  49 #include <sys/priv.h>
  50 
  51 #include <sys/systm.h>
  52 #include <sys/param.h>
  53 #include <sys/kmem.h>
  54 #include <sys/sdt.h>
  55 #include <sys/socket.h>
  56 #include <sys/vtrace.h>
  57 #include <sys/isa_defs.h>
  58 #include <sys/mac.h>
  59 #include <net/if.h>
  60 #include <net/if_arp.h>
  61 #include <net/route.h>
  62 #include <sys/sockio.h>
  63 #include <netinet/in.h>
  64 #include <net/if_dl.h>
  65 
  66 #include <inet/common.h>
  67 #include <inet/mi.h>
  68 #include <inet/mib2.h>
  69 #include <inet/nd.h>
  70 #include <inet/arp.h>
  71 #include <inet/snmpcom.h>
  72 #include <inet/kstatcom.h>
  73 
  74 #include <netinet/igmp_var.h>
  75 #include <netinet/ip6.h>
  76 #include <netinet/icmp6.h>
  77 #include <netinet/sctp.h>
  78 
  79 #include <inet/ip.h>
  80 #include <inet/ip_impl.h>
  81 #include <inet/ip6.h>
  82 #include <inet/ip6_asp.h>
  83 #include <inet/optcom.h>
  84 #include <inet/tcp.h>
  85 #include <inet/tcp_impl.h>
  86 #include <inet/ip_multi.h>
  87 #include <inet/ip_if.h>
  88 #include <inet/ip_ire.h>
  89 #include <inet/ip_ftable.h>
  90 #include <inet/ip_rts.h>
  91 #include <inet/ip_ndp.h>
  92 #include <inet/ip_listutils.h>
  93 #include <netinet/igmp.h>
  94 #include <netinet/ip_mroute.h>
  95 #include <inet/ipp_common.h>
  96 
  97 #include <net/pfkeyv2.h>
  98 #include <inet/sadb.h>
  99 #include <inet/ipsec_impl.h>
 100 #include <inet/ipdrop.h>
 101 #include <inet/ip_netinfo.h>
 102 #include <inet/ilb_ip.h>
 103 #include <sys/squeue_impl.h>
 104 #include <sys/squeue.h>
 105 
 106 #include <sys/ethernet.h>
 107 #include <net/if_types.h>
 108 #include <sys/cpuvar.h>
 109 
 110 #include <ipp/ipp.h>
 111 #include <ipp/ipp_impl.h>
 112 #include <ipp/ipgpc/ipgpc.h>
 113 
 114 #include <sys/pattr.h>
 115 #include <inet/ipclassifier.h>
 116 #include <inet/sctp_ip.h>
 117 #include <inet/sctp/sctp_impl.h>
 118 #include <inet/udp_impl.h>
 119 #include <inet/dccp_impl.h>
 120 #include <sys/sunddi.h>
 121 
 122 #include <sys/tsol/label.h>
 123 #include <sys/tsol/tnet.h>
 124 
 125 #include <sys/clock_impl.h>       /* For LBOLT_FASTPATH{,64} */
 126 
 127 #ifdef  DEBUG
 128 extern boolean_t skip_sctp_cksum;
 129 #endif
 130 
 131 static void     ip_input_local_v4(ire_t *, mblk_t *, ipha_t *,
 132     ip_recv_attr_t *);
 133 
 134 static void     ip_input_broadcast_v4(ire_t *, mblk_t *, ipha_t *,
 135     ip_recv_attr_t *);
 136 static void     ip_input_multicast_v4(ire_t *, mblk_t *, ipha_t *,
 137     ip_recv_attr_t *);
 138 
 139 #pragma inline(ip_input_common_v4, ip_input_local_v4, ip_forward_xmit_v4)
 140 
 141 /*
 142  * Direct read side procedure capable of dealing with chains. GLDv3 based
 143  * drivers call this function directly with mblk chains while STREAMS
 144  * read side procedure ip_rput() calls this for single packet with ip_ring
 145  * set to NULL to process one packet at a time.
 146  *
 147  * The ill will always be valid if this function is called directly from
 148  * the driver.
 149  *
 150  * If ip_input() is called from GLDv3:
 151  *
 152  *   - This must be a non-VLAN IP stream.
 153  *   - 'mp' is either an untagged or a special priority-tagged packet.
 154  *   - Any VLAN tag that was in the MAC header has been stripped.
 155  *
 156  * If the IP header in packet is not 32-bit aligned, every message in the
 157  * chain will be aligned before further operations. This is required on SPARC
 158  * platform.
 159  */
 160 void
 161 ip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain,
 162     struct mac_header_info_s *mhip)
 163 {
 164         (void) ip_input_common_v4(ill, ip_ring, mp_chain, mhip, NULL, NULL,
 165             NULL);
 166 }
 167 
 168 /*
 169  * ip_accept_tcp() - This function is called by the squeue when it retrieves
 170  * a chain of packets in the poll mode. The packets have gone through the
 171  * data link processing but not IP processing. For performance and latency
 172  * reasons, the squeue wants to process the chain in line instead of feeding
 173  * it back via ip_input path.
 174  *
 175  * We set up the ip_recv_attr_t with IRAF_TARGET_SQP to that ip_fanout_v4
 176  * will pass back any TCP packets matching the target sqp to
 177  * ip_input_common_v4 using ira_target_sqp_mp. Other packets are handled by
 178  * ip_input_v4 and ip_fanout_v4 as normal.
 179  * The TCP packets that match the target squeue are returned to the caller
 180  * as a b_next chain after each packet has been prepend with an mblk
 181  * from ip_recv_attr_to_mblk.
 182  */
 183 mblk_t *
 184 ip_accept_tcp(ill_t *ill, ill_rx_ring_t *ip_ring, squeue_t *target_sqp,
 185     mblk_t *mp_chain, mblk_t **last, uint_t *cnt)
 186 {
 187         return (ip_input_common_v4(ill, ip_ring, mp_chain, NULL, target_sqp,
 188             last, cnt));
 189 }
 190 
 191 /*
 192  * Used by ip_input and ip_accept_tcp
 193  * The last three arguments are only used by ip_accept_tcp, and mhip is
 194  * only used by ip_input.
 195  */
 196 mblk_t *
 197 ip_input_common_v4(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain,
 198     struct mac_header_info_s *mhip, squeue_t *target_sqp,
 199     mblk_t **last, uint_t *cnt)
 200 {
 201         mblk_t          *mp;
 202         ipha_t          *ipha;
 203         ip_recv_attr_t  iras;   /* Receive attributes */
 204         rtc_t           rtc;
 205         iaflags_t       chain_flags = 0;        /* Fixed for chain */
 206         mblk_t          *ahead = NULL;  /* Accepted head */
 207         mblk_t          *atail = NULL;  /* Accepted tail */
 208         uint_t          acnt = 0;       /* Accepted count */
 209 
 210         ASSERT(mp_chain != NULL);
 211         ASSERT(ill != NULL);
 212 
 213         /* These ones do not change as we loop over packets */
 214         iras.ira_ill = iras.ira_rill = ill;
 215         iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
 216         iras.ira_rifindex = iras.ira_ruifindex;
 217         iras.ira_sqp = NULL;
 218         iras.ira_ring = ip_ring;
 219         /* For ECMP and outbound transmit ring selection */
 220         iras.ira_xmit_hint = ILL_RING_TO_XMIT_HINT(ip_ring);
 221 
 222         iras.ira_target_sqp = target_sqp;
 223         iras.ira_target_sqp_mp = NULL;
 224         if (target_sqp != NULL)
 225                 chain_flags |= IRAF_TARGET_SQP;
 226 
 227         /*
 228          * We try to have a mhip pointer when possible, but
 229          * it might be NULL in some cases. In those cases we
 230          * have to assume unicast.
 231          */
 232         iras.ira_mhip = mhip;
 233         iras.ira_flags = 0;
 234         if (mhip != NULL) {
 235                 switch (mhip->mhi_dsttype) {
 236                 case MAC_ADDRTYPE_MULTICAST :
 237                         chain_flags |= IRAF_L2DST_MULTICAST;
 238                         break;
 239                 case MAC_ADDRTYPE_BROADCAST :
 240                         chain_flags |= IRAF_L2DST_BROADCAST;
 241                         break;
 242                 }
 243         }
 244 
 245         /*
 246          * Initialize the one-element route cache.
 247          *
 248          * We do ire caching from one iteration to
 249          * another. In the event the packet chain contains
 250          * all packets from the same dst, this caching saves
 251          * an ire_route_recursive for each of the succeeding
 252          * packets in a packet chain.
 253          */
 254         rtc.rtc_ire = NULL;
 255         rtc.rtc_ipaddr = INADDR_ANY;
 256 
 257         /* Loop over b_next */
 258         for (mp = mp_chain; mp != NULL; mp = mp_chain) {
 259                 mp_chain = mp->b_next;
 260                 mp->b_next = NULL;
 261 
 262                 ASSERT(DB_TYPE(mp) == M_DATA);
 263 
 264 
 265                 /*
 266                  * if db_ref > 1 then copymsg and free original. Packet
 267                  * may be changed and we do not want the other entity
 268                  * who has a reference to this message to trip over the
 269                  * changes. This is a blind change because trying to
 270                  * catch all places that might change the packet is too
 271                  * difficult.
 272                  *
 273                  * This corresponds to the fast path case, where we have
 274                  * a chain of M_DATA mblks.  We check the db_ref count
 275                  * of only the 1st data block in the mblk chain. There
 276                  * doesn't seem to be a reason why a device driver would
 277                  * send up data with varying db_ref counts in the mblk
 278                  * chain. In any case the Fast path is a private
 279                  * interface, and our drivers don't do such a thing.
 280                  * Given the above assumption, there is no need to walk
 281                  * down the entire mblk chain (which could have a
 282                  * potential performance problem)
 283                  *
 284                  * The "(DB_REF(mp) > 1)" check was moved from ip_rput()
 285                  * to here because of exclusive ip stacks and vnics.
 286                  * Packets transmitted from exclusive stack over vnic
 287                  * can have db_ref > 1 and when it gets looped back to
 288                  * another vnic in a different zone, you have ip_input()
 289                  * getting dblks with db_ref > 1. So if someone
 290                  * complains of TCP performance under this scenario,
 291                  * take a serious look here on the impact of copymsg().
 292                  */
 293                 if (DB_REF(mp) > 1) {
 294                         if ((mp = ip_fix_dbref(mp, &iras)) == NULL) {
 295                                 /* mhip might point into 1st packet in chain */
 296                                 iras.ira_mhip = NULL;
 297                                 continue;
 298                         }
 299                 }
 300 
 301                 /*
 302                  * IP header ptr not aligned?
 303                  * OR IP header not complete in first mblk
 304                  */
 305                 ipha = (ipha_t *)mp->b_rptr;
 306                 if (!OK_32PTR(ipha) || MBLKL(mp) < IP_SIMPLE_HDR_LENGTH) {
 307                         mp = ip_check_and_align_header(mp, IP_SIMPLE_HDR_LENGTH,
 308                             &iras);
 309                         if (mp == NULL) {
 310                                 /* mhip might point into 1st packet in chain */
 311                                 iras.ira_mhip = NULL;
 312                                 continue;
 313                         }
 314                         ipha = (ipha_t *)mp->b_rptr;
 315                 }
 316 
 317                 /* Protect against a mix of Ethertypes and IP versions */
 318                 if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
 319                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
 320                         ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
 321                         freemsg(mp);
 322                         /* mhip might point into 1st packet in the chain. */
 323                         iras.ira_mhip = NULL;
 324                         continue;
 325                 }
 326 
 327                 /*
 328                  * Check for Martian addrs; we have to explicitly
 329                  * test for for zero dst since this is also used as
 330                  * an indication that the rtc is not used.
 331                  */
 332                 if (ipha->ipha_dst == INADDR_ANY) {
 333                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
 334                         ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
 335                         freemsg(mp);
 336                         /* mhip might point into 1st packet in the chain. */
 337                         iras.ira_mhip = NULL;
 338                         continue;
 339                 }
 340 
 341                 /*
 342                  * Keep L2SRC from a previous packet in chain since mhip
 343                  * might point into an earlier packet in the chain.
 344                  * Keep IRAF_VERIFIED_SRC to avoid redoing broadcast
 345                  * source check in forwarding path.
 346                  */
 347                 chain_flags |= (iras.ira_flags &
 348                     (IRAF_L2SRC_SET|IRAF_VERIFIED_SRC));
 349 
 350                 iras.ira_flags = IRAF_IS_IPV4 | IRAF_VERIFY_IP_CKSUM |
 351                     IRAF_VERIFY_ULP_CKSUM | chain_flags;
 352                 iras.ira_free_flags = 0;
 353                 iras.ira_cred = NULL;
 354                 iras.ira_cpid = NOPID;
 355                 iras.ira_tsl = NULL;
 356                 iras.ira_zoneid = ALL_ZONES;    /* Default for forwarding */
 357 
 358                 /*
 359                  * We must count all incoming packets, even if they end
 360                  * up being dropped later on. Defer counting bytes until
 361                  * we have the whole IP header in first mblk.
 362                  */
 363                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
 364 
 365                 iras.ira_pktlen = ntohs(ipha->ipha_length);
 366                 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets,
 367                     iras.ira_pktlen);
 368 
 369                 /*
 370                  * Call one of:
 371                  *      ill_input_full_v4
 372                  *      ill_input_short_v4
 373                  * The former is used in unusual cases. See ill_set_inputfn().
 374                  */
 375                 (*ill->ill_inputfn)(mp, ipha, &ipha->ipha_dst, &iras, &rtc);
 376 
 377                 /* Any references to clean up? No hold on ira_ill */
 378                 if (iras.ira_flags & (IRAF_IPSEC_SECURE|IRAF_SYSTEM_LABELED))
 379                         ira_cleanup(&iras, B_FALSE);
 380 
 381                 if (iras.ira_target_sqp_mp != NULL) {
 382                         /* Better be called from ip_accept_tcp */
 383                         ASSERT(target_sqp != NULL);
 384 
 385                         /* Found one packet to accept */
 386                         mp = iras.ira_target_sqp_mp;
 387                         iras.ira_target_sqp_mp = NULL;
 388                         ASSERT(ip_recv_attr_is_mblk(mp));
 389 
 390                         if (atail != NULL)
 391                                 atail->b_next = mp;
 392                         else
 393                                 ahead = mp;
 394                         atail = mp;
 395                         acnt++;
 396                         mp = NULL;
 397                 }
 398                 /* mhip might point into 1st packet in the chain. */
 399                 iras.ira_mhip = NULL;
 400         }
 401         /* Any remaining references to the route cache? */
 402         if (rtc.rtc_ire != NULL) {
 403                 ASSERT(rtc.rtc_ipaddr != INADDR_ANY);
 404                 ire_refrele(rtc.rtc_ire);
 405         }
 406 
 407         if (ahead != NULL) {
 408                 /* Better be called from ip_accept_tcp */
 409                 ASSERT(target_sqp != NULL);
 410                 *last = atail;
 411                 *cnt = acnt;
 412                 return (ahead);
 413         }
 414 
 415         return (NULL);
 416 }
 417 
 418 /*
 419  * This input function is used when
 420  *  - is_system_labeled()
 421  *  - CGTP filtering
 422  *  - DHCP unicast before we have an IP address configured
 423  *  - there is an listener for IPPROTO_RSVP
 424  */
 425 void
 426 ill_input_full_v4(mblk_t *mp, void *iph_arg, void *nexthop_arg,
 427     ip_recv_attr_t *ira, rtc_t *rtc)
 428 {
 429         ipha_t          *ipha = (ipha_t *)iph_arg;
 430         ipaddr_t        nexthop = *(ipaddr_t *)nexthop_arg;
 431         ill_t           *ill = ira->ira_ill;
 432         ip_stack_t      *ipst = ill->ill_ipst;
 433         int             cgtp_flt_pkt;
 434 
 435         ASSERT(ira->ira_tsl == NULL);
 436 
 437         /*
 438          * Attach any necessary label information to
 439          * this packet
 440          */
 441         if (is_system_labeled()) {
 442                 ira->ira_flags |= IRAF_SYSTEM_LABELED;
 443 
 444                 /*
 445                  * This updates ira_cred, ira_tsl and ira_free_flags based
 446                  * on the label.
 447                  */
 448                 if (!tsol_get_pkt_label(mp, IPV4_VERSION, ira)) {
 449                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 450                         ip_drop_input("ipIfStatsInDiscards", mp, ill);
 451                         freemsg(mp);
 452                         return;
 453                 }
 454                 /* Note that ira_tsl can be NULL here. */
 455 
 456                 /* tsol_get_pkt_label sometimes does pullupmsg */
 457                 ipha = (ipha_t *)mp->b_rptr;
 458         }
 459 
 460         /*
 461          * Invoke the CGTP (multirouting) filtering module to process
 462          * the incoming packet. Packets identified as duplicates
 463          * must be discarded. Filtering is active only if the
 464          * the ip_cgtp_filter ndd variable is non-zero.
 465          */
 466         cgtp_flt_pkt = CGTP_IP_PKT_NOT_CGTP;
 467         if (ipst->ips_ip_cgtp_filter &&
 468             ipst->ips_ip_cgtp_filter_ops != NULL) {
 469                 netstackid_t stackid;
 470 
 471                 stackid = ipst->ips_netstack->netstack_stackid;
 472                 /*
 473                  * CGTP and IPMP are mutually exclusive so
 474                  * phyint_ifindex is fine here.
 475                  */
 476                 cgtp_flt_pkt =
 477                     ipst->ips_ip_cgtp_filter_ops->cfo_filter(stackid,
 478                     ill->ill_phyint->phyint_ifindex, mp);
 479                 if (cgtp_flt_pkt == CGTP_IP_PKT_DUPLICATE) {
 480                         ip_drop_input("CGTP_IP_PKT_DUPLICATE", mp, ill);
 481                         freemsg(mp);
 482                         return;
 483                 }
 484         }
 485 
 486         /*
 487          * Brutal hack for DHCPv4 unicast: RFC2131 allows a DHCP
 488          * server to unicast DHCP packets to a DHCP client using the
 489          * IP address it is offering to the client.  This can be
 490          * disabled through the "broadcast bit", but not all DHCP
 491          * servers honor that bit.  Therefore, to interoperate with as
 492          * many DHCP servers as possible, the DHCP client allows the
 493          * server to unicast, but we treat those packets as broadcast
 494          * here.  Note that we don't rewrite the packet itself since
 495          * (a) that would mess up the checksums and (b) the DHCP
 496          * client conn is bound to INADDR_ANY so ip_fanout_udp() will
 497          * hand it the packet regardless.
 498          */
 499         if (ill->ill_dhcpinit != 0 &&
 500             ipha->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION &&
 501             ipha->ipha_protocol == IPPROTO_UDP) {
 502                 udpha_t *udpha;
 503 
 504                 ipha = ip_pullup(mp, sizeof (ipha_t) + sizeof (udpha_t), ira);
 505                 if (ipha == NULL) {
 506                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 507                         ip_drop_input("ipIfStatsInDiscards - dhcp", mp, ill);
 508                         freemsg(mp);
 509                         return;
 510                 }
 511                 /* Reload since pullupmsg() can change b_rptr. */
 512                 udpha = (udpha_t *)&ipha[1];
 513 
 514                 if (ntohs(udpha->uha_dst_port) == IPPORT_BOOTPC) {
 515                         DTRACE_PROBE2(ip4__dhcpinit__pkt, ill_t *, ill,
 516                             mblk_t *, mp);
 517                         /*
 518                          * This assumes that we deliver to all conns for
 519                          * multicast and broadcast packets.
 520                          */
 521                         nexthop = INADDR_BROADCAST;
 522                         ira->ira_flags |= IRAF_DHCP_UNICAST;
 523                 }
 524         }
 525 
 526         /*
 527          * If rsvpd is running, let RSVP daemon handle its processing
 528          * and forwarding of RSVP multicast/unicast packets.
 529          * If rsvpd is not running but mrouted is running, RSVP
 530          * multicast packets are forwarded as multicast traffic
 531          * and RSVP unicast packets are forwarded by unicast router.
 532          * If neither rsvpd nor mrouted is running, RSVP multicast
 533          * packets are not forwarded, but the unicast packets are
 534          * forwarded like unicast traffic.
 535          */
 536         if (ipha->ipha_protocol == IPPROTO_RSVP &&
 537             ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head != NULL) {
 538                 /* RSVP packet and rsvpd running. Treat as ours */
 539                 ip2dbg(("ip_input: RSVP for us: 0x%x\n", ntohl(nexthop)));
 540                 /*
 541                  * We use a multicast address to get the packet to
 542                  * ire_recv_multicast_v4. There will not be a membership
 543                  * check since we set IRAF_RSVP
 544                  */
 545                 nexthop = htonl(INADDR_UNSPEC_GROUP);
 546                 ira->ira_flags |= IRAF_RSVP;
 547         }
 548 
 549         ill_input_short_v4(mp, ipha, &nexthop, ira, rtc);
 550 }
 551 
 552 /*
 553  * This is the tail-end of the full receive side packet handling.
 554  * It can be used directly when the configuration is simple.
 555  */
 556 void
 557 ill_input_short_v4(mblk_t *mp, void *iph_arg, void *nexthop_arg,
 558     ip_recv_attr_t *ira, rtc_t *rtc)
 559 {
 560         ire_t           *ire;
 561         uint_t          opt_len;
 562         ill_t           *ill = ira->ira_ill;
 563         ip_stack_t      *ipst = ill->ill_ipst;
 564         uint_t          pkt_len;
 565         ssize_t         len;
 566         ipha_t          *ipha = (ipha_t *)iph_arg;
 567         ipaddr_t        nexthop = *(ipaddr_t *)nexthop_arg;
 568         ilb_stack_t     *ilbs = ipst->ips_netstack->netstack_ilb;
 569         uint_t          irr_flags;
 570 #define rptr    ((uchar_t *)ipha)
 571 
 572         ASSERT(DB_TYPE(mp) == M_DATA);
 573 
 574         /*
 575          * The following test for loopback is faster than
 576          * IP_LOOPBACK_ADDR(), because it avoids any bitwise
 577          * operations.
 578          * Note that these addresses are always in network byte order
 579          */
 580         if (((*(uchar_t *)&ipha->ipha_dst) == IN_LOOPBACKNET) ||
 581             ((*(uchar_t *)&ipha->ipha_src) == IN_LOOPBACKNET)) {
 582                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
 583                 ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
 584                 freemsg(mp);
 585                 return;
 586         }
 587 
 588         len = mp->b_wptr - rptr;
 589         pkt_len = ira->ira_pktlen;
 590 
 591         /* multiple mblk or too short */
 592         len -= pkt_len;
 593         if (len != 0) {
 594                 mp = ip_check_length(mp, rptr, len, pkt_len,
 595                     IP_SIMPLE_HDR_LENGTH, ira);
 596                 if (mp == NULL)
 597                         return;
 598                 ipha = (ipha_t *)mp->b_rptr;
 599         }
 600 
 601         DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
 602             ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL,
 603             int, 0);
 604 
 605         /*
 606          * The event for packets being received from a 'physical'
 607          * interface is placed after validation of the source and/or
 608          * destination address as being local so that packets can be
 609          * redirected to loopback addresses using ipnat.
 610          */
 611         DTRACE_PROBE4(ip4__physical__in__start,
 612             ill_t *, ill, ill_t *, NULL,
 613             ipha_t *, ipha, mblk_t *, mp);
 614 
 615         if (HOOKS4_INTERESTED_PHYSICAL_IN(ipst)) {
 616                 int     ll_multicast = 0;
 617                 int     error;
 618                 ipaddr_t orig_dst = ipha->ipha_dst;
 619 
 620                 if (ira->ira_flags & IRAF_L2DST_MULTICAST)
 621                         ll_multicast = HPE_MULTICAST;
 622                 else if (ira->ira_flags & IRAF_L2DST_BROADCAST)
 623                         ll_multicast = HPE_BROADCAST;
 624 
 625                 FW_HOOKS(ipst->ips_ip4_physical_in_event,
 626                     ipst->ips_ipv4firewall_physical_in,
 627                     ill, NULL, ipha, mp, mp, ll_multicast, ipst, error);
 628 
 629                 DTRACE_PROBE1(ip4__physical__in__end, mblk_t *, mp);
 630 
 631                 if (mp == NULL)
 632                         return;
 633                 /* The length could have changed */
 634                 ipha = (ipha_t *)mp->b_rptr;
 635                 ira->ira_pktlen = ntohs(ipha->ipha_length);
 636                 pkt_len = ira->ira_pktlen;
 637 
 638                 /*
 639                  * In case the destination changed we override any previous
 640                  * change to nexthop.
 641                  */
 642                 if (orig_dst != ipha->ipha_dst)
 643                         nexthop = ipha->ipha_dst;
 644                 if (nexthop == INADDR_ANY) {
 645                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
 646                         ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
 647                         freemsg(mp);
 648                         return;
 649                 }
 650         }
 651 
 652         if (ipst->ips_ip4_observe.he_interested) {
 653                 zoneid_t dzone;
 654 
 655                 /*
 656                  * On the inbound path the src zone will be unknown as
 657                  * this packet has come from the wire.
 658                  */
 659                 dzone = ip_get_zoneid_v4(nexthop, mp, ira, ALL_ZONES);
 660                 ipobs_hook(mp, IPOBS_HOOK_INBOUND, ALL_ZONES, dzone, ill, ipst);
 661         }
 662 
 663         /*
 664          * If there is a good HW IP header checksum we clear the need
 665          * look at the IP header checksum.
 666          */
 667         if ((DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) &&
 668             ILL_HCKSUM_CAPABLE(ill) && dohwcksum) {
 669                 /* Header checksum was ok. Clear the flag */
 670                 DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM;
 671                 ira->ira_flags &= ~IRAF_VERIFY_IP_CKSUM;
 672         }
 673 
 674         /*
 675          * Here we check to see if we machine is setup as
 676          * L3 loadbalancer and if the incoming packet is for a VIP
 677          *
 678          * Check the following:
 679          * - there is at least a rule
 680          * - protocol of the packet is supported
 681          */
 682         if (ilb_has_rules(ilbs) && ILB_SUPP_L4(ipha->ipha_protocol)) {
 683                 ipaddr_t        lb_dst;
 684                 int             lb_ret;
 685 
 686                 /* For convenience, we pull up the mblk. */
 687                 if (mp->b_cont != NULL) {
 688                         if (pullupmsg(mp, -1) == 0) {
 689                                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 690                                 ip_drop_input("ipIfStatsInDiscards - pullupmsg",
 691                                     mp, ill);
 692                                 freemsg(mp);
 693                                 return;
 694                         }
 695                         ipha = (ipha_t *)mp->b_rptr;
 696                 }
 697 
 698                 /*
 699                  * We just drop all fragments going to any VIP, at
 700                  * least for now....
 701                  */
 702                 if (ntohs(ipha->ipha_fragment_offset_and_flags) &
 703                     (IPH_MF | IPH_OFFSET)) {
 704                         if (!ilb_rule_match_vip_v4(ilbs, nexthop, NULL)) {
 705                                 goto after_ilb;
 706                         }
 707 
 708                         ILB_KSTAT_UPDATE(ilbs, ip_frag_in, 1);
 709                         ILB_KSTAT_UPDATE(ilbs, ip_frag_dropped, 1);
 710                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 711                         ip_drop_input("ILB fragment", mp, ill);
 712                         freemsg(mp);
 713                         return;
 714                 }
 715                 lb_ret = ilb_check_v4(ilbs, ill, mp, ipha, ipha->ipha_protocol,
 716                     (uint8_t *)ipha + IPH_HDR_LENGTH(ipha), &lb_dst);
 717 
 718                 if (lb_ret == ILB_DROPPED) {
 719                         /* Is this the right counter to increase? */
 720                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 721                         ip_drop_input("ILB_DROPPED", mp, ill);
 722                         freemsg(mp);
 723                         return;
 724                 }
 725                 if (lb_ret == ILB_BALANCED) {
 726                         /* Set the dst to that of the chosen server */
 727                         nexthop = lb_dst;
 728                         DB_CKSUMFLAGS(mp) = 0;
 729                 }
 730         }
 731 
 732 after_ilb:
 733         opt_len = ipha->ipha_version_and_hdr_length - IP_SIMPLE_HDR_VERSION;
 734         ira->ira_ip_hdr_length = IP_SIMPLE_HDR_LENGTH;
 735         if (opt_len != 0) {
 736                 int error = 0;
 737 
 738                 ira->ira_ip_hdr_length += (opt_len << 2);
 739                 ira->ira_flags |= IRAF_IPV4_OPTIONS;
 740 
 741                 /* IP Options present!  Validate the length. */
 742                 mp = ip_check_optlen(mp, ipha, opt_len, pkt_len, ira);
 743                 if (mp == NULL)
 744                         return;
 745 
 746                 /* Might have changed */
 747                 ipha = (ipha_t *)mp->b_rptr;
 748 
 749                 /* Verify IP header checksum before parsing the options */
 750                 if ((ira->ira_flags & IRAF_VERIFY_IP_CKSUM) &&
 751                     ip_csum_hdr(ipha)) {
 752                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
 753                         ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
 754                         freemsg(mp);
 755                         return;
 756                 }
 757                 ira->ira_flags &= ~IRAF_VERIFY_IP_CKSUM;
 758 
 759                 /*
 760                  * Go off to ip_input_options which returns the next hop
 761                  * destination address, which may have been affected
 762                  * by source routing.
 763                  */
 764                 IP_STAT(ipst, ip_opt);
 765 
 766                 nexthop = ip_input_options(ipha, nexthop, mp, ira, &error);
 767                 if (error != 0) {
 768                         /*
 769                          * An ICMP error has been sent and the packet has
 770                          * been dropped.
 771                          */
 772                         return;
 773                 }
 774         }
 775 
 776         if (ill->ill_flags & ILLF_ROUTER)
 777                 irr_flags = IRR_ALLOCATE;
 778         else
 779                 irr_flags = IRR_NONE;
 780 
 781         /* Can not use route cache with TX since the labels can differ */
 782         if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
 783                 if (CLASSD(nexthop)) {
 784                         ire = ire_multicast(ill);
 785                 } else {
 786                         /* Match destination and label */
 787                         ire = ire_route_recursive_v4(nexthop, 0, NULL,
 788                             ALL_ZONES, ira->ira_tsl, MATCH_IRE_SECATTR,
 789                             irr_flags, ira->ira_xmit_hint, ipst, NULL, NULL,
 790                             NULL);
 791                 }
 792                 /* Update the route cache so we do the ire_refrele */
 793                 ASSERT(ire != NULL);
 794                 if (rtc->rtc_ire != NULL)
 795                         ire_refrele(rtc->rtc_ire);
 796                 rtc->rtc_ire = ire;
 797                 rtc->rtc_ipaddr = nexthop;
 798         } else if (nexthop == rtc->rtc_ipaddr && rtc->rtc_ire != NULL) {
 799                 /* Use the route cache */
 800                 ire = rtc->rtc_ire;
 801         } else {
 802                 /* Update the route cache */
 803                 if (CLASSD(nexthop)) {
 804                         ire = ire_multicast(ill);
 805                 } else {
 806                         /* Just match the destination */
 807                         ire = ire_route_recursive_dstonly_v4(nexthop, irr_flags,
 808                             ira->ira_xmit_hint, ipst);
 809                 }
 810                 ASSERT(ire != NULL);
 811                 if (rtc->rtc_ire != NULL)
 812                         ire_refrele(rtc->rtc_ire);
 813                 rtc->rtc_ire = ire;
 814                 rtc->rtc_ipaddr = nexthop;
 815         }
 816 
 817         ire->ire_ib_pkt_count++;
 818 
 819         /*
 820          * Based on ire_type and ire_flags call one of:
 821          *      ire_recv_local_v4 - for IRE_LOCAL
 822          *      ire_recv_loopback_v4 - for IRE_LOOPBACK
 823          *      ire_recv_multirt_v4 - if RTF_MULTIRT
 824          *      ire_recv_noroute_v4 - if RTF_REJECT or RTF_BLACHOLE
 825          *      ire_recv_multicast_v4 - for IRE_MULTICAST
 826          *      ire_recv_broadcast_v4 - for IRE_BROADCAST
 827          *      ire_recv_noaccept_v4 - for ire_noaccept ones
 828          *      ire_recv_forward_v4 - for the rest.
 829          */
 830         (*ire->ire_recvfn)(ire, mp, ipha, ira);
 831 }
 832 #undef rptr
 833 
 834 /*
 835  * ire_recvfn for IREs that need forwarding
 836  */
 837 void
 838 ire_recv_forward_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
 839 {
 840         ipha_t          *ipha = (ipha_t *)iph_arg;
 841         ill_t           *ill = ira->ira_ill;
 842         ip_stack_t      *ipst = ill->ill_ipst;
 843         ill_t           *dst_ill;
 844         nce_t           *nce;
 845         ipaddr_t        src = ipha->ipha_src;
 846         uint32_t        added_tx_len;
 847         uint32_t        mtu, iremtu;
 848 
 849         if (ira->ira_flags & (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST)) {
 850                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
 851                 ip_drop_input("l2 multicast not forwarded", mp, ill);
 852                 freemsg(mp);
 853                 return;
 854         }
 855 
 856         if (!(ill->ill_flags & ILLF_ROUTER) && !ip_source_routed(ipha, ipst)) {
 857                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
 858                 ip_drop_input("ipIfStatsForwProhibits", mp, ill);
 859                 freemsg(mp);
 860                 return;
 861         }
 862 
 863         /*
 864          * Either ire_nce_capable or ire_dep_parent would be set for the IRE
 865          * when it is found by ire_route_recursive, but that some other thread
 866          * could have changed the routes with the effect of clearing
 867          * ire_dep_parent. In that case we'd end up dropping the packet, or
 868          * finding a new nce below.
 869          * Get, allocate, or update the nce.
 870          * We get a refhold on ire_nce_cache as a result of this to avoid races
 871          * where ire_nce_cache is deleted.
 872          *
 873          * This ensures that we don't forward if the interface is down since
 874          * ipif_down removes all the nces.
 875          */
 876         mutex_enter(&ire->ire_lock);
 877         nce = ire->ire_nce_cache;
 878         if (nce == NULL) {
 879                 /* Not yet set up - try to set one up */
 880                 mutex_exit(&ire->ire_lock);
 881                 (void) ire_revalidate_nce(ire);
 882                 mutex_enter(&ire->ire_lock);
 883                 nce = ire->ire_nce_cache;
 884                 if (nce == NULL) {
 885                         mutex_exit(&ire->ire_lock);
 886                         /* The ire_dep_parent chain went bad, or no memory */
 887                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 888                         ip_drop_input("No ire_dep_parent", mp, ill);
 889                         freemsg(mp);
 890                         return;
 891                 }
 892         }
 893         nce_refhold(nce);
 894         mutex_exit(&ire->ire_lock);
 895 
 896         if (nce->nce_is_condemned) {
 897                 nce_t *nce1;
 898 
 899                 nce1 = ire_handle_condemned_nce(nce, ire, ipha, NULL, B_FALSE);
 900                 nce_refrele(nce);
 901                 if (nce1 == NULL) {
 902                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 903                         ip_drop_input("No nce", mp, ill);
 904                         freemsg(mp);
 905                         return;
 906                 }
 907                 nce = nce1;
 908         }
 909         dst_ill = nce->nce_ill;
 910 
 911         /*
 912          * Unless we are forwarding, drop the packet.
 913          * We have to let source routed packets through if they go out
 914          * the same interface i.e., they are 'ping -l' packets.
 915          */
 916         if (!(dst_ill->ill_flags & ILLF_ROUTER) &&
 917             !(ip_source_routed(ipha, ipst) && dst_ill == ill)) {
 918                 if (ip_source_routed(ipha, ipst)) {
 919                         ip_drop_input("ICMP_SOURCE_ROUTE_FAILED", mp, ill);
 920                         icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, ira);
 921                         nce_refrele(nce);
 922                         return;
 923                 }
 924                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
 925                 ip_drop_input("ipIfStatsForwProhibits", mp, ill);
 926                 freemsg(mp);
 927                 nce_refrele(nce);
 928                 return;
 929         }
 930 
 931         if (ire->ire_zoneid != GLOBAL_ZONEID && ire->ire_zoneid != ALL_ZONES) {
 932                 ipaddr_t        dst = ipha->ipha_dst;
 933 
 934                 ire->ire_ib_pkt_count--;
 935                 /*
 936                  * Should only use IREs that are visible from the
 937                  * global zone for forwarding.
 938                  * Take a source route into account the same way as ip_input
 939                  * did.
 940                  */
 941                 if (ira->ira_flags & IRAF_IPV4_OPTIONS) {
 942                         int             error = 0;
 943 
 944                         dst = ip_input_options(ipha, dst, mp, ira, &error);
 945                         ASSERT(error == 0);     /* ip_input checked */
 946                 }
 947                 ire = ire_route_recursive_v4(dst, 0, NULL, GLOBAL_ZONEID,
 948                     ira->ira_tsl, MATCH_IRE_SECATTR,
 949                     (ill->ill_flags & ILLF_ROUTER) ? IRR_ALLOCATE : IRR_NONE,
 950                     ira->ira_xmit_hint, ipst, NULL, NULL, NULL);
 951                 ire->ire_ib_pkt_count++;
 952                 (*ire->ire_recvfn)(ire, mp, ipha, ira);
 953                 ire_refrele(ire);
 954                 nce_refrele(nce);
 955                 return;
 956         }
 957 
 958         /*
 959          * ipIfStatsHCInForwDatagrams should only be increment if there
 960          * will be an attempt to forward the packet, which is why we
 961          * increment after the above condition has been checked.
 962          */
 963         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams);
 964 
 965         /* Initiate Read side IPPF processing */
 966         if (IPP_ENABLED(IPP_FWD_IN, ipst)) {
 967                 /* ip_process translates an IS_UNDER_IPMP */
 968                 mp = ip_process(IPP_FWD_IN, mp, ill, ill);
 969                 if (mp == NULL) {
 970                         /* ip_drop_packet and MIB done */
 971                         ip2dbg(("ire_recv_forward_v4: pkt dropped/deferred "
 972                             "during IPPF processing\n"));
 973                         nce_refrele(nce);
 974                         return;
 975                 }
 976         }
 977 
 978         DTRACE_PROBE4(ip4__forwarding__start,
 979             ill_t *, ill, ill_t *, dst_ill, ipha_t *, ipha, mblk_t *, mp);
 980 
 981         if (HOOKS4_INTERESTED_FORWARDING(ipst)) {
 982                 int error;
 983 
 984                 FW_HOOKS(ipst->ips_ip4_forwarding_event,
 985                     ipst->ips_ipv4firewall_forwarding,
 986                     ill, dst_ill, ipha, mp, mp, 0, ipst, error);
 987 
 988                 DTRACE_PROBE1(ip4__forwarding__end, mblk_t *, mp);
 989 
 990                 if (mp == NULL) {
 991                         nce_refrele(nce);
 992                         return;
 993                 }
 994                 /*
 995                  * Even if the destination was changed by the filter we use the
 996                  * forwarding decision that was made based on the address
 997                  * in ip_input.
 998                  */
 999 
1000                 /* Might have changed */
1001                 ipha = (ipha_t *)mp->b_rptr;
1002                 ira->ira_pktlen = ntohs(ipha->ipha_length);
1003         }
1004 
1005         /* Packet is being forwarded. Turning off hwcksum flag. */
1006         DB_CKSUMFLAGS(mp) = 0;
1007 
1008         /*
1009          * Martian Address Filtering [RFC 1812, Section 5.3.7]
1010          * The loopback address check for both src and dst has already
1011          * been checked in ip_input
1012          * In the future one can envision adding RPF checks using number 3.
1013          * If we already checked the same source address we can skip this.
1014          */
1015         if (!(ira->ira_flags & IRAF_VERIFIED_SRC) ||
1016             src != ira->ira_verified_src) {
1017                 switch (ipst->ips_src_check) {
1018                 case 0:
1019                         break;
1020                 case 2:
1021                         if (ip_type_v4(src, ipst) == IRE_BROADCAST) {
1022                                 BUMP_MIB(ill->ill_ip_mib,
1023                                     ipIfStatsForwProhibits);
1024                                 BUMP_MIB(ill->ill_ip_mib,
1025                                     ipIfStatsInAddrErrors);
1026                                 ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
1027                                 freemsg(mp);
1028                                 nce_refrele(nce);
1029                                 return;
1030                         }
1031                         /* FALLTHRU */
1032 
1033                 case 1:
1034                         if (CLASSD(src)) {
1035                                 BUMP_MIB(ill->ill_ip_mib,
1036                                     ipIfStatsForwProhibits);
1037                                 BUMP_MIB(ill->ill_ip_mib,
1038                                     ipIfStatsInAddrErrors);
1039                                 ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
1040                                 freemsg(mp);
1041                                 nce_refrele(nce);
1042                                 return;
1043                         }
1044                         break;
1045                 }
1046                 /* Remember for next packet */
1047                 ira->ira_flags |= IRAF_VERIFIED_SRC;
1048                 ira->ira_verified_src = src;
1049         }
1050 
1051         /*
1052          * Check if packet is going out the same link on which it arrived.
1053          * Means we might need to send a redirect.
1054          */
1055         if (IS_ON_SAME_LAN(dst_ill, ill) && ipst->ips_ip_g_send_redirects) {
1056                 ip_send_potential_redirect_v4(mp, ipha, ire, ira);
1057         }
1058 
1059         added_tx_len = 0;
1060         if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
1061                 mblk_t          *mp1;
1062                 uint32_t        old_pkt_len = ira->ira_pktlen;
1063 
1064                 /* Verify IP header checksum before adding/removing options */
1065                 if ((ira->ira_flags & IRAF_VERIFY_IP_CKSUM) &&
1066                     ip_csum_hdr(ipha)) {
1067                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
1068                         ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
1069                         freemsg(mp);
1070                         nce_refrele(nce);
1071                         return;
1072                 }
1073                 ira->ira_flags &= ~IRAF_VERIFY_IP_CKSUM;
1074 
1075                 /*
1076                  * Check if it can be forwarded and add/remove
1077                  * CIPSO options as needed.
1078                  */
1079                 if ((mp1 = tsol_ip_forward(ire, mp, ira)) == NULL) {
1080                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
1081                         ip_drop_input("tsol_ip_forward", mp, ill);
1082                         freemsg(mp);
1083                         nce_refrele(nce);
1084                         return;
1085                 }
1086                 /*
1087                  * Size may have changed. Remember amount added in case
1088                  * IP needs to send an ICMP too big.
1089                  */
1090                 mp = mp1;
1091                 ipha = (ipha_t *)mp->b_rptr;
1092                 ira->ira_pktlen = ntohs(ipha->ipha_length);
1093                 ira->ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
1094                 if (ira->ira_pktlen > old_pkt_len)
1095                         added_tx_len = ira->ira_pktlen - old_pkt_len;
1096 
1097                 /* Options can have been added or removed */
1098                 if (ira->ira_ip_hdr_length != IP_SIMPLE_HDR_LENGTH)
1099                         ira->ira_flags |= IRAF_IPV4_OPTIONS;
1100                 else
1101                         ira->ira_flags &= ~IRAF_IPV4_OPTIONS;
1102         }
1103 
1104         mtu = dst_ill->ill_mtu;
1105         if ((iremtu = ire->ire_metrics.iulp_mtu) != 0 && iremtu < mtu)
1106                 mtu = iremtu;
1107         ip_forward_xmit_v4(nce, ill, mp, ipha, ira, mtu, added_tx_len);
1108         nce_refrele(nce);
1109 }
1110 
1111 /*
1112  * Used for sending out unicast and multicast packets that are
1113  * forwarded.
1114  */
1115 void
1116 ip_forward_xmit_v4(nce_t *nce, ill_t *ill, mblk_t *mp, ipha_t *ipha,
1117     ip_recv_attr_t *ira, uint32_t mtu, uint32_t added_tx_len)
1118 {
1119         ill_t           *dst_ill = nce->nce_ill;
1120         uint32_t        pkt_len;
1121         uint32_t        sum;
1122         iaflags_t       iraflags = ira->ira_flags;
1123         ip_stack_t      *ipst = ill->ill_ipst;
1124         iaflags_t       ixaflags;
1125 
1126         if (ipha->ipha_ttl <= 1) {
1127                 /* Perhaps the checksum was bad */
1128                 if ((iraflags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) {
1129                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
1130                         ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
1131                         freemsg(mp);
1132                         return;
1133                 }
1134                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1135                 ip_drop_input("ICMP_TTL_EXCEEDED", mp, ill);
1136                 icmp_time_exceeded(mp, ICMP_TTL_EXCEEDED, ira);
1137                 return;
1138         }
1139         ipha->ipha_ttl--;
1140         /* Adjust the checksum to reflect the ttl decrement. */
1141         sum = (int)ipha->ipha_hdr_checksum + IP_HDR_CSUM_TTL_ADJUST;
1142         ipha->ipha_hdr_checksum = (uint16_t)(sum + (sum >> 16));
1143 
1144         /* Check if there are options to update */
1145         if (iraflags & IRAF_IPV4_OPTIONS) {
1146                 ASSERT(ipha->ipha_version_and_hdr_length !=
1147                     IP_SIMPLE_HDR_VERSION);
1148                 ASSERT(!(iraflags & IRAF_VERIFY_IP_CKSUM));
1149 
1150                 if (!ip_forward_options(mp, ipha, dst_ill, ira)) {
1151                         /* ipIfStatsForwProhibits and ip_drop_input done */
1152                         return;
1153                 }
1154 
1155                 ipha->ipha_hdr_checksum = 0;
1156                 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1157         }
1158 
1159         /* Initiate Write side IPPF processing before any fragmentation */
1160         if (IPP_ENABLED(IPP_FWD_OUT, ipst)) {
1161                 /* ip_process translates an IS_UNDER_IPMP */
1162                 mp = ip_process(IPP_FWD_OUT, mp, dst_ill, dst_ill);
1163                 if (mp == NULL) {
1164                         /* ip_drop_packet and MIB done */
1165                         ip2dbg(("ire_recv_forward_v4: pkt dropped/deferred" \
1166                             " during IPPF processing\n"));
1167                         return;
1168                 }
1169         }
1170 
1171         pkt_len = ira->ira_pktlen;
1172 
1173         BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsHCOutForwDatagrams);
1174 
1175         ixaflags = IXAF_IS_IPV4 | IXAF_NO_DEV_FLOW_CTL;
1176 
1177         if (pkt_len > mtu) {
1178                 /*
1179                  * It needs fragging on its way out.  If we haven't
1180                  * verified the header checksum yet we do it now since
1181                  * are going to put a surely good checksum in the
1182                  * outgoing header, we have to make sure that it
1183                  * was good coming in.
1184                  */
1185                 if ((iraflags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) {
1186                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
1187                         ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
1188                         freemsg(mp);
1189                         return;
1190                 }
1191                 if (ipha->ipha_fragment_offset_and_flags & IPH_DF_HTONS) {
1192                         BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsOutFragFails);
1193                         ip_drop_output("ipIfStatsOutFragFails", mp, dst_ill);
1194                         if (iraflags & IRAF_SYSTEM_LABELED) {
1195                                 /*
1196                                  * Remove any CIPSO option added by
1197                                  * tsol_ip_forward, and make sure we report
1198                                  * a path MTU so that there
1199                                  * is room to add such a CIPSO option for future
1200                                  * packets.
1201                                  */
1202                                 mtu = tsol_pmtu_adjust(mp, mtu, added_tx_len,
1203                                     AF_INET);
1204                         }
1205 
1206                         icmp_frag_needed(mp, mtu, ira);
1207                         return;
1208                 }
1209 
1210                 (void) ip_fragment_v4(mp, nce, ixaflags, pkt_len, mtu,
1211                     ira->ira_xmit_hint, GLOBAL_ZONEID, 0, ip_xmit, NULL);
1212                 return;
1213         }
1214 
1215         ASSERT(pkt_len == ntohs(((ipha_t *)mp->b_rptr)->ipha_length));
1216         if (iraflags & IRAF_LOOPBACK_COPY) {
1217                 /*
1218                  * IXAF_NO_LOOP_ZONEID is not set hence 7th arg
1219                  * is don't care
1220                  */
1221                 (void) ip_postfrag_loopcheck(mp, nce,
1222                     ixaflags | IXAF_LOOPBACK_COPY,
1223                     pkt_len, ira->ira_xmit_hint, GLOBAL_ZONEID, 0, NULL);
1224         } else {
1225                 (void) ip_xmit(mp, nce, ixaflags, pkt_len, ira->ira_xmit_hint,
1226                     GLOBAL_ZONEID, 0, NULL);
1227         }
1228 }
1229 
1230 /*
1231  * ire_recvfn for RTF_REJECT and RTF_BLACKHOLE routes, including IRE_NOROUTE,
1232  * which is what ire_route_recursive returns when there is no matching ire.
1233  * Send ICMP unreachable unless blackhole.
1234  */
1235 void
1236 ire_recv_noroute_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
1237 {
1238         ipha_t          *ipha = (ipha_t *)iph_arg;
1239         ill_t           *ill = ira->ira_ill;
1240         ip_stack_t      *ipst = ill->ill_ipst;
1241 
1242         /* Would we have forwarded this packet if we had a route? */
1243         if (ira->ira_flags & (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST)) {
1244                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
1245                 ip_drop_input("l2 multicast not forwarded", mp, ill);
1246                 freemsg(mp);
1247                 return;
1248         }
1249 
1250         if (!(ill->ill_flags & ILLF_ROUTER)) {
1251                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
1252                 ip_drop_input("ipIfStatsForwProhibits", mp, ill);
1253                 freemsg(mp);
1254                 return;
1255         }
1256         /*
1257          * If we had a route this could have been forwarded. Count as such.
1258          *
1259          * ipIfStatsHCInForwDatagrams should only be increment if there
1260          * will be an attempt to forward the packet, which is why we
1261          * increment after the above condition has been checked.
1262          */
1263         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams);
1264 
1265         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInNoRoutes);
1266 
1267         ip_rts_change(RTM_MISS, ipha->ipha_dst, 0, 0, 0, 0, 0, 0, RTA_DST,
1268             ipst);
1269 
1270         if (ire->ire_flags & RTF_BLACKHOLE) {
1271                 ip_drop_input("ipIfStatsInNoRoutes RTF_BLACKHOLE", mp, ill);
1272                 freemsg(mp);
1273         } else {
1274                 ip_drop_input("ipIfStatsInNoRoutes RTF_REJECT", mp, ill);
1275 
1276                 if (ip_source_routed(ipha, ipst)) {
1277                         icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, ira);
1278                 } else {
1279                         icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, ira);
1280                 }
1281         }
1282 }
1283 
1284 /*
1285  * ire_recvfn for IRE_LOCALs marked with ire_noaccept. Such IREs are used for
1286  * VRRP when in noaccept mode.
1287  * We silently drop the packet. ARP handles packets even if noaccept is set.
1288  */
1289 /* ARGSUSED */
1290 void
1291 ire_recv_noaccept_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1292     ip_recv_attr_t *ira)
1293 {
1294         ill_t           *ill = ira->ira_ill;
1295 
1296         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1297         ip_drop_input("ipIfStatsInDiscards - noaccept", mp, ill);
1298         freemsg(mp);
1299 }
1300 
1301 /*
1302  * ire_recvfn for IRE_BROADCAST.
1303  */
1304 void
1305 ire_recv_broadcast_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1306     ip_recv_attr_t *ira)
1307 {
1308         ipha_t          *ipha = (ipha_t *)iph_arg;
1309         ill_t           *ill = ira->ira_ill;
1310         ill_t           *dst_ill = ire->ire_ill;
1311         ip_stack_t      *ipst = ill->ill_ipst;
1312         ire_t           *alt_ire;
1313         nce_t           *nce;
1314         ipaddr_t        ipha_dst;
1315 
1316         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInBcastPkts);
1317 
1318         /* Tag for higher-level protocols */
1319         ira->ira_flags |= IRAF_BROADCAST;
1320 
1321         /*
1322          * Whether local or directed broadcast forwarding: don't allow
1323          * for TCP.
1324          */
1325         if (ipha->ipha_protocol == IPPROTO_TCP) {
1326                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1327                 ip_drop_input("ipIfStatsInDiscards", mp, ill);
1328                 freemsg(mp);
1329                 return;
1330         }
1331 
1332         /*
1333          * So that we don't end up with dups, only one ill an IPMP group is
1334          * nominated to receive broadcast traffic.
1335          * If we have no cast_ill we are liberal and accept everything.
1336          */
1337         if (IS_UNDER_IPMP(ill)) {
1338                 /* For an under ill_grp can change under lock */
1339                 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1340                 if (!ill->ill_nom_cast && ill->ill_grp != NULL &&
1341                     ill->ill_grp->ig_cast_ill != NULL) {
1342                         rw_exit(&ipst->ips_ill_g_lock);
1343                         /* No MIB since this is normal operation */
1344                         ip_drop_input("not nom_cast", mp, ill);
1345                         freemsg(mp);
1346                         return;
1347                 }
1348                 rw_exit(&ipst->ips_ill_g_lock);
1349 
1350                 ira->ira_ruifindex = ill_get_upper_ifindex(ill);
1351         }
1352 
1353         /*
1354          * After reassembly and IPsec we will need to duplicate the
1355          * broadcast packet for all matching zones on the ill.
1356          */
1357         ira->ira_zoneid = ALL_ZONES;
1358 
1359         /*
1360          * Check for directed broadcast i.e. ire->ire_ill is different than
1361          * the incoming ill.
1362          * The same broadcast address can be assigned to multiple interfaces
1363          * so have to check explicitly for that case by looking up the alt_ire
1364          */
1365         if (dst_ill == ill && !(ire->ire_flags & RTF_MULTIRT)) {
1366                 /* Reassemble on the ill on which the packet arrived */
1367                 ip_input_local_v4(ire, mp, ipha, ira);
1368                 /* Restore */
1369                 ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex;
1370                 return;
1371         }
1372 
1373         /* Is there an IRE_BROADCAST on the incoming ill? */
1374         ipha_dst = ((ira->ira_flags & IRAF_DHCP_UNICAST) ? INADDR_BROADCAST :
1375             ipha->ipha_dst);
1376         alt_ire = ire_ftable_lookup_v4(ipha_dst, 0, 0, IRE_BROADCAST, ill,
1377             ALL_ZONES, ira->ira_tsl,
1378             MATCH_IRE_TYPE|MATCH_IRE_ILL|MATCH_IRE_SECATTR, 0, ipst, NULL);
1379         if (alt_ire != NULL) {
1380                 /* Not a directed broadcast */
1381                 /*
1382                  * In the special case of multirouted broadcast
1383                  * packets, we unconditionally need to "gateway"
1384                  * them to the appropriate interface here so that reassembly
1385                  * works. We know that the IRE_BROADCAST on cgtp0 doesn't
1386                  * have RTF_MULTIRT set so we look for such an IRE in the
1387                  * bucket.
1388                  */
1389                 if (alt_ire->ire_flags & RTF_MULTIRT) {
1390                         irb_t           *irb;
1391                         ire_t           *ire1;
1392 
1393                         irb = ire->ire_bucket;
1394                         irb_refhold(irb);
1395                         for (ire1 = irb->irb_ire; ire1 != NULL;
1396                             ire1 = ire1->ire_next) {
1397                                 if (IRE_IS_CONDEMNED(ire1))
1398                                         continue;
1399                                 if (!(ire1->ire_type & IRE_BROADCAST) ||
1400                                     (ire1->ire_flags & RTF_MULTIRT))
1401                                         continue;
1402                                 ill = ire1->ire_ill;
1403                                 ill_refhold(ill);
1404                                 break;
1405                         }
1406                         irb_refrele(irb);
1407                         if (ire1 != NULL) {
1408                                 ill_t *orig_ill = ira->ira_ill;
1409 
1410                                 ire_refrele(alt_ire);
1411                                 /* Reassemble on the new ill */
1412                                 ira->ira_ill = ill;
1413                                 ip_input_local_v4(ire, mp, ipha, ira);
1414                                 ill_refrele(ill);
1415                                 /* Restore */
1416                                 ira->ira_ill = orig_ill;
1417                                 ira->ira_ruifindex =
1418                                     orig_ill->ill_phyint->phyint_ifindex;
1419                                 return;
1420                         }
1421                 }
1422                 ire_refrele(alt_ire);
1423                 /* Reassemble on the ill on which the packet arrived */
1424                 ip_input_local_v4(ire, mp, ipha, ira);
1425                 goto done;
1426         }
1427 
1428         /*
1429          * This is a directed broadcast
1430          *
1431          * If directed broadcast is allowed, then forward the packet out
1432          * the destination interface with IXAF_LOOPBACK_COPY set. That will
1433          * result in ip_input() receiving a copy of the packet on the
1434          * appropriate ill. (We could optimize this to avoid the extra trip
1435          * via ip_input(), but since directed broadcasts are normally disabled
1436          * it doesn't make sense to optimize it.)
1437          */
1438         if (!ipst->ips_ip_g_forward_directed_bcast ||
1439             (ira->ira_flags & (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST))) {
1440                 ip_drop_input("directed broadcast not allowed", mp, ill);
1441                 freemsg(mp);
1442                 goto done;
1443         }
1444         if ((ira->ira_flags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) {
1445                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
1446                 ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
1447                 freemsg(mp);
1448                 goto done;
1449         }
1450 
1451         /*
1452          * Clear the indication that this may have hardware
1453          * checksum as we are not using it for forwarding.
1454          */
1455         DB_CKSUMFLAGS(mp) = 0;
1456 
1457         /*
1458          * Adjust ttl to 2 (1+1 - the forward engine will decrement it by one.
1459          */
1460         ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl + 1;
1461         ipha->ipha_hdr_checksum = 0;
1462         ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1463 
1464         /*
1465          * We use ip_forward_xmit to do any fragmentation.
1466          * and loopback copy on the outbound interface.
1467          *
1468          * Make it so that IXAF_LOOPBACK_COPY to be set on transmit side.
1469          */
1470         ira->ira_flags |= IRAF_LOOPBACK_COPY;
1471 
1472         nce = arp_nce_init(dst_ill, ipha->ipha_dst, IRE_BROADCAST);
1473         if (nce == NULL) {
1474                 BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsOutDiscards);
1475                 ip_drop_output("No nce", mp, dst_ill);
1476                 freemsg(mp);
1477                 goto done;
1478         }
1479 
1480         ip_forward_xmit_v4(nce, ill, mp, ipha, ira, dst_ill->ill_mc_mtu, 0);
1481         nce_refrele(nce);
1482 done:
1483         /* Restore */
1484         ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex;
1485 }
1486 
1487 /*
1488  * ire_recvfn for IRE_MULTICAST.
1489  */
1490 void
1491 ire_recv_multicast_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1492     ip_recv_attr_t *ira)
1493 {
1494         ipha_t          *ipha = (ipha_t *)iph_arg;
1495         ill_t           *ill = ira->ira_ill;
1496         ip_stack_t      *ipst = ill->ill_ipst;
1497 
1498         ASSERT(ire->ire_ill == ira->ira_ill);
1499 
1500         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastPkts);
1501         UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastOctets, ira->ira_pktlen);
1502 
1503         /* RSVP hook */
1504         if (ira->ira_flags & IRAF_RSVP)
1505                 goto forus;
1506 
1507         /* Tag for higher-level protocols */
1508         ira->ira_flags |= IRAF_MULTICAST;
1509 
1510         /*
1511          * So that we don't end up with dups, only one ill an IPMP group is
1512          * nominated to receive multicast traffic.
1513          * If we have no cast_ill we are liberal and accept everything.
1514          */
1515         if (IS_UNDER_IPMP(ill)) {
1516                 ip_stack_t      *ipst = ill->ill_ipst;
1517 
1518                 /* For an under ill_grp can change under lock */
1519                 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1520                 if (!ill->ill_nom_cast && ill->ill_grp != NULL &&
1521                     ill->ill_grp->ig_cast_ill != NULL) {
1522                         rw_exit(&ipst->ips_ill_g_lock);
1523                         ip_drop_input("not on cast ill", mp, ill);
1524                         freemsg(mp);
1525                         return;
1526                 }
1527                 rw_exit(&ipst->ips_ill_g_lock);
1528                 /*
1529                  * We switch to the upper ill so that mrouter and hasmembers
1530                  * can operate on upper here and in ip_input_multicast.
1531                  */
1532                 ill = ipmp_ill_hold_ipmp_ill(ill);
1533                 if (ill != NULL) {
1534                         ASSERT(ill != ira->ira_ill);
1535                         ASSERT(ire->ire_ill == ira->ira_ill);
1536                         ira->ira_ill = ill;
1537                         ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex;
1538                 } else {
1539                         ill = ira->ira_ill;
1540                 }
1541         }
1542 
1543         /*
1544          * Check if we are a multicast router - send ip_mforward a copy of
1545          * the packet.
1546          * Due to mroute_decap tunnels we consider forwarding packets even if
1547          * mrouted has not joined the allmulti group on this interface.
1548          */
1549         if (ipst->ips_ip_g_mrouter) {
1550                 int retval;
1551 
1552                 /*
1553                  * Clear the indication that this may have hardware
1554                  * checksum as we are not using it for forwarding.
1555                  */
1556                 DB_CKSUMFLAGS(mp) = 0;
1557 
1558                 /*
1559                  * ip_mforward helps us make these distinctions: If received
1560                  * on tunnel and not IGMP, then drop.
1561                  * If IGMP packet, then don't check membership
1562                  * If received on a phyint and IGMP or PIM, then
1563                  * don't check membership
1564                  */
1565                 retval = ip_mforward(mp, ira);
1566                 /* ip_mforward updates mib variables if needed */
1567 
1568                 switch (retval) {
1569                 case 0:
1570                         /*
1571                          * pkt is okay and arrived on phyint.
1572                          *
1573                          * If we are running as a multicast router
1574                          * we need to see all IGMP and/or PIM packets.
1575                          */
1576                         if ((ipha->ipha_protocol == IPPROTO_IGMP) ||
1577                             (ipha->ipha_protocol == IPPROTO_PIM)) {
1578                                 goto forus;
1579                         }
1580                         break;
1581                 case -1:
1582                         /* pkt is mal-formed, toss it */
1583                         freemsg(mp);
1584                         goto done;
1585                 case 1:
1586                         /*
1587                          * pkt is okay and arrived on a tunnel
1588                          *
1589                          * If we are running a multicast router
1590                          * we need to see all igmp packets.
1591                          */
1592                         if (ipha->ipha_protocol == IPPROTO_IGMP) {
1593                                 goto forus;
1594                         }
1595                         ip_drop_input("Multicast on tunnel ignored", mp, ill);
1596                         freemsg(mp);
1597                         goto done;
1598                 }
1599         }
1600 
1601         /*
1602          * Check if we have members on this ill. This is not necessary for
1603          * correctness because even if the NIC/GLD had a leaky filter, we
1604          * filter before passing to each conn_t.
1605          */
1606         if (!ill_hasmembers_v4(ill, ipha->ipha_dst)) {
1607                 /*
1608                  * Nobody interested
1609                  *
1610                  * This might just be caused by the fact that
1611                  * multiple IP Multicast addresses map to the same
1612                  * link layer multicast - no need to increment counter!
1613                  */
1614                 ip_drop_input("Multicast with no members", mp, ill);
1615                 freemsg(mp);
1616                 goto done;
1617         }
1618 forus:
1619         ip2dbg(("ire_recv_multicast_v4: multicast for us: 0x%x\n",
1620             ntohl(ipha->ipha_dst)));
1621 
1622         /*
1623          * After reassembly and IPsec we will need to duplicate the
1624          * multicast packet for all matching zones on the ill.
1625          */
1626         ira->ira_zoneid = ALL_ZONES;
1627 
1628         /* Reassemble on the ill on which the packet arrived */
1629         ip_input_local_v4(ire, mp, ipha, ira);
1630 done:
1631         if (ill != ire->ire_ill) {
1632                 ill_refrele(ill);
1633                 ira->ira_ill = ire->ire_ill;
1634                 ira->ira_ruifindex = ira->ira_ill->ill_phyint->phyint_ifindex;
1635         }
1636 }
1637 
1638 /*
1639  * ire_recvfn for IRE_OFFLINK with RTF_MULTIRT.
1640  * Drop packets since we don't forward out multirt routes.
1641  */
1642 /* ARGSUSED */
1643 void
1644 ire_recv_multirt_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
1645 {
1646         ill_t           *ill = ira->ira_ill;
1647 
1648         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInNoRoutes);
1649         ip_drop_input("Not forwarding out MULTIRT", mp, ill);
1650         freemsg(mp);
1651 }
1652 
1653 /*
1654  * ire_recvfn for IRE_LOOPBACK. This is only used when a FW_HOOK
1655  * has rewritten the packet to have a loopback destination address (We
1656  * filter out packet with a loopback destination from arriving over the wire).
1657  * We don't know what zone to use, thus we always use the GLOBAL_ZONEID.
1658  */
1659 void
1660 ire_recv_loopback_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
1661 {
1662         ipha_t          *ipha = (ipha_t *)iph_arg;
1663         ill_t           *ill = ira->ira_ill;
1664         ill_t           *ire_ill = ire->ire_ill;
1665 
1666         ira->ira_zoneid = GLOBAL_ZONEID;
1667 
1668         /* Switch to the lo0 ill for further processing  */
1669         if (ire_ill != ill) {
1670                 /*
1671                  * Update ira_ill to be the ILL on which the IP address
1672                  * is hosted.
1673                  * No need to hold the ill since we have a hold on the ire
1674                  */
1675                 ASSERT(ira->ira_ill == ira->ira_rill);
1676                 ira->ira_ill = ire_ill;
1677 
1678                 ip_input_local_v4(ire, mp, ipha, ira);
1679 
1680                 /* Restore */
1681                 ASSERT(ira->ira_ill == ire_ill);
1682                 ira->ira_ill = ill;
1683                 return;
1684 
1685         }
1686         ip_input_local_v4(ire, mp, ipha, ira);
1687 }
1688 
1689 /*
1690  * ire_recvfn for IRE_LOCAL.
1691  */
1692 void
1693 ire_recv_local_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
1694 {
1695         ipha_t          *ipha = (ipha_t *)iph_arg;
1696         ill_t           *ill = ira->ira_ill;
1697         ill_t           *ire_ill = ire->ire_ill;
1698 
1699         /* Make a note for DAD that this address is in use */
1700         ire->ire_last_used_time = LBOLT_FASTPATH;
1701 
1702         /* Only target the IRE_LOCAL with the right zoneid. */
1703         ira->ira_zoneid = ire->ire_zoneid;
1704 
1705         /*
1706          * If the packet arrived on the wrong ill, we check that
1707          * this is ok.
1708          * If it is, then we ensure that we do the reassembly on
1709          * the ill on which the address is hosted. We keep ira_rill as
1710          * the one on which the packet arrived, so that IP_PKTINFO and
1711          * friends can report this.
1712          */
1713         if (ire_ill != ill) {
1714                 ire_t *new_ire;
1715 
1716                 new_ire = ip_check_multihome(&ipha->ipha_dst, ire, ill);
1717                 if (new_ire == NULL) {
1718                         /* Drop packet */
1719                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
1720                         ip_drop_input("ipIfStatsInForwProhibits", mp, ill);
1721                         freemsg(mp);
1722                         return;
1723                 }
1724                 /*
1725                  * Update ira_ill to be the ILL on which the IP address
1726                  * is hosted. No need to hold the ill since we have a
1727                  * hold on the ire. Note that we do the switch even if
1728                  * new_ire == ire (for IPMP, ire would be the one corresponding
1729                  * to the IPMP ill).
1730                  */
1731                 ASSERT(ira->ira_ill == ira->ira_rill);
1732                 ira->ira_ill = new_ire->ire_ill;
1733 
1734                 /* ira_ruifindex tracks the upper for ira_rill */
1735                 if (IS_UNDER_IPMP(ill))
1736                         ira->ira_ruifindex = ill_get_upper_ifindex(ill);
1737 
1738                 ip_input_local_v4(new_ire, mp, ipha, ira);
1739 
1740                 /* Restore */
1741                 ASSERT(ira->ira_ill == new_ire->ire_ill);
1742                 ira->ira_ill = ill;
1743                 ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex;
1744 
1745                 if (new_ire != ire)
1746                         ire_refrele(new_ire);
1747                 return;
1748         }
1749 
1750         ip_input_local_v4(ire, mp, ipha, ira);
1751 }
1752 
1753 /*
1754  * Common function for packets arriving for the host. Handles
1755  * checksum verification, reassembly checks, etc.
1756  */
1757 static void
1758 ip_input_local_v4(ire_t *ire, mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
1759 {
1760         ill_t           *ill = ira->ira_ill;
1761         iaflags_t       iraflags = ira->ira_flags;
1762 
1763         /*
1764          * Verify IP header checksum. If the packet was AH or ESP then
1765          * this flag has already been cleared. Likewise if the packet
1766          * had a hardware checksum.
1767          */
1768         if ((iraflags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) {
1769                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
1770                 ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
1771                 freemsg(mp);
1772                 return;
1773         }
1774 
1775         if (iraflags & IRAF_IPV4_OPTIONS) {
1776                 if (!ip_input_local_options(mp, ipha, ira)) {
1777                         /* Error has been sent and mp consumed */
1778                         return;
1779                 }
1780                 /*
1781                  * Some old hardware does partial checksum by including the
1782                  * whole IP header, so the partial checksum value might have
1783                  * become invalid if any option in the packet have been
1784                  * updated. Always clear partial checksum flag here.
1785                  */
1786                 DB_CKSUMFLAGS(mp) &= ~HCK_PARTIALCKSUM;
1787         }
1788 
1789         /*
1790          * Is packet part of fragmented IP packet?
1791          * We compare against defined values in network byte order
1792          */
1793         if (ipha->ipha_fragment_offset_and_flags &
1794             (IPH_MF_HTONS | IPH_OFFSET_HTONS)) {
1795                 /*
1796                  * Make sure we have ira_l2src before we loose the original
1797                  * mblk
1798                  */
1799                 if (!(ira->ira_flags & IRAF_L2SRC_SET))
1800                         ip_setl2src(mp, ira, ira->ira_rill);
1801 
1802                 mp = ip_input_fragment(mp, ipha, ira);
1803                 if (mp == NULL)
1804                         return;
1805                 /* Completed reassembly */
1806                 ipha = (ipha_t *)mp->b_rptr;
1807         }
1808 
1809         /*
1810          * For broadcast and multicast we need some extra work before
1811          * we call ip_fanout_v4(), since in the case of shared-IP zones
1812          * we need to pretend that a packet arrived for each zoneid.
1813          */
1814         if (iraflags & IRAF_MULTIBROADCAST) {
1815                 if (iraflags & IRAF_BROADCAST)
1816                         ip_input_broadcast_v4(ire, mp, ipha, ira);
1817                 else
1818                         ip_input_multicast_v4(ire, mp, ipha, ira);
1819                 return;
1820         }
1821         ip_fanout_v4(mp, ipha, ira);
1822 }
1823 
1824 
1825 /*
1826  * Handle multiple zones which match the same broadcast address
1827  * and ill by delivering a packet to each of them.
1828  * Walk the bucket and look for different ire_zoneid but otherwise
1829  * the same IRE (same ill/addr/mask/type).
1830  * Note that ire_add() tracks IREs that are identical in all
1831  * fields (addr/mask/type/gw/ill/zoneid) within a single IRE by
1832  * increasing ire_identical_cnt. Thus we don't need to be concerned
1833  * about those.
1834  */
1835 static void
1836 ip_input_broadcast_v4(ire_t *ire, mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
1837 {
1838         ill_t           *ill = ira->ira_ill;
1839         ip_stack_t      *ipst = ill->ill_ipst;
1840         netstack_t      *ns = ipst->ips_netstack;
1841         irb_t           *irb;
1842         ire_t           *ire1;
1843         mblk_t          *mp1;
1844         ipha_t          *ipha1;
1845         uint_t          ira_pktlen = ira->ira_pktlen;
1846         uint16_t        ira_ip_hdr_length = ira->ira_ip_hdr_length;
1847 
1848         irb = ire->ire_bucket;
1849 
1850         /*
1851          * If we don't have more than one shared-IP zone, or if
1852          * there can't be more than one IRE_BROADCAST for this
1853          * IP address, then just set the zoneid and proceed.
1854          */
1855         if (ns->netstack_numzones == 1 || irb->irb_ire_cnt == 1) {
1856                 ira->ira_zoneid = ire->ire_zoneid;
1857 
1858                 ip_fanout_v4(mp, ipha, ira);
1859                 return;
1860         }
1861         irb_refhold(irb);
1862         for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
1863                 /* We do the main IRE after the end of the loop */
1864                 if (ire1 == ire)
1865                         continue;
1866 
1867                 /*
1868                  * Only IREs for the same IP address should be in the same
1869                  * bucket.
1870                  * But could have IRE_HOSTs in the case of CGTP.
1871                  */
1872                 ASSERT(ire1->ire_addr == ire->ire_addr);
1873                 if (!(ire1->ire_type & IRE_BROADCAST))
1874                         continue;
1875 
1876                 if (IRE_IS_CONDEMNED(ire1))
1877                         continue;
1878 
1879                 mp1 = copymsg(mp);
1880                 if (mp1 == NULL) {
1881                         /* Failed to deliver to one zone */
1882                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1883                         ip_drop_input("ipIfStatsInDiscards", mp, ill);
1884                         continue;
1885                 }
1886                 ira->ira_zoneid = ire1->ire_zoneid;
1887                 ipha1 = (ipha_t *)mp1->b_rptr;
1888                 ip_fanout_v4(mp1, ipha1, ira);
1889                 /*
1890                  * IPsec might have modified ira_pktlen and ira_ip_hdr_length
1891                  * so we restore them for a potential next iteration
1892                  */
1893                 ira->ira_pktlen = ira_pktlen;
1894                 ira->ira_ip_hdr_length = ira_ip_hdr_length;
1895         }
1896         irb_refrele(irb);
1897         /* Do the main ire */
1898         ira->ira_zoneid = ire->ire_zoneid;
1899         ip_fanout_v4(mp, ipha, ira);
1900 }
1901 
1902 /*
1903  * Handle multiple zones which want to receive the same multicast packets
1904  * on this ill by delivering a packet to each of them.
1905  *
1906  * Note that for packets delivered to transports we could instead do this
1907  * as part of the fanout code, but since we need to handle icmp_inbound
1908  * it is simpler to have multicast work the same as broadcast.
1909  *
1910  * The ip_fanout matching for multicast matches based on ilm independent of
1911  * zoneid since the zoneid restriction is applied when joining a multicast
1912  * group.
1913  */
1914 /* ARGSUSED */
1915 static void
1916 ip_input_multicast_v4(ire_t *ire, mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
1917 {
1918         ill_t           *ill = ira->ira_ill;
1919         iaflags_t       iraflags = ira->ira_flags;
1920         ip_stack_t      *ipst = ill->ill_ipst;
1921         netstack_t      *ns = ipst->ips_netstack;
1922         zoneid_t        zoneid;
1923         mblk_t          *mp1;
1924         ipha_t          *ipha1;
1925         uint_t          ira_pktlen = ira->ira_pktlen;
1926         uint16_t        ira_ip_hdr_length = ira->ira_ip_hdr_length;
1927 
1928         /* ire_recv_multicast has switched to the upper ill for IPMP */
1929         ASSERT(!IS_UNDER_IPMP(ill));
1930 
1931         /*
1932          * If we don't have more than one shared-IP zone, or if
1933          * there are no members in anything but the global zone,
1934          * then just set the zoneid and proceed.
1935          */
1936         if (ns->netstack_numzones == 1 ||
1937             !ill_hasmembers_otherzones_v4(ill, ipha->ipha_dst,
1938             GLOBAL_ZONEID)) {
1939                 ira->ira_zoneid = GLOBAL_ZONEID;
1940 
1941                 /* If sender didn't want this zone to receive it, drop */
1942                 if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) &&
1943                     ira->ira_no_loop_zoneid == ira->ira_zoneid) {
1944                         ip_drop_input("Multicast but wrong zoneid", mp, ill);
1945                         freemsg(mp);
1946                         return;
1947                 }
1948                 ip_fanout_v4(mp, ipha, ira);
1949                 return;
1950         }
1951 
1952         /*
1953          * Here we loop over all zoneids that have members in the group
1954          * and deliver a packet to ip_fanout for each zoneid.
1955          *
1956          * First find any members in the lowest numeric zoneid by looking for
1957          * first zoneid larger than -1 (ALL_ZONES).
1958          * We terminate the loop when we receive -1 (ALL_ZONES).
1959          */
1960         zoneid = ill_hasmembers_nextzone_v4(ill, ipha->ipha_dst, ALL_ZONES);
1961         for (; zoneid != ALL_ZONES;
1962             zoneid = ill_hasmembers_nextzone_v4(ill, ipha->ipha_dst, zoneid)) {
1963                 /*
1964                  * Avoid an extra copymsg/freemsg by skipping global zone here
1965                  * and doing that at the end.
1966                  */
1967                 if (zoneid == GLOBAL_ZONEID)
1968                         continue;
1969 
1970                 ira->ira_zoneid = zoneid;
1971 
1972                 /* If sender didn't want this zone to receive it, skip */
1973                 if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) &&
1974                     ira->ira_no_loop_zoneid == ira->ira_zoneid)
1975                         continue;
1976 
1977                 mp1 = copymsg(mp);
1978                 if (mp1 == NULL) {
1979                         /* Failed to deliver to one zone */
1980                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1981                         ip_drop_input("ipIfStatsInDiscards", mp, ill);
1982                         continue;
1983                 }
1984                 ipha1 = (ipha_t *)mp1->b_rptr;
1985                 ip_fanout_v4(mp1, ipha1, ira);
1986                 /*
1987                  * IPsec might have modified ira_pktlen and ira_ip_hdr_length
1988                  * so we restore them for a potential next iteration
1989                  */
1990                 ira->ira_pktlen = ira_pktlen;
1991                 ira->ira_ip_hdr_length = ira_ip_hdr_length;
1992         }
1993 
1994         /* Do the main ire */
1995         ira->ira_zoneid = GLOBAL_ZONEID;
1996         /* If sender didn't want this zone to receive it, drop */
1997         if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) &&
1998             ira->ira_no_loop_zoneid == ira->ira_zoneid) {
1999                 ip_drop_input("Multicast but wrong zoneid", mp, ill);
2000                 freemsg(mp);
2001         } else {
2002                 ip_fanout_v4(mp, ipha, ira);
2003         }
2004 }
2005 
2006 
2007 /*
2008  * Determine the zoneid and IRAF_TX_* flags if trusted extensions
2009  * is in use. Updates ira_zoneid and ira_flags as a result.
2010  */
2011 static void
2012 ip_fanout_tx_v4(mblk_t *mp, ipha_t *ipha, uint8_t protocol,
2013     uint_t ip_hdr_length, ip_recv_attr_t *ira)
2014 {
2015         uint16_t        *up;
2016         uint16_t        lport;
2017         zoneid_t        zoneid;
2018 
2019         ASSERT(ira->ira_flags & IRAF_SYSTEM_LABELED);
2020 
2021         /*
2022          * If the packet is unlabeled we might allow read-down
2023          * for MAC_EXEMPT. Below we clear this if it is a multi-level
2024          * port (MLP).
2025          * Note that ira_tsl can be NULL here.
2026          */
2027         if (ira->ira_tsl != NULL && ira->ira_tsl->tsl_flags & TSLF_UNLABELED)
2028                 ira->ira_flags |= IRAF_TX_MAC_EXEMPTABLE;
2029 
2030         if (ira->ira_zoneid != ALL_ZONES)
2031                 return;
2032 
2033         ira->ira_flags |= IRAF_TX_SHARED_ADDR;
2034 
2035         up = (uint16_t *)((uchar_t *)ipha + ip_hdr_length);
2036         switch (protocol) {
2037         case IPPROTO_TCP:
2038         case IPPROTO_SCTP:
2039         case IPPROTO_UDP:
2040                 /* Caller ensures this */
2041                 ASSERT(((uchar_t *)ipha) + ip_hdr_length +4 <= mp->b_wptr);
2042 
2043                 /*
2044                  * Only these transports support MLP.
2045                  * We know their destination port numbers is in
2046                  * the same place in the header.
2047                  */
2048                 lport = up[1];
2049 
2050                 /*
2051                  * No need to handle exclusive-stack zones
2052                  * since ALL_ZONES only applies to the shared IP instance.
2053                  */
2054                 zoneid = tsol_mlp_findzone(protocol, lport);
2055                 /*
2056                  * If no shared MLP is found, tsol_mlp_findzone returns
2057                  * ALL_ZONES.  In that case, we assume it's SLP, and
2058                  * search for the zone based on the packet label.
2059                  *
2060                  * If there is such a zone, we prefer to find a
2061                  * connection in it.  Otherwise, we look for a
2062                  * MAC-exempt connection in any zone whose label
2063                  * dominates the default label on the packet.
2064                  */
2065                 if (zoneid == ALL_ZONES)
2066                         zoneid = tsol_attr_to_zoneid(ira);
2067                 else
2068                         ira->ira_flags &= ~IRAF_TX_MAC_EXEMPTABLE;
2069                 break;
2070         default:
2071                 /* Handle shared address for other protocols */
2072                 zoneid = tsol_attr_to_zoneid(ira);
2073                 break;
2074         }
2075         ira->ira_zoneid = zoneid;
2076 }
2077 
2078 /*
2079  * Increment checksum failure statistics
2080  */
2081 static void
2082 ip_input_cksum_err_v4(uint8_t protocol, uint16_t hck_flags, ill_t *ill)
2083 {
2084         ip_stack_t      *ipst = ill->ill_ipst;
2085 
2086         switch (protocol) {
2087         case IPPROTO_TCP:
2088                 BUMP_MIB(ill->ill_ip_mib, tcpIfStatsInErrs);
2089 
2090                 if (hck_flags & HCK_FULLCKSUM)
2091                         IP_STAT(ipst, ip_tcp_in_full_hw_cksum_err);
2092                 else if (hck_flags & HCK_PARTIALCKSUM)
2093                         IP_STAT(ipst, ip_tcp_in_part_hw_cksum_err);
2094                 else
2095                         IP_STAT(ipst, ip_tcp_in_sw_cksum_err);
2096                 break;
2097         case IPPROTO_UDP:
2098                 BUMP_MIB(ill->ill_ip_mib, udpIfStatsInCksumErrs);
2099                 if (hck_flags & HCK_FULLCKSUM)
2100                         IP_STAT(ipst, ip_udp_in_full_hw_cksum_err);
2101                 else if (hck_flags & HCK_PARTIALCKSUM)
2102                         IP_STAT(ipst, ip_udp_in_part_hw_cksum_err);
2103                 else
2104                         IP_STAT(ipst, ip_udp_in_sw_cksum_err);
2105                 break;
2106         case IPPROTO_ICMP:
2107                 BUMP_MIB(&ipst->ips_icmp_mib, icmpInCksumErrs);
2108                 break;
2109         default:
2110                 ASSERT(0);
2111                 break;
2112         }
2113 }
2114 
2115 /* Calculate the IPv4 pseudo-header checksum */
2116 uint32_t
2117 ip_input_cksum_pseudo_v4(ipha_t *ipha, ip_recv_attr_t *ira)
2118 {
2119         uint_t          ulp_len;
2120         uint32_t        cksum;
2121         uint8_t         protocol = ira->ira_protocol;
2122         uint16_t        ip_hdr_length = ira->ira_ip_hdr_length;
2123 
2124 #define iphs    ((uint16_t *)ipha)
2125 
2126         switch (protocol) {
2127         case IPPROTO_TCP:
2128                 ulp_len = ira->ira_pktlen - ip_hdr_length;
2129 
2130                 /* Protocol and length */
2131                 cksum = htons(ulp_len) + IP_TCP_CSUM_COMP;
2132                 /* IP addresses */
2133                 cksum += iphs[6] + iphs[7] + iphs[8] + iphs[9];
2134                 break;
2135 
2136         case IPPROTO_UDP: {
2137                 udpha_t         *udpha;
2138 
2139                 udpha = (udpha_t  *)((uchar_t *)ipha + ip_hdr_length);
2140 
2141                 /* Protocol and length */
2142                 cksum = udpha->uha_length + IP_UDP_CSUM_COMP;
2143                 /* IP addresses */
2144                 cksum += iphs[6] + iphs[7] + iphs[8] + iphs[9];
2145                 break;
2146         }
2147 
2148         default:
2149                 cksum = 0;
2150                 break;
2151         }
2152 #undef  iphs
2153         return (cksum);
2154 }
2155 
2156 
2157 /*
2158  * Software verification of the ULP checksums.
2159  * Returns B_TRUE if ok.
2160  * Increments statistics of failed.
2161  */
2162 static boolean_t
2163 ip_input_sw_cksum_v4(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
2164 {
2165         ip_stack_t      *ipst = ira->ira_ill->ill_ipst;
2166         uint32_t        cksum;
2167         uint8_t         protocol = ira->ira_protocol;
2168         uint16_t        ip_hdr_length = ira->ira_ip_hdr_length;
2169 
2170         IP_STAT(ipst, ip_in_sw_cksum);
2171 
2172         ASSERT(protocol == IPPROTO_TCP || protocol == IPPROTO_UDP);
2173 
2174         cksum = ip_input_cksum_pseudo_v4(ipha, ira);
2175         cksum = IP_CSUM(mp, ip_hdr_length, cksum);
2176         if (cksum == 0)
2177                 return (B_TRUE);
2178 
2179         ip_input_cksum_err_v4(protocol, 0, ira->ira_ill);
2180         return (B_FALSE);
2181 }
2182 
2183 /*
2184  * Verify the ULP checksums.
2185  * Returns B_TRUE if ok, or if the ULP doesn't have a well-defined checksum
2186  * algorithm.
2187  * Increments statistics if failed.
2188  */
2189 static boolean_t
2190 ip_input_cksum_v4(iaflags_t iraflags, mblk_t *mp, ipha_t *ipha,
2191     ip_recv_attr_t *ira)
2192 {
2193         ill_t           *ill = ira->ira_rill;
2194         uint16_t        hck_flags;
2195         uint32_t        cksum;
2196         mblk_t          *mp1;
2197         int32_t         len;
2198         uint8_t         protocol = ira->ira_protocol;
2199         uint16_t        ip_hdr_length = ira->ira_ip_hdr_length;
2200 
2201 
2202         switch (protocol) {
2203         case IPPROTO_TCP:
2204                 break;
2205 
2206         case IPPROTO_UDP: {
2207                 udpha_t         *udpha;
2208 
2209                 udpha = (udpha_t  *)((uchar_t *)ipha + ip_hdr_length);
2210                 if (udpha->uha_checksum == 0) {
2211                         /* Packet doesn't have a UDP checksum */
2212                         return (B_TRUE);
2213                 }
2214                 break;
2215         }
2216         case IPPROTO_SCTP: {
2217                 sctp_hdr_t      *sctph;
2218                 uint32_t        pktsum;
2219 
2220                 sctph = (sctp_hdr_t *)((uchar_t *)ipha + ip_hdr_length);
2221 #ifdef  DEBUG
2222                 if (skip_sctp_cksum)
2223                         return (B_TRUE);
2224 #endif
2225                 pktsum = sctph->sh_chksum;
2226                 sctph->sh_chksum = 0;
2227                 cksum = sctp_cksum(mp, ip_hdr_length);
2228                 sctph->sh_chksum = pktsum;
2229                 if (cksum == pktsum)
2230                         return (B_TRUE);
2231 
2232                 /*
2233                  * Defer until later whether a bad checksum is ok
2234                  * in order to allow RAW sockets to use Adler checksum
2235                  * with SCTP.
2236                  */
2237                 ira->ira_flags |= IRAF_SCTP_CSUM_ERR;
2238                 return (B_TRUE);
2239         }
2240 
2241         default:
2242                 /* No ULP checksum to verify. */
2243                 return (B_TRUE);
2244         }
2245         /*
2246          * Revert to software checksum calculation if the interface
2247          * isn't capable of checksum offload.
2248          * We clear DB_CKSUMFLAGS when going through IPsec in ip_fanout.
2249          * Note: IRAF_NO_HW_CKSUM is not currently used.
2250          */
2251         ASSERT(!IS_IPMP(ill));
2252         if ((iraflags & IRAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) ||
2253             !dohwcksum) {
2254                 return (ip_input_sw_cksum_v4(mp, ipha, ira));
2255         }
2256 
2257         /*
2258          * We apply this for all ULP protocols. Does the HW know to
2259          * not set the flags for SCTP and other protocols.
2260          */
2261 
2262         hck_flags = DB_CKSUMFLAGS(mp);
2263 
2264         if (hck_flags & HCK_FULLCKSUM_OK) {
2265                 /*
2266                  * Hardware has already verified the checksum.
2267                  */
2268                 return (B_TRUE);
2269         }
2270 
2271         if (hck_flags & HCK_FULLCKSUM) {
2272                 /*
2273                  * Full checksum has been computed by the hardware
2274                  * and has been attached.  If the driver wants us to
2275                  * verify the correctness of the attached value, in
2276                  * order to protect against faulty hardware, compare
2277                  * it against -0 (0xFFFF) to see if it's valid.
2278                  */
2279                 cksum = DB_CKSUM16(mp);
2280                 if (cksum == 0xFFFF)
2281                         return (B_TRUE);
2282                 ip_input_cksum_err_v4(protocol, hck_flags, ira->ira_ill);
2283                 return (B_FALSE);
2284         }
2285 
2286         mp1 = mp->b_cont;
2287         if ((hck_flags & HCK_PARTIALCKSUM) &&
2288             (mp1 == NULL || mp1->b_cont == NULL) &&
2289             ip_hdr_length >= DB_CKSUMSTART(mp) &&
2290             ((len = ip_hdr_length - DB_CKSUMSTART(mp)) & 1) == 0) {
2291                 uint32_t        adj;
2292                 uchar_t         *cksum_start;
2293 
2294                 cksum = ip_input_cksum_pseudo_v4(ipha, ira);
2295 
2296                 cksum_start = ((uchar_t *)ipha + DB_CKSUMSTART(mp));
2297 
2298                 /*
2299                  * Partial checksum has been calculated by hardware
2300                  * and attached to the packet; in addition, any
2301                  * prepended extraneous data is even byte aligned,
2302                  * and there are at most two mblks associated with
2303                  * the packet.  If any such data exists, we adjust
2304                  * the checksum; also take care any postpended data.
2305                  */
2306                 IP_ADJCKSUM_PARTIAL(cksum_start, mp, mp1, len, adj);
2307                 /*
2308                  * One's complement subtract extraneous checksum
2309                  */
2310                 cksum += DB_CKSUM16(mp);
2311                 if (adj >= cksum)
2312                         cksum = ~(adj - cksum) & 0xFFFF;
2313                 else
2314                         cksum -= adj;
2315                 cksum = (cksum & 0xFFFF) + ((int)cksum >> 16);
2316                 cksum = (cksum & 0xFFFF) + ((int)cksum >> 16);
2317                 if (!(~cksum & 0xFFFF))
2318                         return (B_TRUE);
2319 
2320                 ip_input_cksum_err_v4(protocol, hck_flags, ira->ira_ill);
2321                 return (B_FALSE);
2322         }
2323         return (ip_input_sw_cksum_v4(mp, ipha, ira));
2324 }
2325 
2326 
2327 /*
2328  * Handle fanout of received packets.
2329  * Unicast packets that are looped back (from ire_send_local_v4) and packets
2330  * from the wire are differentiated by checking IRAF_VERIFY_ULP_CKSUM.
2331  *
2332  * IPQoS Notes
2333  * Before sending it to the client, invoke IPPF processing. Policy processing
2334  * takes place only if the callout_position, IPP_LOCAL_IN, is enabled.
2335  */
2336 void
2337 ip_fanout_v4(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
2338 {
2339         ill_t           *ill = ira->ira_ill;
2340         iaflags_t       iraflags = ira->ira_flags;
2341         ip_stack_t      *ipst = ill->ill_ipst;
2342         uint8_t         protocol = ipha->ipha_protocol;
2343         conn_t          *connp;
2344 #define rptr    ((uchar_t *)ipha)
2345         uint_t          ip_hdr_length;
2346         uint_t          min_ulp_header_length;
2347         int             offset;
2348         ssize_t         len;
2349         netstack_t      *ns = ipst->ips_netstack;
2350         ipsec_stack_t   *ipss = ns->netstack_ipsec;
2351         ill_t           *rill = ira->ira_rill;
2352 
2353         ASSERT(ira->ira_pktlen == ntohs(ipha->ipha_length));
2354 
2355         ip_hdr_length = ira->ira_ip_hdr_length;
2356         ira->ira_protocol = protocol;
2357 
2358         /*
2359          * Time for IPP once we've done reassembly and IPsec.
2360          * We skip this for loopback packets since we don't do IPQoS
2361          * on loopback.
2362          */
2363         if (IPP_ENABLED(IPP_LOCAL_IN, ipst) &&
2364             !(iraflags & IRAF_LOOPBACK) &&
2365             (protocol != IPPROTO_ESP || protocol != IPPROTO_AH)) {
2366                 /*
2367                  * Use the interface on which the packet arrived - not where
2368                  * the IP address is hosted.
2369                  */
2370                 /* ip_process translates an IS_UNDER_IPMP */
2371                 mp = ip_process(IPP_LOCAL_IN, mp, rill, ill);
2372                 if (mp == NULL) {
2373                         /* ip_drop_packet and MIB done */
2374                         return;
2375                 }
2376         }
2377 
2378         /* Determine the minimum required size of the upper-layer header */
2379         /* Need to do this for at least the set of ULPs that TX handles. */
2380         switch (protocol) {
2381         case IPPROTO_TCP:
2382                 min_ulp_header_length = TCP_MIN_HEADER_LENGTH;
2383                 break;
2384         case IPPROTO_SCTP:
2385                 min_ulp_header_length = SCTP_COMMON_HDR_LENGTH;
2386                 break;
2387         case IPPROTO_UDP:
2388                 min_ulp_header_length = UDPH_SIZE;
2389                 break;
2390         case IPPROTO_ICMP:
2391                 min_ulp_header_length = ICMPH_SIZE;
2392                 break;
2393         case IPPROTO_DCCP:
2394                 min_ulp_header_length = DCCP_MIN_HEADER_LENGTH;
2395                 break;
2396         default:
2397                 min_ulp_header_length = 0;
2398                 break;
2399         }
2400         /* Make sure we have the min ULP header length */
2401         len = mp->b_wptr - rptr;
2402         if (len < ip_hdr_length + min_ulp_header_length) {
2403                 if (ira->ira_pktlen < ip_hdr_length + min_ulp_header_length) {
2404                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
2405                         ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
2406                         freemsg(mp);
2407                         return;
2408                 }
2409                 IP_STAT(ipst, ip_recv_pullup);
2410                 ipha = ip_pullup(mp, ip_hdr_length + min_ulp_header_length,
2411                     ira);
2412                 if (ipha == NULL)
2413                         goto discard;
2414                 len = mp->b_wptr - rptr;
2415         }
2416 
2417         /*
2418          * If trusted extensions then determine the zoneid and TX specific
2419          * ira_flags.
2420          */
2421         if (iraflags & IRAF_SYSTEM_LABELED) {
2422                 /* This can update ira->ira_flags and ira->ira_zoneid */
2423                 ip_fanout_tx_v4(mp, ipha, protocol, ip_hdr_length, ira);
2424                 iraflags = ira->ira_flags;
2425         }
2426 
2427 
2428         /* Verify ULP checksum. Handles TCP, UDP, and SCTP */
2429         if (iraflags & IRAF_VERIFY_ULP_CKSUM) {
2430                 if (!ip_input_cksum_v4(iraflags, mp, ipha, ira)) {
2431                         /* Bad checksum. Stats are already incremented */
2432                         ip_drop_input("Bad ULP checksum", mp, ill);
2433                         freemsg(mp);
2434                         return;
2435                 }
2436                 /* IRAF_SCTP_CSUM_ERR could have been set */
2437                 iraflags = ira->ira_flags;
2438         }
2439         switch (protocol) {
2440         case IPPROTO_TCP:
2441                 /* For TCP, discard broadcast and multicast packets. */
2442                 if (iraflags & IRAF_MULTIBROADCAST)
2443                         goto discard;
2444 
2445                 /* First mblk contains IP+TCP headers per above check */
2446                 ASSERT(len >= ip_hdr_length + TCP_MIN_HEADER_LENGTH);
2447 
2448                 /* TCP options present? */
2449                 offset = ((uchar_t *)ipha)[ip_hdr_length + 12] >> 4;
2450                 if (offset != 5) {
2451                         if (offset < 5)
2452                                 goto discard;
2453 
2454                         /*
2455                          * There must be TCP options.
2456                          * Make sure we can grab them.
2457                          */
2458                         offset <<= 2;
2459                         offset += ip_hdr_length;
2460                         if (len < offset) {
2461                                 if (ira->ira_pktlen < offset) {
2462                                         BUMP_MIB(ill->ill_ip_mib,
2463                                             ipIfStatsInTruncatedPkts);
2464                                         ip_drop_input(
2465                                             "ipIfStatsInTruncatedPkts",
2466                                             mp, ill);
2467                                         freemsg(mp);
2468                                         return;
2469                                 }
2470                                 IP_STAT(ipst, ip_recv_pullup);
2471                                 ipha = ip_pullup(mp, offset, ira);
2472                                 if (ipha == NULL)
2473                                         goto discard;
2474                                 len = mp->b_wptr - rptr;
2475                         }
2476                 }
2477 
2478                 /*
2479                  * Pass up a squeue hint to tcp.
2480                  * If ira_sqp is already set (this is loopback) we leave it
2481                  * alone.
2482                  */
2483                 if (ira->ira_sqp == NULL) {
2484                         ira->ira_sqp = ip_squeue_get(ira->ira_ring);
2485                 }
2486 
2487                 /* Look for AF_INET or AF_INET6 that matches */
2488                 connp = ipcl_classify_v4(mp, IPPROTO_TCP, ip_hdr_length,
2489                     ira, ipst);
2490                 if (connp == NULL) {
2491                         /* Send the TH_RST */
2492                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2493                         tcp_xmit_listeners_reset(mp, ira, ipst, NULL);
2494                         return;
2495                 }
2496                 if (connp->conn_incoming_ifindex != 0 &&
2497                     connp->conn_incoming_ifindex != ira->ira_ruifindex) {
2498                         CONN_DEC_REF(connp);
2499 
2500                         /* Send the TH_RST */
2501                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2502                         tcp_xmit_listeners_reset(mp, ira, ipst, NULL);
2503                         return;
2504                 }
2505                 if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) ||
2506                     (iraflags & IRAF_IPSEC_SECURE)) {
2507                         mp = ipsec_check_inbound_policy(mp, connp,
2508                             ipha, NULL, ira);
2509                         if (mp == NULL) {
2510                                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2511                                 /* Note that mp is NULL */
2512                                 ip_drop_input("ipIfStatsInDiscards", mp, ill);
2513                                 CONN_DEC_REF(connp);
2514                                 return;
2515                         }
2516                 }
2517                 /* Found a client; up it goes */
2518                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2519                 ira->ira_ill = ira->ira_rill = NULL;
2520                 if (!IPCL_IS_TCP(connp)) {
2521                         /* Not TCP; must be SOCK_RAW, IPPROTO_TCP */
2522                         (connp->conn_recv)(connp, mp, NULL, ira);
2523                         CONN_DEC_REF(connp);
2524                         ira->ira_ill = ill;
2525                         ira->ira_rill = rill;
2526                         return;
2527                 }
2528 
2529                 /*
2530                  * We do different processing whether called from
2531                  * ip_accept_tcp and we match the target, don't match
2532                  * the target, and when we are called by ip_input.
2533                  */
2534                 if (iraflags & IRAF_TARGET_SQP) {
2535                         if (ira->ira_target_sqp == connp->conn_sqp) {
2536                                 mblk_t  *attrmp;
2537 
2538                                 attrmp = ip_recv_attr_to_mblk(ira);
2539                                 if (attrmp == NULL) {
2540                                         BUMP_MIB(ill->ill_ip_mib,
2541                                             ipIfStatsInDiscards);
2542                                         ip_drop_input("ipIfStatsInDiscards",
2543                                             mp, ill);
2544                                         freemsg(mp);
2545                                         CONN_DEC_REF(connp);
2546                                 } else {
2547                                         SET_SQUEUE(attrmp, connp->conn_recv,
2548                                             connp);
2549                                         attrmp->b_cont = mp;
2550                                         ASSERT(ira->ira_target_sqp_mp == NULL);
2551                                         ira->ira_target_sqp_mp = attrmp;
2552                                         /*
2553                                          * Conn ref release when drained from
2554                                          * the squeue.
2555                                          */
2556                                 }
2557                         } else {
2558                                 SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
2559                                     connp->conn_recv, connp, ira, SQ_FILL,
2560                                     SQTAG_IP_TCP_INPUT);
2561                         }
2562                 } else {
2563                         SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv,
2564                             connp, ira, ip_squeue_flag, SQTAG_IP_TCP_INPUT);
2565                 }
2566                 ira->ira_ill = ill;
2567                 ira->ira_rill = rill;
2568                 return;
2569 
2570         case IPPROTO_SCTP: {
2571                 sctp_hdr_t      *sctph;
2572                 in6_addr_t      map_src, map_dst;
2573                 uint32_t        ports;  /* Source and destination ports */
2574                 sctp_stack_t    *sctps = ipst->ips_netstack->netstack_sctp;
2575 
2576                 /* For SCTP, discard broadcast and multicast packets. */
2577                 if (iraflags & IRAF_MULTIBROADCAST)
2578                         goto discard;
2579 
2580                 /*
2581                  * Since there is no SCTP h/w cksum support yet, just
2582                  * clear the flag.
2583                  */
2584                 DB_CKSUMFLAGS(mp) = 0;
2585 
2586                 /* Length ensured above */
2587                 ASSERT(MBLKL(mp) >= ip_hdr_length + SCTP_COMMON_HDR_LENGTH);
2588                 sctph = (sctp_hdr_t *)(rptr + ip_hdr_length);
2589 
2590                 /* get the ports */
2591                 ports = *(uint32_t *)&sctph->sh_sport;
2592 
2593                 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &map_dst);
2594                 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &map_src);
2595                 if (iraflags & IRAF_SCTP_CSUM_ERR) {
2596                         /*
2597                          * No potential sctp checksum errors go to the Sun
2598                          * sctp stack however they might be Adler-32 summed
2599                          * packets a userland stack bound to a raw IP socket
2600                          * could reasonably use. Note though that Adler-32 is
2601                          * a long deprecated algorithm and customer sctp
2602                          * networks should eventually migrate to CRC-32 at
2603                          * which time this facility should be removed.
2604                          */
2605                         ip_fanout_sctp_raw(mp, ipha, NULL, ports, ira);
2606                         return;
2607                 }
2608                 connp = sctp_fanout(&map_src, &map_dst, ports, ira, mp,
2609                     sctps, sctph);
2610                 if (connp == NULL) {
2611                         /* Check for raw socket or OOTB handling */
2612                         ip_fanout_sctp_raw(mp, ipha, NULL, ports, ira);
2613                         return;
2614                 }
2615                 if (connp->conn_incoming_ifindex != 0 &&
2616                     connp->conn_incoming_ifindex != ira->ira_ruifindex) {
2617                         CONN_DEC_REF(connp);
2618                         /* Check for raw socket or OOTB handling */
2619                         ip_fanout_sctp_raw(mp, ipha, NULL, ports, ira);
2620                         return;
2621                 }
2622 
2623                 /* Found a client; up it goes */
2624                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2625                 sctp_input(connp, ipha, NULL, mp, ira);
2626                 /* sctp_input does a rele of the sctp_t */
2627                 return;
2628         }
2629 
2630         case IPPROTO_UDP:
2631                 /* First mblk contains IP+UDP headers as checked above */
2632                 ASSERT(MBLKL(mp) >= ip_hdr_length + UDPH_SIZE);
2633 
2634                 if (iraflags & IRAF_MULTIBROADCAST) {
2635                         uint16_t *up;   /* Pointer to ports in ULP header */
2636 
2637                         up = (uint16_t *)((uchar_t *)ipha + ip_hdr_length);
2638                         ip_fanout_udp_multi_v4(mp, ipha, up[1], up[0], ira);
2639                         return;
2640                 }
2641 
2642                 /* Look for AF_INET or AF_INET6 that matches */
2643                 connp = ipcl_classify_v4(mp, IPPROTO_UDP, ip_hdr_length,
2644                     ira, ipst);
2645                 if (connp == NULL) {
2646         no_udp_match:
2647                         if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_UDP].
2648                             connf_head != NULL) {
2649                                 ASSERT(ira->ira_protocol == IPPROTO_UDP);
2650                                 ip_fanout_proto_v4(mp, ipha, ira);
2651                         } else {
2652                                 ip_fanout_send_icmp_v4(mp,
2653                                     ICMP_DEST_UNREACHABLE,
2654                                     ICMP_PORT_UNREACHABLE, ira);
2655                         }
2656                         return;
2657 
2658                 }
2659                 if (connp->conn_incoming_ifindex != 0 &&
2660                     connp->conn_incoming_ifindex != ira->ira_ruifindex) {
2661                         CONN_DEC_REF(connp);
2662                         goto no_udp_match;
2663                 }
2664                 if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld :
2665                     !canputnext(connp->conn_rq)) {
2666                         CONN_DEC_REF(connp);
2667                         BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows);
2668                         ip_drop_input("udpIfStatsInOverflows", mp, ill);
2669                         freemsg(mp);
2670                         return;
2671                 }
2672                 if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) ||
2673                     (iraflags & IRAF_IPSEC_SECURE)) {
2674                         mp = ipsec_check_inbound_policy(mp, connp,
2675                             ipha, NULL, ira);
2676                         if (mp == NULL) {
2677                                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2678                                 /* Note that mp is NULL */
2679                                 ip_drop_input("ipIfStatsInDiscards", mp, ill);
2680                                 CONN_DEC_REF(connp);
2681                                 return;
2682                         }
2683                 }
2684                 /*
2685                  * Remove 0-spi if it's 0, or move everything behind
2686                  * the UDP header over it and forward to ESP via
2687                  * ip_fanout_v4().
2688                  */
2689                 if (connp->conn_udp->udp_nat_t_endpoint) {
2690                         if (iraflags & IRAF_IPSEC_SECURE) {
2691                                 ip_drop_packet(mp, B_TRUE, ira->ira_ill,
2692                                     DROPPER(ipss, ipds_esp_nat_t_ipsec),
2693                                     &ipss->ipsec_dropper);
2694                                 CONN_DEC_REF(connp);
2695                                 return;
2696                         }
2697 
2698                         mp = zero_spi_check(mp, ira);
2699                         if (mp == NULL) {
2700                                 /*
2701                                  * Packet was consumed - probably sent to
2702                                  * ip_fanout_v4.
2703                                  */
2704                                 CONN_DEC_REF(connp);
2705                                 return;
2706                         }
2707                         /* Else continue like a normal UDP packet. */
2708                         ipha = (ipha_t *)mp->b_rptr;
2709                         protocol = ipha->ipha_protocol;
2710                         ira->ira_protocol = protocol;
2711                 }
2712                 /* Found a client; up it goes */
2713                 IP_STAT(ipst, ip_udp_fannorm);
2714                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2715                 ira->ira_ill = ira->ira_rill = NULL;
2716                 (connp->conn_recv)(connp, mp, NULL, ira);
2717                 CONN_DEC_REF(connp);
2718                 ira->ira_ill = ill;
2719                 ira->ira_rill = rill;
2720                 return;
2721         case IPPROTO_DCCP:
2722                 /* For DCCP, discard broadcast and multicast packets */
2723                 if (iraflags & IRAF_MULTIBROADCAST) {
2724                         goto discard;
2725                 }
2726 
2727                 /* First mblk contains IP+DCCP headers per above check */
2728                 ASSERT(len >= ip_hdr_length + DCCP_MIN_HEADER_LENGTH);
2729 
2730                 /* Squeue hint */
2731                 if (ira->ira_sqp == NULL) {
2732                         ira->ira_sqp = ip_squeue_get(ira->ira_ring);
2733                 }
2734 
2735                 connp = ipcl_classify_v4(mp, IPPROTO_DCCP, ip_hdr_length,
2736                     ira, ipst);
2737                 if (connp == NULL) {
2738                         cmn_err(CE_NOTE, "ip_input.c: ip_fanout_v4 connp not found");
2739                         /* Send the reset packet */
2740                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2741                         dccp_xmit_listeners_reset(mp, ira, ipst, NULL);
2742                         return;
2743                 }
2744 
2745                 if (connp->conn_incoming_ifindex != 0 &&
2746                     connp->conn_incoming_ifindex != ira->ira_ruifindex) {
2747                         cmn_err(CE_NOTE, "ip_input.c: ip_fanout_v4 ifindex problem");
2748                         /* Send the reset packet */
2749                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2750                         dccp_xmit_listeners_reset(mp, ira, ipst, NULL);
2751                         return;
2752                 }
2753 
2754                 if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) ||
2755                     (iraflags & IRAF_IPSEC_SECURE)) {
2756                         mp = ipsec_check_inbound_policy(mp, connp,
2757                             ipha, NULL, ira);
2758                         if (mp == NULL) {
2759                                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2760                                 /* Note that mp is NULL */
2761                                 ip_drop_input("ipIfStatsInDiscards", mp, ill);
2762                                 CONN_DEC_REF(connp);
2763                                 return;
2764                         }
2765                 }
2766 
2767                 /* Found a client; up it goes */
2768                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2769                 ira->ira_ill = ira->ira_rill = NULL;
2770 
2771                 /* XXX SOCK_RAW for DCCP? */
2772 
2773                 if (iraflags & IRAF_TARGET_SQP) {
2774                         cmn_err(CE_NOTE, "IRAF_TARGET_SQP");
2775                 } else {
2776                         SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv,
2777                             connp, ira, ip_squeue_flag, SQTAG_IP_DCCP_INPUT);
2778                 }
2779 
2780                 ira->ira_ill = ill;
2781                 ira->ira_rill = rill;
2782                 return;
2783 
2784         default:
2785                 break;
2786         }
2787 
2788         /*
2789          * Clear hardware checksumming flag as it is currently only
2790          * used by TCP and UDP.
2791          */
2792         DB_CKSUMFLAGS(mp) = 0;
2793 
2794         switch (protocol) {
2795         case IPPROTO_ICMP:
2796                 /*
2797                  * We need to accomodate icmp messages coming in clear
2798                  * until we get everything secure from the wire. If
2799                  * icmp_accept_clear_messages is zero we check with
2800                  * the global policy and act accordingly. If it is
2801                  * non-zero, we accept the message without any checks.
2802                  * But *this does not mean* that this will be delivered
2803                  * to RAW socket clients. By accepting we might send
2804                  * replies back, change our MTU value etc.,
2805                  * but delivery to the ULP/clients depends on their
2806                  * policy dispositions.
2807                  */
2808                 if (ipst->ips_icmp_accept_clear_messages == 0) {
2809                         mp = ipsec_check_global_policy(mp, NULL,
2810                             ipha, NULL, ira, ns);
2811                         if (mp == NULL)
2812                                 return;
2813                 }
2814 
2815                 /*
2816                  * On a labeled system, we have to check whether the zone
2817                  * itself is permitted to receive raw traffic.
2818                  */
2819                 if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
2820                         if (!tsol_can_accept_raw(mp, ira, B_FALSE)) {
2821                                 BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
2822                                 ip_drop_input("tsol_can_accept_raw", mp, ill);
2823                                 freemsg(mp);
2824                                 return;
2825                         }
2826                 }
2827 
2828                 /*
2829                  * ICMP header checksum, including checksum field,
2830                  * should be zero.
2831                  */
2832                 if (IP_CSUM(mp, ip_hdr_length, 0)) {
2833                         BUMP_MIB(&ipst->ips_icmp_mib, icmpInCksumErrs);
2834                         ip_drop_input("icmpInCksumErrs", mp, ill);
2835                         freemsg(mp);
2836                         return;
2837                 }
2838                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2839                 mp = icmp_inbound_v4(mp, ira);
2840                 if (mp == NULL) {
2841                         /* No need to pass to RAW sockets */
2842                         return;
2843                 }
2844                 break;
2845 
2846         case IPPROTO_IGMP:
2847                 /*
2848                  * If we are not willing to accept IGMP packets in clear,
2849                  * then check with global policy.
2850                  */
2851                 if (ipst->ips_igmp_accept_clear_messages == 0) {
2852                         mp = ipsec_check_global_policy(mp, NULL,
2853                             ipha, NULL, ira, ns);
2854                         if (mp == NULL)
2855                                 return;
2856                 }
2857                 if ((ira->ira_flags & IRAF_SYSTEM_LABELED) &&
2858                     !tsol_can_accept_raw(mp, ira, B_TRUE)) {
2859                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2860                         ip_drop_input("ipIfStatsInDiscards", mp, ill);
2861                         freemsg(mp);
2862                         return;
2863                 }
2864                 /*
2865                  * Validate checksum
2866                  */
2867                 if (IP_CSUM(mp, ip_hdr_length, 0)) {
2868                         ++ipst->ips_igmpstat.igps_rcv_badsum;
2869                         ip_drop_input("igps_rcv_badsum", mp, ill);
2870                         freemsg(mp);
2871                         return;
2872                 }
2873 
2874                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2875                 mp = igmp_input(mp, ira);
2876                 if (mp == NULL) {
2877                         /* Bad packet - discarded by igmp_input */
2878                         return;
2879                 }
2880                 break;
2881         case IPPROTO_PIM:
2882                 /*
2883                  * If we are not willing to accept PIM packets in clear,
2884                  * then check with global policy.
2885                  */
2886                 if (ipst->ips_pim_accept_clear_messages == 0) {
2887                         mp = ipsec_check_global_policy(mp, NULL,
2888                             ipha, NULL, ira, ns);
2889                         if (mp == NULL)
2890                                 return;
2891                 }
2892                 if ((ira->ira_flags & IRAF_SYSTEM_LABELED) &&
2893                     !tsol_can_accept_raw(mp, ira, B_TRUE)) {
2894                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2895                         ip_drop_input("ipIfStatsInDiscards", mp, ill);
2896                         freemsg(mp);
2897                         return;
2898                 }
2899                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2900 
2901                 /* Checksum is verified in pim_input */
2902                 mp = pim_input(mp, ira);
2903                 if (mp == NULL) {
2904                         /* Bad packet - discarded by pim_input */
2905                         return;
2906                 }
2907                 break;
2908         case IPPROTO_AH:
2909         case IPPROTO_ESP: {
2910                 /*
2911                  * Fast path for AH/ESP.
2912                  */
2913                 netstack_t *ns = ipst->ips_netstack;
2914                 ipsec_stack_t *ipss = ns->netstack_ipsec;
2915 
2916                 IP_STAT(ipst, ipsec_proto_ahesp);
2917 
2918                 if (!ipsec_loaded(ipss)) {
2919                         ip_proto_not_sup(mp, ira);
2920                         return;
2921                 }
2922 
2923                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2924                 /* select inbound SA and have IPsec process the pkt */
2925                 if (protocol == IPPROTO_ESP) {
2926                         esph_t *esph;
2927                         boolean_t esp_in_udp_sa;
2928                         boolean_t esp_in_udp_packet;
2929 
2930                         mp = ipsec_inbound_esp_sa(mp, ira, &esph);
2931                         if (mp == NULL)
2932                                 return;
2933 
2934                         ASSERT(esph != NULL);
2935                         ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
2936                         ASSERT(ira->ira_ipsec_esp_sa != NULL);
2937                         ASSERT(ira->ira_ipsec_esp_sa->ipsa_input_func != NULL);
2938 
2939                         esp_in_udp_sa = ((ira->ira_ipsec_esp_sa->ipsa_flags &
2940                             IPSA_F_NATT) != 0);
2941                         esp_in_udp_packet =
2942                             (ira->ira_flags & IRAF_ESP_UDP_PORTS) != 0;
2943 
2944                         /*
2945                          * The following is a fancy, but quick, way of saying:
2946                          * ESP-in-UDP SA and Raw ESP packet --> drop
2947                          *    OR
2948                          * ESP SA and ESP-in-UDP packet --> drop
2949                          */
2950                         if (esp_in_udp_sa != esp_in_udp_packet) {
2951                                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2952                                 ip_drop_packet(mp, B_TRUE, ira->ira_ill,
2953                                     DROPPER(ipss, ipds_esp_no_sa),
2954                                     &ipss->ipsec_dropper);
2955                                 return;
2956                         }
2957                         mp = ira->ira_ipsec_esp_sa->ipsa_input_func(mp, esph,
2958                             ira);
2959                 } else {
2960                         ah_t *ah;
2961 
2962                         mp = ipsec_inbound_ah_sa(mp, ira, &ah);
2963                         if (mp == NULL)
2964                                 return;
2965 
2966                         ASSERT(ah != NULL);
2967                         ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
2968                         ASSERT(ira->ira_ipsec_ah_sa != NULL);
2969                         ASSERT(ira->ira_ipsec_ah_sa->ipsa_input_func != NULL);
2970                         mp = ira->ira_ipsec_ah_sa->ipsa_input_func(mp, ah,
2971                             ira);
2972                 }
2973 
2974                 if (mp == NULL) {
2975                         /*
2976                          * Either it failed or is pending. In the former case
2977                          * ipIfStatsInDiscards was increased.
2978                          */
2979                         return;
2980                 }
2981                 /* we're done with IPsec processing, send it up */
2982                 ip_input_post_ipsec(mp, ira);
2983                 return;
2984         }
2985         case IPPROTO_ENCAP: {
2986                 ipha_t          *inner_ipha;
2987 
2988                 /*
2989                  * Handle self-encapsulated packets (IP-in-IP where
2990                  * the inner addresses == the outer addresses).
2991                  */
2992                 if ((uchar_t *)ipha + ip_hdr_length + sizeof (ipha_t) >
2993                     mp->b_wptr) {
2994                         if (ira->ira_pktlen <
2995                             ip_hdr_length + sizeof (ipha_t)) {
2996                                 BUMP_MIB(ill->ill_ip_mib,
2997                                     ipIfStatsInTruncatedPkts);
2998                                 ip_drop_input("ipIfStatsInTruncatedPkts",
2999                                     mp, ill);
3000                                 freemsg(mp);
3001                                 return;
3002                         }
3003                         ipha = ip_pullup(mp, (uchar_t *)ipha + ip_hdr_length +
3004                             sizeof (ipha_t) - mp->b_rptr, ira);
3005                         if (ipha == NULL) {
3006                                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
3007                                 ip_drop_input("ipIfStatsInDiscards", mp, ill);
3008                                 freemsg(mp);
3009                                 return;
3010                         }
3011                 }
3012                 inner_ipha = (ipha_t *)((uchar_t *)ipha + ip_hdr_length);
3013                 /*
3014                  * Check the sanity of the inner IP header.
3015                  */
3016                 if ((IPH_HDR_VERSION(inner_ipha) != IPV4_VERSION)) {
3017                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
3018                         ip_drop_input("ipIfStatsInDiscards", mp, ill);
3019                         freemsg(mp);
3020                         return;
3021                 }
3022                 if (IPH_HDR_LENGTH(inner_ipha) < sizeof (ipha_t)) {
3023                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
3024                         ip_drop_input("ipIfStatsInDiscards", mp, ill);
3025                         freemsg(mp);
3026                         return;
3027                 }
3028                 if (inner_ipha->ipha_src != ipha->ipha_src ||
3029                     inner_ipha->ipha_dst != ipha->ipha_dst) {
3030                         /* We fallthru to iptun fanout below */
3031                         goto iptun;
3032                 }
3033 
3034                 /*
3035                  * Self-encapsulated tunnel packet. Remove
3036                  * the outer IP header and fanout again.
3037                  * We also need to make sure that the inner
3038                  * header is pulled up until options.
3039                  */
3040                 mp->b_rptr = (uchar_t *)inner_ipha;
3041                 ipha = inner_ipha;
3042                 ip_hdr_length = IPH_HDR_LENGTH(ipha);
3043                 if ((uchar_t *)ipha + ip_hdr_length > mp->b_wptr) {
3044                         if (ira->ira_pktlen <
3045                             (uchar_t *)ipha + ip_hdr_length - mp->b_rptr) {
3046                                 BUMP_MIB(ill->ill_ip_mib,
3047                                     ipIfStatsInTruncatedPkts);
3048                                 ip_drop_input("ipIfStatsInTruncatedPkts",
3049                                     mp, ill);
3050                                 freemsg(mp);
3051                                 return;
3052                         }
3053                         ipha = ip_pullup(mp,
3054                             (uchar_t *)ipha + ip_hdr_length - mp->b_rptr, ira);
3055                         if (ipha == NULL) {
3056                                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
3057                                 ip_drop_input("ipIfStatsInDiscards", mp, ill);
3058                                 freemsg(mp);
3059                                 return;
3060                         }
3061                 }
3062                 if (ip_hdr_length > sizeof (ipha_t)) {
3063                         /* We got options on the inner packet. */
3064                         ipaddr_t        dst = ipha->ipha_dst;
3065                         int             error = 0;
3066 
3067                         dst = ip_input_options(ipha, dst, mp, ira, &error);
3068                         if (error != 0) {
3069                                 /*
3070                                  * An ICMP error has been sent and the packet
3071                                  * has been dropped.
3072                                  */
3073                                 return;
3074                         }
3075                         if (dst != ipha->ipha_dst) {
3076                                 /*
3077                                  * Someone put a source-route in
3078                                  * the inside header of a self-
3079                                  * encapsulated packet.  Drop it
3080                                  * with extreme prejudice and let
3081                                  * the sender know.
3082                                  */
3083                                 ip_drop_input("ICMP_SOURCE_ROUTE_FAILED",
3084                                     mp, ill);
3085                                 icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED,
3086                                     ira);
3087                                 return;
3088                         }
3089                 }
3090                 if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
3091                         /*
3092                          * This means that somebody is sending
3093                          * Self-encapsualted packets without AH/ESP.
3094                          *
3095                          * Send this packet to find a tunnel endpoint.
3096                          * if I can't find one, an ICMP
3097                          * PROTOCOL_UNREACHABLE will get sent.
3098                          */
3099                         protocol = ipha->ipha_protocol;
3100                         ira->ira_protocol = protocol;
3101                         goto iptun;
3102                 }
3103 
3104                 /* Update based on removed IP header */
3105                 ira->ira_ip_hdr_length = ip_hdr_length;
3106                 ira->ira_pktlen = ntohs(ipha->ipha_length);
3107 
3108                 if (ira->ira_flags & IRAF_IPSEC_DECAPS) {
3109                         /*
3110                          * This packet is self-encapsulated multiple
3111                          * times. We don't want to recurse infinitely.
3112                          * To keep it simple, drop the packet.
3113                          */
3114                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
3115                         ip_drop_input("ipIfStatsInDiscards", mp, ill);
3116                         freemsg(mp);
3117                         return;
3118                 }
3119                 ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
3120                 ira->ira_flags |= IRAF_IPSEC_DECAPS;
3121 
3122                 ip_input_post_ipsec(mp, ira);
3123                 return;
3124         }
3125 
3126         iptun:  /* IPPROTO_ENCAPS that is not self-encapsulated */
3127         case IPPROTO_IPV6:
3128                 /* iptun will verify trusted label */
3129                 connp = ipcl_classify_v4(mp, protocol, ip_hdr_length,
3130                     ira, ipst);
3131                 if (connp != NULL) {
3132                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
3133                         ira->ira_ill = ira->ira_rill = NULL;
3134                         (connp->conn_recv)(connp, mp, NULL, ira);
3135                         CONN_DEC_REF(connp);
3136                         ira->ira_ill = ill;
3137                         ira->ira_rill = rill;
3138                         return;
3139                 }
3140                 /* FALLTHRU */
3141         default:
3142                 /*
3143                  * On a labeled system, we have to check whether the zone
3144                  * itself is permitted to receive raw traffic.
3145                  */
3146                 if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
3147                         if (!tsol_can_accept_raw(mp, ira, B_FALSE)) {
3148                                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
3149                                 ip_drop_input("ipIfStatsInDiscards", mp, ill);
3150                                 freemsg(mp);
3151                                 return;
3152                         }
3153                 }
3154                 break;
3155         }
3156 
3157         /*
3158          * The above input functions may have returned the pulled up message.
3159          * So ipha need to be reinitialized.
3160          */
3161         ipha = (ipha_t *)mp->b_rptr;
3162         ira->ira_protocol = protocol = ipha->ipha_protocol;
3163         if (ipst->ips_ipcl_proto_fanout_v4[protocol].connf_head == NULL) {
3164                 /*
3165                  * No user-level listener for these packets packets.
3166                  * Check for IPPROTO_ENCAP...
3167                  */
3168                 if (protocol == IPPROTO_ENCAP && ipst->ips_ip_g_mrouter) {
3169                         /*
3170                          * Check policy here,
3171                          * THEN ship off to ip_mroute_decap().
3172                          *
3173                          * BTW,  If I match a configured IP-in-IP
3174                          * tunnel above, this path will not be reached, and
3175                          * ip_mroute_decap will never be called.
3176                          */
3177                         mp = ipsec_check_global_policy(mp, connp,
3178                             ipha, NULL, ira, ns);
3179                         if (mp != NULL) {
3180                                 ip_mroute_decap(mp, ira);
3181                         } /* Else we already freed everything! */
3182                 } else {
3183                         ip_proto_not_sup(mp, ira);
3184                 }
3185                 return;
3186         }
3187 
3188         /*
3189          * Handle fanout to raw sockets.  There
3190          * can be more than one stream bound to a particular
3191          * protocol.  When this is the case, each one gets a copy
3192          * of any incoming packets.
3193          */
3194         ASSERT(ira->ira_protocol == ipha->ipha_protocol);
3195         ip_fanout_proto_v4(mp, ipha, ira);
3196         return;
3197 
3198 discard:
3199         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
3200         ip_drop_input("ipIfStatsInDiscards", mp, ill);
3201         freemsg(mp);
3202 #undef rptr
3203 }