1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  24  *
  25  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  26  */
  27 /* Copyright (c) 1990 Mentat Inc. */
  28 
  29 #include <sys/types.h>
  30 #include <sys/stream.h>
  31 #include <sys/dlpi.h>
  32 #include <sys/stropts.h>
  33 #include <sys/sysmacros.h>
  34 #include <sys/strsubr.h>
  35 #include <sys/strlog.h>
  36 #include <sys/strsun.h>
  37 #include <sys/zone.h>
  38 #define _SUN_TPI_VERSION 2
  39 #include <sys/tihdr.h>
  40 #include <sys/xti_inet.h>
  41 #include <sys/ddi.h>
  42 #include <sys/sunddi.h>
  43 #include <sys/cmn_err.h>
  44 #include <sys/debug.h>
  45 #include <sys/kobj.h>
  46 #include <sys/modctl.h>
  47 #include <sys/atomic.h>
  48 #include <sys/policy.h>
  49 #include <sys/priv.h>
  50 
  51 #include <sys/systm.h>
  52 #include <sys/param.h>
  53 #include <sys/kmem.h>
  54 #include <sys/sdt.h>
  55 #include <sys/socket.h>
  56 #include <sys/vtrace.h>
  57 #include <sys/isa_defs.h>
  58 #include <sys/mac.h>
  59 #include <net/if.h>
  60 #include <net/if_arp.h>
  61 #include <net/route.h>
  62 #include <sys/sockio.h>
  63 #include <netinet/in.h>
  64 #include <net/if_dl.h>
  65 
  66 #include <inet/common.h>
  67 #include <inet/mi.h>
  68 #include <inet/mib2.h>
  69 #include <inet/nd.h>
  70 #include <inet/arp.h>
  71 #include <inet/snmpcom.h>
  72 #include <inet/kstatcom.h>
  73 
  74 #include <netinet/igmp_var.h>
  75 #include <netinet/ip6.h>
  76 #include <netinet/icmp6.h>
  77 #include <netinet/sctp.h>
  78 
  79 #include <inet/ip.h>
  80 #include <inet/ip_impl.h>
  81 #include <inet/ip6.h>
  82 #include <inet/ip6_asp.h>
  83 #include <inet/optcom.h>
  84 #include <inet/tcp.h>
  85 #include <inet/tcp_impl.h>
  86 #include <inet/ip_multi.h>
  87 #include <inet/ip_if.h>
  88 #include <inet/ip_ire.h>
  89 #include <inet/ip_ftable.h>
  90 #include <inet/ip_rts.h>
  91 #include <inet/ip_ndp.h>
  92 #include <inet/ip_listutils.h>
  93 #include <netinet/igmp.h>
  94 #include <netinet/ip_mroute.h>
  95 #include <inet/ipp_common.h>
  96 
  97 #include <net/pfkeyv2.h>
  98 #include <inet/sadb.h>
  99 #include <inet/ipsec_impl.h>
 100 #include <inet/ipdrop.h>
 101 #include <inet/ip_netinfo.h>
 102 #include <inet/ilb_ip.h>
 103 #include <sys/squeue_impl.h>
 104 #include <sys/squeue.h>
 105 
 106 #include <sys/ethernet.h>
 107 #include <net/if_types.h>
 108 #include <sys/cpuvar.h>
 109 
 110 #include <ipp/ipp.h>
 111 #include <ipp/ipp_impl.h>
 112 #include <ipp/ipgpc/ipgpc.h>
 113 
 114 #include <sys/pattr.h>
 115 #include <inet/ipclassifier.h>
 116 #include <inet/sctp_ip.h>
 117 #include <inet/sctp/sctp_impl.h>
 118 #include <inet/udp_impl.h>
 119 #include <sys/sunddi.h>
 120 
 121 #include <sys/tsol/label.h>
 122 #include <sys/tsol/tnet.h>
 123 
 124 #include <sys/clock_impl.h>       /* For LBOLT_FASTPATH{,64} */
 125 
 126 #ifdef  DEBUG
 127 extern boolean_t skip_sctp_cksum;
 128 #endif
 129 
 130 static void     ip_input_local_v4(ire_t *, mblk_t *, ipha_t *,
 131     ip_recv_attr_t *);
 132 
 133 static void     ip_input_broadcast_v4(ire_t *, mblk_t *, ipha_t *,
 134     ip_recv_attr_t *);
 135 static void     ip_input_multicast_v4(ire_t *, mblk_t *, ipha_t *,
 136     ip_recv_attr_t *);
 137 
 138 #pragma inline(ip_input_common_v4, ip_input_local_v4, ip_forward_xmit_v4)
 139 
 140 /*
 141  * Direct read side procedure capable of dealing with chains. GLDv3 based
 142  * drivers call this function directly with mblk chains while STREAMS
 143  * read side procedure ip_rput() calls this for single packet with ip_ring
 144  * set to NULL to process one packet at a time.
 145  *
 146  * The ill will always be valid if this function is called directly from
 147  * the driver.
 148  *
 149  * If ip_input() is called from GLDv3:
 150  *
 151  *   - This must be a non-VLAN IP stream.
 152  *   - 'mp' is either an untagged or a special priority-tagged packet.
 153  *   - Any VLAN tag that was in the MAC header has been stripped.
 154  *
 155  * If the IP header in packet is not 32-bit aligned, every message in the
 156  * chain will be aligned before further operations. This is required on SPARC
 157  * platform.
 158  */
 159 void
 160 ip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain,
 161     struct mac_header_info_s *mhip)
 162 {
 163         (void) ip_input_common_v4(ill, ip_ring, mp_chain, mhip, NULL, NULL,
 164             NULL);
 165 }
 166 
 167 /*
 168  * ip_accept_tcp() - This function is called by the squeue when it retrieves
 169  * a chain of packets in the poll mode. The packets have gone through the
 170  * data link processing but not IP processing. For performance and latency
 171  * reasons, the squeue wants to process the chain in line instead of feeding
 172  * it back via ip_input path.
 173  *
 174  * We set up the ip_recv_attr_t with IRAF_TARGET_SQP to that ip_fanout_v4
 175  * will pass back any TCP packets matching the target sqp to
 176  * ip_input_common_v4 using ira_target_sqp_mp. Other packets are handled by
 177  * ip_input_v4 and ip_fanout_v4 as normal.
 178  * The TCP packets that match the target squeue are returned to the caller
 179  * as a b_next chain after each packet has been prepend with an mblk
 180  * from ip_recv_attr_to_mblk.
 181  */
 182 mblk_t *
 183 ip_accept_tcp(ill_t *ill, ill_rx_ring_t *ip_ring, squeue_t *target_sqp,
 184     mblk_t *mp_chain, mblk_t **last, uint_t *cnt)
 185 {
 186         return (ip_input_common_v4(ill, ip_ring, mp_chain, NULL, target_sqp,
 187             last, cnt));
 188 }
 189 
 190 /*
 191  * Used by ip_input and ip_accept_tcp
 192  * The last three arguments are only used by ip_accept_tcp, and mhip is
 193  * only used by ip_input.
 194  */
 195 mblk_t *
 196 ip_input_common_v4(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain,
 197     struct mac_header_info_s *mhip, squeue_t *target_sqp,
 198     mblk_t **last, uint_t *cnt)
 199 {
 200         mblk_t          *mp;
 201         ipha_t          *ipha;
 202         ip_recv_attr_t  iras;   /* Receive attributes */
 203         rtc_t           rtc;
 204         iaflags_t       chain_flags = 0;        /* Fixed for chain */
 205         mblk_t          *ahead = NULL;  /* Accepted head */
 206         mblk_t          *atail = NULL;  /* Accepted tail */
 207         uint_t          acnt = 0;       /* Accepted count */
 208 
 209         ASSERT(mp_chain != NULL);
 210         ASSERT(ill != NULL);
 211 
 212         /* These ones do not change as we loop over packets */
 213         iras.ira_ill = iras.ira_rill = ill;
 214         iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
 215         iras.ira_rifindex = iras.ira_ruifindex;
 216         iras.ira_sqp = NULL;
 217         iras.ira_ring = ip_ring;
 218         /* For ECMP and outbound transmit ring selection */
 219         iras.ira_xmit_hint = ILL_RING_TO_XMIT_HINT(ip_ring);
 220 
 221         iras.ira_target_sqp = target_sqp;
 222         iras.ira_target_sqp_mp = NULL;
 223         if (target_sqp != NULL)
 224                 chain_flags |= IRAF_TARGET_SQP;
 225 
 226         /*
 227          * We try to have a mhip pointer when possible, but
 228          * it might be NULL in some cases. In those cases we
 229          * have to assume unicast.
 230          */
 231         iras.ira_mhip = mhip;
 232         iras.ira_flags = 0;
 233         if (mhip != NULL) {
 234                 switch (mhip->mhi_dsttype) {
 235                 case MAC_ADDRTYPE_MULTICAST :
 236                         chain_flags |= IRAF_L2DST_MULTICAST;
 237                         break;
 238                 case MAC_ADDRTYPE_BROADCAST :
 239                         chain_flags |= IRAF_L2DST_BROADCAST;
 240                         break;
 241                 }
 242         }
 243 
 244         /*
 245          * Initialize the one-element route cache.
 246          *
 247          * We do ire caching from one iteration to
 248          * another. In the event the packet chain contains
 249          * all packets from the same dst, this caching saves
 250          * an ire_route_recursive for each of the succeeding
 251          * packets in a packet chain.
 252          */
 253         rtc.rtc_ire = NULL;
 254         rtc.rtc_ipaddr = INADDR_ANY;
 255 
 256         /* Loop over b_next */
 257         for (mp = mp_chain; mp != NULL; mp = mp_chain) {
 258                 mp_chain = mp->b_next;
 259                 mp->b_next = NULL;
 260 
 261                 ASSERT(DB_TYPE(mp) == M_DATA);
 262 
 263 
 264                 /*
 265                  * if db_ref > 1 then copymsg and free original. Packet
 266                  * may be changed and we do not want the other entity
 267                  * who has a reference to this message to trip over the
 268                  * changes. This is a blind change because trying to
 269                  * catch all places that might change the packet is too
 270                  * difficult.
 271                  *
 272                  * This corresponds to the fast path case, where we have
 273                  * a chain of M_DATA mblks.  We check the db_ref count
 274                  * of only the 1st data block in the mblk chain. There
 275                  * doesn't seem to be a reason why a device driver would
 276                  * send up data with varying db_ref counts in the mblk
 277                  * chain. In any case the Fast path is a private
 278                  * interface, and our drivers don't do such a thing.
 279                  * Given the above assumption, there is no need to walk
 280                  * down the entire mblk chain (which could have a
 281                  * potential performance problem)
 282                  *
 283                  * The "(DB_REF(mp) > 1)" check was moved from ip_rput()
 284                  * to here because of exclusive ip stacks and vnics.
 285                  * Packets transmitted from exclusive stack over vnic
 286                  * can have db_ref > 1 and when it gets looped back to
 287                  * another vnic in a different zone, you have ip_input()
 288                  * getting dblks with db_ref > 1. So if someone
 289                  * complains of TCP performance under this scenario,
 290                  * take a serious look here on the impact of copymsg().
 291                  */
 292                 if (DB_REF(mp) > 1) {
 293                         if ((mp = ip_fix_dbref(mp, &iras)) == NULL) {
 294                                 /* mhip might point into 1st packet in chain */
 295                                 iras.ira_mhip = NULL;
 296                                 continue;
 297                         }
 298                 }
 299 
 300                 /*
 301                  * IP header ptr not aligned?
 302                  * OR IP header not complete in first mblk
 303                  */
 304                 ipha = (ipha_t *)mp->b_rptr;
 305                 if (!OK_32PTR(ipha) || MBLKL(mp) < IP_SIMPLE_HDR_LENGTH) {
 306                         mp = ip_check_and_align_header(mp, IP_SIMPLE_HDR_LENGTH,
 307                             &iras);
 308                         if (mp == NULL) {
 309                                 /* mhip might point into 1st packet in chain */
 310                                 iras.ira_mhip = NULL;
 311                                 continue;
 312                         }
 313                         ipha = (ipha_t *)mp->b_rptr;
 314                 }
 315 
 316                 /* Protect against a mix of Ethertypes and IP versions */
 317                 if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
 318                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
 319                         ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
 320                         freemsg(mp);
 321                         /* mhip might point into 1st packet in the chain. */
 322                         iras.ira_mhip = NULL;
 323                         continue;
 324                 }
 325 
 326                 /*
 327                  * Check for Martian addrs; we have to explicitly
 328                  * test for for zero dst since this is also used as
 329                  * an indication that the rtc is not used.
 330                  */
 331                 if (ipha->ipha_dst == INADDR_ANY) {
 332                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
 333                         ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
 334                         freemsg(mp);
 335                         /* mhip might point into 1st packet in the chain. */
 336                         iras.ira_mhip = NULL;
 337                         continue;
 338                 }
 339 
 340                 /*
 341                  * Keep L2SRC from a previous packet in chain since mhip
 342                  * might point into an earlier packet in the chain.
 343                  * Keep IRAF_VERIFIED_SRC to avoid redoing broadcast
 344                  * source check in forwarding path.
 345                  */
 346                 chain_flags |= (iras.ira_flags &
 347                     (IRAF_L2SRC_SET|IRAF_VERIFIED_SRC));
 348 
 349                 iras.ira_flags = IRAF_IS_IPV4 | IRAF_VERIFY_IP_CKSUM |
 350                     IRAF_VERIFY_ULP_CKSUM | chain_flags;
 351                 iras.ira_free_flags = 0;
 352                 iras.ira_cred = NULL;
 353                 iras.ira_cpid = NOPID;
 354                 iras.ira_tsl = NULL;
 355                 iras.ira_zoneid = ALL_ZONES;    /* Default for forwarding */
 356 
 357                 /*
 358                  * We must count all incoming packets, even if they end
 359                  * up being dropped later on. Defer counting bytes until
 360                  * we have the whole IP header in first mblk.
 361                  */
 362                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
 363 
 364                 iras.ira_pktlen = ntohs(ipha->ipha_length);
 365                 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets,
 366                     iras.ira_pktlen);
 367 
 368                 /*
 369                  * Call one of:
 370                  *      ill_input_full_v4
 371                  *      ill_input_short_v4
 372                  * The former is used in unusual cases. See ill_set_inputfn().
 373                  */
 374                 (*ill->ill_inputfn)(mp, ipha, &ipha->ipha_dst, &iras, &rtc);
 375 
 376                 /* Any references to clean up? No hold on ira_ill */
 377                 if (iras.ira_flags & (IRAF_IPSEC_SECURE|IRAF_SYSTEM_LABELED))
 378                         ira_cleanup(&iras, B_FALSE);
 379 
 380                 if (iras.ira_target_sqp_mp != NULL) {
 381                         /* Better be called from ip_accept_tcp */
 382                         ASSERT(target_sqp != NULL);
 383 
 384                         /* Found one packet to accept */
 385                         mp = iras.ira_target_sqp_mp;
 386                         iras.ira_target_sqp_mp = NULL;
 387                         ASSERT(ip_recv_attr_is_mblk(mp));
 388 
 389                         if (atail != NULL)
 390                                 atail->b_next = mp;
 391                         else
 392                                 ahead = mp;
 393                         atail = mp;
 394                         acnt++;
 395                         mp = NULL;
 396                 }
 397                 /* mhip might point into 1st packet in the chain. */
 398                 iras.ira_mhip = NULL;
 399         }
 400         /* Any remaining references to the route cache? */
 401         if (rtc.rtc_ire != NULL) {
 402                 ASSERT(rtc.rtc_ipaddr != INADDR_ANY);
 403                 ire_refrele(rtc.rtc_ire);
 404         }
 405 
 406         if (ahead != NULL) {
 407                 /* Better be called from ip_accept_tcp */
 408                 ASSERT(target_sqp != NULL);
 409                 *last = atail;
 410                 *cnt = acnt;
 411                 return (ahead);
 412         }
 413 
 414         return (NULL);
 415 }
 416 
 417 /*
 418  * This input function is used when
 419  *  - is_system_labeled()
 420  *  - CGTP filtering
 421  *  - DHCP unicast before we have an IP address configured
 422  *  - there is an listener for IPPROTO_RSVP
 423  */
 424 void
 425 ill_input_full_v4(mblk_t *mp, void *iph_arg, void *nexthop_arg,
 426     ip_recv_attr_t *ira, rtc_t *rtc)
 427 {
 428         ipha_t          *ipha = (ipha_t *)iph_arg;
 429         ipaddr_t        nexthop = *(ipaddr_t *)nexthop_arg;
 430         ill_t           *ill = ira->ira_ill;
 431         ip_stack_t      *ipst = ill->ill_ipst;
 432         int             cgtp_flt_pkt;
 433 
 434         ASSERT(ira->ira_tsl == NULL);
 435 
 436         /*
 437          * Attach any necessary label information to
 438          * this packet
 439          */
 440         if (is_system_labeled()) {
 441                 ira->ira_flags |= IRAF_SYSTEM_LABELED;
 442 
 443                 /*
 444                  * This updates ira_cred, ira_tsl and ira_free_flags based
 445                  * on the label.
 446                  */
 447                 if (!tsol_get_pkt_label(mp, IPV4_VERSION, ira)) {
 448                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 449                         ip_drop_input("ipIfStatsInDiscards", mp, ill);
 450                         freemsg(mp);
 451                         return;
 452                 }
 453                 /* Note that ira_tsl can be NULL here. */
 454 
 455                 /* tsol_get_pkt_label sometimes does pullupmsg */
 456                 ipha = (ipha_t *)mp->b_rptr;
 457         }
 458 
 459         /*
 460          * Invoke the CGTP (multirouting) filtering module to process
 461          * the incoming packet. Packets identified as duplicates
 462          * must be discarded. Filtering is active only if the
 463          * the ip_cgtp_filter ndd variable is non-zero.
 464          */
 465         cgtp_flt_pkt = CGTP_IP_PKT_NOT_CGTP;
 466         if (ipst->ips_ip_cgtp_filter &&
 467             ipst->ips_ip_cgtp_filter_ops != NULL) {
 468                 netstackid_t stackid;
 469 
 470                 stackid = ipst->ips_netstack->netstack_stackid;
 471                 /*
 472                  * CGTP and IPMP are mutually exclusive so
 473                  * phyint_ifindex is fine here.
 474                  */
 475                 cgtp_flt_pkt =
 476                     ipst->ips_ip_cgtp_filter_ops->cfo_filter(stackid,
 477                     ill->ill_phyint->phyint_ifindex, mp);
 478                 if (cgtp_flt_pkt == CGTP_IP_PKT_DUPLICATE) {
 479                         ip_drop_input("CGTP_IP_PKT_DUPLICATE", mp, ill);
 480                         freemsg(mp);
 481                         return;
 482                 }
 483         }
 484 
 485         /*
 486          * Brutal hack for DHCPv4 unicast: RFC2131 allows a DHCP
 487          * server to unicast DHCP packets to a DHCP client using the
 488          * IP address it is offering to the client.  This can be
 489          * disabled through the "broadcast bit", but not all DHCP
 490          * servers honor that bit.  Therefore, to interoperate with as
 491          * many DHCP servers as possible, the DHCP client allows the
 492          * server to unicast, but we treat those packets as broadcast
 493          * here.  Note that we don't rewrite the packet itself since
 494          * (a) that would mess up the checksums and (b) the DHCP
 495          * client conn is bound to INADDR_ANY so ip_fanout_udp() will
 496          * hand it the packet regardless.
 497          */
 498         if (ill->ill_dhcpinit != 0 &&
 499             ipha->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION &&
 500             ipha->ipha_protocol == IPPROTO_UDP) {
 501                 udpha_t *udpha;
 502 
 503                 ipha = ip_pullup(mp, sizeof (ipha_t) + sizeof (udpha_t), ira);
 504                 if (ipha == NULL) {
 505                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 506                         ip_drop_input("ipIfStatsInDiscards - dhcp", mp, ill);
 507                         freemsg(mp);
 508                         return;
 509                 }
 510                 /* Reload since pullupmsg() can change b_rptr. */
 511                 udpha = (udpha_t *)&ipha[1];
 512 
 513                 if (ntohs(udpha->uha_dst_port) == IPPORT_BOOTPC) {
 514                         DTRACE_PROBE2(ip4__dhcpinit__pkt, ill_t *, ill,
 515                             mblk_t *, mp);
 516                         /*
 517                          * This assumes that we deliver to all conns for
 518                          * multicast and broadcast packets.
 519                          */
 520                         nexthop = INADDR_BROADCAST;
 521                         ira->ira_flags |= IRAF_DHCP_UNICAST;
 522                 }
 523         }
 524 
 525         /*
 526          * If rsvpd is running, let RSVP daemon handle its processing
 527          * and forwarding of RSVP multicast/unicast packets.
 528          * If rsvpd is not running but mrouted is running, RSVP
 529          * multicast packets are forwarded as multicast traffic
 530          * and RSVP unicast packets are forwarded by unicast router.
 531          * If neither rsvpd nor mrouted is running, RSVP multicast
 532          * packets are not forwarded, but the unicast packets are
 533          * forwarded like unicast traffic.
 534          */
 535         if (ipha->ipha_protocol == IPPROTO_RSVP &&
 536             ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head != NULL) {
 537                 /* RSVP packet and rsvpd running. Treat as ours */
 538                 ip2dbg(("ip_input: RSVP for us: 0x%x\n", ntohl(nexthop)));
 539                 /*
 540                  * We use a multicast address to get the packet to
 541                  * ire_recv_multicast_v4. There will not be a membership
 542                  * check since we set IRAF_RSVP
 543                  */
 544                 nexthop = htonl(INADDR_UNSPEC_GROUP);
 545                 ira->ira_flags |= IRAF_RSVP;
 546         }
 547 
 548         ill_input_short_v4(mp, ipha, &nexthop, ira, rtc);
 549 }
 550 
 551 /*
 552  * This is the tail-end of the full receive side packet handling.
 553  * It can be used directly when the configuration is simple.
 554  */
 555 void
 556 ill_input_short_v4(mblk_t *mp, void *iph_arg, void *nexthop_arg,
 557     ip_recv_attr_t *ira, rtc_t *rtc)
 558 {
 559         ire_t           *ire;
 560         uint_t          opt_len;
 561         ill_t           *ill = ira->ira_ill;
 562         ip_stack_t      *ipst = ill->ill_ipst;
 563         uint_t          pkt_len;
 564         ssize_t         len;
 565         ipha_t          *ipha = (ipha_t *)iph_arg;
 566         ipaddr_t        nexthop = *(ipaddr_t *)nexthop_arg;
 567         ilb_stack_t     *ilbs = ipst->ips_netstack->netstack_ilb;
 568         uint_t          irr_flags;
 569 #define rptr    ((uchar_t *)ipha)
 570 
 571         ASSERT(DB_TYPE(mp) == M_DATA);
 572 
 573         /*
 574          * The following test for loopback is faster than
 575          * IP_LOOPBACK_ADDR(), because it avoids any bitwise
 576          * operations.
 577          * Note that these addresses are always in network byte order
 578          */
 579         if (((*(uchar_t *)&ipha->ipha_dst) == IN_LOOPBACKNET) ||
 580             ((*(uchar_t *)&ipha->ipha_src) == IN_LOOPBACKNET)) {
 581                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
 582                 ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
 583                 freemsg(mp);
 584                 return;
 585         }
 586 
 587         len = mp->b_wptr - rptr;
 588         pkt_len = ira->ira_pktlen;
 589 
 590         /* multiple mblk or too short */
 591         len -= pkt_len;
 592         if (len != 0) {
 593                 mp = ip_check_length(mp, rptr, len, pkt_len,
 594                     IP_SIMPLE_HDR_LENGTH, ira);
 595                 if (mp == NULL)
 596                         return;
 597                 ipha = (ipha_t *)mp->b_rptr;
 598         }
 599 
 600         DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
 601             ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL,
 602             int, 0);
 603 
 604         /*
 605          * The event for packets being received from a 'physical'
 606          * interface is placed after validation of the source and/or
 607          * destination address as being local so that packets can be
 608          * redirected to loopback addresses using ipnat.
 609          */
 610         DTRACE_PROBE4(ip4__physical__in__start,
 611             ill_t *, ill, ill_t *, NULL,
 612             ipha_t *, ipha, mblk_t *, mp);
 613 
 614         if (HOOKS4_INTERESTED_PHYSICAL_IN(ipst)) {
 615                 int     ll_multicast = 0;
 616                 int     error;
 617                 ipaddr_t orig_dst = ipha->ipha_dst;
 618 
 619                 if (ira->ira_flags & IRAF_L2DST_MULTICAST)
 620                         ll_multicast = HPE_MULTICAST;
 621                 else if (ira->ira_flags & IRAF_L2DST_BROADCAST)
 622                         ll_multicast = HPE_BROADCAST;
 623 
 624                 FW_HOOKS(ipst->ips_ip4_physical_in_event,
 625                     ipst->ips_ipv4firewall_physical_in,
 626                     ill, NULL, ipha, mp, mp, ll_multicast, ipst, error);
 627 
 628                 DTRACE_PROBE1(ip4__physical__in__end, mblk_t *, mp);
 629 
 630                 if (mp == NULL)
 631                         return;
 632                 /* The length could have changed */
 633                 ipha = (ipha_t *)mp->b_rptr;
 634                 ira->ira_pktlen = ntohs(ipha->ipha_length);
 635                 pkt_len = ira->ira_pktlen;
 636 
 637                 /*
 638                  * In case the destination changed we override any previous
 639                  * change to nexthop.
 640                  */
 641                 if (orig_dst != ipha->ipha_dst)
 642                         nexthop = ipha->ipha_dst;
 643                 if (nexthop == INADDR_ANY) {
 644                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
 645                         ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
 646                         freemsg(mp);
 647                         return;
 648                 }
 649         }
 650 
 651         if (ipst->ips_ip4_observe.he_interested) {
 652                 zoneid_t dzone;
 653 
 654                 /*
 655                  * On the inbound path the src zone will be unknown as
 656                  * this packet has come from the wire.
 657                  */
 658                 dzone = ip_get_zoneid_v4(nexthop, mp, ira, ALL_ZONES);
 659                 ipobs_hook(mp, IPOBS_HOOK_INBOUND, ALL_ZONES, dzone, ill, ipst);
 660         }
 661 
 662         /*
 663          * If there is a good HW IP header checksum we clear the need
 664          * look at the IP header checksum.
 665          */
 666         if ((DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) &&
 667             ILL_HCKSUM_CAPABLE(ill) && dohwcksum) {
 668                 /* Header checksum was ok. Clear the flag */
 669                 DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM;
 670                 ira->ira_flags &= ~IRAF_VERIFY_IP_CKSUM;
 671         }
 672 
 673         /*
 674          * Here we check to see if we machine is setup as
 675          * L3 loadbalancer and if the incoming packet is for a VIP
 676          *
 677          * Check the following:
 678          * - there is at least a rule
 679          * - protocol of the packet is supported
 680          */
 681         if (ilb_has_rules(ilbs) && ILB_SUPP_L4(ipha->ipha_protocol)) {
 682                 ipaddr_t        lb_dst;
 683                 int             lb_ret;
 684 
 685                 /* For convenience, we pull up the mblk. */
 686                 if (mp->b_cont != NULL) {
 687                         if (pullupmsg(mp, -1) == 0) {
 688                                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 689                                 ip_drop_input("ipIfStatsInDiscards - pullupmsg",
 690                                     mp, ill);
 691                                 freemsg(mp);
 692                                 return;
 693                         }
 694                         ipha = (ipha_t *)mp->b_rptr;
 695                 }
 696 
 697                 /*
 698                  * We just drop all fragments going to any VIP, at
 699                  * least for now....
 700                  */
 701                 if (ntohs(ipha->ipha_fragment_offset_and_flags) &
 702                     (IPH_MF | IPH_OFFSET)) {
 703                         if (!ilb_rule_match_vip_v4(ilbs, nexthop, NULL)) {
 704                                 goto after_ilb;
 705                         }
 706 
 707                         ILB_KSTAT_UPDATE(ilbs, ip_frag_in, 1);
 708                         ILB_KSTAT_UPDATE(ilbs, ip_frag_dropped, 1);
 709                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 710                         ip_drop_input("ILB fragment", mp, ill);
 711                         freemsg(mp);
 712                         return;
 713                 }
 714                 lb_ret = ilb_check_v4(ilbs, ill, mp, ipha, ipha->ipha_protocol,
 715                     (uint8_t *)ipha + IPH_HDR_LENGTH(ipha), &lb_dst);
 716 
 717                 if (lb_ret == ILB_DROPPED) {
 718                         /* Is this the right counter to increase? */
 719                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 720                         ip_drop_input("ILB_DROPPED", mp, ill);
 721                         freemsg(mp);
 722                         return;
 723                 }
 724                 if (lb_ret == ILB_BALANCED) {
 725                         /* Set the dst to that of the chosen server */
 726                         nexthop = lb_dst;
 727                         DB_CKSUMFLAGS(mp) = 0;
 728                 }
 729         }
 730 
 731 after_ilb:
 732         opt_len = ipha->ipha_version_and_hdr_length - IP_SIMPLE_HDR_VERSION;
 733         ira->ira_ip_hdr_length = IP_SIMPLE_HDR_LENGTH;
 734         if (opt_len != 0) {
 735                 int error = 0;
 736 
 737                 ira->ira_ip_hdr_length += (opt_len << 2);
 738                 ira->ira_flags |= IRAF_IPV4_OPTIONS;
 739 
 740                 /* IP Options present!  Validate the length. */
 741                 mp = ip_check_optlen(mp, ipha, opt_len, pkt_len, ira);
 742                 if (mp == NULL)
 743                         return;
 744 
 745                 /* Might have changed */
 746                 ipha = (ipha_t *)mp->b_rptr;
 747 
 748                 /* Verify IP header checksum before parsing the options */
 749                 if ((ira->ira_flags & IRAF_VERIFY_IP_CKSUM) &&
 750                     ip_csum_hdr(ipha)) {
 751                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
 752                         ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
 753                         freemsg(mp);
 754                         return;
 755                 }
 756                 ira->ira_flags &= ~IRAF_VERIFY_IP_CKSUM;
 757 
 758                 /*
 759                  * Go off to ip_input_options which returns the next hop
 760                  * destination address, which may have been affected
 761                  * by source routing.
 762                  */
 763                 IP_STAT(ipst, ip_opt);
 764 
 765                 nexthop = ip_input_options(ipha, nexthop, mp, ira, &error);
 766                 if (error != 0) {
 767                         /*
 768                          * An ICMP error has been sent and the packet has
 769                          * been dropped.
 770                          */
 771                         return;
 772                 }
 773         }
 774 
 775         if (ill->ill_flags & ILLF_ROUTER)
 776                 irr_flags = IRR_ALLOCATE;
 777         else
 778                 irr_flags = IRR_NONE;
 779 
 780         /* Can not use route cache with TX since the labels can differ */
 781         if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
 782                 if (CLASSD(nexthop)) {
 783                         ire = ire_multicast(ill);
 784                 } else {
 785                         /* Match destination and label */
 786                         ire = ire_route_recursive_v4(nexthop, 0, NULL,
 787                             ALL_ZONES, ira->ira_tsl, MATCH_IRE_SECATTR,
 788                             irr_flags, ira->ira_xmit_hint, ipst, NULL, NULL,
 789                             NULL);
 790                 }
 791                 /* Update the route cache so we do the ire_refrele */
 792                 ASSERT(ire != NULL);
 793                 if (rtc->rtc_ire != NULL)
 794                         ire_refrele(rtc->rtc_ire);
 795                 rtc->rtc_ire = ire;
 796                 rtc->rtc_ipaddr = nexthop;
 797         } else if (nexthop == rtc->rtc_ipaddr && rtc->rtc_ire != NULL) {
 798                 /* Use the route cache */
 799                 ire = rtc->rtc_ire;
 800         } else {
 801                 /* Update the route cache */
 802                 if (CLASSD(nexthop)) {
 803                         ire = ire_multicast(ill);
 804                 } else {
 805                         /* Just match the destination */
 806                         ire = ire_route_recursive_dstonly_v4(nexthop, irr_flags,
 807                             ira->ira_xmit_hint, ipst);
 808                 }
 809                 ASSERT(ire != NULL);
 810                 if (rtc->rtc_ire != NULL)
 811                         ire_refrele(rtc->rtc_ire);
 812                 rtc->rtc_ire = ire;
 813                 rtc->rtc_ipaddr = nexthop;
 814         }
 815 
 816         ire->ire_ib_pkt_count++;
 817 
 818         /*
 819          * Based on ire_type and ire_flags call one of:
 820          *      ire_recv_local_v4 - for IRE_LOCAL
 821          *      ire_recv_loopback_v4 - for IRE_LOOPBACK
 822          *      ire_recv_multirt_v4 - if RTF_MULTIRT
 823          *      ire_recv_noroute_v4 - if RTF_REJECT or RTF_BLACHOLE
 824          *      ire_recv_multicast_v4 - for IRE_MULTICAST
 825          *      ire_recv_broadcast_v4 - for IRE_BROADCAST
 826          *      ire_recv_noaccept_v4 - for ire_noaccept ones
 827          *      ire_recv_forward_v4 - for the rest.
 828          */
 829         (*ire->ire_recvfn)(ire, mp, ipha, ira);
 830 }
 831 #undef rptr
 832 
 833 /*
 834  * ire_recvfn for IREs that need forwarding
 835  */
 836 void
 837 ire_recv_forward_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
 838 {
 839         ipha_t          *ipha = (ipha_t *)iph_arg;
 840         ill_t           *ill = ira->ira_ill;
 841         ip_stack_t      *ipst = ill->ill_ipst;
 842         ill_t           *dst_ill;
 843         nce_t           *nce;
 844         ipaddr_t        src = ipha->ipha_src;
 845         uint32_t        added_tx_len;
 846         uint32_t        mtu, iremtu;
 847 
 848         if (ira->ira_flags & (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST)) {
 849                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
 850                 ip_drop_input("l2 multicast not forwarded", mp, ill);
 851                 freemsg(mp);
 852                 return;
 853         }
 854 
 855         if (!(ill->ill_flags & ILLF_ROUTER) && !ip_source_routed(ipha, ipst)) {
 856                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
 857                 ip_drop_input("ipIfStatsForwProhibits", mp, ill);
 858                 freemsg(mp);
 859                 return;
 860         }
 861 
 862         /*
 863          * Either ire_nce_capable or ire_dep_parent would be set for the IRE
 864          * when it is found by ire_route_recursive, but that some other thread
 865          * could have changed the routes with the effect of clearing
 866          * ire_dep_parent. In that case we'd end up dropping the packet, or
 867          * finding a new nce below.
 868          * Get, allocate, or update the nce.
 869          * We get a refhold on ire_nce_cache as a result of this to avoid races
 870          * where ire_nce_cache is deleted.
 871          *
 872          * This ensures that we don't forward if the interface is down since
 873          * ipif_down removes all the nces.
 874          */
 875         mutex_enter(&ire->ire_lock);
 876         nce = ire->ire_nce_cache;
 877         if (nce == NULL) {
 878                 /* Not yet set up - try to set one up */
 879                 mutex_exit(&ire->ire_lock);
 880                 (void) ire_revalidate_nce(ire);
 881                 mutex_enter(&ire->ire_lock);
 882                 nce = ire->ire_nce_cache;
 883                 if (nce == NULL) {
 884                         mutex_exit(&ire->ire_lock);
 885                         /* The ire_dep_parent chain went bad, or no memory */
 886                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 887                         ip_drop_input("No ire_dep_parent", mp, ill);
 888                         freemsg(mp);
 889                         return;
 890                 }
 891         }
 892         nce_refhold(nce);
 893         mutex_exit(&ire->ire_lock);
 894 
 895         if (nce->nce_is_condemned) {
 896                 nce_t *nce1;
 897 
 898                 nce1 = ire_handle_condemned_nce(nce, ire, ipha, NULL, B_FALSE);
 899                 nce_refrele(nce);
 900                 if (nce1 == NULL) {
 901                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 902                         ip_drop_input("No nce", mp, ill);
 903                         freemsg(mp);
 904                         return;
 905                 }
 906                 nce = nce1;
 907         }
 908         dst_ill = nce->nce_ill;
 909 
 910         /*
 911          * Unless we are forwarding, drop the packet.
 912          * We have to let source routed packets through if they go out
 913          * the same interface i.e., they are 'ping -l' packets.
 914          */
 915         if (!(dst_ill->ill_flags & ILLF_ROUTER) &&
 916             !(ip_source_routed(ipha, ipst) && dst_ill == ill)) {
 917                 if (ip_source_routed(ipha, ipst)) {
 918                         ip_drop_input("ICMP_SOURCE_ROUTE_FAILED", mp, ill);
 919                         icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, ira);
 920                         nce_refrele(nce);
 921                         return;
 922                 }
 923                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
 924                 ip_drop_input("ipIfStatsForwProhibits", mp, ill);
 925                 freemsg(mp);
 926                 nce_refrele(nce);
 927                 return;
 928         }
 929 
 930         if (ire->ire_zoneid != GLOBAL_ZONEID && ire->ire_zoneid != ALL_ZONES) {
 931                 ipaddr_t        dst = ipha->ipha_dst;
 932 
 933                 ire->ire_ib_pkt_count--;
 934                 /*
 935                  * Should only use IREs that are visible from the
 936                  * global zone for forwarding.
 937                  * Take a source route into account the same way as ip_input
 938                  * did.
 939                  */
 940                 if (ira->ira_flags & IRAF_IPV4_OPTIONS) {
 941                         int             error = 0;
 942 
 943                         dst = ip_input_options(ipha, dst, mp, ira, &error);
 944                         ASSERT(error == 0);     /* ip_input checked */
 945                 }
 946                 ire = ire_route_recursive_v4(dst, 0, NULL, GLOBAL_ZONEID,
 947                     ira->ira_tsl, MATCH_IRE_SECATTR,
 948                     (ill->ill_flags & ILLF_ROUTER) ? IRR_ALLOCATE : IRR_NONE,
 949                     ira->ira_xmit_hint, ipst, NULL, NULL, NULL);
 950                 ire->ire_ib_pkt_count++;
 951                 (*ire->ire_recvfn)(ire, mp, ipha, ira);
 952                 ire_refrele(ire);
 953                 nce_refrele(nce);
 954                 return;
 955         }
 956 
 957         /*
 958          * ipIfStatsHCInForwDatagrams should only be increment if there
 959          * will be an attempt to forward the packet, which is why we
 960          * increment after the above condition has been checked.
 961          */
 962         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams);
 963 
 964         /* Initiate Read side IPPF processing */
 965         if (IPP_ENABLED(IPP_FWD_IN, ipst)) {
 966                 /* ip_process translates an IS_UNDER_IPMP */
 967                 mp = ip_process(IPP_FWD_IN, mp, ill, ill);
 968                 if (mp == NULL) {
 969                         /* ip_drop_packet and MIB done */
 970                         ip2dbg(("ire_recv_forward_v4: pkt dropped/deferred "
 971                             "during IPPF processing\n"));
 972                         nce_refrele(nce);
 973                         return;
 974                 }
 975         }
 976 
 977         DTRACE_PROBE4(ip4__forwarding__start,
 978             ill_t *, ill, ill_t *, dst_ill, ipha_t *, ipha, mblk_t *, mp);
 979 
 980         if (HOOKS4_INTERESTED_FORWARDING(ipst)) {
 981                 int error;
 982 
 983                 FW_HOOKS(ipst->ips_ip4_forwarding_event,
 984                     ipst->ips_ipv4firewall_forwarding,
 985                     ill, dst_ill, ipha, mp, mp, 0, ipst, error);
 986 
 987                 DTRACE_PROBE1(ip4__forwarding__end, mblk_t *, mp);
 988 
 989                 if (mp == NULL) {
 990                         nce_refrele(nce);
 991                         return;
 992                 }
 993                 /*
 994                  * Even if the destination was changed by the filter we use the
 995                  * forwarding decision that was made based on the address
 996                  * in ip_input.
 997                  */
 998 
 999                 /* Might have changed */
1000                 ipha = (ipha_t *)mp->b_rptr;
1001                 ira->ira_pktlen = ntohs(ipha->ipha_length);
1002         }
1003 
1004         /* Packet is being forwarded. Turning off hwcksum flag. */
1005         DB_CKSUMFLAGS(mp) = 0;
1006 
1007         /*
1008          * Martian Address Filtering [RFC 1812, Section 5.3.7]
1009          * The loopback address check for both src and dst has already
1010          * been checked in ip_input
1011          * In the future one can envision adding RPF checks using number 3.
1012          * If we already checked the same source address we can skip this.
1013          */
1014         if (!(ira->ira_flags & IRAF_VERIFIED_SRC) ||
1015             src != ira->ira_verified_src) {
1016                 switch (ipst->ips_src_check) {
1017                 case 0:
1018                         break;
1019                 case 2:
1020                         if (ip_type_v4(src, ipst) == IRE_BROADCAST) {
1021                                 BUMP_MIB(ill->ill_ip_mib,
1022                                     ipIfStatsForwProhibits);
1023                                 BUMP_MIB(ill->ill_ip_mib,
1024                                     ipIfStatsInAddrErrors);
1025                                 ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
1026                                 freemsg(mp);
1027                                 nce_refrele(nce);
1028                                 return;
1029                         }
1030                         /* FALLTHRU */
1031 
1032                 case 1:
1033                         if (CLASSD(src)) {
1034                                 BUMP_MIB(ill->ill_ip_mib,
1035                                     ipIfStatsForwProhibits);
1036                                 BUMP_MIB(ill->ill_ip_mib,
1037                                     ipIfStatsInAddrErrors);
1038                                 ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
1039                                 freemsg(mp);
1040                                 nce_refrele(nce);
1041                                 return;
1042                         }
1043                         break;
1044                 }
1045                 /* Remember for next packet */
1046                 ira->ira_flags |= IRAF_VERIFIED_SRC;
1047                 ira->ira_verified_src = src;
1048         }
1049 
1050         /*
1051          * Check if packet is going out the same link on which it arrived.
1052          * Means we might need to send a redirect.
1053          */
1054         if (IS_ON_SAME_LAN(dst_ill, ill) && ipst->ips_ip_g_send_redirects) {
1055                 ip_send_potential_redirect_v4(mp, ipha, ire, ira);
1056         }
1057 
1058         added_tx_len = 0;
1059         if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
1060                 mblk_t          *mp1;
1061                 uint32_t        old_pkt_len = ira->ira_pktlen;
1062 
1063                 /* Verify IP header checksum before adding/removing options */
1064                 if ((ira->ira_flags & IRAF_VERIFY_IP_CKSUM) &&
1065                     ip_csum_hdr(ipha)) {
1066                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
1067                         ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
1068                         freemsg(mp);
1069                         nce_refrele(nce);
1070                         return;
1071                 }
1072                 ira->ira_flags &= ~IRAF_VERIFY_IP_CKSUM;
1073 
1074                 /*
1075                  * Check if it can be forwarded and add/remove
1076                  * CIPSO options as needed.
1077                  */
1078                 if ((mp1 = tsol_ip_forward(ire, mp, ira)) == NULL) {
1079                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
1080                         ip_drop_input("tsol_ip_forward", mp, ill);
1081                         freemsg(mp);
1082                         nce_refrele(nce);
1083                         return;
1084                 }
1085                 /*
1086                  * Size may have changed. Remember amount added in case
1087                  * IP needs to send an ICMP too big.
1088                  */
1089                 mp = mp1;
1090                 ipha = (ipha_t *)mp->b_rptr;
1091                 ira->ira_pktlen = ntohs(ipha->ipha_length);
1092                 ira->ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
1093                 if (ira->ira_pktlen > old_pkt_len)
1094                         added_tx_len = ira->ira_pktlen - old_pkt_len;
1095 
1096                 /* Options can have been added or removed */
1097                 if (ira->ira_ip_hdr_length != IP_SIMPLE_HDR_LENGTH)
1098                         ira->ira_flags |= IRAF_IPV4_OPTIONS;
1099                 else
1100                         ira->ira_flags &= ~IRAF_IPV4_OPTIONS;
1101         }
1102 
1103         mtu = dst_ill->ill_mtu;
1104         if ((iremtu = ire->ire_metrics.iulp_mtu) != 0 && iremtu < mtu)
1105                 mtu = iremtu;
1106         ip_forward_xmit_v4(nce, ill, mp, ipha, ira, mtu, added_tx_len);
1107         nce_refrele(nce);
1108 }
1109 
1110 /*
1111  * Used for sending out unicast and multicast packets that are
1112  * forwarded.
1113  */
1114 void
1115 ip_forward_xmit_v4(nce_t *nce, ill_t *ill, mblk_t *mp, ipha_t *ipha,
1116     ip_recv_attr_t *ira, uint32_t mtu, uint32_t added_tx_len)
1117 {
1118         ill_t           *dst_ill = nce->nce_ill;
1119         uint32_t        pkt_len;
1120         uint32_t        sum;
1121         iaflags_t       iraflags = ira->ira_flags;
1122         ip_stack_t      *ipst = ill->ill_ipst;
1123         iaflags_t       ixaflags;
1124 
1125         if (ipha->ipha_ttl <= 1) {
1126                 /* Perhaps the checksum was bad */
1127                 if ((iraflags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) {
1128                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
1129                         ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
1130                         freemsg(mp);
1131                         return;
1132                 }
1133                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1134                 ip_drop_input("ICMP_TTL_EXCEEDED", mp, ill);
1135                 icmp_time_exceeded(mp, ICMP_TTL_EXCEEDED, ira);
1136                 return;
1137         }
1138         ipha->ipha_ttl--;
1139         /* Adjust the checksum to reflect the ttl decrement. */
1140         sum = (int)ipha->ipha_hdr_checksum + IP_HDR_CSUM_TTL_ADJUST;
1141         ipha->ipha_hdr_checksum = (uint16_t)(sum + (sum >> 16));
1142 
1143         /* Check if there are options to update */
1144         if (iraflags & IRAF_IPV4_OPTIONS) {
1145                 ASSERT(ipha->ipha_version_and_hdr_length !=
1146                     IP_SIMPLE_HDR_VERSION);
1147                 ASSERT(!(iraflags & IRAF_VERIFY_IP_CKSUM));
1148 
1149                 if (!ip_forward_options(mp, ipha, dst_ill, ira)) {
1150                         /* ipIfStatsForwProhibits and ip_drop_input done */
1151                         return;
1152                 }
1153 
1154                 ipha->ipha_hdr_checksum = 0;
1155                 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1156         }
1157 
1158         /* Initiate Write side IPPF processing before any fragmentation */
1159         if (IPP_ENABLED(IPP_FWD_OUT, ipst)) {
1160                 /* ip_process translates an IS_UNDER_IPMP */
1161                 mp = ip_process(IPP_FWD_OUT, mp, dst_ill, dst_ill);
1162                 if (mp == NULL) {
1163                         /* ip_drop_packet and MIB done */
1164                         ip2dbg(("ire_recv_forward_v4: pkt dropped/deferred" \
1165                             " during IPPF processing\n"));
1166                         return;
1167                 }
1168         }
1169 
1170         pkt_len = ira->ira_pktlen;
1171 
1172         BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsHCOutForwDatagrams);
1173 
1174         ixaflags = IXAF_IS_IPV4 | IXAF_NO_DEV_FLOW_CTL;
1175 
1176         if (pkt_len > mtu) {
1177                 /*
1178                  * It needs fragging on its way out.  If we haven't
1179                  * verified the header checksum yet we do it now since
1180                  * are going to put a surely good checksum in the
1181                  * outgoing header, we have to make sure that it
1182                  * was good coming in.
1183                  */
1184                 if ((iraflags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) {
1185                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
1186                         ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
1187                         freemsg(mp);
1188                         return;
1189                 }
1190                 if (ipha->ipha_fragment_offset_and_flags & IPH_DF_HTONS) {
1191                         BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsOutFragFails);
1192                         ip_drop_output("ipIfStatsOutFragFails", mp, dst_ill);
1193                         if (iraflags & IRAF_SYSTEM_LABELED) {
1194                                 /*
1195                                  * Remove any CIPSO option added by
1196                                  * tsol_ip_forward, and make sure we report
1197                                  * a path MTU so that there
1198                                  * is room to add such a CIPSO option for future
1199                                  * packets.
1200                                  */
1201                                 mtu = tsol_pmtu_adjust(mp, mtu, added_tx_len,
1202                                     AF_INET);
1203                         }
1204 
1205                         icmp_frag_needed(mp, mtu, ira);
1206                         return;
1207                 }
1208 
1209                 (void) ip_fragment_v4(mp, nce, ixaflags, pkt_len, mtu,
1210                     ira->ira_xmit_hint, GLOBAL_ZONEID, 0, ip_xmit, NULL);
1211                 return;
1212         }
1213 
1214         ASSERT(pkt_len == ntohs(((ipha_t *)mp->b_rptr)->ipha_length));
1215         if (iraflags & IRAF_LOOPBACK_COPY) {
1216                 /*
1217                  * IXAF_NO_LOOP_ZONEID is not set hence 7th arg
1218                  * is don't care
1219                  */
1220                 (void) ip_postfrag_loopcheck(mp, nce,
1221                     ixaflags | IXAF_LOOPBACK_COPY,
1222                     pkt_len, ira->ira_xmit_hint, GLOBAL_ZONEID, 0, NULL);
1223         } else {
1224                 (void) ip_xmit(mp, nce, ixaflags, pkt_len, ira->ira_xmit_hint,
1225                     GLOBAL_ZONEID, 0, NULL);
1226         }
1227 }
1228 
1229 /*
1230  * ire_recvfn for RTF_REJECT and RTF_BLACKHOLE routes, including IRE_NOROUTE,
1231  * which is what ire_route_recursive returns when there is no matching ire.
1232  * Send ICMP unreachable unless blackhole.
1233  */
1234 void
1235 ire_recv_noroute_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
1236 {
1237         ipha_t          *ipha = (ipha_t *)iph_arg;
1238         ill_t           *ill = ira->ira_ill;
1239         ip_stack_t      *ipst = ill->ill_ipst;
1240 
1241         /* Would we have forwarded this packet if we had a route? */
1242         if (ira->ira_flags & (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST)) {
1243                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
1244                 ip_drop_input("l2 multicast not forwarded", mp, ill);
1245                 freemsg(mp);
1246                 return;
1247         }
1248 
1249         if (!(ill->ill_flags & ILLF_ROUTER)) {
1250                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
1251                 ip_drop_input("ipIfStatsForwProhibits", mp, ill);
1252                 freemsg(mp);
1253                 return;
1254         }
1255         /*
1256          * If we had a route this could have been forwarded. Count as such.
1257          *
1258          * ipIfStatsHCInForwDatagrams should only be increment if there
1259          * will be an attempt to forward the packet, which is why we
1260          * increment after the above condition has been checked.
1261          */
1262         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams);
1263 
1264         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInNoRoutes);
1265 
1266         ip_rts_change(RTM_MISS, ipha->ipha_dst, 0, 0, 0, 0, 0, 0, RTA_DST,
1267             ipst);
1268 
1269         if (ire->ire_flags & RTF_BLACKHOLE) {
1270                 ip_drop_input("ipIfStatsInNoRoutes RTF_BLACKHOLE", mp, ill);
1271                 freemsg(mp);
1272         } else {
1273                 ip_drop_input("ipIfStatsInNoRoutes RTF_REJECT", mp, ill);
1274 
1275                 if (ip_source_routed(ipha, ipst)) {
1276                         icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, ira);
1277                 } else {
1278                         icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, ira);
1279                 }
1280         }
1281 }
1282 
1283 /*
1284  * ire_recvfn for IRE_LOCALs marked with ire_noaccept. Such IREs are used for
1285  * VRRP when in noaccept mode.
1286  * We silently drop the packet. ARP handles packets even if noaccept is set.
1287  */
1288 /* ARGSUSED */
1289 void
1290 ire_recv_noaccept_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1291     ip_recv_attr_t *ira)
1292 {
1293         ill_t           *ill = ira->ira_ill;
1294 
1295         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1296         ip_drop_input("ipIfStatsInDiscards - noaccept", mp, ill);
1297         freemsg(mp);
1298 }
1299 
1300 /*
1301  * ire_recvfn for IRE_BROADCAST.
1302  */
1303 void
1304 ire_recv_broadcast_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1305     ip_recv_attr_t *ira)
1306 {
1307         ipha_t          *ipha = (ipha_t *)iph_arg;
1308         ill_t           *ill = ira->ira_ill;
1309         ill_t           *dst_ill = ire->ire_ill;
1310         ip_stack_t      *ipst = ill->ill_ipst;
1311         ire_t           *alt_ire;
1312         nce_t           *nce;
1313         ipaddr_t        ipha_dst;
1314 
1315         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInBcastPkts);
1316 
1317         /* Tag for higher-level protocols */
1318         ira->ira_flags |= IRAF_BROADCAST;
1319 
1320         /*
1321          * Whether local or directed broadcast forwarding: don't allow
1322          * for TCP.
1323          */
1324         if (ipha->ipha_protocol == IPPROTO_TCP) {
1325                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1326                 ip_drop_input("ipIfStatsInDiscards", mp, ill);
1327                 freemsg(mp);
1328                 return;
1329         }
1330 
1331         /*
1332          * So that we don't end up with dups, only one ill an IPMP group is
1333          * nominated to receive broadcast traffic.
1334          * If we have no cast_ill we are liberal and accept everything.
1335          */
1336         if (IS_UNDER_IPMP(ill)) {
1337                 /* For an under ill_grp can change under lock */
1338                 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1339                 if (!ill->ill_nom_cast && ill->ill_grp != NULL &&
1340                     ill->ill_grp->ig_cast_ill != NULL) {
1341                         rw_exit(&ipst->ips_ill_g_lock);
1342                         /* No MIB since this is normal operation */
1343                         ip_drop_input("not nom_cast", mp, ill);
1344                         freemsg(mp);
1345                         return;
1346                 }
1347                 rw_exit(&ipst->ips_ill_g_lock);
1348 
1349                 ira->ira_ruifindex = ill_get_upper_ifindex(ill);
1350         }
1351 
1352         /*
1353          * After reassembly and IPsec we will need to duplicate the
1354          * broadcast packet for all matching zones on the ill.
1355          */
1356         ira->ira_zoneid = ALL_ZONES;
1357 
1358         /*
1359          * Check for directed broadcast i.e. ire->ire_ill is different than
1360          * the incoming ill.
1361          * The same broadcast address can be assigned to multiple interfaces
1362          * so have to check explicitly for that case by looking up the alt_ire
1363          */
1364         if (dst_ill == ill && !(ire->ire_flags & RTF_MULTIRT)) {
1365                 /* Reassemble on the ill on which the packet arrived */
1366                 ip_input_local_v4(ire, mp, ipha, ira);
1367                 /* Restore */
1368                 ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex;
1369                 return;
1370         }
1371 
1372         /* Is there an IRE_BROADCAST on the incoming ill? */
1373         ipha_dst = ((ira->ira_flags & IRAF_DHCP_UNICAST) ? INADDR_BROADCAST :
1374             ipha->ipha_dst);
1375         alt_ire = ire_ftable_lookup_v4(ipha_dst, 0, 0, IRE_BROADCAST, ill,
1376             ALL_ZONES, ira->ira_tsl,
1377             MATCH_IRE_TYPE|MATCH_IRE_ILL|MATCH_IRE_SECATTR, 0, ipst, NULL);
1378         if (alt_ire != NULL) {
1379                 /* Not a directed broadcast */
1380                 /*
1381                  * In the special case of multirouted broadcast
1382                  * packets, we unconditionally need to "gateway"
1383                  * them to the appropriate interface here so that reassembly
1384                  * works. We know that the IRE_BROADCAST on cgtp0 doesn't
1385                  * have RTF_MULTIRT set so we look for such an IRE in the
1386                  * bucket.
1387                  */
1388                 if (alt_ire->ire_flags & RTF_MULTIRT) {
1389                         irb_t           *irb;
1390                         ire_t           *ire1;
1391 
1392                         irb = ire->ire_bucket;
1393                         irb_refhold(irb);
1394                         for (ire1 = irb->irb_ire; ire1 != NULL;
1395                             ire1 = ire1->ire_next) {
1396                                 if (IRE_IS_CONDEMNED(ire1))
1397                                         continue;
1398                                 if (!(ire1->ire_type & IRE_BROADCAST) ||
1399                                     (ire1->ire_flags & RTF_MULTIRT))
1400                                         continue;
1401                                 ill = ire1->ire_ill;
1402                                 ill_refhold(ill);
1403                                 break;
1404                         }
1405                         irb_refrele(irb);
1406                         if (ire1 != NULL) {
1407                                 ill_t *orig_ill = ira->ira_ill;
1408 
1409                                 ire_refrele(alt_ire);
1410                                 /* Reassemble on the new ill */
1411                                 ira->ira_ill = ill;
1412                                 ip_input_local_v4(ire, mp, ipha, ira);
1413                                 ill_refrele(ill);
1414                                 /* Restore */
1415                                 ira->ira_ill = orig_ill;
1416                                 ira->ira_ruifindex =
1417                                     orig_ill->ill_phyint->phyint_ifindex;
1418                                 return;
1419                         }
1420                 }
1421                 ire_refrele(alt_ire);
1422                 /* Reassemble on the ill on which the packet arrived */
1423                 ip_input_local_v4(ire, mp, ipha, ira);
1424                 goto done;
1425         }
1426 
1427         /*
1428          * This is a directed broadcast
1429          *
1430          * If directed broadcast is allowed, then forward the packet out
1431          * the destination interface with IXAF_LOOPBACK_COPY set. That will
1432          * result in ip_input() receiving a copy of the packet on the
1433          * appropriate ill. (We could optimize this to avoid the extra trip
1434          * via ip_input(), but since directed broadcasts are normally disabled
1435          * it doesn't make sense to optimize it.)
1436          */
1437         if (!ipst->ips_ip_g_forward_directed_bcast ||
1438             (ira->ira_flags & (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST))) {
1439                 ip_drop_input("directed broadcast not allowed", mp, ill);
1440                 freemsg(mp);
1441                 goto done;
1442         }
1443         if ((ira->ira_flags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) {
1444                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
1445                 ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
1446                 freemsg(mp);
1447                 goto done;
1448         }
1449 
1450         /*
1451          * Clear the indication that this may have hardware
1452          * checksum as we are not using it for forwarding.
1453          */
1454         DB_CKSUMFLAGS(mp) = 0;
1455 
1456         /*
1457          * Adjust ttl to 2 (1+1 - the forward engine will decrement it by one.
1458          */
1459         ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl + 1;
1460         ipha->ipha_hdr_checksum = 0;
1461         ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1462 
1463         /*
1464          * We use ip_forward_xmit to do any fragmentation.
1465          * and loopback copy on the outbound interface.
1466          *
1467          * Make it so that IXAF_LOOPBACK_COPY to be set on transmit side.
1468          */
1469         ira->ira_flags |= IRAF_LOOPBACK_COPY;
1470 
1471         nce = arp_nce_init(dst_ill, ipha->ipha_dst, IRE_BROADCAST);
1472         if (nce == NULL) {
1473                 BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsOutDiscards);
1474                 ip_drop_output("No nce", mp, dst_ill);
1475                 freemsg(mp);
1476                 goto done;
1477         }
1478 
1479         ip_forward_xmit_v4(nce, ill, mp, ipha, ira, dst_ill->ill_mc_mtu, 0);
1480         nce_refrele(nce);
1481 done:
1482         /* Restore */
1483         ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex;
1484 }
1485 
1486 /*
1487  * ire_recvfn for IRE_MULTICAST.
1488  */
1489 void
1490 ire_recv_multicast_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1491     ip_recv_attr_t *ira)
1492 {
1493         ipha_t          *ipha = (ipha_t *)iph_arg;
1494         ill_t           *ill = ira->ira_ill;
1495         ip_stack_t      *ipst = ill->ill_ipst;
1496 
1497         ASSERT(ire->ire_ill == ira->ira_ill);
1498 
1499         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastPkts);
1500         UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastOctets, ira->ira_pktlen);
1501 
1502         /* RSVP hook */
1503         if (ira->ira_flags & IRAF_RSVP)
1504                 goto forus;
1505 
1506         /* Tag for higher-level protocols */
1507         ira->ira_flags |= IRAF_MULTICAST;
1508 
1509         /*
1510          * So that we don't end up with dups, only one ill an IPMP group is
1511          * nominated to receive multicast traffic.
1512          * If we have no cast_ill we are liberal and accept everything.
1513          */
1514         if (IS_UNDER_IPMP(ill)) {
1515                 ip_stack_t      *ipst = ill->ill_ipst;
1516 
1517                 /* For an under ill_grp can change under lock */
1518                 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1519                 if (!ill->ill_nom_cast && ill->ill_grp != NULL &&
1520                     ill->ill_grp->ig_cast_ill != NULL) {
1521                         rw_exit(&ipst->ips_ill_g_lock);
1522                         ip_drop_input("not on cast ill", mp, ill);
1523                         freemsg(mp);
1524                         return;
1525                 }
1526                 rw_exit(&ipst->ips_ill_g_lock);
1527                 /*
1528                  * We switch to the upper ill so that mrouter and hasmembers
1529                  * can operate on upper here and in ip_input_multicast.
1530                  */
1531                 ill = ipmp_ill_hold_ipmp_ill(ill);
1532                 if (ill != NULL) {
1533                         ASSERT(ill != ira->ira_ill);
1534                         ASSERT(ire->ire_ill == ira->ira_ill);
1535                         ira->ira_ill = ill;
1536                         ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex;
1537                 } else {
1538                         ill = ira->ira_ill;
1539                 }
1540         }
1541 
1542         /*
1543          * Check if we are a multicast router - send ip_mforward a copy of
1544          * the packet.
1545          * Due to mroute_decap tunnels we consider forwarding packets even if
1546          * mrouted has not joined the allmulti group on this interface.
1547          */
1548         if (ipst->ips_ip_g_mrouter) {
1549                 int retval;
1550 
1551                 /*
1552                  * Clear the indication that this may have hardware
1553                  * checksum as we are not using it for forwarding.
1554                  */
1555                 DB_CKSUMFLAGS(mp) = 0;
1556 
1557                 /*
1558                  * ip_mforward helps us make these distinctions: If received
1559                  * on tunnel and not IGMP, then drop.
1560                  * If IGMP packet, then don't check membership
1561                  * If received on a phyint and IGMP or PIM, then
1562                  * don't check membership
1563                  */
1564                 retval = ip_mforward(mp, ira);
1565                 /* ip_mforward updates mib variables if needed */
1566 
1567                 switch (retval) {
1568                 case 0:
1569                         /*
1570                          * pkt is okay and arrived on phyint.
1571                          *
1572                          * If we are running as a multicast router
1573                          * we need to see all IGMP and/or PIM packets.
1574                          */
1575                         if ((ipha->ipha_protocol == IPPROTO_IGMP) ||
1576                             (ipha->ipha_protocol == IPPROTO_PIM)) {
1577                                 goto forus;
1578                         }
1579                         break;
1580                 case -1:
1581                         /* pkt is mal-formed, toss it */
1582                         freemsg(mp);
1583                         goto done;
1584                 case 1:
1585                         /*
1586                          * pkt is okay and arrived on a tunnel
1587                          *
1588                          * If we are running a multicast router
1589                          * we need to see all igmp packets.
1590                          */
1591                         if (ipha->ipha_protocol == IPPROTO_IGMP) {
1592                                 goto forus;
1593                         }
1594                         ip_drop_input("Multicast on tunnel ignored", mp, ill);
1595                         freemsg(mp);
1596                         goto done;
1597                 }
1598         }
1599 
1600         /*
1601          * Check if we have members on this ill. This is not necessary for
1602          * correctness because even if the NIC/GLD had a leaky filter, we
1603          * filter before passing to each conn_t.
1604          */
1605         if (!ill_hasmembers_v4(ill, ipha->ipha_dst)) {
1606                 /*
1607                  * Nobody interested
1608                  *
1609                  * This might just be caused by the fact that
1610                  * multiple IP Multicast addresses map to the same
1611                  * link layer multicast - no need to increment counter!
1612                  */
1613                 ip_drop_input("Multicast with no members", mp, ill);
1614                 freemsg(mp);
1615                 goto done;
1616         }
1617 forus:
1618         ip2dbg(("ire_recv_multicast_v4: multicast for us: 0x%x\n",
1619             ntohl(ipha->ipha_dst)));
1620 
1621         /*
1622          * After reassembly and IPsec we will need to duplicate the
1623          * multicast packet for all matching zones on the ill.
1624          */
1625         ira->ira_zoneid = ALL_ZONES;
1626 
1627         /* Reassemble on the ill on which the packet arrived */
1628         ip_input_local_v4(ire, mp, ipha, ira);
1629 done:
1630         if (ill != ire->ire_ill) {
1631                 ill_refrele(ill);
1632                 ira->ira_ill = ire->ire_ill;
1633                 ira->ira_ruifindex = ira->ira_ill->ill_phyint->phyint_ifindex;
1634         }
1635 }
1636 
1637 /*
1638  * ire_recvfn for IRE_OFFLINK with RTF_MULTIRT.
1639  * Drop packets since we don't forward out multirt routes.
1640  */
1641 /* ARGSUSED */
1642 void
1643 ire_recv_multirt_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
1644 {
1645         ill_t           *ill = ira->ira_ill;
1646 
1647         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInNoRoutes);
1648         ip_drop_input("Not forwarding out MULTIRT", mp, ill);
1649         freemsg(mp);
1650 }
1651 
1652 /*
1653  * ire_recvfn for IRE_LOOPBACK. This is only used when a FW_HOOK
1654  * has rewritten the packet to have a loopback destination address (We
1655  * filter out packet with a loopback destination from arriving over the wire).
1656  * We don't know what zone to use, thus we always use the GLOBAL_ZONEID.
1657  */
1658 void
1659 ire_recv_loopback_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
1660 {
1661         ipha_t          *ipha = (ipha_t *)iph_arg;
1662         ill_t           *ill = ira->ira_ill;
1663         ill_t           *ire_ill = ire->ire_ill;
1664 
1665         ira->ira_zoneid = GLOBAL_ZONEID;
1666 
1667         /* Switch to the lo0 ill for further processing  */
1668         if (ire_ill != ill) {
1669                 /*
1670                  * Update ira_ill to be the ILL on which the IP address
1671                  * is hosted.
1672                  * No need to hold the ill since we have a hold on the ire
1673                  */
1674                 ASSERT(ira->ira_ill == ira->ira_rill);
1675                 ira->ira_ill = ire_ill;
1676 
1677                 ip_input_local_v4(ire, mp, ipha, ira);
1678 
1679                 /* Restore */
1680                 ASSERT(ira->ira_ill == ire_ill);
1681                 ira->ira_ill = ill;
1682                 return;
1683 
1684         }
1685         ip_input_local_v4(ire, mp, ipha, ira);
1686 }
1687 
1688 /*
1689  * ire_recvfn for IRE_LOCAL.
1690  */
1691 void
1692 ire_recv_local_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
1693 {
1694         ipha_t          *ipha = (ipha_t *)iph_arg;
1695         ill_t           *ill = ira->ira_ill;
1696         ill_t           *ire_ill = ire->ire_ill;
1697 
1698         /* Make a note for DAD that this address is in use */
1699         ire->ire_last_used_time = LBOLT_FASTPATH;
1700 
1701         /* Only target the IRE_LOCAL with the right zoneid. */
1702         ira->ira_zoneid = ire->ire_zoneid;
1703 
1704         /*
1705          * If the packet arrived on the wrong ill, we check that
1706          * this is ok.
1707          * If it is, then we ensure that we do the reassembly on
1708          * the ill on which the address is hosted. We keep ira_rill as
1709          * the one on which the packet arrived, so that IP_PKTINFO and
1710          * friends can report this.
1711          */
1712         if (ire_ill != ill) {
1713                 ire_t *new_ire;
1714 
1715                 new_ire = ip_check_multihome(&ipha->ipha_dst, ire, ill);
1716                 if (new_ire == NULL) {
1717                         /* Drop packet */
1718                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
1719                         ip_drop_input("ipIfStatsInForwProhibits", mp, ill);
1720                         freemsg(mp);
1721                         return;
1722                 }
1723                 /*
1724                  * Update ira_ill to be the ILL on which the IP address
1725                  * is hosted. No need to hold the ill since we have a
1726                  * hold on the ire. Note that we do the switch even if
1727                  * new_ire == ire (for IPMP, ire would be the one corresponding
1728                  * to the IPMP ill).
1729                  */
1730                 ASSERT(ira->ira_ill == ira->ira_rill);
1731                 ira->ira_ill = new_ire->ire_ill;
1732 
1733                 /* ira_ruifindex tracks the upper for ira_rill */
1734                 if (IS_UNDER_IPMP(ill))
1735                         ira->ira_ruifindex = ill_get_upper_ifindex(ill);
1736 
1737                 ip_input_local_v4(new_ire, mp, ipha, ira);
1738 
1739                 /* Restore */
1740                 ASSERT(ira->ira_ill == new_ire->ire_ill);
1741                 ira->ira_ill = ill;
1742                 ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex;
1743 
1744                 if (new_ire != ire)
1745                         ire_refrele(new_ire);
1746                 return;
1747         }
1748 
1749         ip_input_local_v4(ire, mp, ipha, ira);
1750 }
1751 
1752 /*
1753  * Common function for packets arriving for the host. Handles
1754  * checksum verification, reassembly checks, etc.
1755  */
1756 static void
1757 ip_input_local_v4(ire_t *ire, mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
1758 {
1759         ill_t           *ill = ira->ira_ill;
1760         iaflags_t       iraflags = ira->ira_flags;
1761 
1762         /*
1763          * Verify IP header checksum. If the packet was AH or ESP then
1764          * this flag has already been cleared. Likewise if the packet
1765          * had a hardware checksum.
1766          */
1767         if ((iraflags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) {
1768                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
1769                 ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
1770                 freemsg(mp);
1771                 return;
1772         }
1773 
1774         if (iraflags & IRAF_IPV4_OPTIONS) {
1775                 if (!ip_input_local_options(mp, ipha, ira)) {
1776                         /* Error has been sent and mp consumed */
1777                         return;
1778                 }
1779                 /*
1780                  * Some old hardware does partial checksum by including the
1781                  * whole IP header, so the partial checksum value might have
1782                  * become invalid if any option in the packet have been
1783                  * updated. Always clear partial checksum flag here.
1784                  */
1785                 DB_CKSUMFLAGS(mp) &= ~HCK_PARTIALCKSUM;
1786         }
1787 
1788         /*
1789          * Is packet part of fragmented IP packet?
1790          * We compare against defined values in network byte order
1791          */
1792         if (ipha->ipha_fragment_offset_and_flags &
1793             (IPH_MF_HTONS | IPH_OFFSET_HTONS)) {
1794                 /*
1795                  * Make sure we have ira_l2src before we loose the original
1796                  * mblk
1797                  */
1798                 if (!(ira->ira_flags & IRAF_L2SRC_SET))
1799                         ip_setl2src(mp, ira, ira->ira_rill);
1800 
1801                 mp = ip_input_fragment(mp, ipha, ira);
1802                 if (mp == NULL)
1803                         return;
1804                 /* Completed reassembly */
1805                 ipha = (ipha_t *)mp->b_rptr;
1806         }
1807 
1808         /*
1809          * For broadcast and multicast we need some extra work before
1810          * we call ip_fanout_v4(), since in the case of shared-IP zones
1811          * we need to pretend that a packet arrived for each zoneid.
1812          */
1813         if (iraflags & IRAF_MULTIBROADCAST) {
1814                 if (iraflags & IRAF_BROADCAST)
1815                         ip_input_broadcast_v4(ire, mp, ipha, ira);
1816                 else
1817                         ip_input_multicast_v4(ire, mp, ipha, ira);
1818                 return;
1819         }
1820         ip_fanout_v4(mp, ipha, ira);
1821 }
1822 
1823 
1824 /*
1825  * Handle multiple zones which match the same broadcast address
1826  * and ill by delivering a packet to each of them.
1827  * Walk the bucket and look for different ire_zoneid but otherwise
1828  * the same IRE (same ill/addr/mask/type).
1829  * Note that ire_add() tracks IREs that are identical in all
1830  * fields (addr/mask/type/gw/ill/zoneid) within a single IRE by
1831  * increasing ire_identical_cnt. Thus we don't need to be concerned
1832  * about those.
1833  */
1834 static void
1835 ip_input_broadcast_v4(ire_t *ire, mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
1836 {
1837         ill_t           *ill = ira->ira_ill;
1838         ip_stack_t      *ipst = ill->ill_ipst;
1839         netstack_t      *ns = ipst->ips_netstack;
1840         irb_t           *irb;
1841         ire_t           *ire1;
1842         mblk_t          *mp1;
1843         ipha_t          *ipha1;
1844         uint_t          ira_pktlen = ira->ira_pktlen;
1845         uint16_t        ira_ip_hdr_length = ira->ira_ip_hdr_length;
1846 
1847         irb = ire->ire_bucket;
1848 
1849         /*
1850          * If we don't have more than one shared-IP zone, or if
1851          * there can't be more than one IRE_BROADCAST for this
1852          * IP address, then just set the zoneid and proceed.
1853          */
1854         if (ns->netstack_numzones == 1 || irb->irb_ire_cnt == 1) {
1855                 ira->ira_zoneid = ire->ire_zoneid;
1856 
1857                 ip_fanout_v4(mp, ipha, ira);
1858                 return;
1859         }
1860         irb_refhold(irb);
1861         for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
1862                 /* We do the main IRE after the end of the loop */
1863                 if (ire1 == ire)
1864                         continue;
1865 
1866                 /*
1867                  * Only IREs for the same IP address should be in the same
1868                  * bucket.
1869                  * But could have IRE_HOSTs in the case of CGTP.
1870                  */
1871                 ASSERT(ire1->ire_addr == ire->ire_addr);
1872                 if (!(ire1->ire_type & IRE_BROADCAST))
1873                         continue;
1874 
1875                 if (IRE_IS_CONDEMNED(ire1))
1876                         continue;
1877 
1878                 mp1 = copymsg(mp);
1879                 if (mp1 == NULL) {
1880                         /* Failed to deliver to one zone */
1881                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1882                         ip_drop_input("ipIfStatsInDiscards", mp, ill);
1883                         continue;
1884                 }
1885                 ira->ira_zoneid = ire1->ire_zoneid;
1886                 ipha1 = (ipha_t *)mp1->b_rptr;
1887                 ip_fanout_v4(mp1, ipha1, ira);
1888                 /*
1889                  * IPsec might have modified ira_pktlen and ira_ip_hdr_length
1890                  * so we restore them for a potential next iteration
1891                  */
1892                 ira->ira_pktlen = ira_pktlen;
1893                 ira->ira_ip_hdr_length = ira_ip_hdr_length;
1894         }
1895         irb_refrele(irb);
1896         /* Do the main ire */
1897         ira->ira_zoneid = ire->ire_zoneid;
1898         ip_fanout_v4(mp, ipha, ira);
1899 }
1900 
1901 /*
1902  * Handle multiple zones which want to receive the same multicast packets
1903  * on this ill by delivering a packet to each of them.
1904  *
1905  * Note that for packets delivered to transports we could instead do this
1906  * as part of the fanout code, but since we need to handle icmp_inbound
1907  * it is simpler to have multicast work the same as broadcast.
1908  *
1909  * The ip_fanout matching for multicast matches based on ilm independent of
1910  * zoneid since the zoneid restriction is applied when joining a multicast
1911  * group.
1912  */
1913 /* ARGSUSED */
1914 static void
1915 ip_input_multicast_v4(ire_t *ire, mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
1916 {
1917         ill_t           *ill = ira->ira_ill;
1918         iaflags_t       iraflags = ira->ira_flags;
1919         ip_stack_t      *ipst = ill->ill_ipst;
1920         netstack_t      *ns = ipst->ips_netstack;
1921         zoneid_t        zoneid;
1922         mblk_t          *mp1;
1923         ipha_t          *ipha1;
1924         uint_t          ira_pktlen = ira->ira_pktlen;
1925         uint16_t        ira_ip_hdr_length = ira->ira_ip_hdr_length;
1926 
1927         /* ire_recv_multicast has switched to the upper ill for IPMP */
1928         ASSERT(!IS_UNDER_IPMP(ill));
1929 
1930         /*
1931          * If we don't have more than one shared-IP zone, or if
1932          * there are no members in anything but the global zone,
1933          * then just set the zoneid and proceed.
1934          */
1935         if (ns->netstack_numzones == 1 ||
1936             !ill_hasmembers_otherzones_v4(ill, ipha->ipha_dst,
1937             GLOBAL_ZONEID)) {
1938                 ira->ira_zoneid = GLOBAL_ZONEID;
1939 
1940                 /* If sender didn't want this zone to receive it, drop */
1941                 if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) &&
1942                     ira->ira_no_loop_zoneid == ira->ira_zoneid) {
1943                         ip_drop_input("Multicast but wrong zoneid", mp, ill);
1944                         freemsg(mp);
1945                         return;
1946                 }
1947                 ip_fanout_v4(mp, ipha, ira);
1948                 return;
1949         }
1950 
1951         /*
1952          * Here we loop over all zoneids that have members in the group
1953          * and deliver a packet to ip_fanout for each zoneid.
1954          *
1955          * First find any members in the lowest numeric zoneid by looking for
1956          * first zoneid larger than -1 (ALL_ZONES).
1957          * We terminate the loop when we receive -1 (ALL_ZONES).
1958          */
1959         zoneid = ill_hasmembers_nextzone_v4(ill, ipha->ipha_dst, ALL_ZONES);
1960         for (; zoneid != ALL_ZONES;
1961             zoneid = ill_hasmembers_nextzone_v4(ill, ipha->ipha_dst, zoneid)) {
1962                 /*
1963                  * Avoid an extra copymsg/freemsg by skipping global zone here
1964                  * and doing that at the end.
1965                  */
1966                 if (zoneid == GLOBAL_ZONEID)
1967                         continue;
1968 
1969                 ira->ira_zoneid = zoneid;
1970 
1971                 /* If sender didn't want this zone to receive it, skip */
1972                 if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) &&
1973                     ira->ira_no_loop_zoneid == ira->ira_zoneid)
1974                         continue;
1975 
1976                 mp1 = copymsg(mp);
1977                 if (mp1 == NULL) {
1978                         /* Failed to deliver to one zone */
1979                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1980                         ip_drop_input("ipIfStatsInDiscards", mp, ill);
1981                         continue;
1982                 }
1983                 ipha1 = (ipha_t *)mp1->b_rptr;
1984                 ip_fanout_v4(mp1, ipha1, ira);
1985                 /*
1986                  * IPsec might have modified ira_pktlen and ira_ip_hdr_length
1987                  * so we restore them for a potential next iteration
1988                  */
1989                 ira->ira_pktlen = ira_pktlen;
1990                 ira->ira_ip_hdr_length = ira_ip_hdr_length;
1991         }
1992 
1993         /* Do the main ire */
1994         ira->ira_zoneid = GLOBAL_ZONEID;
1995         /* If sender didn't want this zone to receive it, drop */
1996         if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) &&
1997             ira->ira_no_loop_zoneid == ira->ira_zoneid) {
1998                 ip_drop_input("Multicast but wrong zoneid", mp, ill);
1999                 freemsg(mp);
2000         } else {
2001                 ip_fanout_v4(mp, ipha, ira);
2002         }
2003 }
2004 
2005 
2006 /*
2007  * Determine the zoneid and IRAF_TX_* flags if trusted extensions
2008  * is in use. Updates ira_zoneid and ira_flags as a result.
2009  */
2010 static void
2011 ip_fanout_tx_v4(mblk_t *mp, ipha_t *ipha, uint8_t protocol,
2012     uint_t ip_hdr_length, ip_recv_attr_t *ira)
2013 {
2014         uint16_t        *up;
2015         uint16_t        lport;
2016         zoneid_t        zoneid;
2017 
2018         ASSERT(ira->ira_flags & IRAF_SYSTEM_LABELED);
2019 
2020         /*
2021          * If the packet is unlabeled we might allow read-down
2022          * for MAC_EXEMPT. Below we clear this if it is a multi-level
2023          * port (MLP).
2024          * Note that ira_tsl can be NULL here.
2025          */
2026         if (ira->ira_tsl != NULL && ira->ira_tsl->tsl_flags & TSLF_UNLABELED)
2027                 ira->ira_flags |= IRAF_TX_MAC_EXEMPTABLE;
2028 
2029         if (ira->ira_zoneid != ALL_ZONES)
2030                 return;
2031 
2032         ira->ira_flags |= IRAF_TX_SHARED_ADDR;
2033 
2034         up = (uint16_t *)((uchar_t *)ipha + ip_hdr_length);
2035         switch (protocol) {
2036         case IPPROTO_TCP:
2037         case IPPROTO_SCTP:
2038         case IPPROTO_UDP:
2039                 /* Caller ensures this */
2040                 ASSERT(((uchar_t *)ipha) + ip_hdr_length +4 <= mp->b_wptr);
2041 
2042                 /*
2043                  * Only these transports support MLP.
2044                  * We know their destination port numbers is in
2045                  * the same place in the header.
2046                  */
2047                 lport = up[1];
2048 
2049                 /*
2050                  * No need to handle exclusive-stack zones
2051                  * since ALL_ZONES only applies to the shared IP instance.
2052                  */
2053                 zoneid = tsol_mlp_findzone(protocol, lport);
2054                 /*
2055                  * If no shared MLP is found, tsol_mlp_findzone returns
2056                  * ALL_ZONES.  In that case, we assume it's SLP, and
2057                  * search for the zone based on the packet label.
2058                  *
2059                  * If there is such a zone, we prefer to find a
2060                  * connection in it.  Otherwise, we look for a
2061                  * MAC-exempt connection in any zone whose label
2062                  * dominates the default label on the packet.
2063                  */
2064                 if (zoneid == ALL_ZONES)
2065                         zoneid = tsol_attr_to_zoneid(ira);
2066                 else
2067                         ira->ira_flags &= ~IRAF_TX_MAC_EXEMPTABLE;
2068                 break;
2069         default:
2070                 /* Handle shared address for other protocols */
2071                 zoneid = tsol_attr_to_zoneid(ira);
2072                 break;
2073         }
2074         ira->ira_zoneid = zoneid;
2075 }
2076 
2077 /*
2078  * Increment checksum failure statistics
2079  */
2080 static void
2081 ip_input_cksum_err_v4(uint8_t protocol, uint16_t hck_flags, ill_t *ill)
2082 {
2083         ip_stack_t      *ipst = ill->ill_ipst;
2084 
2085         switch (protocol) {
2086         case IPPROTO_TCP:
2087                 BUMP_MIB(ill->ill_ip_mib, tcpIfStatsInErrs);
2088 
2089                 if (hck_flags & HCK_FULLCKSUM)
2090                         IP_STAT(ipst, ip_tcp_in_full_hw_cksum_err);
2091                 else if (hck_flags & HCK_PARTIALCKSUM)
2092                         IP_STAT(ipst, ip_tcp_in_part_hw_cksum_err);
2093                 else
2094                         IP_STAT(ipst, ip_tcp_in_sw_cksum_err);
2095                 break;
2096         case IPPROTO_UDP:
2097                 BUMP_MIB(ill->ill_ip_mib, udpIfStatsInCksumErrs);
2098                 if (hck_flags & HCK_FULLCKSUM)
2099                         IP_STAT(ipst, ip_udp_in_full_hw_cksum_err);
2100                 else if (hck_flags & HCK_PARTIALCKSUM)
2101                         IP_STAT(ipst, ip_udp_in_part_hw_cksum_err);
2102                 else
2103                         IP_STAT(ipst, ip_udp_in_sw_cksum_err);
2104                 break;
2105         case IPPROTO_ICMP:
2106                 BUMP_MIB(&ipst->ips_icmp_mib, icmpInCksumErrs);
2107                 break;
2108         default:
2109                 ASSERT(0);
2110                 break;
2111         }
2112 }
2113 
2114 /* Calculate the IPv4 pseudo-header checksum */
2115 uint32_t
2116 ip_input_cksum_pseudo_v4(ipha_t *ipha, ip_recv_attr_t *ira)
2117 {
2118         uint_t          ulp_len;
2119         uint32_t        cksum;
2120         uint8_t         protocol = ira->ira_protocol;
2121         uint16_t        ip_hdr_length = ira->ira_ip_hdr_length;
2122 
2123 #define iphs    ((uint16_t *)ipha)
2124 
2125         switch (protocol) {
2126         case IPPROTO_TCP:
2127                 ulp_len = ira->ira_pktlen - ip_hdr_length;
2128 
2129                 /* Protocol and length */
2130                 cksum = htons(ulp_len) + IP_TCP_CSUM_COMP;
2131                 /* IP addresses */
2132                 cksum += iphs[6] + iphs[7] + iphs[8] + iphs[9];
2133                 break;
2134 
2135         case IPPROTO_UDP: {
2136                 udpha_t         *udpha;
2137 
2138                 udpha = (udpha_t  *)((uchar_t *)ipha + ip_hdr_length);
2139 
2140                 /* Protocol and length */
2141                 cksum = udpha->uha_length + IP_UDP_CSUM_COMP;
2142                 /* IP addresses */
2143                 cksum += iphs[6] + iphs[7] + iphs[8] + iphs[9];
2144                 break;
2145         }
2146 
2147         default:
2148                 cksum = 0;
2149                 break;
2150         }
2151 #undef  iphs
2152         return (cksum);
2153 }
2154 
2155 
2156 /*
2157  * Software verification of the ULP checksums.
2158  * Returns B_TRUE if ok.
2159  * Increments statistics of failed.
2160  */
2161 static boolean_t
2162 ip_input_sw_cksum_v4(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
2163 {
2164         ip_stack_t      *ipst = ira->ira_ill->ill_ipst;
2165         uint32_t        cksum;
2166         uint8_t         protocol = ira->ira_protocol;
2167         uint16_t        ip_hdr_length = ira->ira_ip_hdr_length;
2168 
2169         IP_STAT(ipst, ip_in_sw_cksum);
2170 
2171         ASSERT(protocol == IPPROTO_TCP || protocol == IPPROTO_UDP);
2172 
2173         cksum = ip_input_cksum_pseudo_v4(ipha, ira);
2174         cksum = IP_CSUM(mp, ip_hdr_length, cksum);
2175         if (cksum == 0)
2176                 return (B_TRUE);
2177 
2178         ip_input_cksum_err_v4(protocol, 0, ira->ira_ill);
2179         return (B_FALSE);
2180 }
2181 
2182 /*
2183  * Verify the ULP checksums.
2184  * Returns B_TRUE if ok, or if the ULP doesn't have a well-defined checksum
2185  * algorithm.
2186  * Increments statistics if failed.
2187  */
2188 static boolean_t
2189 ip_input_cksum_v4(iaflags_t iraflags, mblk_t *mp, ipha_t *ipha,
2190     ip_recv_attr_t *ira)
2191 {
2192         ill_t           *ill = ira->ira_rill;
2193         uint16_t        hck_flags;
2194         uint32_t        cksum;
2195         mblk_t          *mp1;
2196         int32_t         len;
2197         uint8_t         protocol = ira->ira_protocol;
2198         uint16_t        ip_hdr_length = ira->ira_ip_hdr_length;
2199 
2200 
2201         switch (protocol) {
2202         case IPPROTO_TCP:
2203                 break;
2204 
2205         case IPPROTO_UDP: {
2206                 udpha_t         *udpha;
2207 
2208                 udpha = (udpha_t  *)((uchar_t *)ipha + ip_hdr_length);
2209                 if (udpha->uha_checksum == 0) {
2210                         /* Packet doesn't have a UDP checksum */
2211                         return (B_TRUE);
2212                 }
2213                 break;
2214         }
2215         case IPPROTO_SCTP: {
2216                 sctp_hdr_t      *sctph;
2217                 uint32_t        pktsum;
2218 
2219                 sctph = (sctp_hdr_t *)((uchar_t *)ipha + ip_hdr_length);
2220 #ifdef  DEBUG
2221                 if (skip_sctp_cksum)
2222                         return (B_TRUE);
2223 #endif
2224                 pktsum = sctph->sh_chksum;
2225                 sctph->sh_chksum = 0;
2226                 cksum = sctp_cksum(mp, ip_hdr_length);
2227                 sctph->sh_chksum = pktsum;
2228                 if (cksum == pktsum)
2229                         return (B_TRUE);
2230 
2231                 /*
2232                  * Defer until later whether a bad checksum is ok
2233                  * in order to allow RAW sockets to use Adler checksum
2234                  * with SCTP.
2235                  */
2236                 ira->ira_flags |= IRAF_SCTP_CSUM_ERR;
2237                 return (B_TRUE);
2238         }
2239 
2240         default:
2241                 /* No ULP checksum to verify. */
2242                 return (B_TRUE);
2243         }
2244         /*
2245          * Revert to software checksum calculation if the interface
2246          * isn't capable of checksum offload.
2247          * We clear DB_CKSUMFLAGS when going through IPsec in ip_fanout.
2248          * Note: IRAF_NO_HW_CKSUM is not currently used.
2249          */
2250         ASSERT(!IS_IPMP(ill));
2251         if ((iraflags & IRAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) ||
2252             !dohwcksum) {
2253                 return (ip_input_sw_cksum_v4(mp, ipha, ira));
2254         }
2255 
2256         /*
2257          * We apply this for all ULP protocols. Does the HW know to
2258          * not set the flags for SCTP and other protocols.
2259          */
2260 
2261         hck_flags = DB_CKSUMFLAGS(mp);
2262 
2263         if (hck_flags & HCK_FULLCKSUM_OK) {
2264                 /*
2265                  * Hardware has already verified the checksum.
2266                  */
2267                 return (B_TRUE);
2268         }
2269 
2270         if (hck_flags & HCK_FULLCKSUM) {
2271                 /*
2272                  * Full checksum has been computed by the hardware
2273                  * and has been attached.  If the driver wants us to
2274                  * verify the correctness of the attached value, in
2275                  * order to protect against faulty hardware, compare
2276                  * it against -0 (0xFFFF) to see if it's valid.
2277                  */
2278                 cksum = DB_CKSUM16(mp);
2279                 if (cksum == 0xFFFF)
2280                         return (B_TRUE);
2281                 ip_input_cksum_err_v4(protocol, hck_flags, ira->ira_ill);
2282                 return (B_FALSE);
2283         }
2284 
2285         mp1 = mp->b_cont;
2286         if ((hck_flags & HCK_PARTIALCKSUM) &&
2287             (mp1 == NULL || mp1->b_cont == NULL) &&
2288             ip_hdr_length >= DB_CKSUMSTART(mp) &&
2289             ((len = ip_hdr_length - DB_CKSUMSTART(mp)) & 1) == 0) {
2290                 uint32_t        adj;
2291                 uchar_t         *cksum_start;
2292 
2293                 cksum = ip_input_cksum_pseudo_v4(ipha, ira);
2294 
2295                 cksum_start = ((uchar_t *)ipha + DB_CKSUMSTART(mp));
2296 
2297                 /*
2298                  * Partial checksum has been calculated by hardware
2299                  * and attached to the packet; in addition, any
2300                  * prepended extraneous data is even byte aligned,
2301                  * and there are at most two mblks associated with
2302                  * the packet.  If any such data exists, we adjust
2303                  * the checksum; also take care any postpended data.
2304                  */
2305                 IP_ADJCKSUM_PARTIAL(cksum_start, mp, mp1, len, adj);
2306                 /*
2307                  * One's complement subtract extraneous checksum
2308                  */
2309                 cksum += DB_CKSUM16(mp);
2310                 if (adj >= cksum)
2311                         cksum = ~(adj - cksum) & 0xFFFF;
2312                 else
2313                         cksum -= adj;
2314                 cksum = (cksum & 0xFFFF) + ((int)cksum >> 16);
2315                 cksum = (cksum & 0xFFFF) + ((int)cksum >> 16);
2316                 if (!(~cksum & 0xFFFF))
2317                         return (B_TRUE);
2318 
2319                 ip_input_cksum_err_v4(protocol, hck_flags, ira->ira_ill);
2320                 return (B_FALSE);
2321         }
2322         return (ip_input_sw_cksum_v4(mp, ipha, ira));
2323 }
2324 
2325 
2326 /*
2327  * Handle fanout of received packets.
2328  * Unicast packets that are looped back (from ire_send_local_v4) and packets
2329  * from the wire are differentiated by checking IRAF_VERIFY_ULP_CKSUM.
2330  *
2331  * IPQoS Notes
2332  * Before sending it to the client, invoke IPPF processing. Policy processing
2333  * takes place only if the callout_position, IPP_LOCAL_IN, is enabled.
2334  */
2335 void
2336 ip_fanout_v4(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
2337 {
2338         ill_t           *ill = ira->ira_ill;
2339         iaflags_t       iraflags = ira->ira_flags;
2340         ip_stack_t      *ipst = ill->ill_ipst;
2341         uint8_t         protocol = ipha->ipha_protocol;
2342         conn_t          *connp;
2343 #define rptr    ((uchar_t *)ipha)
2344         uint_t          ip_hdr_length;
2345         uint_t          min_ulp_header_length;
2346         int             offset;
2347         ssize_t         len;
2348         netstack_t      *ns = ipst->ips_netstack;
2349         ipsec_stack_t   *ipss = ns->netstack_ipsec;
2350         ill_t           *rill = ira->ira_rill;
2351 
2352         ASSERT(ira->ira_pktlen == ntohs(ipha->ipha_length));
2353 
2354         ip_hdr_length = ira->ira_ip_hdr_length;
2355         ira->ira_protocol = protocol;
2356 
2357         /*
2358          * Time for IPP once we've done reassembly and IPsec.
2359          * We skip this for loopback packets since we don't do IPQoS
2360          * on loopback.
2361          */
2362         if (IPP_ENABLED(IPP_LOCAL_IN, ipst) &&
2363             !(iraflags & IRAF_LOOPBACK) &&
2364             (protocol != IPPROTO_ESP || protocol != IPPROTO_AH)) {
2365                 /*
2366                  * Use the interface on which the packet arrived - not where
2367                  * the IP address is hosted.
2368                  */
2369                 /* ip_process translates an IS_UNDER_IPMP */
2370                 mp = ip_process(IPP_LOCAL_IN, mp, rill, ill);
2371                 if (mp == NULL) {
2372                         /* ip_drop_packet and MIB done */
2373                         return;
2374                 }
2375         }
2376 
2377         /* Determine the minimum required size of the upper-layer header */
2378         /* Need to do this for at least the set of ULPs that TX handles. */
2379         switch (protocol) {
2380         case IPPROTO_TCP:
2381                 min_ulp_header_length = TCP_MIN_HEADER_LENGTH;
2382                 break;
2383         case IPPROTO_SCTP:
2384                 min_ulp_header_length = SCTP_COMMON_HDR_LENGTH;
2385                 break;
2386         case IPPROTO_UDP:
2387                 min_ulp_header_length = UDPH_SIZE;
2388                 break;
2389         case IPPROTO_ICMP:
2390                 min_ulp_header_length = ICMPH_SIZE;
2391                 break;
2392         default:
2393                 min_ulp_header_length = 0;
2394                 break;
2395         }
2396         /* Make sure we have the min ULP header length */
2397         len = mp->b_wptr - rptr;
2398         if (len < ip_hdr_length + min_ulp_header_length) {
2399                 if (ira->ira_pktlen < ip_hdr_length + min_ulp_header_length) {
2400                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
2401                         ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
2402                         freemsg(mp);
2403                         return;
2404                 }
2405                 IP_STAT(ipst, ip_recv_pullup);
2406                 ipha = ip_pullup(mp, ip_hdr_length + min_ulp_header_length,
2407                     ira);
2408                 if (ipha == NULL)
2409                         goto discard;
2410                 len = mp->b_wptr - rptr;
2411         }
2412 
2413         /*
2414          * If trusted extensions then determine the zoneid and TX specific
2415          * ira_flags.
2416          */
2417         if (iraflags & IRAF_SYSTEM_LABELED) {
2418                 /* This can update ira->ira_flags and ira->ira_zoneid */
2419                 ip_fanout_tx_v4(mp, ipha, protocol, ip_hdr_length, ira);
2420                 iraflags = ira->ira_flags;
2421         }
2422 
2423 
2424         /* Verify ULP checksum. Handles TCP, UDP, and SCTP */
2425         if (iraflags & IRAF_VERIFY_ULP_CKSUM) {
2426                 if (!ip_input_cksum_v4(iraflags, mp, ipha, ira)) {
2427                         /* Bad checksum. Stats are already incremented */
2428                         ip_drop_input("Bad ULP checksum", mp, ill);
2429                         freemsg(mp);
2430                         return;
2431                 }
2432                 /* IRAF_SCTP_CSUM_ERR could have been set */
2433                 iraflags = ira->ira_flags;
2434         }
2435         switch (protocol) {
2436         case IPPROTO_TCP:
2437                 /* For TCP, discard broadcast and multicast packets. */
2438                 if (iraflags & IRAF_MULTIBROADCAST)
2439                         goto discard;
2440 
2441                 /* First mblk contains IP+TCP headers per above check */
2442                 ASSERT(len >= ip_hdr_length + TCP_MIN_HEADER_LENGTH);
2443 
2444                 /* TCP options present? */
2445                 offset = ((uchar_t *)ipha)[ip_hdr_length + 12] >> 4;
2446                 if (offset != 5) {
2447                         if (offset < 5)
2448                                 goto discard;
2449 
2450                         /*
2451                          * There must be TCP options.
2452                          * Make sure we can grab them.
2453                          */
2454                         offset <<= 2;
2455                         offset += ip_hdr_length;
2456                         if (len < offset) {
2457                                 if (ira->ira_pktlen < offset) {
2458                                         BUMP_MIB(ill->ill_ip_mib,
2459                                             ipIfStatsInTruncatedPkts);
2460                                         ip_drop_input(
2461                                             "ipIfStatsInTruncatedPkts",
2462                                             mp, ill);
2463                                         freemsg(mp);
2464                                         return;
2465                                 }
2466                                 IP_STAT(ipst, ip_recv_pullup);
2467                                 ipha = ip_pullup(mp, offset, ira);
2468                                 if (ipha == NULL)
2469                                         goto discard;
2470                                 len = mp->b_wptr - rptr;
2471                         }
2472                 }
2473 
2474                 /*
2475                  * Pass up a squeue hint to tcp.
2476                  * If ira_sqp is already set (this is loopback) we leave it
2477                  * alone.
2478                  */
2479                 if (ira->ira_sqp == NULL) {
2480                         ira->ira_sqp = ip_squeue_get(ira->ira_ring);
2481                 }
2482 
2483                 /* Look for AF_INET or AF_INET6 that matches */
2484                 connp = ipcl_classify_v4(mp, IPPROTO_TCP, ip_hdr_length,
2485                     ira, ipst);
2486                 if (connp == NULL) {
2487                         /* Send the TH_RST */
2488                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2489                         tcp_xmit_listeners_reset(mp, ira, ipst, NULL);
2490                         return;
2491                 }
2492                 if (connp->conn_incoming_ifindex != 0 &&
2493                     connp->conn_incoming_ifindex != ira->ira_ruifindex) {
2494                         CONN_DEC_REF(connp);
2495 
2496                         /* Send the TH_RST */
2497                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2498                         tcp_xmit_listeners_reset(mp, ira, ipst, NULL);
2499                         return;
2500                 }
2501                 if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) ||
2502                     (iraflags & IRAF_IPSEC_SECURE)) {
2503                         mp = ipsec_check_inbound_policy(mp, connp,
2504                             ipha, NULL, ira);
2505                         if (mp == NULL) {
2506                                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2507                                 /* Note that mp is NULL */
2508                                 ip_drop_input("ipIfStatsInDiscards", mp, ill);
2509                                 CONN_DEC_REF(connp);
2510                                 return;
2511                         }
2512                 }
2513                 /* Found a client; up it goes */
2514                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2515                 ira->ira_ill = ira->ira_rill = NULL;
2516                 if (!IPCL_IS_TCP(connp)) {
2517                         /* Not TCP; must be SOCK_RAW, IPPROTO_TCP */
2518                         (connp->conn_recv)(connp, mp, NULL, ira);
2519                         CONN_DEC_REF(connp);
2520                         ira->ira_ill = ill;
2521                         ira->ira_rill = rill;
2522                         return;
2523                 }
2524 
2525                 /*
2526                  * We do different processing whether called from
2527                  * ip_accept_tcp and we match the target, don't match
2528                  * the target, and when we are called by ip_input.
2529                  */
2530                 if (iraflags & IRAF_TARGET_SQP) {
2531                         if (ira->ira_target_sqp == connp->conn_sqp) {
2532                                 mblk_t  *attrmp;
2533 
2534                                 attrmp = ip_recv_attr_to_mblk(ira);
2535                                 if (attrmp == NULL) {
2536                                         BUMP_MIB(ill->ill_ip_mib,
2537                                             ipIfStatsInDiscards);
2538                                         ip_drop_input("ipIfStatsInDiscards",
2539                                             mp, ill);
2540                                         freemsg(mp);
2541                                         CONN_DEC_REF(connp);
2542                                 } else {
2543                                         SET_SQUEUE(attrmp, connp->conn_recv,
2544                                             connp);
2545                                         attrmp->b_cont = mp;
2546                                         ASSERT(ira->ira_target_sqp_mp == NULL);
2547                                         ira->ira_target_sqp_mp = attrmp;
2548                                         /*
2549                                          * Conn ref release when drained from
2550                                          * the squeue.
2551                                          */
2552                                 }
2553                         } else {
2554                                 SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
2555                                     connp->conn_recv, connp, ira, SQ_FILL,
2556                                     SQTAG_IP_TCP_INPUT);
2557                         }
2558                 } else {
2559                         SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv,
2560                             connp, ira, ip_squeue_flag, SQTAG_IP_TCP_INPUT);
2561                 }
2562                 ira->ira_ill = ill;
2563                 ira->ira_rill = rill;
2564                 return;
2565 
2566         case IPPROTO_SCTP: {
2567                 sctp_hdr_t      *sctph;
2568                 in6_addr_t      map_src, map_dst;
2569                 uint32_t        ports;  /* Source and destination ports */
2570                 sctp_stack_t    *sctps = ipst->ips_netstack->netstack_sctp;
2571 
2572                 /* For SCTP, discard broadcast and multicast packets. */
2573                 if (iraflags & IRAF_MULTIBROADCAST)
2574                         goto discard;
2575 
2576                 /*
2577                  * Since there is no SCTP h/w cksum support yet, just
2578                  * clear the flag.
2579                  */
2580                 DB_CKSUMFLAGS(mp) = 0;
2581 
2582                 /* Length ensured above */
2583                 ASSERT(MBLKL(mp) >= ip_hdr_length + SCTP_COMMON_HDR_LENGTH);
2584                 sctph = (sctp_hdr_t *)(rptr + ip_hdr_length);
2585 
2586                 /* get the ports */
2587                 ports = *(uint32_t *)&sctph->sh_sport;
2588 
2589                 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &map_dst);
2590                 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &map_src);
2591                 if (iraflags & IRAF_SCTP_CSUM_ERR) {
2592                         /*
2593                          * No potential sctp checksum errors go to the Sun
2594                          * sctp stack however they might be Adler-32 summed
2595                          * packets a userland stack bound to a raw IP socket
2596                          * could reasonably use. Note though that Adler-32 is
2597                          * a long deprecated algorithm and customer sctp
2598                          * networks should eventually migrate to CRC-32 at
2599                          * which time this facility should be removed.
2600                          */
2601                         ip_fanout_sctp_raw(mp, ipha, NULL, ports, ira);
2602                         return;
2603                 }
2604                 connp = sctp_fanout(&map_src, &map_dst, ports, ira, mp,
2605                     sctps, sctph);
2606                 if (connp == NULL) {
2607                         /* Check for raw socket or OOTB handling */
2608                         ip_fanout_sctp_raw(mp, ipha, NULL, ports, ira);
2609                         return;
2610                 }
2611                 if (connp->conn_incoming_ifindex != 0 &&
2612                     connp->conn_incoming_ifindex != ira->ira_ruifindex) {
2613                         CONN_DEC_REF(connp);
2614                         /* Check for raw socket or OOTB handling */
2615                         ip_fanout_sctp_raw(mp, ipha, NULL, ports, ira);
2616                         return;
2617                 }
2618 
2619                 /* Found a client; up it goes */
2620                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2621                 sctp_input(connp, ipha, NULL, mp, ira);
2622                 /* sctp_input does a rele of the sctp_t */
2623                 return;
2624         }
2625 
2626         case IPPROTO_UDP:
2627                 /* First mblk contains IP+UDP headers as checked above */
2628                 ASSERT(MBLKL(mp) >= ip_hdr_length + UDPH_SIZE);
2629 
2630                 if (iraflags & IRAF_MULTIBROADCAST) {
2631                         uint16_t *up;   /* Pointer to ports in ULP header */
2632 
2633                         up = (uint16_t *)((uchar_t *)ipha + ip_hdr_length);
2634                         ip_fanout_udp_multi_v4(mp, ipha, up[1], up[0], ira);
2635                         return;
2636                 }
2637 
2638                 /* Look for AF_INET or AF_INET6 that matches */
2639                 connp = ipcl_classify_v4(mp, IPPROTO_UDP, ip_hdr_length,
2640                     ira, ipst);
2641                 if (connp == NULL) {
2642         no_udp_match:
2643                         if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_UDP].
2644                             connf_head != NULL) {
2645                                 ASSERT(ira->ira_protocol == IPPROTO_UDP);
2646                                 ip_fanout_proto_v4(mp, ipha, ira);
2647                         } else {
2648                                 ip_fanout_send_icmp_v4(mp,
2649                                     ICMP_DEST_UNREACHABLE,
2650                                     ICMP_PORT_UNREACHABLE, ira);
2651                         }
2652                         return;
2653 
2654                 }
2655                 if (connp->conn_incoming_ifindex != 0 &&
2656                     connp->conn_incoming_ifindex != ira->ira_ruifindex) {
2657                         CONN_DEC_REF(connp);
2658                         goto no_udp_match;
2659                 }
2660                 if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld :
2661                     !canputnext(connp->conn_rq)) {
2662                         CONN_DEC_REF(connp);
2663                         BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows);
2664                         ip_drop_input("udpIfStatsInOverflows", mp, ill);
2665                         freemsg(mp);
2666                         return;
2667                 }
2668                 if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) ||
2669                     (iraflags & IRAF_IPSEC_SECURE)) {
2670                         mp = ipsec_check_inbound_policy(mp, connp,
2671                             ipha, NULL, ira);
2672                         if (mp == NULL) {
2673                                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2674                                 /* Note that mp is NULL */
2675                                 ip_drop_input("ipIfStatsInDiscards", mp, ill);
2676                                 CONN_DEC_REF(connp);
2677                                 return;
2678                         }
2679                 }
2680                 /*
2681                  * Remove 0-spi if it's 0, or move everything behind
2682                  * the UDP header over it and forward to ESP via
2683                  * ip_fanout_v4().
2684                  */
2685                 if (connp->conn_udp->udp_nat_t_endpoint) {
2686                         if (iraflags & IRAF_IPSEC_SECURE) {
2687                                 ip_drop_packet(mp, B_TRUE, ira->ira_ill,
2688                                     DROPPER(ipss, ipds_esp_nat_t_ipsec),
2689                                     &ipss->ipsec_dropper);
2690                                 CONN_DEC_REF(connp);
2691                                 return;
2692                         }
2693 
2694                         mp = zero_spi_check(mp, ira);
2695                         if (mp == NULL) {
2696                                 /*
2697                                  * Packet was consumed - probably sent to
2698                                  * ip_fanout_v4.
2699                                  */
2700                                 CONN_DEC_REF(connp);
2701                                 return;
2702                         }
2703                         /* Else continue like a normal UDP packet. */
2704                         ipha = (ipha_t *)mp->b_rptr;
2705                         protocol = ipha->ipha_protocol;
2706                         ira->ira_protocol = protocol;
2707                 }
2708                 /* Found a client; up it goes */
2709                 IP_STAT(ipst, ip_udp_fannorm);
2710                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2711                 ira->ira_ill = ira->ira_rill = NULL;
2712                 (connp->conn_recv)(connp, mp, NULL, ira);
2713                 CONN_DEC_REF(connp);
2714                 ira->ira_ill = ill;
2715                 ira->ira_rill = rill;
2716                 return;
2717         default:
2718                 break;
2719         }
2720 
2721         /*
2722          * Clear hardware checksumming flag as it is currently only
2723          * used by TCP and UDP.
2724          */
2725         DB_CKSUMFLAGS(mp) = 0;
2726 
2727         switch (protocol) {
2728         case IPPROTO_ICMP:
2729                 /*
2730                  * We need to accomodate icmp messages coming in clear
2731                  * until we get everything secure from the wire. If
2732                  * icmp_accept_clear_messages is zero we check with
2733                  * the global policy and act accordingly. If it is
2734                  * non-zero, we accept the message without any checks.
2735                  * But *this does not mean* that this will be delivered
2736                  * to RAW socket clients. By accepting we might send
2737                  * replies back, change our MTU value etc.,
2738                  * but delivery to the ULP/clients depends on their
2739                  * policy dispositions.
2740                  */
2741                 if (ipst->ips_icmp_accept_clear_messages == 0) {
2742                         mp = ipsec_check_global_policy(mp, NULL,
2743                             ipha, NULL, ira, ns);
2744                         if (mp == NULL)
2745                                 return;
2746                 }
2747 
2748                 /*
2749                  * On a labeled system, we have to check whether the zone
2750                  * itself is permitted to receive raw traffic.
2751                  */
2752                 if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
2753                         if (!tsol_can_accept_raw(mp, ira, B_FALSE)) {
2754                                 BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
2755                                 ip_drop_input("tsol_can_accept_raw", mp, ill);
2756                                 freemsg(mp);
2757                                 return;
2758                         }
2759                 }
2760 
2761                 /*
2762                  * ICMP header checksum, including checksum field,
2763                  * should be zero.
2764                  */
2765                 if (IP_CSUM(mp, ip_hdr_length, 0)) {
2766                         BUMP_MIB(&ipst->ips_icmp_mib, icmpInCksumErrs);
2767                         ip_drop_input("icmpInCksumErrs", mp, ill);
2768                         freemsg(mp);
2769                         return;
2770                 }
2771                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2772                 mp = icmp_inbound_v4(mp, ira);
2773                 if (mp == NULL) {
2774                         /* No need to pass to RAW sockets */
2775                         return;
2776                 }
2777                 break;
2778 
2779         case IPPROTO_IGMP:
2780                 /*
2781                  * If we are not willing to accept IGMP packets in clear,
2782                  * then check with global policy.
2783                  */
2784                 if (ipst->ips_igmp_accept_clear_messages == 0) {
2785                         mp = ipsec_check_global_policy(mp, NULL,
2786                             ipha, NULL, ira, ns);
2787                         if (mp == NULL)
2788                                 return;
2789                 }
2790                 if ((ira->ira_flags & IRAF_SYSTEM_LABELED) &&
2791                     !tsol_can_accept_raw(mp, ira, B_TRUE)) {
2792                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2793                         ip_drop_input("ipIfStatsInDiscards", mp, ill);
2794                         freemsg(mp);
2795                         return;
2796                 }
2797                 /*
2798                  * Validate checksum
2799                  */
2800                 if (IP_CSUM(mp, ip_hdr_length, 0)) {
2801                         ++ipst->ips_igmpstat.igps_rcv_badsum;
2802                         ip_drop_input("igps_rcv_badsum", mp, ill);
2803                         freemsg(mp);
2804                         return;
2805                 }
2806 
2807                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2808                 mp = igmp_input(mp, ira);
2809                 if (mp == NULL) {
2810                         /* Bad packet - discarded by igmp_input */
2811                         return;
2812                 }
2813                 break;
2814         case IPPROTO_PIM:
2815                 /*
2816                  * If we are not willing to accept PIM packets in clear,
2817                  * then check with global policy.
2818                  */
2819                 if (ipst->ips_pim_accept_clear_messages == 0) {
2820                         mp = ipsec_check_global_policy(mp, NULL,
2821                             ipha, NULL, ira, ns);
2822                         if (mp == NULL)
2823                                 return;
2824                 }
2825                 if ((ira->ira_flags & IRAF_SYSTEM_LABELED) &&
2826                     !tsol_can_accept_raw(mp, ira, B_TRUE)) {
2827                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2828                         ip_drop_input("ipIfStatsInDiscards", mp, ill);
2829                         freemsg(mp);
2830                         return;
2831                 }
2832                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2833 
2834                 /* Checksum is verified in pim_input */
2835                 mp = pim_input(mp, ira);
2836                 if (mp == NULL) {
2837                         /* Bad packet - discarded by pim_input */
2838                         return;
2839                 }
2840                 break;
2841         case IPPROTO_AH:
2842         case IPPROTO_ESP: {
2843                 /*
2844                  * Fast path for AH/ESP.
2845                  */
2846                 netstack_t *ns = ipst->ips_netstack;
2847                 ipsec_stack_t *ipss = ns->netstack_ipsec;
2848 
2849                 IP_STAT(ipst, ipsec_proto_ahesp);
2850 
2851                 if (!ipsec_loaded(ipss)) {
2852                         ip_proto_not_sup(mp, ira);
2853                         return;
2854                 }
2855 
2856                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2857                 /* select inbound SA and have IPsec process the pkt */
2858                 if (protocol == IPPROTO_ESP) {
2859                         esph_t *esph;
2860                         boolean_t esp_in_udp_sa;
2861                         boolean_t esp_in_udp_packet;
2862 
2863                         mp = ipsec_inbound_esp_sa(mp, ira, &esph);
2864                         if (mp == NULL)
2865                                 return;
2866 
2867                         ASSERT(esph != NULL);
2868                         ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
2869                         ASSERT(ira->ira_ipsec_esp_sa != NULL);
2870                         ASSERT(ira->ira_ipsec_esp_sa->ipsa_input_func != NULL);
2871 
2872                         esp_in_udp_sa = ((ira->ira_ipsec_esp_sa->ipsa_flags &
2873                             IPSA_F_NATT) != 0);
2874                         esp_in_udp_packet =
2875                             (ira->ira_flags & IRAF_ESP_UDP_PORTS) != 0;
2876 
2877                         /*
2878                          * The following is a fancy, but quick, way of saying:
2879                          * ESP-in-UDP SA and Raw ESP packet --> drop
2880                          *    OR
2881                          * ESP SA and ESP-in-UDP packet --> drop
2882                          */
2883                         if (esp_in_udp_sa != esp_in_udp_packet) {
2884                                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2885                                 ip_drop_packet(mp, B_TRUE, ira->ira_ill,
2886                                     DROPPER(ipss, ipds_esp_no_sa),
2887                                     &ipss->ipsec_dropper);
2888                                 return;
2889                         }
2890                         mp = ira->ira_ipsec_esp_sa->ipsa_input_func(mp, esph,
2891                             ira);
2892                 } else {
2893                         ah_t *ah;
2894 
2895                         mp = ipsec_inbound_ah_sa(mp, ira, &ah);
2896                         if (mp == NULL)
2897                                 return;
2898 
2899                         ASSERT(ah != NULL);
2900                         ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
2901                         ASSERT(ira->ira_ipsec_ah_sa != NULL);
2902                         ASSERT(ira->ira_ipsec_ah_sa->ipsa_input_func != NULL);
2903                         mp = ira->ira_ipsec_ah_sa->ipsa_input_func(mp, ah,
2904                             ira);
2905                 }
2906 
2907                 if (mp == NULL) {
2908                         /*
2909                          * Either it failed or is pending. In the former case
2910                          * ipIfStatsInDiscards was increased.
2911                          */
2912                         return;
2913                 }
2914                 /* we're done with IPsec processing, send it up */
2915                 ip_input_post_ipsec(mp, ira);
2916                 return;
2917         }
2918         case IPPROTO_ENCAP: {
2919                 ipha_t          *inner_ipha;
2920 
2921                 /*
2922                  * Handle self-encapsulated packets (IP-in-IP where
2923                  * the inner addresses == the outer addresses).
2924                  */
2925                 if ((uchar_t *)ipha + ip_hdr_length + sizeof (ipha_t) >
2926                     mp->b_wptr) {
2927                         if (ira->ira_pktlen <
2928                             ip_hdr_length + sizeof (ipha_t)) {
2929                                 BUMP_MIB(ill->ill_ip_mib,
2930                                     ipIfStatsInTruncatedPkts);
2931                                 ip_drop_input("ipIfStatsInTruncatedPkts",
2932                                     mp, ill);
2933                                 freemsg(mp);
2934                                 return;
2935                         }
2936                         ipha = ip_pullup(mp, (uchar_t *)ipha + ip_hdr_length +
2937                             sizeof (ipha_t) - mp->b_rptr, ira);
2938                         if (ipha == NULL) {
2939                                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2940                                 ip_drop_input("ipIfStatsInDiscards", mp, ill);
2941                                 freemsg(mp);
2942                                 return;
2943                         }
2944                 }
2945                 inner_ipha = (ipha_t *)((uchar_t *)ipha + ip_hdr_length);
2946                 /*
2947                  * Check the sanity of the inner IP header.
2948                  */
2949                 if ((IPH_HDR_VERSION(inner_ipha) != IPV4_VERSION)) {
2950                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2951                         ip_drop_input("ipIfStatsInDiscards", mp, ill);
2952                         freemsg(mp);
2953                         return;
2954                 }
2955                 if (IPH_HDR_LENGTH(inner_ipha) < sizeof (ipha_t)) {
2956                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2957                         ip_drop_input("ipIfStatsInDiscards", mp, ill);
2958                         freemsg(mp);
2959                         return;
2960                 }
2961                 if (inner_ipha->ipha_src != ipha->ipha_src ||
2962                     inner_ipha->ipha_dst != ipha->ipha_dst) {
2963                         /* We fallthru to iptun fanout below */
2964                         goto iptun;
2965                 }
2966 
2967                 /*
2968                  * Self-encapsulated tunnel packet. Remove
2969                  * the outer IP header and fanout again.
2970                  * We also need to make sure that the inner
2971                  * header is pulled up until options.
2972                  */
2973                 mp->b_rptr = (uchar_t *)inner_ipha;
2974                 ipha = inner_ipha;
2975                 ip_hdr_length = IPH_HDR_LENGTH(ipha);
2976                 if ((uchar_t *)ipha + ip_hdr_length > mp->b_wptr) {
2977                         if (ira->ira_pktlen <
2978                             (uchar_t *)ipha + ip_hdr_length - mp->b_rptr) {
2979                                 BUMP_MIB(ill->ill_ip_mib,
2980                                     ipIfStatsInTruncatedPkts);
2981                                 ip_drop_input("ipIfStatsInTruncatedPkts",
2982                                     mp, ill);
2983                                 freemsg(mp);
2984                                 return;
2985                         }
2986                         ipha = ip_pullup(mp,
2987                             (uchar_t *)ipha + ip_hdr_length - mp->b_rptr, ira);
2988                         if (ipha == NULL) {
2989                                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2990                                 ip_drop_input("ipIfStatsInDiscards", mp, ill);
2991                                 freemsg(mp);
2992                                 return;
2993                         }
2994                 }
2995                 if (ip_hdr_length > sizeof (ipha_t)) {
2996                         /* We got options on the inner packet. */
2997                         ipaddr_t        dst = ipha->ipha_dst;
2998                         int             error = 0;
2999 
3000                         dst = ip_input_options(ipha, dst, mp, ira, &error);
3001                         if (error != 0) {
3002                                 /*
3003                                  * An ICMP error has been sent and the packet
3004                                  * has been dropped.
3005                                  */
3006                                 return;
3007                         }
3008                         if (dst != ipha->ipha_dst) {
3009                                 /*
3010                                  * Someone put a source-route in
3011                                  * the inside header of a self-
3012                                  * encapsulated packet.  Drop it
3013                                  * with extreme prejudice and let
3014                                  * the sender know.
3015                                  */
3016                                 ip_drop_input("ICMP_SOURCE_ROUTE_FAILED",
3017                                     mp, ill);
3018                                 icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED,
3019                                     ira);
3020                                 return;
3021                         }
3022                 }
3023                 if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
3024                         /*
3025                          * This means that somebody is sending
3026                          * Self-encapsualted packets without AH/ESP.
3027                          *
3028                          * Send this packet to find a tunnel endpoint.
3029                          * if I can't find one, an ICMP
3030                          * PROTOCOL_UNREACHABLE will get sent.
3031                          */
3032                         protocol = ipha->ipha_protocol;
3033                         ira->ira_protocol = protocol;
3034                         goto iptun;
3035                 }
3036 
3037                 /* Update based on removed IP header */
3038                 ira->ira_ip_hdr_length = ip_hdr_length;
3039                 ira->ira_pktlen = ntohs(ipha->ipha_length);
3040 
3041                 if (ira->ira_flags & IRAF_IPSEC_DECAPS) {
3042                         /*
3043                          * This packet is self-encapsulated multiple
3044                          * times. We don't want to recurse infinitely.
3045                          * To keep it simple, drop the packet.
3046                          */
3047                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
3048                         ip_drop_input("ipIfStatsInDiscards", mp, ill);
3049                         freemsg(mp);
3050                         return;
3051                 }
3052                 ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
3053                 ira->ira_flags |= IRAF_IPSEC_DECAPS;
3054 
3055                 ip_input_post_ipsec(mp, ira);
3056                 return;
3057         }
3058 
3059         iptun:  /* IPPROTO_ENCAPS that is not self-encapsulated */
3060         case IPPROTO_IPV6:
3061                 /* iptun will verify trusted label */
3062                 connp = ipcl_classify_v4(mp, protocol, ip_hdr_length,
3063                     ira, ipst);
3064                 if (connp != NULL) {
3065                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
3066                         ira->ira_ill = ira->ira_rill = NULL;
3067                         (connp->conn_recv)(connp, mp, NULL, ira);
3068                         CONN_DEC_REF(connp);
3069                         ira->ira_ill = ill;
3070                         ira->ira_rill = rill;
3071                         return;
3072                 }
3073                 /* FALLTHRU */
3074         default:
3075                 /*
3076                  * On a labeled system, we have to check whether the zone
3077                  * itself is permitted to receive raw traffic.
3078                  */
3079                 if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
3080                         if (!tsol_can_accept_raw(mp, ira, B_FALSE)) {
3081                                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
3082                                 ip_drop_input("ipIfStatsInDiscards", mp, ill);
3083                                 freemsg(mp);
3084                                 return;
3085                         }
3086                 }
3087                 break;
3088         }
3089 
3090         /*
3091          * The above input functions may have returned the pulled up message.
3092          * So ipha need to be reinitialized.
3093          */
3094         ipha = (ipha_t *)mp->b_rptr;
3095         ira->ira_protocol = protocol = ipha->ipha_protocol;
3096         if (ipst->ips_ipcl_proto_fanout_v4[protocol].connf_head == NULL) {
3097                 /*
3098                  * No user-level listener for these packets packets.
3099                  * Check for IPPROTO_ENCAP...
3100                  */
3101                 if (protocol == IPPROTO_ENCAP && ipst->ips_ip_g_mrouter) {
3102                         /*
3103                          * Check policy here,
3104                          * THEN ship off to ip_mroute_decap().
3105                          *
3106                          * BTW,  If I match a configured IP-in-IP
3107                          * tunnel above, this path will not be reached, and
3108                          * ip_mroute_decap will never be called.
3109                          */
3110                         mp = ipsec_check_global_policy(mp, connp,
3111                             ipha, NULL, ira, ns);
3112                         if (mp != NULL) {
3113                                 ip_mroute_decap(mp, ira);
3114                         } /* Else we already freed everything! */
3115                 } else {
3116                         ip_proto_not_sup(mp, ira);
3117                 }
3118                 return;
3119         }
3120 
3121         /*
3122          * Handle fanout to raw sockets.  There
3123          * can be more than one stream bound to a particular
3124          * protocol.  When this is the case, each one gets a copy
3125          * of any incoming packets.
3126          */
3127         ASSERT(ira->ira_protocol == ipha->ipha_protocol);
3128         ip_fanout_proto_v4(mp, ipha, ira);
3129         return;
3130 
3131 discard:
3132         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
3133         ip_drop_input("ipIfStatsInDiscards", mp, ill);
3134         freemsg(mp);
3135 #undef rptr
3136 }